RDF converter modified. Progress in creation of RDF structure.

This commit is contained in:
Georgy Litvinov 2020-02-24 18:30:40 +01:00
parent 0a8f3de2ed
commit 1db32e2410
3 changed files with 324 additions and 25 deletions

View file

@ -0,0 +1,91 @@
package writer2latex.rdf;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.ls.DOMImplementationLS;
import org.w3c.dom.ls.LSSerializer;
import writer2latex.xhtml.XhtmlDocument;
public class DocumentPart {
private XhtmlDocument excerptDoc;
private String path;
private String itemNumber;
private String body;
private String parentPath;
private String name;
public DocumentPart(XhtmlDocument document) {
this.excerptDoc = document;
extractPath();
extractName();
extractNumber();
extractBody();
calculateParentPath();
}
public DocumentPart(String path) {
this.path = path;
this.name = "";
extractNumber();
this.body = "";
calculateParentPath();
}
private void extractName() {
Element excerptContentNode = excerptDoc.getContentNode();
this.name = excerptContentNode.getAttribute("name");
}
private void extractNumber() {
itemNumber = path.replaceAll("([0-9]+ )+", "");
}
public String getPath() {
return path;
}
public String getSafePath(){
return path.replaceAll(" ","_");
}
public String getNumber() {
return itemNumber;
}
public String getBody() {
return body;
}
public String getParentPath() {
return parentPath;
}
public String getName() {
return name;
}
private void extractPath() {
Element excerptContentNode = excerptDoc.getContentNode();
this.path = excerptContentNode.getAttribute("path");
}
private void calculateParentPath() {
if(path.length() == 1) {
parentPath = "";
}
parentPath = path.replaceAll(" [0-9]+$", "");
}
private void extractBody() {
Element excerptContentNode = excerptDoc.getContentNode();
StringBuilder bodyBuilder;
Document document = excerptContentNode.getOwnerDocument();
DOMImplementationLS domImplLS = (DOMImplementationLS) document.getImplementation();
LSSerializer serializer = domImplLS.createLSSerializer();
serializer.getDomConfig().setParameter("xml-declaration", false);
bodyBuilder = new StringBuilder();
NodeList excerptContentNodes = excerptContentNode.getChildNodes();
int i = 0;
while (excerptContentNodes.getLength() > i) {
Node child = excerptContentNodes.item(i);
bodyBuilder.append(serializer.writeToString(child));
i++;
}
this.body = bodyBuilder.toString();
}
}

View file

@ -0,0 +1,143 @@
package writer2latex.rdf;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Set;
import java.util.Vector;
import org.apache.jena.rdf.model.Model;
import org.apache.jena.rdf.model.ModelFactory;
import org.apache.jena.rdf.model.Resource;
import writer2latex.xhtml.XhtmlDocument;
import org.apache.jena.rdf.model.Property;
public class DocumentStructure {
private static final String TOCITEM = "TOCItem";
private HashMap<String, Resource> elements;
private HashMap<String, DocumentPart> inputParts;
private final String TS = "https://iph.ras.ru/text_structures#";
private final String PARSERNAME = "w2phtml";
private final String EXCERPT = "Excerpt";
private final String TOC_ELEMENT = "TOCElement";
private String documentID = "DOC_ID";
Model m;
private DocumentStructure() {
this.elements = new HashMap<String, Resource>();
this.inputParts = new HashMap<String, DocumentPart>();
this.m = ModelFactory.createDefaultModel();
}
public DocumentStructure(Vector<XhtmlDocument> files) {
this();
System.out.println("DocStructure");
Iterator<XhtmlDocument> filesIterator = files.iterator();
while (filesIterator.hasNext()) {
XhtmlDocument inputDoc = filesIterator.next();
DocumentPart part = new DocumentPart(inputDoc);
addPart(part);
}
addEmptyParts();
}
private void addEmptyParts() {
Set<String> paths = inputParts.keySet();
String[] array = new String[paths.size()];
paths.toArray(array);
for (int k = 0; k < array.length;k++) {
String[] levels = array[k].split(" ");
for (int i = 0; i < levels.length; i++) {
if (levels[i].equals("0")) {
String emptyPath = createEmptyPath(levels, i);
if (!inputParts.containsKey(emptyPath)) {
System.out.println("empty path added " + emptyPath);
DocumentPart emptyPart = new DocumentPart(emptyPath);
addPart(emptyPart);
}
}
}
}
}
private String createEmptyPath(String[] levels, int i) {
StringBuilder emptyPath = new StringBuilder();
for (int j = 0; j<= i;j++) {
if (j != 0) {
emptyPath.append(" ");
}
emptyPath.append(levels[j]);
}
return emptyPath.toString();
}
public void printModel() {
m.write(System.out, "RDF/XML-ABBREV");
}
private void addPart(DocumentPart docExcerpt) {
inputParts.put(docExcerpt.getPath(), docExcerpt);
}
private Resource createExcerpt(DocumentPart docExcerpt) {
Resource excerpt = m.createResource(TS + EXCERPT + "/" + PARSERNAME + "_" + documentID + docExcerpt.getSafePath());
Property htmlExcerpt = m.createProperty(TS + "htmlExcerpt");
excerpt.addProperty(htmlExcerpt, docExcerpt.getBody());
return excerpt;
}
private void createElement(DocumentPart docPart) {
String elementName = TS + TOC_ELEMENT + "/" + PARSERNAME + "_" + documentID + docPart.getSafePath();
Resource element = m.createResource(elementName);
elements.put(docPart.getPath(), element);
attachExcerpt(docPart, element);
}
private void createTOCItem(DocumentPart docPart) {
String tocItemName = TS + TOCITEM + "/" + PARSERNAME + "_" + documentID + docPart.getSafePath();
Resource tocItem = m.createResource(tocItemName);
Property pointsTo = m.createProperty(TS + "pointsTo");
Property itemNumber = m.createProperty(TS + "itemNumber");
Property hasTOCItem = m.createProperty(TS + "hasTOCItem");
tocItem.addLiteral(itemNumber, docPart.getNumber());
tocItem.addProperty(pointsTo, elements.get(docPart.getPath()));
Resource parent = elements.get(docPart.getParentPath());
parent.addProperty(hasTOCItem, tocItem);
}
private void attachExcerpt(DocumentPart docPart, Resource element) {
if (docPart.getBody().isEmpty()) {
return;
}
Resource excerpt = createExcerpt(docPart);
Property hasText = m.createProperty(TS + "hasText");
element.addProperty(hasText, excerpt);
}
public void createTree() {
createElements();
createTOCItems();
}
private void createTOCItems() {
Set<String> paths = inputParts.keySet();
for (String path : paths) {
DocumentPart part = inputParts.get(path);
if (!part.getNumber().equals("")) {
createTOCItem(part);
}
}
}
private void createElements() {
Set<String> paths = inputParts.keySet();
for (String path : paths) {
createElement(inputParts.get(path));
}
}
}

View file

@ -2,57 +2,122 @@ package writer2latex.rdf;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.util.HashMap;
import java.util.Iterator;
import org.w3c.dom.Element;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.stream.events.Namespace;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.TransformerFactoryConfigurationError;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.jena.rdf.model.Model;
import org.apache.jena.rdf.model.ModelFactory;
import org.apache.jena.rdf.model.Property;
import org.apache.jena.rdf.model.Resource;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.ls.DOMImplementationLS;
import org.w3c.dom.ls.LSSerializer;
import com.sun.org.apache.xml.internal.utils.NameSpace;
import pro.litvinovg.xml.Debug;
import writer2latex.api.ConverterResult;
import writer2latex.api.OutputFile;
import writer2latex.base.ConverterResultImpl;
import writer2latex.epub.EPUBWriter;
import writer2latex.xhtml.Html5Converter;
import writer2latex.xhtml.Xhtml11Converter;
import writer2latex.xhtml.Converter;
import writer2latex.xhtml.XhtmlDocument;
public final class RDFConverter extends Xhtml11Converter {
public final class RDFConverter extends Converter {
// Constructor
public RDFConverter() {
super();
super(XhtmlDocument.HTML5);
this.isRDF = true;
}
@Override public ConverterResult convert(InputStream is, String sTargetFileName) throws IOException {
setOpenPubStructure(false);
ConverterResult xhtmlResult = super.convert(is, "chapter");
Iterator<XhtmlDocument> excerptIterator = this.outFiles.iterator();
while (excerptIterator.hasNext()) {
XhtmlDocument excerptDoc = excerptIterator.next();
Element excerptContentNode = excerptDoc.getContentNode();
String exPath = excerptContentNode.getAttribute("path");
System.out.println(exPath);
}
/*
* Iterator<OutputFile> iterator = xhtmlResult.iterator();
* while(iterator.hasNext()) { OutputFile of = iterator.next();
* System.out.println(of.getFileName()); }
*/
createPackage();
return createPackage(xhtmlResult,sTargetFileName);
}
@Override public ConverterResult convert(org.w3c.dom.Document dom, String sTargetFileName, boolean bDestructive) throws IOException {
ConverterResult xhtmlResult = super.convert(dom, "chapter", bDestructive);
//System.out.println(this.outFiles.size());
/*
* Iterator<OutputFile> iterator = xhtmlResult.iterator();
* while(iterator.hasNext()) { OutputFile outfile = iterator.next();
* System.out.println(outfile.isMasterDocument() + outfile.getFileName()); }
*/
return createPackage(xhtmlResult,sTargetFileName);
}
private ConverterResult createPackage(ConverterResult xhtmlResult, String sTargetFileName) {
private ConverterResult createPackage() {
createRDF();
return converterResult;
}
private void createRDF() {
/*
* Resource root = m.createResource(ts + "elenphArticle");
*
* Property itemNumber = m.createProperty(ts + "itemNumber"); Property hasItem =
* m.createProperty(ts + "hasItem"); Property hasText = m.createProperty(ts +
* "hasText");
*
* elements.put("root", root);
*/
DocumentStructure structure = new DocumentStructure(this.outFiles);
structure.createTree();
structure.printModel();
/*
* if (elements.containsKey(path)) { element = elements.get(path); } else {
* element = m.createResource(ts + "Element/" + path); }
* element.addProperty(hasText, body.toString());
*
* tocItem = m.createResource(ts + "TOCItem/" + path);
* tocItem.addProperty(itemNumber, order); String parentPath =
* calculateParentPath(path); System.out.println("parentPath " + parentPath);
* System.out.println("exPath " + path); Resource parent; if
* (elements.containsKey(parentPath)) { parent = elements.get(parentPath); }
* else { parent = m.createResource(ts + "Element/" + parentPath); }
* m.add(parent, hasItem, tocItem)
*/;
/* Resource root = m.createResource(ts + "Element");
Property P = m.createProperty(ts + "TOCItem");
Property Q = m.createProperty(nsB + "Q");
Resource y = m.createResource(ts + "Excerpt");
Resource z = m.createResource(ts + "z");*/
/* m.add(root, P, y);
m.add(y, Q, z);
m.setNsPrefix("nsA", ts);
m.write(System.out, "RDF/XML-ABBREV");*/
}
private ConverterResult createPackage(ConverterResult xhtmlResult, String sTargetFileName) {
ConverterResultImpl rdfResult = new ConverterResultImpl();
RDFWriter rdfWriter = new RDFWriter(xhtmlResult,sTargetFileName,3,getXhtmlConfig());
rdfResult.addDocument(rdfWriter);