RDF converter modified. Progress in creation of RDF structure.
This commit is contained in:
parent
0a8f3de2ed
commit
1db32e2410
3 changed files with 324 additions and 25 deletions
91
src/main/java/writer2latex/rdf/DocumentPart.java
Normal file
91
src/main/java/writer2latex/rdf/DocumentPart.java
Normal file
|
@ -0,0 +1,91 @@
|
||||||
|
package writer2latex.rdf;
|
||||||
|
|
||||||
|
import org.w3c.dom.Document;
|
||||||
|
import org.w3c.dom.Element;
|
||||||
|
import org.w3c.dom.Node;
|
||||||
|
import org.w3c.dom.NodeList;
|
||||||
|
import org.w3c.dom.ls.DOMImplementationLS;
|
||||||
|
import org.w3c.dom.ls.LSSerializer;
|
||||||
|
|
||||||
|
import writer2latex.xhtml.XhtmlDocument;
|
||||||
|
|
||||||
|
public class DocumentPart {
|
||||||
|
|
||||||
|
private XhtmlDocument excerptDoc;
|
||||||
|
private String path;
|
||||||
|
private String itemNumber;
|
||||||
|
private String body;
|
||||||
|
private String parentPath;
|
||||||
|
private String name;
|
||||||
|
|
||||||
|
public DocumentPart(XhtmlDocument document) {
|
||||||
|
this.excerptDoc = document;
|
||||||
|
extractPath();
|
||||||
|
extractName();
|
||||||
|
extractNumber();
|
||||||
|
extractBody();
|
||||||
|
calculateParentPath();
|
||||||
|
}
|
||||||
|
public DocumentPart(String path) {
|
||||||
|
this.path = path;
|
||||||
|
this.name = "";
|
||||||
|
extractNumber();
|
||||||
|
this.body = "";
|
||||||
|
calculateParentPath();
|
||||||
|
}
|
||||||
|
private void extractName() {
|
||||||
|
Element excerptContentNode = excerptDoc.getContentNode();
|
||||||
|
this.name = excerptContentNode.getAttribute("name");
|
||||||
|
}
|
||||||
|
private void extractNumber() {
|
||||||
|
itemNumber = path.replaceAll("([0-9]+ )+", "");
|
||||||
|
}
|
||||||
|
public String getPath() {
|
||||||
|
return path;
|
||||||
|
}
|
||||||
|
public String getSafePath(){
|
||||||
|
return path.replaceAll(" ","_");
|
||||||
|
}
|
||||||
|
public String getNumber() {
|
||||||
|
return itemNumber;
|
||||||
|
}
|
||||||
|
public String getBody() {
|
||||||
|
return body;
|
||||||
|
}
|
||||||
|
public String getParentPath() {
|
||||||
|
return parentPath;
|
||||||
|
}
|
||||||
|
public String getName() {
|
||||||
|
return name;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void extractPath() {
|
||||||
|
Element excerptContentNode = excerptDoc.getContentNode();
|
||||||
|
this.path = excerptContentNode.getAttribute("path");
|
||||||
|
}
|
||||||
|
|
||||||
|
private void calculateParentPath() {
|
||||||
|
if(path.length() == 1) {
|
||||||
|
parentPath = "";
|
||||||
|
}
|
||||||
|
parentPath = path.replaceAll(" [0-9]+$", "");
|
||||||
|
}
|
||||||
|
private void extractBody() {
|
||||||
|
Element excerptContentNode = excerptDoc.getContentNode();
|
||||||
|
StringBuilder bodyBuilder;
|
||||||
|
Document document = excerptContentNode.getOwnerDocument();
|
||||||
|
DOMImplementationLS domImplLS = (DOMImplementationLS) document.getImplementation();
|
||||||
|
LSSerializer serializer = domImplLS.createLSSerializer();
|
||||||
|
serializer.getDomConfig().setParameter("xml-declaration", false);
|
||||||
|
bodyBuilder = new StringBuilder();
|
||||||
|
NodeList excerptContentNodes = excerptContentNode.getChildNodes();
|
||||||
|
int i = 0;
|
||||||
|
while (excerptContentNodes.getLength() > i) {
|
||||||
|
Node child = excerptContentNodes.item(i);
|
||||||
|
bodyBuilder.append(serializer.writeToString(child));
|
||||||
|
i++;
|
||||||
|
|
||||||
|
}
|
||||||
|
this.body = bodyBuilder.toString();
|
||||||
|
}
|
||||||
|
}
|
143
src/main/java/writer2latex/rdf/DocumentStructure.java
Normal file
143
src/main/java/writer2latex/rdf/DocumentStructure.java
Normal file
|
@ -0,0 +1,143 @@
|
||||||
|
package writer2latex.rdf;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.Vector;
|
||||||
|
|
||||||
|
import org.apache.jena.rdf.model.Model;
|
||||||
|
import org.apache.jena.rdf.model.ModelFactory;
|
||||||
|
import org.apache.jena.rdf.model.Resource;
|
||||||
|
|
||||||
|
import writer2latex.xhtml.XhtmlDocument;
|
||||||
|
|
||||||
|
import org.apache.jena.rdf.model.Property;
|
||||||
|
|
||||||
|
|
||||||
|
public class DocumentStructure {
|
||||||
|
private static final String TOCITEM = "TOCItem";
|
||||||
|
private HashMap<String, Resource> elements;
|
||||||
|
private HashMap<String, DocumentPart> inputParts;
|
||||||
|
private final String TS = "https://iph.ras.ru/text_structures#";
|
||||||
|
private final String PARSERNAME = "w2phtml";
|
||||||
|
private final String EXCERPT = "Excerpt";
|
||||||
|
private final String TOC_ELEMENT = "TOCElement";
|
||||||
|
private String documentID = "DOC_ID";
|
||||||
|
|
||||||
|
Model m;
|
||||||
|
|
||||||
|
private DocumentStructure() {
|
||||||
|
this.elements = new HashMap<String, Resource>();
|
||||||
|
this.inputParts = new HashMap<String, DocumentPart>();
|
||||||
|
this.m = ModelFactory.createDefaultModel();
|
||||||
|
}
|
||||||
|
public DocumentStructure(Vector<XhtmlDocument> files) {
|
||||||
|
this();
|
||||||
|
System.out.println("DocStructure");
|
||||||
|
Iterator<XhtmlDocument> filesIterator = files.iterator();
|
||||||
|
while (filesIterator.hasNext()) {
|
||||||
|
XhtmlDocument inputDoc = filesIterator.next();
|
||||||
|
DocumentPart part = new DocumentPart(inputDoc);
|
||||||
|
addPart(part);
|
||||||
|
}
|
||||||
|
addEmptyParts();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addEmptyParts() {
|
||||||
|
Set<String> paths = inputParts.keySet();
|
||||||
|
String[] array = new String[paths.size()];
|
||||||
|
paths.toArray(array);
|
||||||
|
for (int k = 0; k < array.length;k++) {
|
||||||
|
String[] levels = array[k].split(" ");
|
||||||
|
for (int i = 0; i < levels.length; i++) {
|
||||||
|
if (levels[i].equals("0")) {
|
||||||
|
String emptyPath = createEmptyPath(levels, i);
|
||||||
|
if (!inputParts.containsKey(emptyPath)) {
|
||||||
|
System.out.println("empty path added " + emptyPath);
|
||||||
|
DocumentPart emptyPart = new DocumentPart(emptyPath);
|
||||||
|
addPart(emptyPart);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
private String createEmptyPath(String[] levels, int i) {
|
||||||
|
StringBuilder emptyPath = new StringBuilder();
|
||||||
|
for (int j = 0; j<= i;j++) {
|
||||||
|
if (j != 0) {
|
||||||
|
emptyPath.append(" ");
|
||||||
|
}
|
||||||
|
emptyPath.append(levels[j]);
|
||||||
|
}
|
||||||
|
return emptyPath.toString();
|
||||||
|
}
|
||||||
|
public void printModel() {
|
||||||
|
m.write(System.out, "RDF/XML-ABBREV");
|
||||||
|
}
|
||||||
|
|
||||||
|
private void addPart(DocumentPart docExcerpt) {
|
||||||
|
inputParts.put(docExcerpt.getPath(), docExcerpt);
|
||||||
|
}
|
||||||
|
|
||||||
|
private Resource createExcerpt(DocumentPart docExcerpt) {
|
||||||
|
Resource excerpt = m.createResource(TS + EXCERPT + "/" + PARSERNAME + "_" + documentID + docExcerpt.getSafePath());
|
||||||
|
Property htmlExcerpt = m.createProperty(TS + "htmlExcerpt");
|
||||||
|
excerpt.addProperty(htmlExcerpt, docExcerpt.getBody());
|
||||||
|
|
||||||
|
return excerpt;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void createElement(DocumentPart docPart) {
|
||||||
|
String elementName = TS + TOC_ELEMENT + "/" + PARSERNAME + "_" + documentID + docPart.getSafePath();
|
||||||
|
Resource element = m.createResource(elementName);
|
||||||
|
elements.put(docPart.getPath(), element);
|
||||||
|
attachExcerpt(docPart, element);
|
||||||
|
}
|
||||||
|
private void createTOCItem(DocumentPart docPart) {
|
||||||
|
String tocItemName = TS + TOCITEM + "/" + PARSERNAME + "_" + documentID + docPart.getSafePath();
|
||||||
|
Resource tocItem = m.createResource(tocItemName);
|
||||||
|
Property pointsTo = m.createProperty(TS + "pointsTo");
|
||||||
|
Property itemNumber = m.createProperty(TS + "itemNumber");
|
||||||
|
Property hasTOCItem = m.createProperty(TS + "hasTOCItem");
|
||||||
|
tocItem.addLiteral(itemNumber, docPart.getNumber());
|
||||||
|
tocItem.addProperty(pointsTo, elements.get(docPart.getPath()));
|
||||||
|
|
||||||
|
|
||||||
|
Resource parent = elements.get(docPart.getParentPath());
|
||||||
|
parent.addProperty(hasTOCItem, tocItem);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private void attachExcerpt(DocumentPart docPart, Resource element) {
|
||||||
|
if (docPart.getBody().isEmpty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
Resource excerpt = createExcerpt(docPart);
|
||||||
|
Property hasText = m.createProperty(TS + "hasText");
|
||||||
|
element.addProperty(hasText, excerpt);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void createTree() {
|
||||||
|
createElements();
|
||||||
|
createTOCItems();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void createTOCItems() {
|
||||||
|
Set<String> paths = inputParts.keySet();
|
||||||
|
for (String path : paths) {
|
||||||
|
DocumentPart part = inputParts.get(path);
|
||||||
|
if (!part.getNumber().equals("")) {
|
||||||
|
createTOCItem(part);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void createElements() {
|
||||||
|
Set<String> paths = inputParts.keySet();
|
||||||
|
for (String path : paths) {
|
||||||
|
createElement(inputParts.get(path));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -2,57 +2,122 @@ package writer2latex.rdf;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
import java.io.StringWriter;
|
||||||
|
import java.util.HashMap;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
|
||||||
import org.w3c.dom.Element;
|
import javax.xml.parsers.DocumentBuilder;
|
||||||
|
import javax.xml.parsers.DocumentBuilderFactory;
|
||||||
|
import javax.xml.parsers.ParserConfigurationException;
|
||||||
|
import javax.xml.stream.events.Namespace;
|
||||||
|
import javax.xml.transform.OutputKeys;
|
||||||
|
import javax.xml.transform.Transformer;
|
||||||
|
import javax.xml.transform.TransformerConfigurationException;
|
||||||
|
import javax.xml.transform.TransformerException;
|
||||||
|
import javax.xml.transform.TransformerFactory;
|
||||||
|
import javax.xml.transform.TransformerFactoryConfigurationError;
|
||||||
|
import javax.xml.transform.dom.DOMSource;
|
||||||
|
import javax.xml.transform.stream.StreamResult;
|
||||||
|
|
||||||
|
import org.apache.jena.rdf.model.Model;
|
||||||
|
import org.apache.jena.rdf.model.ModelFactory;
|
||||||
|
import org.apache.jena.rdf.model.Property;
|
||||||
|
import org.apache.jena.rdf.model.Resource;
|
||||||
|
import org.w3c.dom.Document;
|
||||||
|
import org.w3c.dom.Element;
|
||||||
|
import org.w3c.dom.Node;
|
||||||
|
import org.w3c.dom.NodeList;
|
||||||
|
import org.w3c.dom.ls.DOMImplementationLS;
|
||||||
|
import org.w3c.dom.ls.LSSerializer;
|
||||||
|
|
||||||
|
import com.sun.org.apache.xml.internal.utils.NameSpace;
|
||||||
|
|
||||||
|
import pro.litvinovg.xml.Debug;
|
||||||
import writer2latex.api.ConverterResult;
|
import writer2latex.api.ConverterResult;
|
||||||
import writer2latex.api.OutputFile;
|
import writer2latex.api.OutputFile;
|
||||||
import writer2latex.base.ConverterResultImpl;
|
import writer2latex.base.ConverterResultImpl;
|
||||||
import writer2latex.epub.EPUBWriter;
|
import writer2latex.xhtml.Converter;
|
||||||
import writer2latex.xhtml.Html5Converter;
|
|
||||||
import writer2latex.xhtml.Xhtml11Converter;
|
|
||||||
import writer2latex.xhtml.XhtmlDocument;
|
import writer2latex.xhtml.XhtmlDocument;
|
||||||
|
|
||||||
|
|
||||||
public final class RDFConverter extends Xhtml11Converter {
|
public final class RDFConverter extends Converter {
|
||||||
|
|
||||||
// Constructor
|
// Constructor
|
||||||
public RDFConverter() {
|
public RDFConverter() {
|
||||||
super();
|
super(XhtmlDocument.HTML5);
|
||||||
this.isRDF = true;
|
this.isRDF = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public ConverterResult convert(InputStream is, String sTargetFileName) throws IOException {
|
@Override public ConverterResult convert(InputStream is, String sTargetFileName) throws IOException {
|
||||||
setOpenPubStructure(false);
|
setOpenPubStructure(false);
|
||||||
ConverterResult xhtmlResult = super.convert(is, "chapter");
|
ConverterResult xhtmlResult = super.convert(is, "chapter");
|
||||||
Iterator<XhtmlDocument> excerptIterator = this.outFiles.iterator();
|
createPackage();
|
||||||
while (excerptIterator.hasNext()) {
|
|
||||||
XhtmlDocument excerptDoc = excerptIterator.next();
|
|
||||||
Element excerptContentNode = excerptDoc.getContentNode();
|
|
||||||
String exPath = excerptContentNode.getAttribute("path");
|
|
||||||
System.out.println(exPath);
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
* Iterator<OutputFile> iterator = xhtmlResult.iterator();
|
|
||||||
* while(iterator.hasNext()) { OutputFile of = iterator.next();
|
|
||||||
* System.out.println(of.getFileName()); }
|
|
||||||
*/
|
|
||||||
return createPackage(xhtmlResult,sTargetFileName);
|
return createPackage(xhtmlResult,sTargetFileName);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override public ConverterResult convert(org.w3c.dom.Document dom, String sTargetFileName, boolean bDestructive) throws IOException {
|
@Override public ConverterResult convert(org.w3c.dom.Document dom, String sTargetFileName, boolean bDestructive) throws IOException {
|
||||||
ConverterResult xhtmlResult = super.convert(dom, "chapter", bDestructive);
|
ConverterResult xhtmlResult = super.convert(dom, "chapter", bDestructive);
|
||||||
//System.out.println(this.outFiles.size());
|
|
||||||
/*
|
|
||||||
* Iterator<OutputFile> iterator = xhtmlResult.iterator();
|
|
||||||
* while(iterator.hasNext()) { OutputFile outfile = iterator.next();
|
|
||||||
* System.out.println(outfile.isMasterDocument() + outfile.getFileName()); }
|
|
||||||
*/
|
|
||||||
return createPackage(xhtmlResult,sTargetFileName);
|
return createPackage(xhtmlResult,sTargetFileName);
|
||||||
}
|
}
|
||||||
|
|
||||||
private ConverterResult createPackage(ConverterResult xhtmlResult, String sTargetFileName) {
|
private ConverterResult createPackage() {
|
||||||
|
createRDF();
|
||||||
|
return converterResult;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private void createRDF() {
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Resource root = m.createResource(ts + "elenphArticle");
|
||||||
|
*
|
||||||
|
* Property itemNumber = m.createProperty(ts + "itemNumber"); Property hasItem =
|
||||||
|
* m.createProperty(ts + "hasItem"); Property hasText = m.createProperty(ts +
|
||||||
|
* "hasText");
|
||||||
|
*
|
||||||
|
* elements.put("root", root);
|
||||||
|
*/
|
||||||
|
|
||||||
|
DocumentStructure structure = new DocumentStructure(this.outFiles);
|
||||||
|
|
||||||
|
structure.createTree();
|
||||||
|
structure.printModel();
|
||||||
|
/*
|
||||||
|
* if (elements.containsKey(path)) { element = elements.get(path); } else {
|
||||||
|
* element = m.createResource(ts + "Element/" + path); }
|
||||||
|
* element.addProperty(hasText, body.toString());
|
||||||
|
*
|
||||||
|
* tocItem = m.createResource(ts + "TOCItem/" + path);
|
||||||
|
* tocItem.addProperty(itemNumber, order); String parentPath =
|
||||||
|
* calculateParentPath(path); System.out.println("parentPath " + parentPath);
|
||||||
|
* System.out.println("exPath " + path); Resource parent; if
|
||||||
|
* (elements.containsKey(parentPath)) { parent = elements.get(parentPath); }
|
||||||
|
* else { parent = m.createResource(ts + "Element/" + parentPath); }
|
||||||
|
* m.add(parent, hasItem, tocItem)
|
||||||
|
*/;
|
||||||
|
|
||||||
|
/* Resource root = m.createResource(ts + "Element");
|
||||||
|
Property P = m.createProperty(ts + "TOCItem");
|
||||||
|
Property Q = m.createProperty(nsB + "Q");
|
||||||
|
Resource y = m.createResource(ts + "Excerpt");
|
||||||
|
Resource z = m.createResource(ts + "z");*/
|
||||||
|
|
||||||
|
/* m.add(root, P, y);
|
||||||
|
m.add(y, Q, z);
|
||||||
|
m.setNsPrefix("nsA", ts);
|
||||||
|
m.write(System.out, "RDF/XML-ABBREV");*/
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
private ConverterResult createPackage(ConverterResult xhtmlResult, String sTargetFileName) {
|
||||||
ConverterResultImpl rdfResult = new ConverterResultImpl();
|
ConverterResultImpl rdfResult = new ConverterResultImpl();
|
||||||
RDFWriter rdfWriter = new RDFWriter(xhtmlResult,sTargetFileName,3,getXhtmlConfig());
|
RDFWriter rdfWriter = new RDFWriter(xhtmlResult,sTargetFileName,3,getXhtmlConfig());
|
||||||
rdfResult.addDocument(rdfWriter);
|
rdfResult.addDocument(rdfWriter);
|
||||||
|
|
Loading…
Add table
Reference in a new issue