w2phtml/src/main/java/writer2latex/rdf/DocumentStructure.java

297 lines
8.3 KiB
Java

package writer2latex.rdf;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStream;
import java.util.HashMap;
import java.util.Set;
import java.util.Vector;
import org.apache.jena.ontology.OntClass;
import org.apache.jena.ontology.OntModel;
import org.apache.jena.rdf.model.ModelFactory;
import org.apache.jena.rdf.model.Resource;
import org.apache.jena.vocabulary.*;
import writer2latex.xhtml.XhtmlDocument;
import org.apache.jena.rdf.model.Property;
public class DocumentStructure {
private static final String TOCITEM = "TOCItem";
private HashMap<String, Resource> elements;
private HashMap<String, DocumentPart> inputParts;
private final String TS = "https://iph.ras.ru/text_structures#";
private final String PARSERNAME = "w2phtml";
private final String EXCERPT = "elenphExcerpt";
private final String TOC_ELEMENT = "TOCElement";
private final String ELENPHARTICLE = "elenphArticle";
private String documentID = "DOC_ID";
private OntModel m;
private OntClass excerptClass;
private OntClass elementClass;
private OntClass itemClass;
private OntClass elenphClass;
private DocumentStructure() {
this.elements = new HashMap<String, Resource>();
this.inputParts = new HashMap<String, DocumentPart>();
this.m = ModelFactory.createOntologyModel();
this.excerptClass = m.createClass(TS + EXCERPT);
this.elementClass = m.createClass(TS + TOC_ELEMENT);
this.itemClass = m.createClass(TS + TOCITEM);
this.elenphClass = m.createClass(TS + ELENPHARTICLE);
}
public DocumentStructure(Vector<XhtmlDocument> files,String fileName) {
this();
this.documentID = fileName;
for(int i = 0 ; i< files.size();i++) {
XhtmlDocument inputDoc = files.get(i);
DocumentPart part = new DocumentPart(inputDoc);
part.setOrder(Integer.toString(i));
addPart(part);
}
addEmptyParts();
}
private void addEmptyParts() {
Set<String> paths = inputParts.keySet();
String[] array = new String[paths.size()];
paths.toArray(array);
for (int k = 0; k < array.length;k++) {
String[] levels = array[k].split(" ");
for (int i = 0; i < levels.length; i++) {
if (levels[i].equals("0")) {
String emptyPath = createEmptyPath(levels, i);
if (!inputParts.containsKey(emptyPath)) {
DocumentPart emptyPart = new DocumentPart(emptyPath);
addPart(emptyPart);
}
}
}
}
}
private String createEmptyPath(String[] levels, int i) {
StringBuilder emptyPath = new StringBuilder();
for (int j = 0; j<= i;j++) {
if (j != 0) {
emptyPath.append(" ");
}
emptyPath.append(levels[j]);
}
return emptyPath.toString();
}
public void printModel(String fileName) {
File outFile = new File(fileName + ".rdf");
FileWriter fw = null;
try {
outFile.createNewFile();
fw = new FileWriter(outFile);
m.write(fw,"RDF/XML-ABBREV");
} catch (IOException e) {
System.out.println("File couldn't be created");
e.printStackTrace();
} finally {
try {
fw.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
public void printModel(OutputStream os) {
m.write(os,"RDF/XML-ABBREV");
}
private void addPart(DocumentPart docExcerpt) {
inputParts.put(docExcerpt.getPath(), docExcerpt);
}
private Resource createExcerpt(DocumentPart docExcerpt) {
String name = TS + EXCERPT + "/" + PARSERNAME + "_" + documentID + docExcerpt.getSafePath();
Resource excerpt = m.createIndividual(name, excerptClass);
if (!docExcerpt.getBody().isEmpty()) {
Property htmlExcerpt = m.createProperty(TS + "htmlExcerpt");
excerpt.addLiteral(htmlExcerpt, docExcerpt.getBody());
}
return excerpt;
}
private void createElement(DocumentPart docPart) {
String elementName = TS + TOC_ELEMENT + "/" + PARSERNAME + "_" + documentID + docPart.getSafePath();
Resource element = m.createIndividual(elementName,elementClass);
element.addProperty( RDFS.label, docPart.getName());
elements.put(docPart.getPath(), element);
attachExcerpt(docPart, element);
}
private void createDocumentElement(DocumentPart docPart) {
String elementName = TS + ELENPHARTICLE + "/" + PARSERNAME + "_" + documentID ;
Resource element = m.createResource(elementName,elenphClass);
element.addProperty( RDFS.label, docPart.getName());
addMetadataProperties(element,docPart.getMetadata());
elements.put(docPart.getPath(), element);
attachExcerpt(docPart, element);
}
private void addMetadataProperties(Resource resource, HashMap<String, Set<String>> metadata) {
Set<String> names = metadata.keySet();
for (String name : names) {
Set<String> values = metadata.get(name);
if (values != null) {
for (String value : values) {
addMetadata(resource,name,value);
}
}
}
}
private void addMetadata(Resource resource, String name, String value) {
if (isNotBlacklisted(name)) {
name = convertName(name);
if (isDefinedInOntology(resource,name)) {
Property property = m.createProperty(TS + name);
resource.addProperty( property, value);
}
}
}
private boolean isDefinedInOntology(Resource resource, String name) {
String nameSpace = resource.getNameSpace();
if (nameSpace.contains(TS + EXCERPT)) {
if (name.equals("author") ||
name.equals("bibliography") ||
name.equals("keywords") ||
name.equals("works") ||
name.equals("affiliation")
) {
return true;
}
} else
if (nameSpace.contains(TS + ELENPHARTICLE)) {
if (name.equals("doi") ||
name.equals("firstPublication") ||
//name.equals("yearAndMonth") ||
name.equals("year") ||
name.equals("issue")
) {
return true;
}
}
System.out.println(resource.getNameSpace() + " " + name);
System.out.println("rightNamespace" + TS + EXCERPT );
return false;
}
private String convertName(String name) {
if (name.equals("Affiliation")){
return "affiliation";
} else
if (name.equals("DOI")){
return "doi";
} else
if (name.equals("1st-edition")){
return "firstPublication";
} else
if (name.equals("year")){
return "yearAndMonth";
} else
if (name.equals("year.short")){
return "year";
}else
return name;
}
private boolean isNotBlacklisted(String name) {
if (name.equals("Filename") ||
name.equals("Section") ||
name.equals("dc.Title") ||
name.equals("subtitle") ||
name.equals("dc.Identifier")
) {
return false;
}
return true;
}
private void createTOCItem(DocumentPart docPart) {
String tocItemName = TS + TOCITEM + "/" + PARSERNAME + "_" + documentID + docPart.getSafePath();
Resource tocItem = m.createIndividual(tocItemName,itemClass);
tocItem.addProperty( RDFS.label, docPart.getName());
Property pointsTo = m.createProperty(TS + "pointsTo");
Property itemNumber = m.createProperty(TS + "itemNumber");
Property hasTOCItem = m.createProperty(TS + "hasTOCItem");
tocItem.addLiteral(itemNumber, docPart.getNumber());
m.add(tocItem, pointsTo, elements.get(docPart.getPath()));
if (!docPart.getPath().isEmpty()) {
Resource parent = elements.get(docPart.getParentPath());
m.add(parent, hasTOCItem, tocItem);
}
}
private void attachExcerpt(DocumentPart docPart, Resource element) {
if (docPart.isEmpty()) {
return;
}
if (docPart.getBody().isEmpty() && isMaster(element)) {
return;
}
Resource excerpt = createExcerpt(docPart);
excerpt.addProperty( RDFS.label, docPart.getName());
Property hasText = m.createProperty(TS + "hasText");
element.addProperty(hasText, excerpt);
if (!docPart.isMasterPart()) {
addMetadataProperties(excerpt,docPart.getMetadata());
}
}
private boolean isMaster(Resource element) {
return element.getNameSpace().contains(TS + ELENPHARTICLE);
}
public void createTree() {
createElements();
createTOCItems();
}
private void createTOCItems() {
Set<String> paths = inputParts.keySet();
for (String path : paths) {
DocumentPart part = inputParts.get(path);
if (!part.getNumber().equals("")) {
createTOCItem(part);
}
}
}
private void createElements() {
Set<String> paths = inputParts.keySet();
for (String path : paths) {
DocumentPart part = inputParts.get(path);
if (part.getPath().isEmpty()) {
createDocumentElement(part);
} else {
createElement(part);
}
}
}
public void applyMetadata(Metadata metadata) {
for (DocumentPart part: inputParts.values()) {
metadata.apply(part);
}
}
}