291 lines
8.1 KiB
Java
291 lines
8.1 KiB
Java
package writer2latex.rdf;
|
|
|
|
import java.io.File;
|
|
import java.io.FileWriter;
|
|
import java.io.IOException;
|
|
import java.io.OutputStream;
|
|
import java.util.HashMap;
|
|
import java.util.Set;
|
|
import java.util.Vector;
|
|
|
|
import org.apache.jena.ontology.OntClass;
|
|
import org.apache.jena.ontology.OntModel;
|
|
import org.apache.jena.rdf.model.ModelFactory;
|
|
import org.apache.jena.rdf.model.Resource;
|
|
import org.apache.jena.vocabulary.*;
|
|
|
|
import writer2latex.xhtml.XhtmlDocument;
|
|
|
|
import org.apache.jena.rdf.model.Property;
|
|
|
|
|
|
public class DocumentStructure {
|
|
private static final String TOCITEM = "TOCItem";
|
|
private HashMap<String, Resource> elements;
|
|
private HashMap<String, DocumentPart> inputParts;
|
|
private final String TS = "https://iph.ras.ru/text_structures#";
|
|
private final String PARSERNAME = "w2phtml";
|
|
private final String EXCERPT = "elenphExcerpt";
|
|
private final String TOC_ELEMENT = "TOCElement";
|
|
private final String ELENPHARTICLE = "elenphArticle";
|
|
private String documentID = "DOC_ID";
|
|
|
|
|
|
private OntModel m;
|
|
private OntClass excerptClass;
|
|
private OntClass elementClass;
|
|
private OntClass itemClass;
|
|
private OntClass elenphClass;
|
|
|
|
private DocumentStructure() {
|
|
this.elements = new HashMap<String, Resource>();
|
|
this.inputParts = new HashMap<String, DocumentPart>();
|
|
this.m = ModelFactory.createOntologyModel();
|
|
this.excerptClass = m.createClass(TS + EXCERPT);
|
|
this.elementClass = m.createClass(TS + TOC_ELEMENT);
|
|
this.itemClass = m.createClass(TS + TOCITEM);
|
|
this.elenphClass = m.createClass(TS + ELENPHARTICLE);
|
|
|
|
|
|
|
|
}
|
|
public DocumentStructure(Vector<XhtmlDocument> files,String fileName) {
|
|
this();
|
|
this.documentID = fileName;
|
|
for(int i = 0 ; i< files.size();i++) {
|
|
XhtmlDocument inputDoc = files.get(i);
|
|
DocumentPart part = new DocumentPart(inputDoc);
|
|
part.setOrder(Integer.toString(i));
|
|
addPart(part);
|
|
}
|
|
addEmptyParts();
|
|
}
|
|
|
|
private void addEmptyParts() {
|
|
Set<String> paths = inputParts.keySet();
|
|
String[] array = new String[paths.size()];
|
|
paths.toArray(array);
|
|
for (int k = 0; k < array.length;k++) {
|
|
String[] levels = array[k].split(" ");
|
|
for (int i = 0; i < levels.length; i++) {
|
|
if (levels[i].equals("0")) {
|
|
String emptyPath = createEmptyPath(levels, i);
|
|
if (!inputParts.containsKey(emptyPath)) {
|
|
DocumentPart emptyPart = new DocumentPart(emptyPath);
|
|
addPart(emptyPart);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
private String createEmptyPath(String[] levels, int i) {
|
|
StringBuilder emptyPath = new StringBuilder();
|
|
for (int j = 0; j<= i;j++) {
|
|
if (j != 0) {
|
|
emptyPath.append(" ");
|
|
}
|
|
emptyPath.append(levels[j]);
|
|
}
|
|
return emptyPath.toString();
|
|
}
|
|
|
|
public void printModel(String fileName) {
|
|
File outFile = new File(fileName + ".rdf");
|
|
FileWriter fw = null;
|
|
try {
|
|
outFile.createNewFile();
|
|
fw = new FileWriter(outFile);
|
|
m.write(fw,"RDF/XML-ABBREV");
|
|
|
|
} catch (IOException e) {
|
|
System.out.println("File couldn't be created");
|
|
e.printStackTrace();
|
|
} finally {
|
|
try {
|
|
fw.close();
|
|
} catch (IOException e) {
|
|
// TODO Auto-generated catch block
|
|
e.printStackTrace();
|
|
}
|
|
}
|
|
|
|
}
|
|
public void printModel(OutputStream os) {
|
|
m.write(os,"RDF/XML-ABBREV");
|
|
}
|
|
|
|
|
|
private void addPart(DocumentPart docExcerpt) {
|
|
inputParts.put(docExcerpt.getPath(), docExcerpt);
|
|
}
|
|
|
|
private Resource createExcerpt(DocumentPart docExcerpt) {
|
|
String name = TS + EXCERPT + "/" + PARSERNAME + "_" + documentID + docExcerpt.getSafePath();
|
|
Resource excerpt = m.createIndividual(name, excerptClass);
|
|
if (!docExcerpt.getBody().isEmpty()) {
|
|
Property htmlExcerpt = m.createProperty(TS + "htmlExcerpt");
|
|
excerpt.addLiteral(htmlExcerpt, docExcerpt.getBody());
|
|
}
|
|
|
|
|
|
return excerpt;
|
|
}
|
|
|
|
private void createElement(DocumentPart docPart) {
|
|
String elementName = TS + TOC_ELEMENT + "/" + PARSERNAME + "_" + documentID + docPart.getSafePath();
|
|
Resource element = m.createIndividual(elementName,elementClass);
|
|
element.addProperty( RDFS.label, docPart.getName());
|
|
elements.put(docPart.getPath(), element);
|
|
attachExcerpt(docPart, element);
|
|
}
|
|
|
|
private void createDocumentElement(DocumentPart docPart) {
|
|
String elementName = TS + ELENPHARTICLE + "/" + PARSERNAME + "_" + documentID ;
|
|
Resource element = m.createResource(elementName,elenphClass);
|
|
element.addProperty( RDFS.label, docPart.getName());
|
|
addMetadataProperties(element,docPart.getMetadata());
|
|
elements.put(docPart.getPath(), element);
|
|
attachExcerpt(docPart, element);
|
|
|
|
}
|
|
|
|
private void addMetadataProperties(Resource resource, HashMap<String, Set<String>> metadata) {
|
|
Set<String> names = metadata.keySet();
|
|
for (String name : names) {
|
|
Set<String> values = metadata.get(name);
|
|
if (values != null) {
|
|
for (String value : values) {
|
|
addMetadata(resource,name,value);
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|
|
private void addMetadata(Resource resource, String name, String value) {
|
|
if (isNotBlacklisted(name)) {
|
|
name = convertName(name);
|
|
if (isDefinedInOntology(resource,name)) {
|
|
Property property = m.createProperty(TS + name);
|
|
resource.addProperty( property, value);
|
|
}
|
|
}
|
|
}
|
|
private boolean isDefinedInOntology(Resource resource, String name) {
|
|
String nameSpace = resource.getNameSpace();
|
|
if (nameSpace.contains(TS + EXCERPT)) {
|
|
if (name.equals("author") ||
|
|
name.equals("bibliography") ||
|
|
name.equals("keywords") ||
|
|
name.equals("works") ||
|
|
name.equals("affiliation")
|
|
) {
|
|
return true;
|
|
}
|
|
} else
|
|
if (nameSpace.contains(TS + ELENPHARTICLE)) {
|
|
if (name.equals("doi") ||
|
|
name.equals("firstPublication") ||
|
|
//name.equals("yearAndMonth") ||
|
|
name.equals("year") ||
|
|
name.equals("issue")
|
|
) {
|
|
return true;
|
|
}
|
|
}
|
|
System.out.println(resource.getNameSpace() + " " + name);
|
|
System.out.println("rightNamespace" + TS + EXCERPT );
|
|
return false;
|
|
}
|
|
private String convertName(String name) {
|
|
if (name.equals("Affiliation")){
|
|
return "affiliation";
|
|
} else
|
|
if (name.equals("DOI")){
|
|
return "doi";
|
|
} else
|
|
if (name.equals("1st-edition")){
|
|
return "firstPublication";
|
|
} else
|
|
if (name.equals("year")){
|
|
return "yearAndMonth";
|
|
} else
|
|
if (name.equals("year.short")){
|
|
return "year";
|
|
}else
|
|
return name;
|
|
}
|
|
private boolean isNotBlacklisted(String name) {
|
|
if (name.equals("Filename") ||
|
|
name.equals("Section") ||
|
|
name.equals("dc.Title") ||
|
|
name.equals("subtitle") ||
|
|
name.equals("dc.Identifier")
|
|
) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
private void createTOCItem(DocumentPart docPart) {
|
|
String tocItemName = TS + TOCITEM + "/" + PARSERNAME + "_" + documentID + docPart.getSafePath();
|
|
Resource tocItem = m.createIndividual(tocItemName,itemClass);
|
|
tocItem.addProperty( RDFS.label, docPart.getName());
|
|
Property pointsTo = m.createProperty(TS + "pointsTo");
|
|
Property itemNumber = m.createProperty(TS + "itemNumber");
|
|
Property hasTOCItem = m.createProperty(TS + "hasTOCItem");
|
|
tocItem.addLiteral(itemNumber, docPart.getNumber());
|
|
m.add(tocItem, pointsTo, elements.get(docPart.getPath()));
|
|
if (!docPart.getPath().isEmpty()) {
|
|
Resource parent = elements.get(docPart.getParentPath());
|
|
m.add(parent, hasTOCItem, tocItem);
|
|
}
|
|
|
|
}
|
|
|
|
private void attachExcerpt(DocumentPart docPart, Resource element) {
|
|
if (docPart.isEmpty()) {
|
|
return;
|
|
}
|
|
Resource excerpt = createExcerpt(docPart);
|
|
excerpt.addProperty( RDFS.label, docPart.getName());
|
|
Property hasText = m.createProperty(TS + "hasText");
|
|
element.addProperty(hasText, excerpt);
|
|
if (!docPart.isMasterPart()) {
|
|
addMetadataProperties(excerpt,docPart.getMetadata());
|
|
}
|
|
}
|
|
|
|
public void createTree() {
|
|
createElements();
|
|
createTOCItems();
|
|
}
|
|
|
|
private void createTOCItems() {
|
|
Set<String> paths = inputParts.keySet();
|
|
for (String path : paths) {
|
|
DocumentPart part = inputParts.get(path);
|
|
if (!part.getNumber().equals("")) {
|
|
createTOCItem(part);
|
|
}
|
|
}
|
|
}
|
|
|
|
private void createElements() {
|
|
Set<String> paths = inputParts.keySet();
|
|
for (String path : paths) {
|
|
DocumentPart part = inputParts.get(path);
|
|
if (part.getPath().isEmpty()) {
|
|
createDocumentElement(part);
|
|
} else {
|
|
createElement(part);
|
|
}
|
|
}
|
|
}
|
|
public void applyMetadata(Metadata metadata) {
|
|
for (DocumentPart part: inputParts.values()) {
|
|
metadata.apply(part);
|
|
}
|
|
}
|
|
|
|
}
|