w2phtml/src/main/java/writer2latex/rdf/DocumentStructure.java

package writer2latex.rdf;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStream;
import java.util.HashMap;
import java.util.Set;
import java.util.Vector;

import org.apache.jena.ontology.OntClass;
import org.apache.jena.ontology.OntModel;
import org.apache.jena.rdf.model.ModelFactory;
import org.apache.jena.rdf.model.Resource;
import org.apache.jena.vocabulary.*;

import writer2latex.xhtml.XhtmlDocument;

import org.apache.jena.rdf.model.Property;


public class DocumentStructure {
	private static final String TOCITEM = "TOCItem";
	private HashMap<String, Resource> elements;
	private HashMap<String, DocumentPart> inputParts;
	private final String TS = "https://iph.ras.ru/text_structures#";
	private final String PARSERNAME = "w2phtml";
	private final String EXCERPT = "elenphExcerpt";
	private final String TOC_ELEMENT = "TOCElement";
	private final String ELENPHARTICLE = "elenphArticle";
	private String documentID = "DOC_ID";


	private OntModel m;
	private OntClass excerptClass;
	private OntClass elementClass;
	private OntClass itemClass;
	private OntClass elenphClass;

	private DocumentStructure() {
		this.elements = new HashMap<String, Resource>();
		this.inputParts = new HashMap<String, DocumentPart>();
		this.m = ModelFactory.createOntologyModel();
		this.excerptClass = m.createClass(TS + EXCERPT);
		this.elementClass = m.createClass(TS + TOC_ELEMENT);
		this.itemClass = m.createClass(TS + TOCITEM);
		this.elenphClass = m.createClass(TS + ELENPHARTICLE);


	}
	public DocumentStructure(Vector<XhtmlDocument> files,String fileName) {
		this();
		this.documentID = fileName;
		for(int i = 0 ; i< files.size();i++) {
			XhtmlDocument inputDoc = files.get(i);
			DocumentPart part = new DocumentPart(inputDoc);
			part.setOrder(Integer.toString(i));
			addPart(part);
		}
		addEmptyParts();
	}

	private void addEmptyParts() {
		Set<String> paths = inputParts.keySet();
		String[] array = new String[paths.size()];
		paths.toArray(array);
		for (int k = 0; k < array.length;k++) {
			String[] levels = array[k].split(" ");
			for (int i = 0; i < levels.length; i++) {
				if (levels[i].equals("0")) {
					String emptyPath = createEmptyPath(levels, i);
					if (!inputParts.containsKey(emptyPath)) {
						DocumentPart emptyPart = new DocumentPart(emptyPath);
						addPart(emptyPart);
					}
				}
			}
		}
	}

	private String createEmptyPath(String[] levels, int i) {
		StringBuilder emptyPath = new StringBuilder();
		for (int j = 0; j<= i;j++) {
			if (j != 0) {
				emptyPath.append(" ");
			}
			emptyPath.append(levels[j]);
		}
		return emptyPath.toString();
	}

	public void printModel(String fileName) {
		File outFile = new File(fileName + ".rdf");
		FileWriter fw = null;
		try {
			outFile.createNewFile();
			fw = new FileWriter(outFile);
			m.write(fw,"RDF/XML-ABBREV");

		} catch (IOException e) {
			System.out.println("File couldn't be created");
			e.printStackTrace();
		} finally {
			try {
				fw.close();
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}

	}
	public void printModel(OutputStream os) {
		m.write(os,"RDF/XML-ABBREV");
	}


	private void addPart(DocumentPart docExcerpt) {
		inputParts.put(docExcerpt.getPath(), docExcerpt);
	}

	private Resource createExcerpt(DocumentPart docExcerpt) {
		String name = TS + EXCERPT + "/" + PARSERNAME + "_" + documentID + docExcerpt.getSafePath();
		Resource excerpt = m.createIndividual(name, excerptClass);
		if (!docExcerpt.getBody().isEmpty()) {
			Property htmlExcerpt = m.createProperty(TS + "htmlExcerpt");
			excerpt.addLiteral(htmlExcerpt, docExcerpt.getBody());
		}


		return excerpt;
	}

	private void createElement(DocumentPart docPart) {
		String elementName = TS + TOC_ELEMENT + "/" + PARSERNAME + "_" + documentID + docPart.getSafePath();
		Resource element = m.createIndividual(elementName,elementClass);
		element.addProperty( RDFS.label, docPart.getName());
		elements.put(docPart.getPath(), element);
		attachExcerpt(docPart, element);
	}

	private void createDocumentElement(DocumentPart docPart) {
		String elementName = TS + ELENPHARTICLE + "/" + PARSERNAME + "_" + documentID ;
		Resource element = m.createResource(elementName,elenphClass);
		element.addProperty( RDFS.label, docPart.getName());
		addMetadataProperties(element,docPart.getMetadata());
		elements.put(docPart.getPath(), element);
		attachExcerpt(docPart, element);

	}

	private void addMetadataProperties(Resource resource, HashMap<String, Set<String>> metadata) {
		Set<String> names = metadata.keySet();
		for (String name : names) {
			Set<String> values = metadata.get(name);
			if (values != null) {
				for (String value : values) {
					addMetadata(resource,name,value);
				}
			}
		}

	}
	private void addMetadata(Resource resource, String name, String value) {
		if (isNotBlacklisted(name)) {
			name = convertName(name);
			if (isDefinedInOntology(resource,name)) {
				Property property = m.createProperty(TS + name);
				resource.addProperty( property, value);
			}
		}
	}
	private boolean isDefinedInOntology(Resource resource, String name) {
		String nameSpace = resource.getNameSpace();
		if (nameSpace.contains(TS + EXCERPT)) {
			if (name.equals("author") ||
					name.equals("bibliography") ||
					name.equals("keywords") ||
					name.equals("works") ||
					name.equals("affiliation")
					) {
				return true;
			}
		} else
		if (nameSpace.contains(TS + ELENPHARTICLE)) {
			if (name.equals("doi") ||
					name.equals("firstPublication") ||
					//name.equals("yearAndMonth") ||
					name.equals("year") ||
					name.equals("issue")
					) {
				return true;
			}
		}
		System.out.println(resource.getNameSpace() + " " + name);
		System.out.println("rightNamespace" +   TS + EXCERPT );
		return false;
	}
	private String convertName(String name) {
		if (name.equals("Affiliation")){
			return "affiliation";
		} else
		if (name.equals("DOI")){
			return "doi";
		} else
		if (name.equals("1st-edition")){
			return "firstPublication";
		} else
		if (name.equals("year")){
			return "yearAndMonth";
		} else
		if (name.equals("year.short")){
				return "year";
		}else
		return name;
	}
	private boolean isNotBlacklisted(String name) {
		if (name.equals("Filename") ||
				name.equals("Section") ||
				name.equals("dc.Title") ||
				name.equals("subtitle") ||
				name.equals("dc.Identifier")
				) {
				return false;
		}

		return true;
	}
	private void createTOCItem(DocumentPart docPart) {
		String tocItemName = TS + TOCITEM + "/" + PARSERNAME + "_" + documentID + docPart.getSafePath();
		Resource tocItem = m.createIndividual(tocItemName,itemClass);
		tocItem.addProperty( RDFS.label, docPart.getName());
		Property pointsTo = m.createProperty(TS + "pointsTo");
		Property itemNumber = m.createProperty(TS + "itemNumber");
		Property hasTOCItem = m.createProperty(TS + "hasTOCItem");
		tocItem.addLiteral(itemNumber, docPart.getNumber());
		m.add(tocItem, pointsTo, elements.get(docPart.getPath()));
		if (!docPart.getPath().isEmpty()) {
			Resource parent = elements.get(docPart.getParentPath());
			m.add(parent, hasTOCItem, tocItem);
		}

	}

	private void attachExcerpt(DocumentPart docPart, Resource element) {
		if (docPart.isEmpty()) {
			return;
		}
		Resource excerpt = createExcerpt(docPart);
		excerpt.addProperty( RDFS.label, docPart.getName());
		Property hasText = m.createProperty(TS + "hasText");
		element.addProperty(hasText, excerpt);
		if (!docPart.isMasterPart()) {
			addMetadataProperties(excerpt,docPart.getMetadata());
		}
	}

	public void createTree() {
		createElements();
		createTOCItems();
	}

	private void createTOCItems() {
		Set<String> paths = inputParts.keySet();
		for (String path : paths) {
			DocumentPart part = inputParts.get(path);
			if (!part.getNumber().equals("")) {
				createTOCItem(part);
			}
		}
	}

	private void createElements() {
		Set<String> paths = inputParts.keySet();
		for (String path : paths) {
			DocumentPart part = inputParts.get(path);
			if (part.getPath().isEmpty()) {
				createDocumentElement(part);
			} else {
				createElement(part);
			}
		}
	}
	public void applyMetadata(Metadata metadata) {
		for (DocumentPart part: inputParts.values()) {
			 metadata.apply(part);
		}
	}

}