From 296c8a28bde1fb6aab9bad2274e160964133ee5a Mon Sep 17 00:00:00 2001 From: Georgy Litvinov Date: Mon, 2 Mar 2020 19:55:27 +0100 Subject: [PATCH] Convert metadata from csv to rdf --- .../java/writer2latex/rdf/DocumentPart.java | 42 +++++++- .../writer2latex/rdf/DocumentStructure.java | 95 ++++++++++++++++--- 2 files changed, 123 insertions(+), 14 deletions(-) diff --git a/src/main/java/writer2latex/rdf/DocumentPart.java b/src/main/java/writer2latex/rdf/DocumentPart.java index 3a871de..2b62594 100644 --- a/src/main/java/writer2latex/rdf/DocumentPart.java +++ b/src/main/java/writer2latex/rdf/DocumentPart.java @@ -1,6 +1,8 @@ package writer2latex.rdf; import java.io.StringWriter; +import java.util.HashMap; +import java.util.Set; import javax.xml.transform.OutputKeys; import javax.xml.transform.Result; @@ -22,13 +24,15 @@ import writer2latex.xhtml.XhtmlDocument; public class DocumentPart { + private static final String DC_TITLE = "dc.Title"; private XhtmlDocument excerptDoc; private String path; private String itemNumber; private String body; private String parentPath; - private String name; - private String order; + private String name = null; + private String order = null; + private HashMap> metadata = null; public DocumentPart(XhtmlDocument document) { this.excerptDoc = document; @@ -67,19 +71,45 @@ public class DocumentPart { public void setOrder(String order) { this.order = order; } - public String getOrder() { + public String getSequentionalNumber() { return order; } public String getParentPath() { return parentPath; } + public boolean isMasterPart() { + if (path.isEmpty()) { + return true; + } + return false; + } public String getName() { + if (metadata != null) { + Set nameSet = metadata.get(DC_TITLE); + if (nameSet != null && !nameSet.isEmpty()) { + for (String value : nameSet) { + //return first one + return value; + } + } + } if (name.isEmpty()) { return "NONAME"; } return name; } + public void setMetadata(HashMap> metadata) { + this.metadata = metadata; + } + public HashMap> getMetadata(){ + if (metadata == null) { + return new HashMap>(); + } else { + return metadata; + } + } + private void extractPath() { Element excerptContentNode = excerptDoc.getContentNode(); this.path = excerptContentNode.getAttribute("path"); @@ -126,4 +156,10 @@ public class DocumentPart { } this.body = bodyBuilder.toString(); } + public boolean isEmpty() { + if (metadata == null && body.isEmpty()) { + return true; + } + return false; + } } diff --git a/src/main/java/writer2latex/rdf/DocumentStructure.java b/src/main/java/writer2latex/rdf/DocumentStructure.java index 4e47cf9..11107bf 100644 --- a/src/main/java/writer2latex/rdf/DocumentStructure.java +++ b/src/main/java/writer2latex/rdf/DocumentStructure.java @@ -25,7 +25,7 @@ public class DocumentStructure { private HashMap inputParts; private final String TS = "https://iph.ras.ru/text_structures#"; private final String PARSERNAME = "w2phtml"; - private final String EXCERPT = "textExcerpt"; + private final String EXCERPT = "elenphExcerpt"; private final String TOC_ELEMENT = "TOCElement"; private final String ELENPHARTICLE = "elenphArticle"; private String documentID = "DOC_ID"; @@ -52,17 +52,12 @@ public class DocumentStructure { public DocumentStructure(Vector files,String fileName) { this(); this.documentID = fileName; - //Iterator filesIterator = files.iterator(); - //while (filesIterator.hasNext()) { - //XhtmlDocument inputDoc = filesIterator.next(); for(int i = 0 ; i< files.size();i++) { XhtmlDocument inputDoc = files.get(i); DocumentPart part = new DocumentPart(inputDoc); part.setOrder(Integer.toString(i)); addPart(part); } - - //} addEmptyParts(); } @@ -146,11 +141,89 @@ public class DocumentStructure { String elementName = TS + ELENPHARTICLE + "/" + PARSERNAME + "_" + documentID ; Resource element = m.createResource(elementName,elenphClass); element.addProperty( RDFS.label, docPart.getName()); - + addMetadataProperties(element,docPart.getMetadata()); elements.put(docPart.getPath(), element); attachExcerpt(docPart, element); + } + private void addMetadataProperties(Resource resource, HashMap> metadata) { + Set names = metadata.keySet(); + for (String name : names) { + Set values = metadata.get(name); + if (values != null) { + for (String value : values) { + addMetadata(resource,name,value); + } + } + } + + } + private void addMetadata(Resource resource, String name, String value) { + if (isNotBlacklisted(name)) { + name = convertName(name); + if (isDefinedInOntology(resource,name)) { + Property property = m.createProperty(TS + name); + resource.addProperty( property, value); + } + } + } + private boolean isDefinedInOntology(Resource resource, String name) { + String nameSpace = resource.getNameSpace(); + if (nameSpace.contains(TS + EXCERPT)) { + if (name.equals("author") || + name.equals("bibliography") || + name.equals("keywords") || + name.equals("works") || + name.equals("affiliation") + ) { + return true; + } + } else + if (nameSpace.contains(TS + ELENPHARTICLE)) { + if (name.equals("doi") || + name.equals("firstPublication") || + //name.equals("yearAndMonth") || + name.equals("year") || + name.equals("issue") + ) { + return true; + } + } + System.out.println(resource.getNameSpace() + " " + name); + System.out.println("rightNamespace" + TS + EXCERPT ); + return false; + } + private String convertName(String name) { + if (name.equals("Affiliation")){ + return "affiliation"; + } else + if (name.equals("DOI")){ + return "doi"; + } else + if (name.equals("1st-edition")){ + return "firstPublication"; + } else + if (name.equals("year")){ + return "yearAndMonth"; + } else + if (name.equals("year.short")){ + return "year"; + }else + return name; + } + private boolean isNotBlacklisted(String name) { + if (name.equals("Filename") || + name.equals("Section") || + name.equals("dc.Title") || + name.equals("subtitle") || + name.equals("dc.Identifier") + ) { + return false; + } + + return true; + } private void createTOCItem(DocumentPart docPart) { String tocItemName = TS + TOCITEM + "/" + PARSERNAME + "_" + documentID + docPart.getSafePath(); Resource tocItem = m.createIndividual(tocItemName,itemClass); @@ -159,8 +232,6 @@ public class DocumentStructure { Property itemNumber = m.createProperty(TS + "itemNumber"); Property hasTOCItem = m.createProperty(TS + "hasTOCItem"); tocItem.addLiteral(itemNumber, docPart.getNumber()); - - m.add(tocItem, pointsTo, elements.get(docPart.getPath())); if (!docPart.getPath().isEmpty()) { Resource parent = elements.get(docPart.getParentPath()); @@ -170,14 +241,16 @@ public class DocumentStructure { } private void attachExcerpt(DocumentPart docPart, Resource element) { - if (docPart.getBody().isEmpty()) { + if (docPart.isEmpty()) { return; } Resource excerpt = createExcerpt(docPart); excerpt.addProperty( RDFS.label, docPart.getName()); - Property hasText = m.createProperty(TS + "hasText"); element.addProperty(hasText, excerpt); + if (!docPart.isMasterPart()) { + addMetadataProperties(excerpt,docPart.getMetadata()); + } } public void createTree() {