Convert metadata from csv to rdf

This commit is contained in:
Georgy Litvinov 2020-03-02 19:55:27 +01:00
parent dc46b0c3ad
commit 296c8a28bd
2 changed files with 123 additions and 14 deletions

View file

@ -1,6 +1,8 @@
package writer2latex.rdf; package writer2latex.rdf;
import java.io.StringWriter; import java.io.StringWriter;
import java.util.HashMap;
import java.util.Set;
import javax.xml.transform.OutputKeys; import javax.xml.transform.OutputKeys;
import javax.xml.transform.Result; import javax.xml.transform.Result;
@ -22,13 +24,15 @@ import writer2latex.xhtml.XhtmlDocument;
public class DocumentPart { public class DocumentPart {
private static final String DC_TITLE = "dc.Title";
private XhtmlDocument excerptDoc; private XhtmlDocument excerptDoc;
private String path; private String path;
private String itemNumber; private String itemNumber;
private String body; private String body;
private String parentPath; private String parentPath;
private String name; private String name = null;
private String order; private String order = null;
private HashMap<String,Set<String>> metadata = null;
public DocumentPart(XhtmlDocument document) { public DocumentPart(XhtmlDocument document) {
this.excerptDoc = document; this.excerptDoc = document;
@ -67,19 +71,45 @@ public class DocumentPart {
public void setOrder(String order) { public void setOrder(String order) {
this.order = order; this.order = order;
} }
public String getOrder() { public String getSequentionalNumber() {
return order; return order;
} }
public String getParentPath() { public String getParentPath() {
return parentPath; return parentPath;
} }
public boolean isMasterPart() {
if (path.isEmpty()) {
return true;
}
return false;
}
public String getName() { public String getName() {
if (metadata != null) {
Set<String> nameSet = metadata.get(DC_TITLE);
if (nameSet != null && !nameSet.isEmpty()) {
for (String value : nameSet) {
//return first one
return value;
}
}
}
if (name.isEmpty()) { if (name.isEmpty()) {
return "NONAME"; return "NONAME";
} }
return name; return name;
} }
public void setMetadata(HashMap<String,Set<String>> metadata) {
this.metadata = metadata;
}
public HashMap<String,Set<String>> getMetadata(){
if (metadata == null) {
return new HashMap<String,Set<String>>();
} else {
return metadata;
}
}
private void extractPath() { private void extractPath() {
Element excerptContentNode = excerptDoc.getContentNode(); Element excerptContentNode = excerptDoc.getContentNode();
this.path = excerptContentNode.getAttribute("path"); this.path = excerptContentNode.getAttribute("path");
@ -126,4 +156,10 @@ public class DocumentPart {
} }
this.body = bodyBuilder.toString(); this.body = bodyBuilder.toString();
} }
public boolean isEmpty() {
if (metadata == null && body.isEmpty()) {
return true;
}
return false;
}
} }

View file

@ -25,7 +25,7 @@ public class DocumentStructure {
private HashMap<String, DocumentPart> inputParts; private HashMap<String, DocumentPart> inputParts;
private final String TS = "https://iph.ras.ru/text_structures#"; private final String TS = "https://iph.ras.ru/text_structures#";
private final String PARSERNAME = "w2phtml"; private final String PARSERNAME = "w2phtml";
private final String EXCERPT = "textExcerpt"; private final String EXCERPT = "elenphExcerpt";
private final String TOC_ELEMENT = "TOCElement"; private final String TOC_ELEMENT = "TOCElement";
private final String ELENPHARTICLE = "elenphArticle"; private final String ELENPHARTICLE = "elenphArticle";
private String documentID = "DOC_ID"; private String documentID = "DOC_ID";
@ -52,17 +52,12 @@ public class DocumentStructure {
public DocumentStructure(Vector<XhtmlDocument> files,String fileName) { public DocumentStructure(Vector<XhtmlDocument> files,String fileName) {
this(); this();
this.documentID = fileName; this.documentID = fileName;
//Iterator<XhtmlDocument> filesIterator = files.iterator();
//while (filesIterator.hasNext()) {
//XhtmlDocument inputDoc = filesIterator.next();
for(int i = 0 ; i< files.size();i++) { for(int i = 0 ; i< files.size();i++) {
XhtmlDocument inputDoc = files.get(i); XhtmlDocument inputDoc = files.get(i);
DocumentPart part = new DocumentPart(inputDoc); DocumentPart part = new DocumentPart(inputDoc);
part.setOrder(Integer.toString(i)); part.setOrder(Integer.toString(i));
addPart(part); addPart(part);
} }
//}
addEmptyParts(); addEmptyParts();
} }
@ -146,11 +141,89 @@ public class DocumentStructure {
String elementName = TS + ELENPHARTICLE + "/" + PARSERNAME + "_" + documentID ; String elementName = TS + ELENPHARTICLE + "/" + PARSERNAME + "_" + documentID ;
Resource element = m.createResource(elementName,elenphClass); Resource element = m.createResource(elementName,elenphClass);
element.addProperty( RDFS.label, docPart.getName()); element.addProperty( RDFS.label, docPart.getName());
addMetadataProperties(element,docPart.getMetadata());
elements.put(docPart.getPath(), element); elements.put(docPart.getPath(), element);
attachExcerpt(docPart, element); attachExcerpt(docPart, element);
} }
private void addMetadataProperties(Resource resource, HashMap<String, Set<String>> metadata) {
Set<String> names = metadata.keySet();
for (String name : names) {
Set<String> values = metadata.get(name);
if (values != null) {
for (String value : values) {
addMetadata(resource,name,value);
}
}
}
}
private void addMetadata(Resource resource, String name, String value) {
if (isNotBlacklisted(name)) {
name = convertName(name);
if (isDefinedInOntology(resource,name)) {
Property property = m.createProperty(TS + name);
resource.addProperty( property, value);
}
}
}
private boolean isDefinedInOntology(Resource resource, String name) {
String nameSpace = resource.getNameSpace();
if (nameSpace.contains(TS + EXCERPT)) {
if (name.equals("author") ||
name.equals("bibliography") ||
name.equals("keywords") ||
name.equals("works") ||
name.equals("affiliation")
) {
return true;
}
} else
if (nameSpace.contains(TS + ELENPHARTICLE)) {
if (name.equals("doi") ||
name.equals("firstPublication") ||
//name.equals("yearAndMonth") ||
name.equals("year") ||
name.equals("issue")
) {
return true;
}
}
System.out.println(resource.getNameSpace() + " " + name);
System.out.println("rightNamespace" + TS + EXCERPT );
return false;
}
private String convertName(String name) {
if (name.equals("Affiliation")){
return "affiliation";
} else
if (name.equals("DOI")){
return "doi";
} else
if (name.equals("1st-edition")){
return "firstPublication";
} else
if (name.equals("year")){
return "yearAndMonth";
} else
if (name.equals("year.short")){
return "year";
}else
return name;
}
private boolean isNotBlacklisted(String name) {
if (name.equals("Filename") ||
name.equals("Section") ||
name.equals("dc.Title") ||
name.equals("subtitle") ||
name.equals("dc.Identifier")
) {
return false;
}
return true;
}
private void createTOCItem(DocumentPart docPart) { private void createTOCItem(DocumentPart docPart) {
String tocItemName = TS + TOCITEM + "/" + PARSERNAME + "_" + documentID + docPart.getSafePath(); String tocItemName = TS + TOCITEM + "/" + PARSERNAME + "_" + documentID + docPart.getSafePath();
Resource tocItem = m.createIndividual(tocItemName,itemClass); Resource tocItem = m.createIndividual(tocItemName,itemClass);
@ -159,8 +232,6 @@ public class DocumentStructure {
Property itemNumber = m.createProperty(TS + "itemNumber"); Property itemNumber = m.createProperty(TS + "itemNumber");
Property hasTOCItem = m.createProperty(TS + "hasTOCItem"); Property hasTOCItem = m.createProperty(TS + "hasTOCItem");
tocItem.addLiteral(itemNumber, docPart.getNumber()); tocItem.addLiteral(itemNumber, docPart.getNumber());
m.add(tocItem, pointsTo, elements.get(docPart.getPath())); m.add(tocItem, pointsTo, elements.get(docPart.getPath()));
if (!docPart.getPath().isEmpty()) { if (!docPart.getPath().isEmpty()) {
Resource parent = elements.get(docPart.getParentPath()); Resource parent = elements.get(docPart.getParentPath());
@ -170,14 +241,16 @@ public class DocumentStructure {
} }
private void attachExcerpt(DocumentPart docPart, Resource element) { private void attachExcerpt(DocumentPart docPart, Resource element) {
if (docPart.getBody().isEmpty()) { if (docPart.isEmpty()) {
return; return;
} }
Resource excerpt = createExcerpt(docPart); Resource excerpt = createExcerpt(docPart);
excerpt.addProperty( RDFS.label, docPart.getName()); excerpt.addProperty( RDFS.label, docPart.getName());
Property hasText = m.createProperty(TS + "hasText"); Property hasText = m.createProperty(TS + "hasText");
element.addProperty(hasText, excerpt); element.addProperty(hasText, excerpt);
if (!docPart.isMasterPart()) {
addMetadataProperties(excerpt,docPart.getMetadata());
}
} }
public void createTree() { public void createTree() {