Convert metadata from csv to rdf

This commit is contained in:
Georgy Litvinov 2020-03-02 19:55:27 +01:00
parent dc46b0c3ad
commit 296c8a28bd
2 changed files with 123 additions and 14 deletions

View file

@ -1,6 +1,8 @@
package writer2latex.rdf;
import java.io.StringWriter;
import java.util.HashMap;
import java.util.Set;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Result;
@ -22,13 +24,15 @@ import writer2latex.xhtml.XhtmlDocument;
public class DocumentPart {
private static final String DC_TITLE = "dc.Title";
private XhtmlDocument excerptDoc;
private String path;
private String itemNumber;
private String body;
private String parentPath;
private String name;
private String order;
private String name = null;
private String order = null;
private HashMap<String,Set<String>> metadata = null;
public DocumentPart(XhtmlDocument document) {
this.excerptDoc = document;
@ -67,19 +71,45 @@ public class DocumentPart {
public void setOrder(String order) {
this.order = order;
}
public String getOrder() {
public String getSequentionalNumber() {
return order;
}
public String getParentPath() {
return parentPath;
}
public boolean isMasterPart() {
if (path.isEmpty()) {
return true;
}
return false;
}
public String getName() {
if (metadata != null) {
Set<String> nameSet = metadata.get(DC_TITLE);
if (nameSet != null && !nameSet.isEmpty()) {
for (String value : nameSet) {
//return first one
return value;
}
}
}
if (name.isEmpty()) {
return "NONAME";
}
return name;
}
public void setMetadata(HashMap<String,Set<String>> metadata) {
this.metadata = metadata;
}
public HashMap<String,Set<String>> getMetadata(){
if (metadata == null) {
return new HashMap<String,Set<String>>();
} else {
return metadata;
}
}
private void extractPath() {
Element excerptContentNode = excerptDoc.getContentNode();
this.path = excerptContentNode.getAttribute("path");
@ -126,4 +156,10 @@ public class DocumentPart {
}
this.body = bodyBuilder.toString();
}
public boolean isEmpty() {
if (metadata == null && body.isEmpty()) {
return true;
}
return false;
}
}

View file

@ -25,7 +25,7 @@ public class DocumentStructure {
private HashMap<String, DocumentPart> inputParts;
private final String TS = "https://iph.ras.ru/text_structures#";
private final String PARSERNAME = "w2phtml";
private final String EXCERPT = "textExcerpt";
private final String EXCERPT = "elenphExcerpt";
private final String TOC_ELEMENT = "TOCElement";
private final String ELENPHARTICLE = "elenphArticle";
private String documentID = "DOC_ID";
@ -52,17 +52,12 @@ public class DocumentStructure {
public DocumentStructure(Vector<XhtmlDocument> files,String fileName) {
this();
this.documentID = fileName;
//Iterator<XhtmlDocument> filesIterator = files.iterator();
//while (filesIterator.hasNext()) {
//XhtmlDocument inputDoc = filesIterator.next();
for(int i = 0 ; i< files.size();i++) {
XhtmlDocument inputDoc = files.get(i);
DocumentPart part = new DocumentPart(inputDoc);
part.setOrder(Integer.toString(i));
addPart(part);
}
//}
addEmptyParts();
}
@ -146,11 +141,89 @@ public class DocumentStructure {
String elementName = TS + ELENPHARTICLE + "/" + PARSERNAME + "_" + documentID ;
Resource element = m.createResource(elementName,elenphClass);
element.addProperty( RDFS.label, docPart.getName());
addMetadataProperties(element,docPart.getMetadata());
elements.put(docPart.getPath(), element);
attachExcerpt(docPart, element);
}
private void addMetadataProperties(Resource resource, HashMap<String, Set<String>> metadata) {
Set<String> names = metadata.keySet();
for (String name : names) {
Set<String> values = metadata.get(name);
if (values != null) {
for (String value : values) {
addMetadata(resource,name,value);
}
}
}
}
private void addMetadata(Resource resource, String name, String value) {
if (isNotBlacklisted(name)) {
name = convertName(name);
if (isDefinedInOntology(resource,name)) {
Property property = m.createProperty(TS + name);
resource.addProperty( property, value);
}
}
}
private boolean isDefinedInOntology(Resource resource, String name) {
String nameSpace = resource.getNameSpace();
if (nameSpace.contains(TS + EXCERPT)) {
if (name.equals("author") ||
name.equals("bibliography") ||
name.equals("keywords") ||
name.equals("works") ||
name.equals("affiliation")
) {
return true;
}
} else
if (nameSpace.contains(TS + ELENPHARTICLE)) {
if (name.equals("doi") ||
name.equals("firstPublication") ||
//name.equals("yearAndMonth") ||
name.equals("year") ||
name.equals("issue")
) {
return true;
}
}
System.out.println(resource.getNameSpace() + " " + name);
System.out.println("rightNamespace" + TS + EXCERPT );
return false;
}
private String convertName(String name) {
if (name.equals("Affiliation")){
return "affiliation";
} else
if (name.equals("DOI")){
return "doi";
} else
if (name.equals("1st-edition")){
return "firstPublication";
} else
if (name.equals("year")){
return "yearAndMonth";
} else
if (name.equals("year.short")){
return "year";
}else
return name;
}
private boolean isNotBlacklisted(String name) {
if (name.equals("Filename") ||
name.equals("Section") ||
name.equals("dc.Title") ||
name.equals("subtitle") ||
name.equals("dc.Identifier")
) {
return false;
}
return true;
}
private void createTOCItem(DocumentPart docPart) {
String tocItemName = TS + TOCITEM + "/" + PARSERNAME + "_" + documentID + docPart.getSafePath();
Resource tocItem = m.createIndividual(tocItemName,itemClass);
@ -159,8 +232,6 @@ public class DocumentStructure {
Property itemNumber = m.createProperty(TS + "itemNumber");
Property hasTOCItem = m.createProperty(TS + "hasTOCItem");
tocItem.addLiteral(itemNumber, docPart.getNumber());
m.add(tocItem, pointsTo, elements.get(docPart.getPath()));
if (!docPart.getPath().isEmpty()) {
Resource parent = elements.get(docPart.getParentPath());
@ -170,14 +241,16 @@ public class DocumentStructure {
}
private void attachExcerpt(DocumentPart docPart, Resource element) {
if (docPart.getBody().isEmpty()) {
if (docPart.isEmpty()) {
return;
}
Resource excerpt = createExcerpt(docPart);
excerpt.addProperty( RDFS.label, docPart.getName());
Property hasText = m.createProperty(TS + "hasText");
element.addProperty(hasText, excerpt);
if (!docPart.isMasterPart()) {
addMetadataProperties(excerpt,docPart.getMetadata());
}
}
public void createTree() {