Convert metadata from csv to rdf
This commit is contained in:
parent
dc46b0c3ad
commit
296c8a28bd
2 changed files with 123 additions and 14 deletions
|
@ -1,6 +1,8 @@
|
||||||
package writer2latex.rdf;
|
package writer2latex.rdf;
|
||||||
|
|
||||||
import java.io.StringWriter;
|
import java.io.StringWriter;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
import javax.xml.transform.OutputKeys;
|
import javax.xml.transform.OutputKeys;
|
||||||
import javax.xml.transform.Result;
|
import javax.xml.transform.Result;
|
||||||
|
@ -22,13 +24,15 @@ import writer2latex.xhtml.XhtmlDocument;
|
||||||
|
|
||||||
public class DocumentPart {
|
public class DocumentPart {
|
||||||
|
|
||||||
|
private static final String DC_TITLE = "dc.Title";
|
||||||
private XhtmlDocument excerptDoc;
|
private XhtmlDocument excerptDoc;
|
||||||
private String path;
|
private String path;
|
||||||
private String itemNumber;
|
private String itemNumber;
|
||||||
private String body;
|
private String body;
|
||||||
private String parentPath;
|
private String parentPath;
|
||||||
private String name;
|
private String name = null;
|
||||||
private String order;
|
private String order = null;
|
||||||
|
private HashMap<String,Set<String>> metadata = null;
|
||||||
|
|
||||||
public DocumentPart(XhtmlDocument document) {
|
public DocumentPart(XhtmlDocument document) {
|
||||||
this.excerptDoc = document;
|
this.excerptDoc = document;
|
||||||
|
@ -67,19 +71,45 @@ public class DocumentPart {
|
||||||
public void setOrder(String order) {
|
public void setOrder(String order) {
|
||||||
this.order = order;
|
this.order = order;
|
||||||
}
|
}
|
||||||
public String getOrder() {
|
public String getSequentionalNumber() {
|
||||||
return order;
|
return order;
|
||||||
}
|
}
|
||||||
public String getParentPath() {
|
public String getParentPath() {
|
||||||
return parentPath;
|
return parentPath;
|
||||||
}
|
}
|
||||||
|
public boolean isMasterPart() {
|
||||||
|
if (path.isEmpty()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
public String getName() {
|
public String getName() {
|
||||||
|
if (metadata != null) {
|
||||||
|
Set<String> nameSet = metadata.get(DC_TITLE);
|
||||||
|
if (nameSet != null && !nameSet.isEmpty()) {
|
||||||
|
for (String value : nameSet) {
|
||||||
|
//return first one
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
if (name.isEmpty()) {
|
if (name.isEmpty()) {
|
||||||
return "NONAME";
|
return "NONAME";
|
||||||
}
|
}
|
||||||
return name;
|
return name;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void setMetadata(HashMap<String,Set<String>> metadata) {
|
||||||
|
this.metadata = metadata;
|
||||||
|
}
|
||||||
|
public HashMap<String,Set<String>> getMetadata(){
|
||||||
|
if (metadata == null) {
|
||||||
|
return new HashMap<String,Set<String>>();
|
||||||
|
} else {
|
||||||
|
return metadata;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private void extractPath() {
|
private void extractPath() {
|
||||||
Element excerptContentNode = excerptDoc.getContentNode();
|
Element excerptContentNode = excerptDoc.getContentNode();
|
||||||
this.path = excerptContentNode.getAttribute("path");
|
this.path = excerptContentNode.getAttribute("path");
|
||||||
|
@ -126,4 +156,10 @@ public class DocumentPart {
|
||||||
}
|
}
|
||||||
this.body = bodyBuilder.toString();
|
this.body = bodyBuilder.toString();
|
||||||
}
|
}
|
||||||
|
public boolean isEmpty() {
|
||||||
|
if (metadata == null && body.isEmpty()) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,7 +25,7 @@ public class DocumentStructure {
|
||||||
private HashMap<String, DocumentPart> inputParts;
|
private HashMap<String, DocumentPart> inputParts;
|
||||||
private final String TS = "https://iph.ras.ru/text_structures#";
|
private final String TS = "https://iph.ras.ru/text_structures#";
|
||||||
private final String PARSERNAME = "w2phtml";
|
private final String PARSERNAME = "w2phtml";
|
||||||
private final String EXCERPT = "textExcerpt";
|
private final String EXCERPT = "elenphExcerpt";
|
||||||
private final String TOC_ELEMENT = "TOCElement";
|
private final String TOC_ELEMENT = "TOCElement";
|
||||||
private final String ELENPHARTICLE = "elenphArticle";
|
private final String ELENPHARTICLE = "elenphArticle";
|
||||||
private String documentID = "DOC_ID";
|
private String documentID = "DOC_ID";
|
||||||
|
@ -52,17 +52,12 @@ public class DocumentStructure {
|
||||||
public DocumentStructure(Vector<XhtmlDocument> files,String fileName) {
|
public DocumentStructure(Vector<XhtmlDocument> files,String fileName) {
|
||||||
this();
|
this();
|
||||||
this.documentID = fileName;
|
this.documentID = fileName;
|
||||||
//Iterator<XhtmlDocument> filesIterator = files.iterator();
|
|
||||||
//while (filesIterator.hasNext()) {
|
|
||||||
//XhtmlDocument inputDoc = filesIterator.next();
|
|
||||||
for(int i = 0 ; i< files.size();i++) {
|
for(int i = 0 ; i< files.size();i++) {
|
||||||
XhtmlDocument inputDoc = files.get(i);
|
XhtmlDocument inputDoc = files.get(i);
|
||||||
DocumentPart part = new DocumentPart(inputDoc);
|
DocumentPart part = new DocumentPart(inputDoc);
|
||||||
part.setOrder(Integer.toString(i));
|
part.setOrder(Integer.toString(i));
|
||||||
addPart(part);
|
addPart(part);
|
||||||
}
|
}
|
||||||
|
|
||||||
//}
|
|
||||||
addEmptyParts();
|
addEmptyParts();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -146,11 +141,89 @@ public class DocumentStructure {
|
||||||
String elementName = TS + ELENPHARTICLE + "/" + PARSERNAME + "_" + documentID ;
|
String elementName = TS + ELENPHARTICLE + "/" + PARSERNAME + "_" + documentID ;
|
||||||
Resource element = m.createResource(elementName,elenphClass);
|
Resource element = m.createResource(elementName,elenphClass);
|
||||||
element.addProperty( RDFS.label, docPart.getName());
|
element.addProperty( RDFS.label, docPart.getName());
|
||||||
|
addMetadataProperties(element,docPart.getMetadata());
|
||||||
elements.put(docPart.getPath(), element);
|
elements.put(docPart.getPath(), element);
|
||||||
attachExcerpt(docPart, element);
|
attachExcerpt(docPart, element);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void addMetadataProperties(Resource resource, HashMap<String, Set<String>> metadata) {
|
||||||
|
Set<String> names = metadata.keySet();
|
||||||
|
for (String name : names) {
|
||||||
|
Set<String> values = metadata.get(name);
|
||||||
|
if (values != null) {
|
||||||
|
for (String value : values) {
|
||||||
|
addMetadata(resource,name,value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
private void addMetadata(Resource resource, String name, String value) {
|
||||||
|
if (isNotBlacklisted(name)) {
|
||||||
|
name = convertName(name);
|
||||||
|
if (isDefinedInOntology(resource,name)) {
|
||||||
|
Property property = m.createProperty(TS + name);
|
||||||
|
resource.addProperty( property, value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
private boolean isDefinedInOntology(Resource resource, String name) {
|
||||||
|
String nameSpace = resource.getNameSpace();
|
||||||
|
if (nameSpace.contains(TS + EXCERPT)) {
|
||||||
|
if (name.equals("author") ||
|
||||||
|
name.equals("bibliography") ||
|
||||||
|
name.equals("keywords") ||
|
||||||
|
name.equals("works") ||
|
||||||
|
name.equals("affiliation")
|
||||||
|
) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} else
|
||||||
|
if (nameSpace.contains(TS + ELENPHARTICLE)) {
|
||||||
|
if (name.equals("doi") ||
|
||||||
|
name.equals("firstPublication") ||
|
||||||
|
//name.equals("yearAndMonth") ||
|
||||||
|
name.equals("year") ||
|
||||||
|
name.equals("issue")
|
||||||
|
) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
System.out.println(resource.getNameSpace() + " " + name);
|
||||||
|
System.out.println("rightNamespace" + TS + EXCERPT );
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
private String convertName(String name) {
|
||||||
|
if (name.equals("Affiliation")){
|
||||||
|
return "affiliation";
|
||||||
|
} else
|
||||||
|
if (name.equals("DOI")){
|
||||||
|
return "doi";
|
||||||
|
} else
|
||||||
|
if (name.equals("1st-edition")){
|
||||||
|
return "firstPublication";
|
||||||
|
} else
|
||||||
|
if (name.equals("year")){
|
||||||
|
return "yearAndMonth";
|
||||||
|
} else
|
||||||
|
if (name.equals("year.short")){
|
||||||
|
return "year";
|
||||||
|
}else
|
||||||
|
return name;
|
||||||
|
}
|
||||||
|
private boolean isNotBlacklisted(String name) {
|
||||||
|
if (name.equals("Filename") ||
|
||||||
|
name.equals("Section") ||
|
||||||
|
name.equals("dc.Title") ||
|
||||||
|
name.equals("subtitle") ||
|
||||||
|
name.equals("dc.Identifier")
|
||||||
|
) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
private void createTOCItem(DocumentPart docPart) {
|
private void createTOCItem(DocumentPart docPart) {
|
||||||
String tocItemName = TS + TOCITEM + "/" + PARSERNAME + "_" + documentID + docPart.getSafePath();
|
String tocItemName = TS + TOCITEM + "/" + PARSERNAME + "_" + documentID + docPart.getSafePath();
|
||||||
Resource tocItem = m.createIndividual(tocItemName,itemClass);
|
Resource tocItem = m.createIndividual(tocItemName,itemClass);
|
||||||
|
@ -159,8 +232,6 @@ public class DocumentStructure {
|
||||||
Property itemNumber = m.createProperty(TS + "itemNumber");
|
Property itemNumber = m.createProperty(TS + "itemNumber");
|
||||||
Property hasTOCItem = m.createProperty(TS + "hasTOCItem");
|
Property hasTOCItem = m.createProperty(TS + "hasTOCItem");
|
||||||
tocItem.addLiteral(itemNumber, docPart.getNumber());
|
tocItem.addLiteral(itemNumber, docPart.getNumber());
|
||||||
|
|
||||||
|
|
||||||
m.add(tocItem, pointsTo, elements.get(docPart.getPath()));
|
m.add(tocItem, pointsTo, elements.get(docPart.getPath()));
|
||||||
if (!docPart.getPath().isEmpty()) {
|
if (!docPart.getPath().isEmpty()) {
|
||||||
Resource parent = elements.get(docPart.getParentPath());
|
Resource parent = elements.get(docPart.getParentPath());
|
||||||
|
@ -170,14 +241,16 @@ public class DocumentStructure {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void attachExcerpt(DocumentPart docPart, Resource element) {
|
private void attachExcerpt(DocumentPart docPart, Resource element) {
|
||||||
if (docPart.getBody().isEmpty()) {
|
if (docPart.isEmpty()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
Resource excerpt = createExcerpt(docPart);
|
Resource excerpt = createExcerpt(docPart);
|
||||||
excerpt.addProperty( RDFS.label, docPart.getName());
|
excerpt.addProperty( RDFS.label, docPart.getName());
|
||||||
|
|
||||||
Property hasText = m.createProperty(TS + "hasText");
|
Property hasText = m.createProperty(TS + "hasText");
|
||||||
element.addProperty(hasText, excerpt);
|
element.addProperty(hasText, excerpt);
|
||||||
|
if (!docPart.isMasterPart()) {
|
||||||
|
addMetadataProperties(excerpt,docPart.getMetadata());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void createTree() {
|
public void createTree() {
|
||||||
|
|
Loading…
Add table
Reference in a new issue