Convert metadata from csv to rdf
This commit is contained in:
parent
dc46b0c3ad
commit
296c8a28bd
2 changed files with 123 additions and 14 deletions
|
@ -1,6 +1,8 @@
|
|||
package writer2latex.rdf;
|
||||
|
||||
import java.io.StringWriter;
|
||||
import java.util.HashMap;
|
||||
import java.util.Set;
|
||||
|
||||
import javax.xml.transform.OutputKeys;
|
||||
import javax.xml.transform.Result;
|
||||
|
@ -22,13 +24,15 @@ import writer2latex.xhtml.XhtmlDocument;
|
|||
|
||||
public class DocumentPart {
|
||||
|
||||
private static final String DC_TITLE = "dc.Title";
|
||||
private XhtmlDocument excerptDoc;
|
||||
private String path;
|
||||
private String itemNumber;
|
||||
private String body;
|
||||
private String parentPath;
|
||||
private String name;
|
||||
private String order;
|
||||
private String name = null;
|
||||
private String order = null;
|
||||
private HashMap<String,Set<String>> metadata = null;
|
||||
|
||||
public DocumentPart(XhtmlDocument document) {
|
||||
this.excerptDoc = document;
|
||||
|
@ -67,19 +71,45 @@ public class DocumentPart {
|
|||
public void setOrder(String order) {
|
||||
this.order = order;
|
||||
}
|
||||
public String getOrder() {
|
||||
public String getSequentionalNumber() {
|
||||
return order;
|
||||
}
|
||||
public String getParentPath() {
|
||||
return parentPath;
|
||||
}
|
||||
public boolean isMasterPart() {
|
||||
if (path.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
public String getName() {
|
||||
if (metadata != null) {
|
||||
Set<String> nameSet = metadata.get(DC_TITLE);
|
||||
if (nameSet != null && !nameSet.isEmpty()) {
|
||||
for (String value : nameSet) {
|
||||
//return first one
|
||||
return value;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (name.isEmpty()) {
|
||||
return "NONAME";
|
||||
}
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setMetadata(HashMap<String,Set<String>> metadata) {
|
||||
this.metadata = metadata;
|
||||
}
|
||||
public HashMap<String,Set<String>> getMetadata(){
|
||||
if (metadata == null) {
|
||||
return new HashMap<String,Set<String>>();
|
||||
} else {
|
||||
return metadata;
|
||||
}
|
||||
}
|
||||
|
||||
private void extractPath() {
|
||||
Element excerptContentNode = excerptDoc.getContentNode();
|
||||
this.path = excerptContentNode.getAttribute("path");
|
||||
|
@ -126,4 +156,10 @@ public class DocumentPart {
|
|||
}
|
||||
this.body = bodyBuilder.toString();
|
||||
}
|
||||
public boolean isEmpty() {
|
||||
if (metadata == null && body.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,7 +25,7 @@ public class DocumentStructure {
|
|||
private HashMap<String, DocumentPart> inputParts;
|
||||
private final String TS = "https://iph.ras.ru/text_structures#";
|
||||
private final String PARSERNAME = "w2phtml";
|
||||
private final String EXCERPT = "textExcerpt";
|
||||
private final String EXCERPT = "elenphExcerpt";
|
||||
private final String TOC_ELEMENT = "TOCElement";
|
||||
private final String ELENPHARTICLE = "elenphArticle";
|
||||
private String documentID = "DOC_ID";
|
||||
|
@ -52,17 +52,12 @@ public class DocumentStructure {
|
|||
public DocumentStructure(Vector<XhtmlDocument> files,String fileName) {
|
||||
this();
|
||||
this.documentID = fileName;
|
||||
//Iterator<XhtmlDocument> filesIterator = files.iterator();
|
||||
//while (filesIterator.hasNext()) {
|
||||
//XhtmlDocument inputDoc = filesIterator.next();
|
||||
for(int i = 0 ; i< files.size();i++) {
|
||||
XhtmlDocument inputDoc = files.get(i);
|
||||
DocumentPart part = new DocumentPart(inputDoc);
|
||||
part.setOrder(Integer.toString(i));
|
||||
addPart(part);
|
||||
}
|
||||
|
||||
//}
|
||||
addEmptyParts();
|
||||
}
|
||||
|
||||
|
@ -146,11 +141,89 @@ public class DocumentStructure {
|
|||
String elementName = TS + ELENPHARTICLE + "/" + PARSERNAME + "_" + documentID ;
|
||||
Resource element = m.createResource(elementName,elenphClass);
|
||||
element.addProperty( RDFS.label, docPart.getName());
|
||||
|
||||
addMetadataProperties(element,docPart.getMetadata());
|
||||
elements.put(docPart.getPath(), element);
|
||||
attachExcerpt(docPart, element);
|
||||
|
||||
}
|
||||
|
||||
private void addMetadataProperties(Resource resource, HashMap<String, Set<String>> metadata) {
|
||||
Set<String> names = metadata.keySet();
|
||||
for (String name : names) {
|
||||
Set<String> values = metadata.get(name);
|
||||
if (values != null) {
|
||||
for (String value : values) {
|
||||
addMetadata(resource,name,value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
private void addMetadata(Resource resource, String name, String value) {
|
||||
if (isNotBlacklisted(name)) {
|
||||
name = convertName(name);
|
||||
if (isDefinedInOntology(resource,name)) {
|
||||
Property property = m.createProperty(TS + name);
|
||||
resource.addProperty( property, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
private boolean isDefinedInOntology(Resource resource, String name) {
|
||||
String nameSpace = resource.getNameSpace();
|
||||
if (nameSpace.contains(TS + EXCERPT)) {
|
||||
if (name.equals("author") ||
|
||||
name.equals("bibliography") ||
|
||||
name.equals("keywords") ||
|
||||
name.equals("works") ||
|
||||
name.equals("affiliation")
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
} else
|
||||
if (nameSpace.contains(TS + ELENPHARTICLE)) {
|
||||
if (name.equals("doi") ||
|
||||
name.equals("firstPublication") ||
|
||||
//name.equals("yearAndMonth") ||
|
||||
name.equals("year") ||
|
||||
name.equals("issue")
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
System.out.println(resource.getNameSpace() + " " + name);
|
||||
System.out.println("rightNamespace" + TS + EXCERPT );
|
||||
return false;
|
||||
}
|
||||
private String convertName(String name) {
|
||||
if (name.equals("Affiliation")){
|
||||
return "affiliation";
|
||||
} else
|
||||
if (name.equals("DOI")){
|
||||
return "doi";
|
||||
} else
|
||||
if (name.equals("1st-edition")){
|
||||
return "firstPublication";
|
||||
} else
|
||||
if (name.equals("year")){
|
||||
return "yearAndMonth";
|
||||
} else
|
||||
if (name.equals("year.short")){
|
||||
return "year";
|
||||
}else
|
||||
return name;
|
||||
}
|
||||
private boolean isNotBlacklisted(String name) {
|
||||
if (name.equals("Filename") ||
|
||||
name.equals("Section") ||
|
||||
name.equals("dc.Title") ||
|
||||
name.equals("subtitle") ||
|
||||
name.equals("dc.Identifier")
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
private void createTOCItem(DocumentPart docPart) {
|
||||
String tocItemName = TS + TOCITEM + "/" + PARSERNAME + "_" + documentID + docPart.getSafePath();
|
||||
Resource tocItem = m.createIndividual(tocItemName,itemClass);
|
||||
|
@ -159,8 +232,6 @@ public class DocumentStructure {
|
|||
Property itemNumber = m.createProperty(TS + "itemNumber");
|
||||
Property hasTOCItem = m.createProperty(TS + "hasTOCItem");
|
||||
tocItem.addLiteral(itemNumber, docPart.getNumber());
|
||||
|
||||
|
||||
m.add(tocItem, pointsTo, elements.get(docPart.getPath()));
|
||||
if (!docPart.getPath().isEmpty()) {
|
||||
Resource parent = elements.get(docPart.getParentPath());
|
||||
|
@ -170,14 +241,16 @@ public class DocumentStructure {
|
|||
}
|
||||
|
||||
private void attachExcerpt(DocumentPart docPart, Resource element) {
|
||||
if (docPart.getBody().isEmpty()) {
|
||||
if (docPart.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
Resource excerpt = createExcerpt(docPart);
|
||||
excerpt.addProperty( RDFS.label, docPart.getName());
|
||||
|
||||
Property hasText = m.createProperty(TS + "hasText");
|
||||
element.addProperty(hasText, excerpt);
|
||||
if (!docPart.isMasterPart()) {
|
||||
addMetadataProperties(excerpt,docPart.getMetadata());
|
||||
}
|
||||
}
|
||||
|
||||
public void createTree() {
|
||||
|
|
Loading…
Add table
Reference in a new issue