From 663090318dd783d20bb27d7248e7e255814089ff Mon Sep 17 00:00:00 2001 From: mbarbier Date: Tue, 10 May 2011 20:12:46 +0000 Subject: [PATCH] FileHarvestJob.java: added getScript() and performHarvest() methods Harvester.java: added stringsToArray(), modified API methods to use it TestFileController.java: implemented preliminary harvest --- .../controller/harvester/FileHarvestJob.java | 12 ++ .../controller/harvester/Harvester.java | 148 ++++++++----- .../harvester/TestFileController.java | 202 +++++++++++++++++- 3 files changed, 301 insertions(+), 61 deletions(-) diff --git a/src/edu/cornell/mannlib/vitro/webapp/controller/harvester/FileHarvestJob.java b/src/edu/cornell/mannlib/vitro/webapp/controller/harvester/FileHarvestJob.java index 31622ff0..0f6a2b94 100644 --- a/src/edu/cornell/mannlib/vitro/webapp/controller/harvester/FileHarvestJob.java +++ b/src/edu/cornell/mannlib/vitro/webapp/controller/harvester/FileHarvestJob.java @@ -18,5 +18,17 @@ interface FileHarvestJob { * @return null if success, message to be returned to the user if failure */ String validateUpload(File file); + + /** + * Gets the console script which can be used to run the harvest job. + * @return the console script which can be used to run the harvest job + */ + String getScript(); + + /** + * Runs a harvest on the files in the specified directory. + * @param directory the directory containing files to harvest + */ + void performHarvest(File directory); } diff --git a/src/edu/cornell/mannlib/vitro/webapp/controller/harvester/Harvester.java b/src/edu/cornell/mannlib/vitro/webapp/controller/harvester/Harvester.java index c40d9805..15769d5e 100644 --- a/src/edu/cornell/mannlib/vitro/webapp/controller/harvester/Harvester.java +++ b/src/edu/cornell/mannlib/vitro/webapp/controller/harvester/Harvester.java @@ -2,6 +2,8 @@ package edu.cornell.mannlib.vitro.webapp.controller.harvester; +import java.util.ArrayList; + /* //PLEASE SEE JAVADOC COMMENT FOR CLASS BELOW import org.vivoweb.harvester.diff.Diff; import org.vivoweb.harvester.fetch.CSVtoRDF; @@ -52,99 +54,135 @@ import org.vivoweb.harvester.util.XPathTool; class Harvester { /* // diff - public static void runDiff(String ... args) { - Diff.main(args); + public static void runDiff(Object ... args) { + Diff.main(stringsToArray(args)); } // fetch - public static void runCSVtoRDF(String ... args) { - CSVtoRDF.main(args); + public static void runCSVtoRDF(Object ... args) { + CSVtoRDF.main(stringsToArray(args)); } - public static void runD2RMapFetch(String ... args) { - D2RMapFetch.main(args); + public static void runD2RMapFetch(Object ... args) { + D2RMapFetch.main(stringsToArray(args)); } - public static void runJDBCFetch(String ... args) { - JDBCFetch.main(args); + public static void runJDBCFetch(Object ... args) { + JDBCFetch.main(stringsToArray(args)); } - public static void runNLMJournalFetch(String ... args) { - NLMJournalFetch.main(args); + public static void runNLMJournalFetch(Object ... args) { + NLMJournalFetch.main(stringsToArray(args)); } - public static void runOAIFetch(String ... args) { - OAIFetch.main(args); + public static void runOAIFetch(Object ... args) { + OAIFetch.main(stringsToArray(args)); } - public static void runPubmedFetch(String ... args) { - PubmedFetch.main(args); + public static void runPubmedFetch(Object ... args) { + PubmedFetch.main(stringsToArray(args)); } - public static void runPubmedHTTPFetch(String ... args) { - PubmedHTTPFetch.main(args); + public static void runPubmedHTTPFetch(Object ... args) { + PubmedHTTPFetch.main(stringsToArray(args)); } // qualify - public static void runChangeNamespace(String ... args) { - ChangeNamespace.main(args); + public static void runChangeNamespace(Object ... args) { + ChangeNamespace.main(stringsToArray(args)); } - public static void runQualify(String ... args) { - Qualify.main(args); + public static void runQualify(Object ... args) { + Qualify.main(stringsToArray(args)); } - public static void runRenameBlankNodes(String ... args) { - RenameBlankNodes.main(args); + public static void runRenameBlankNodes(Object ... args) { + RenameBlankNodes.main(stringsToArray(args)); } - public static void runRenameResources(String ... args) { - RenameResources.main(args); + public static void runRenameResources(Object ... args) { + RenameResources.main(stringsToArray(args)); } - public static void runSmush(String ... args) { - Smush.main(args); + public static void runSmush(Object ... args) { + Smush.main(stringsToArray(args)); } - public static void runSplitProperty(String ... args) { - SplitProperty.main(args); + public static void runSplitProperty(Object ... args) { + SplitProperty.main(stringsToArray(args)); } // score - public static void runMatch(String ... args) { - Match.main(args); + public static void runMatch(Object ... args) { + Match.main(stringsToArray(args)); } - public static void runPubmedScore(String ... args) { - PubmedScore.main(args); + public static void runPubmedScore(Object ... args) { + PubmedScore.main(stringsToArray(args)); } - public static void runScore(String ... args) { - Score.main(args); + public static void runScore(Object ... args) { + Score.main(stringsToArray(args)); } // transfer - public static void transfer(String ... args) { - Transfer.main(args); + public static void runTransfer(Object ... args) { + Transfer.main(stringsToArray(args)); } // translate - public static void runGlozeTranslator(String ... args) { - GlozeTranslator.main(args); + public static void runGlozeTranslator(Object ... args) { + GlozeTranslator.main(stringsToArray(args)); } - public static void runRunBibutils(String ... args) { - RunBibutils.main(args); + public static void runRunBibutils(Object ... args) { + RunBibutils.main(stringsToArray(args)); } - public static void runSanitizeMODSXML(String ... args) { - SanitizeMODSXML.main(args); + public static void runSanitizeMODSXML(Object ... args) { + SanitizeMODSXML.main(stringsToArray(args)); } - public static void runSPARQLTranslator(String ... args) { - SPARQLTranslator.main(args); + public static void runSPARQLTranslator(Object ... args) { + SPARQLTranslator.main(stringsToArray(args)); } - public static void runXSLTranslator(String ... args) { - XSLTranslator.main(args); + public static void runXSLTranslator(Object ... args) { + XSLTranslator.main(stringsToArray(args)); } // util - public static void runCSVtoJDBC(String ... args) { - CSVtoJDBC.main(args); + public static void runCSVtoJDBC(Object ... args) { + CSVtoJDBC.main(stringsToArray(args)); } - public static void runDatabaseClone(String ... args) { - DatabaseClone.main(args); + public static void runDatabaseClone(Object ... args) { + DatabaseClone.main(stringsToArray(args)); } - public static void runMerge(String ... args) { - Merge.main(args); + public static void runMerge(Object ... args) { + Merge.main(stringsToArray(args)); } - public static void runXPathTool(String ... args) { - XPathTool.main(args); + public static void runXPathTool(Object ... args) { + XPathTool.main(stringsToArray(args)); + } + */ + + /** + * Convenience method to expand the ability to use Java's "..." arg list. Harvester scripts frequently declare sub-macros, + * so for example you might have: + * + * SCOREINPUT="-i $H2MODEL -ImodelName=$MODELNAME -IdbUrl=$MODELDBURL -IcheckEmpty=$CHECKEMPTY" + * SCOREDATA="-s $H2MODEL -SmodelName=$SCOREDATANAME -SdbUrl=$SCOREDATADBURL -ScheckEmpty=$CHECKEMPTY" + * SCOREMODELS="$SCOREINPUT -v $VIVOCONFIG -VcheckEmpty=$CHECKEMPTY $SCOREDATA -t $TEMPCOPYDIR -b $SCOREBATCHSIZE" + * $Score $SCOREMODELS -AGrantNumber=$EQTEST -WGrantNumber=1.0 -FGrantNumber=$GRANTIDNUM -PGrantNumber=$GRANTIDNUM -n ${BASEURI}grant/ + * + * In order to mimic this functionality for easy use in Java, this method has been created. It takes a "..." arg list of Object + * objects, and returns an array of Strings. For each object, if it's an array of Strings, each String is added to the output + * array. Otherwise, its toString() method is called and that value is added to the output array. + * + * It is intended to be used with a combination of String and String[] values, in any arbitrary order. + * + * All static Harvester methods in this class take an Object arg list rather than a String arg list, and automatically call + * this method. + * + * @param args an array of objects, which ought to be a combination of String and String[] values, in any arbitrary order + * @return all the strings put together as one array + */ + public static String[] stringsToArray(Object ... args) { + ArrayList allData = new ArrayList(); + for(int i = 0; i < args.length; i++) { + if(args[i] instanceof String[]) { + String[] array = (String[])(args[i]); + for(int j = 0; j < array.length; j++) { + allData.add(array[j]); + } + } else { + allData.add(args[i].toString()); + } + } + return allData.toArray(new String[allData.size()]); } -*/ } diff --git a/src/edu/cornell/mannlib/vitro/webapp/controller/harvester/TestFileController.java b/src/edu/cornell/mannlib/vitro/webapp/controller/harvester/TestFileController.java index 3a89ab51..6f31ceef 100644 --- a/src/edu/cornell/mannlib/vitro/webapp/controller/harvester/TestFileController.java +++ b/src/edu/cornell/mannlib/vitro/webapp/controller/harvester/TestFileController.java @@ -2,8 +2,11 @@ package edu.cornell.mannlib.vitro.webapp.controller.harvester; +import java.io.BufferedReader; import java.io.File; +import java.io.FileReader; import java.io.IOException; +import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -235,24 +238,25 @@ public class TestFileController extends FreemarkerHttpServlet { @SuppressWarnings("unused") - private void doHarvest() - { + private void doHarvest() { /* Harvest will entail: - + D2RMapFetch Transfer to local temp model Diffs Transfers - + If this is being done with a script, then we should probably use a templating system. - run-csv.sh + run-csv.sh - */ + */ } + + @@ -361,6 +365,192 @@ class CsvHarvestJob implements FileHarvestJob { } return null; } + + @Override + public String getScript() + { + String path = ""; //todo: complete + File scriptTemplate = new File(path); + + String scriptTemplateContents = readScriptTemplate(scriptTemplate); + String replacements = performScriptTemplateReplacements(scriptTemplateContents); + return replacements; + } + + + private String performScriptTemplateReplacements(String scriptTemplateContents) { + String replacements = scriptTemplateContents; + /* + * What needs to be replaced? + * + * task directory name + */ + //todo: complete + return replacements; + } + + + private String readScriptTemplate(File scriptTemplate) { + String scriptTemplateContents = null; + BufferedReader reader = null; + try { + int fileSize = (int)(scriptTemplate.length()); + char[] buffer = new char[fileSize]; + reader = new BufferedReader(new FileReader(scriptTemplate), fileSize); + reader.read(buffer); + scriptTemplateContents = new String(buffer); + } catch (IOException e) { + log.error(e, e); + } finally { + try { + if(reader != null) + reader.close(); + } catch(IOException e) { + log.error(e, e); + } + } + + return scriptTemplateContents; + } + + + @Override + public void performHarvest(File directory) { + + /* //COMMENTED OUT UNTIL HARVESTER INTEGRATION IS WORKING + String vivoconfig = "config/models/vivo.xml"; + String scorebatchsize = "100"; + String checkempty = "true"; + String namespace = ""; //todo: get namespace + String h2model = "config/models/h2-sdb.xml"; + String prevharvdburlbase = "jdbc:h2:harvested-data/prevHarvs/"; + String tfrh = "config/recordhandlers/h2-jdbc.xml"; + + String harvesterTask = "csv"; + + String basedir = "harvested-data/" + harvesterTask; + + String rawrhdir = basedir + "/rh-raw"; + String rdfrhdir = basedir + "/rh-rdf"; + String modeldir = basedir + "/model"; + String scoredatadir = basedir + "/score-data"; + + String modeldburl = "jdbc:h2:" + modeldir + "/store"; + String scoredatadburl = "jdbc:h2:" + scoredatadir + "/store"; + + String modelname = "csvTempTransfer"; + String scoredataname = "csvScoreData"; + + String tempcopydir = basedir + "/temp-copy"; + + String[] scoreinput = Harvester.stringsToArray("-i", h2model, "-ImodelName=" + modelname, "-IdbUrl=" + modeldburl, "-IcheckEmpty=" + checkempty); + String[] scoredata = Harvester.stringsToArray("-s", h2model, "-SmodelName=" + scoredataname, "-SdbUrl=" + scoredatadburl, "-ScheckEmpty=" + checkempty); + String[] scoremodels = Harvester.stringsToArray(scoreinput, "-v", vivoconfig, "-VcheckEmpty=" + checkempty, scoredata, "-t", tempcopydir, "-b", scorebatchsize); + + String[] cnflags = Harvester.stringsToArray(scoreinput, "-v", vivoconfig, "-n", namespace); + + String eqtest = "org.vivoweb.harvester.score.algorithm.EqualityTest"; + + String grantidnum = "http://vivoweb.org/ontology/score#grantID"; + String rdfslabel = "http://www.w3.org/2000/01/rdf-schema#label"; + String personidnum = "http://vivoweb.org/ontology/score#personID"; + String deptidnum = "http://vivoweb.org/ontology/score#deptID"; + String rolein = "http://vivoweb.org/ontology/core#roleIn"; + String piroleof = "http://vivoweb.org/ontology/core#principalInvestigatorRoleOf"; + String copiroleof = "http://vivoweb.org/ontology/core#co-PrincipalInvestigatorRoleOf"; + String datetime = "http://vivoweb.org/ontology/core#dateTime"; + String baseuri = "http://vivoweb.org/harvest/csvfile/"; + + + + //execute fetch + Harvester.runCSVtoRDF("-o", tfrh, "-O", "fileDir=" + rawrhdir, "-i", "filepath"); + + //execute translate + Harvester.runXSLTranslator("-i", tfrh, "-IfileDir=" + rawrhdir, "-o", tfrh, "-OfileDir=" + rdfrhdir, "-x", "config/datamaps/csv-grant-to-vivo.xsl"); + + //execute transfer to import from record handler into local temp model + Harvester.runTransfer("-o", h2model, "-OmodelName=" + modelname, "-OdbUrl=" + modeldburl, "-h", tfrh, "-HfileDir=" + rdfrhdir, "-n", namespace); + + //smushes in-place(-r) on the Grant id THEN on the person ID then deptID + Harvester.runSmush(scoreinput, "-P", grantidnum, "-P", personidnum, "-P", deptidnum, "-P", datetime, "-n", baseuri, "-r"); + + //scoring of Grants on GrantNumber + Harvester.runScore(scoremodels, "-AGrantNumber=" + eqtest, "-WGrantNumber=1.0", "-FGrantNumber=" + grantidnum, "-PGrantNumber=" + grantidnum, "-n", baseuri + "grant/"); + + //scoring of people on PERSONIDNUM + Harvester.runScore(scoremodels, "-Aufid=" + eqtest, "-Wufid=1.0", "-Fufid=" + personidnum, "-Pufid=" + personidnum, "-n", baseuri + "person/"); + + Harvester.runSmush(scoreinput, "-P", deptidnum, "-n", baseuri + "org/", "-r"); + + //scoring of orgs on DeptID + Harvester.runScore(scoremodels, "-AdeptID=" + eqtest, "-WdeptID=1.0", "-FdeptID=" + deptidnum, "-PdeptID=" + deptidnum, "-n", baseuri + "org/"); + + + Harvester.runSmush(scoreinput, "-P", rdfslabel, "-n", baseuri + "sponsor/", "-r"); + + //scoring sponsors by labels + Harvester.runScore(scoremodels, "-Alabel=" + eqtest, "-Wlabel=1.0", "-Flabel=" + rdfslabel, "-Plabel=" + rdfslabel, "-n", baseuri + "sponsor/"); + + //scoring of PI Roles + String[] piuri = Harvester.stringsToArray("-Aperson=" + eqtest, "-Wperson=0.5", "-Fperson=" + piroleof, "-Pperson=" + piroleof); + String[] granturi = Harvester.stringsToArray("-Agrant=" + eqtest, "-Wgrant=0.5", "-Fgrant=" + rolein, "-Pgrant=" + rolein); + Harvester.runScore(scoremodels, piuri, granturi, "-n", baseuri + "piRole/"); + + //scoring of coPI Roles + String[] copiuri = Harvester.stringsToArray("-Aperson=" + eqtest, "-Wperson=0.5", "-Fperson=" + copiroleof, "-Pperson=" + copiroleof); + Harvester.runScore(scoremodels, copiuri, granturi, "-n", baseuri + "coPiRole/"); + + //find matches using scores and rename nodes to matching uri + Harvester.runMatch(scoreinput, scoredata, "-b", scorebatchsize, "-t", "1.0", "-r"); + + //execute ChangeNamespace to get grants into current namespace + Harvester.runChangeNamespace(cnflags, "-u", baseuri + "grant/"); + + //execute ChangeNamespace to get orgs into current namespace + Harvester.runChangeNamespace(cnflags, "-u", baseuri + "org/"); + + //execute ChangeNamespace to get sponsors into current namespace + Harvester.runChangeNamespace(cnflags, "-u", baseuri + "sponsor/"); + + //execute ChangeNamespace to get people into current namespace + Harvester.runChangeNamespace(cnflags, "-u", baseuri + "person/"); + + //execute ChangeNamespace to get PI roles into current namespace + Harvester.runChangeNamespace(cnflags, "-u", baseuri + "piRole/"); + + //execute ChangeNamespace to get co-PI roles into current namespace + Harvester.runChangeNamespace(cnflags, "-u", baseuri + "coPiRole/"); + + //execute ChangeNamespace to get co-PI roles into current namespace + Harvester.runChangeNamespace(cnflags, "-u", baseuri + "timeInterval"); + + + //todo: we probably don't want to do prev harvest stuff for this + String prevharvestmodel = "http://vivoweb.org/ingest/dsr"; + String addfile = basedir + "/additions.rdf.xml"; + String subfile = basedir + "/subtractions.rdf.xml"; + + //find Subtractions + Harvester.runDiff("-m", h2model, "-MdbUrl=" + prevharvdburlbase + harvesterTask + "/store", "-McheckEmpty=" + checkempty, "-MmodelName=" + prevharvestmodel, "-s", h2model, "-ScheckEmpty=" + checkempty, "-SdbUrl=" + modeldburl, "-SmodelName=" + modelname, "-d", subfile); + + //find Additions + Harvester.runDiff("-m", h2model, "-McheckEmpty=" + checkempty, "-MdbUrl=" + modeldburl, "-MmodelName=" + modelname, "-s", h2model, "-ScheckEmpty=" + checkempty, "-SdbUrl=" + prevharvdburlbase + harvesterTask + "/store", "-SmodelName=" + prevharvestmodel, "-d", addfile); + + //apply Subtractions to Previous model + Harvester.runTransfer("-o", h2model, "-OdbUrl=" + prevharvdburlbase + harvesterTask + "/store", "-OcheckEmpty=" + checkempty, "-OmodelName=" + prevharvestmodel, "-r", subfile, "-m"); + + //apply Additions to Previous model + Harvester.runTransfer("-o", h2model, "-OdbUrl=" + prevharvdburlbase + harvesterTask + "/store", "-OcheckEmpty=" + checkempty, "-OmodelName=" + prevharvestmodel, "-r", addfile); + + //apply Subtractions to VIVO + Harvester.runTransfer("-o", vivoconfig, "-OcheckEmpty=" + checkempty, "-r", subfile, "-m"); + + //apply Additions to VIVO + Harvester.runTransfer("-o", vivoconfig, "-OcheckEmpty=" + checkempty, "-r", addfile); + */ + } + }