From 27d3141bb64f0d0c21aa21380d3159c9b94d6330 Mon Sep 17 00:00:00 2001 From: Jim Blake Date: Fri, 16 Jan 2015 11:34:31 -0500 Subject: [PATCH] VIVO-871 Break out more tasks to improve the timings UpdateDocumentWorkUnit contains a list of required DocumentModifiers. They are required but because they are in the list of DocumentModifiers, they are timed like the optional ones. Similarly, UpdateUrisTask contains a required SearchIndexExcluder. --- .../webapp/searchindex/SearchIndexerImpl.java | 16 +- .../DocumentModifierListDeveloper.java | 2 +- .../SearchIndexExcluderListDeveloper.java | 2 +- .../IndexingUriFinderListDeveloper.java | 4 +- .../tasks/UpdateDocumentWorkUnit.java | 302 +++++++++++------- .../searchindex/tasks/UpdateUrisTask.java | 33 +- 6 files changed, 228 insertions(+), 131 deletions(-) diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/SearchIndexerImpl.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/SearchIndexerImpl.java index e1d780844..66f7a55c6 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/SearchIndexerImpl.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/SearchIndexerImpl.java @@ -54,6 +54,7 @@ import edu.cornell.mannlib.vitro.webapp.searchindex.indexing.IndexingUriFinderLi import edu.cornell.mannlib.vitro.webapp.searchindex.indexing.IndexingUriFinderListBasic; import edu.cornell.mannlib.vitro.webapp.searchindex.indexing.IndexingUriFinderListDeveloper; import edu.cornell.mannlib.vitro.webapp.searchindex.tasks.RebuildIndexTask; +import edu.cornell.mannlib.vitro.webapp.searchindex.tasks.UpdateDocumentWorkUnit; import edu.cornell.mannlib.vitro.webapp.searchindex.tasks.UpdateStatementsTask; import edu.cornell.mannlib.vitro.webapp.searchindex.tasks.UpdateUrisTask; import edu.cornell.mannlib.vitro.webapp.utils.configuration.ConfigurationBeanLoader; @@ -82,8 +83,8 @@ public class SearchIndexerImpl implements SearchIndexer { private final WorkerThreadPool pool = new WorkerThreadPool(); private ServletContext ctx; - private Set excluders; - private Set modifiers; + private List excluders; + private List modifiers; private Set uriFinders; private WebappDaoFactory wadf; @@ -107,9 +108,16 @@ public class SearchIndexerImpl implements SearchIndexer { private void loadConfiguration() throws ConfigurationBeanLoaderException { ConfigurationBeanLoader beanLoader = new ConfigurationBeanLoader( ModelAccess.on(ctx).getOntModel(DISPLAY), ctx); - excluders = beanLoader.loadAll(SearchIndexExcluder.class); - modifiers = beanLoader.loadAll(DocumentModifier.class); uriFinders = beanLoader.loadAll(IndexingUriFinder.class); + + excluders = new ArrayList<>(); + excluders.add(new UpdateUrisTask.ExcludeIfNoVClasses()); + excluders.addAll(beanLoader.loadAll(SearchIndexExcluder.class)); + + modifiers = new ArrayList<>(); + modifiers.addAll(new UpdateDocumentWorkUnit.MinimalDocumentModifiers() + .getList()); + modifiers.addAll(beanLoader.loadAll(DocumentModifier.class)); } /** diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/documentBuilding/DocumentModifierListDeveloper.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/documentBuilding/DocumentModifierListDeveloper.java index 1c4082231..43aa1fcc4 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/documentBuilding/DocumentModifierListDeveloper.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/documentBuilding/DocumentModifierListDeveloper.java @@ -67,7 +67,7 @@ public class DocumentModifierListDeveloper implements DocumentModifierList { for (ModifierTiming timing : timings) { int totalMillis = timing.getTotal(); float totalSeconds = totalMillis / 1000.0F; - int average = totalMillis / count.get(); + int average = (count.get() == 0) ? 0 : totalMillis / count.get(); message += String .format("\n count: %7d, total: %9.3fsec, average: %4dms-- %1.200s", count.get(), totalSeconds, average, diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/exclusions/SearchIndexExcluderListDeveloper.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/exclusions/SearchIndexExcluderListDeveloper.java index 259b01573..0e56ac048 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/exclusions/SearchIndexExcluderListDeveloper.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/exclusions/SearchIndexExcluderListDeveloper.java @@ -80,7 +80,7 @@ public class SearchIndexExcluderListDeveloper implements int thisCount = timing.getCount(); int totalMillis = timing.getTotal(); float totalSeconds = totalMillis / 1000.0F; - int average = totalMillis / thisCount; + int average = (thisCount == 0) ? 0 : totalMillis / thisCount; message += String .format("\n count: %7d, total: %9.3fsec, average: %4dms-- %1.200s", thisCount, totalSeconds, average, diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/indexing/IndexingUriFinderListDeveloper.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/indexing/IndexingUriFinderListDeveloper.java index 807987abb..563935dc3 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/indexing/IndexingUriFinderListDeveloper.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/indexing/IndexingUriFinderListDeveloper.java @@ -71,12 +71,12 @@ public class IndexingUriFinderListDeveloper implements IndexingUriFinderList { } String message = String.format( - "Timings for %d modifiers after %d calls:", timings.size(), + "Timings for %d URI finders after %d calls:", timings.size(), count.get()); for (FinderTiming timing : timings) { int totalMillis = timing.getTotal(); float totalSeconds = totalMillis / 1000.0F; - int average = totalMillis / count.get(); + int average = (count.get() == 0) ? 0 : totalMillis / count.get(); message += String .format("\n count: %7d, total: %9.3fsec, average: %4dms-- %1.200s", count.get(), totalSeconds, average, diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/tasks/UpdateDocumentWorkUnit.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/tasks/UpdateDocumentWorkUnit.java index 4367bee9a..ffaf6dcb6 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/tasks/UpdateDocumentWorkUnit.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/tasks/UpdateDocumentWorkUnit.java @@ -13,6 +13,7 @@ import static edu.cornell.mannlib.vitro.webapp.search.VitroSearchTermNames.NAME_ import static edu.cornell.mannlib.vitro.webapp.search.VitroSearchTermNames.RDFTYPE; import static edu.cornell.mannlib.vitro.webapp.search.VitroSearchTermNames.URI; +import java.util.Arrays; import java.util.List; import org.apache.commons.lang.StringUtils; @@ -32,6 +33,7 @@ import edu.cornell.mannlib.vitro.webapp.beans.VClass; import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngine; import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchInputDocument; import edu.cornell.mannlib.vitro.webapp.modules.searchIndexer.SearchIndexerUtils; +import edu.cornell.mannlib.vitro.webapp.searchindex.documentBuilding.DocumentModifier; import edu.cornell.mannlib.vitro.webapp.searchindex.documentBuilding.DocumentModifierList; public class UpdateDocumentWorkUnit implements Runnable { @@ -60,131 +62,205 @@ public class UpdateDocumentWorkUnit implements Runnable { public void run() { try { SearchInputDocument doc = searchEngine.createInputDocument(); - - addIdAndUri(doc); - addLabel(doc); - addClasses(doc); - addMostSpecificTypes(doc); - addObjectPropertyText(doc); - addDataPropertyText(doc); - addEntityBoost(doc); - modifiers.modifyDocument(ind, doc); - addIndexedTime(doc); - searchEngine.add(doc); } catch (Exception e) { log.warn("Failed to add '" + ind + "' to the search index.", e); } } - private void addIdAndUri(SearchInputDocument doc) { - doc.addField(DOCID, SearchIndexerUtils.getIdForUri(ind.getURI())); - doc.addField(URI, ind.getURI()); - } - - private void addLabel(SearchInputDocument doc) { - String name = ind.getRdfsLabel(); - if (name == null) { - name = ind.getLocalName(); - } - - doc.addField(NAME_RAW, name); - doc.addField(NAME_LOWERCASE_SINGLE_VALUED, name); - } - - /** - * For each class that the individual belongs to, record the class URI, the - * class group URI, the class Name, and the class boost. - */ - private void addClasses(SearchInputDocument doc) { - List vclasses = ind.getVClasses(false); - if (vclasses == null) { - return; - } - - for (VClass clz : vclasses) { - String classUri = clz.getURI(); - if (classUri == null || URI_OWL_THING.equals(classUri)) { - continue; - } - doc.addField(RDFTYPE, classUri); - - String classGroupUri = clz.getGroupURI(); - if (classGroupUri != null) { - doc.addField(CLASSGROUP_URI, classGroupUri); - } - - addToAlltext(doc, clz.getName()); - - Float boost = clz.getSearchBoost(); - if (boost != null) { - doc.setDocumentBoost(doc.getDocumentBoost() + boost); - } - } - } - - private void addMostSpecificTypes(SearchInputDocument doc) { - List mstURIs = ind.getMostSpecificTypeURIs(); - if (mstURIs != null) { - for (String typeURI : mstURIs) { - if (StringUtils.isNotBlank(typeURI)) { - doc.addField(MOST_SPECIFIC_TYPE_URIS, typeURI); - } - } - } - } - - private void addObjectPropertyText(SearchInputDocument doc) { - List stmts = ind.getObjectPropertyStatements(); - if (stmts == null) { - return; - } - - for (ObjectPropertyStatement stmt : stmts) { - if (URI_DIFFERENT_FROM.equals(stmt.getPropertyURI())) { - continue; - } - addToAlltext(doc, stmt.getObject().getRdfsLabel()); - } - } - - private void addDataPropertyText(SearchInputDocument doc) { - List stmts = ind.getDataPropertyStatements(); - if (stmts == null) { - return; - } - - for (DataPropertyStatement stmt : stmts) { - if (stmt.getDatapropURI().equals(URI_RDFS_LABEL)) { - continue; - } - addToAlltext(doc, stmt.getData()); - } - } - - private void addEntityBoost(SearchInputDocument doc) { - Float boost = ind.getSearchBoost(); - if (boost != null && !boost.equals(0.0F)) { - doc.setDocumentBoost(boost); - } - } - private void addIndexedTime(SearchInputDocument doc) { doc.addField(INDEXEDTIME, (Object) new DateTime().getMillis()); } - private void addToAlltext(SearchInputDocument doc, String raw) { - if (StringUtils.isBlank(raw)) { - return; - } - String clean = Jsoup.parse(raw).text(); - if (StringUtils.isBlank(clean)) { - return; - } - doc.addField(ALLTEXT, clean); - doc.addField(ALLTEXTUNSTEMMED, clean); + // ---------------------------------------------------------------------- + // Helper classes + // ---------------------------------------------------------------------- + /** + * These will be hardcoded at the beginning of the list of + * DocumentModifiers. + */ + public static class MinimalDocumentModifiers { + private final List list; + + public MinimalDocumentModifiers() { + this.list = Arrays.asList(new DocumentModifier[] { + new IdUriLabel(), new AddClasses(), + new AddMostSpecificTypes(), new AddObjectPropertyText(), + new AddDataPropertyText(), new AddEntityBoost() }); + } + + public List getList() { + return list; + } } + + private abstract static class BaseDocumentModifier implements + DocumentModifier { + protected void addToAlltext(SearchInputDocument doc, String raw) { + if (StringUtils.isBlank(raw)) { + return; + } + String clean = Jsoup.parse(raw).text(); + if (StringUtils.isBlank(clean)) { + return; + } + doc.addField(ALLTEXT, clean); + doc.addField(ALLTEXTUNSTEMMED, clean); + } + + @Override + public void shutdown() { + // Nothing to do. + } + } + + private static class IdUriLabel extends BaseDocumentModifier { + @Override + public void modifyDocument(Individual ind, SearchInputDocument doc) { + addIdAndUri(ind, doc); + addLabel(ind, doc); + } + + private void addIdAndUri(Individual ind, SearchInputDocument doc) { + doc.addField(DOCID, SearchIndexerUtils.getIdForUri(ind.getURI())); + doc.addField(URI, ind.getURI()); + } + + private void addLabel(Individual ind, SearchInputDocument doc) { + String name = ind.getRdfsLabel(); + if (name == null) { + name = ind.getLocalName(); + } + + doc.addField(NAME_RAW, name); + doc.addField(NAME_LOWERCASE_SINGLE_VALUED, name); + } + + @Override + public String toString() { + return "REQUIRED: IdUriLabel"; + } + } + + /** + * For each class that the individual belongs to, record the class URI, the + * class group URI, the class Name, and the class boost. + */ + private static class AddClasses extends BaseDocumentModifier { + @Override + public void modifyDocument(Individual ind, SearchInputDocument doc) { + List vclasses = ind.getVClasses(false); + if (vclasses == null) { + return; + } + + for (VClass clz : vclasses) { + String classUri = clz.getURI(); + if (classUri == null || URI_OWL_THING.equals(classUri)) { + continue; + } + doc.addField(RDFTYPE, classUri); + + String classGroupUri = clz.getGroupURI(); + if (classGroupUri != null) { + doc.addField(CLASSGROUP_URI, classGroupUri); + } + + addToAlltext(doc, clz.getName()); + + Float boost = clz.getSearchBoost(); + if (boost != null) { + doc.setDocumentBoost(doc.getDocumentBoost() + boost); + } + } + } + + @Override + public String toString() { + return "REQUIRED: AddClasses"; + } + } + + private static class AddMostSpecificTypes extends BaseDocumentModifier { + @Override + public void modifyDocument(Individual ind, SearchInputDocument doc) { + List mstURIs = ind.getMostSpecificTypeURIs(); + if (mstURIs != null) { + for (String typeURI : mstURIs) { + if (StringUtils.isNotBlank(typeURI)) { + doc.addField(MOST_SPECIFIC_TYPE_URIS, typeURI); + } + } + } + } + + @Override + public String toString() { + return "REQUIRED: AddMostSpecificTypes"; + } + } + + private static class AddObjectPropertyText extends BaseDocumentModifier { + @Override + public void modifyDocument(Individual ind, SearchInputDocument doc) { + List stmts = ind + .getObjectPropertyStatements(); + if (stmts == null) { + return; + } + + for (ObjectPropertyStatement stmt : stmts) { + if (URI_DIFFERENT_FROM.equals(stmt.getPropertyURI())) { + continue; + } + addToAlltext(doc, stmt.getObject().getRdfsLabel()); + } + } + + @Override + public String toString() { + return "REQUIRED: AddObjectPropertyText"; + } + } + + private static class AddDataPropertyText extends BaseDocumentModifier { + @Override + public void modifyDocument(Individual ind, SearchInputDocument doc) { + List stmts = ind.getDataPropertyStatements(); + if (stmts == null) { + return; + } + + for (DataPropertyStatement stmt : stmts) { + if (stmt.getDatapropURI().equals(URI_RDFS_LABEL)) { + continue; + } + addToAlltext(doc, stmt.getData()); + } + } + + @Override + public String toString() { + return "REQUIRED: AddDataPropertyText"; + } + } + + private static class AddEntityBoost extends BaseDocumentModifier { + @Override + public void modifyDocument(Individual ind, SearchInputDocument doc) { + Float boost = ind.getSearchBoost(); + if (boost != null && !boost.equals(0.0F)) { + doc.setDocumentBoost(boost); + } + } + + @Override + public String toString() { + return "REQUIRED: AddEntityBoost"; + } + } + } diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/tasks/UpdateUrisTask.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/tasks/UpdateUrisTask.java index 2743f84dc..ad8bd3eba 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/tasks/UpdateUrisTask.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/tasks/UpdateUrisTask.java @@ -29,6 +29,7 @@ import edu.cornell.mannlib.vitro.webapp.searchindex.SearchIndexerImpl.ListenerLi import edu.cornell.mannlib.vitro.webapp.searchindex.SearchIndexerImpl.Task; import edu.cornell.mannlib.vitro.webapp.searchindex.SearchIndexerImpl.WorkerThreadPool; import edu.cornell.mannlib.vitro.webapp.searchindex.documentBuilding.DocumentModifierList; +import edu.cornell.mannlib.vitro.webapp.searchindex.exclusions.SearchIndexExcluder; import edu.cornell.mannlib.vitro.webapp.searchindex.exclusions.SearchIndexExcluderList; /** @@ -68,6 +69,8 @@ public class UpdateUrisTask implements Task { this.status = new Status(uris.size(), 200, listeners); this.searchEngine = ApplicationUtils.instance().getSearchEngine(); + + } @Override @@ -83,7 +86,7 @@ public class UpdateUrisTask implements Task { break; } else { Individual ind = getIndividual(uri); - if (ind == null || hasNoClass(ind) || isExcluded(ind)) { + if (ind == null || isExcluded(ind)) { deleteDocument(uri); } else { updateDocument(ind); @@ -124,15 +127,6 @@ public class UpdateUrisTask implements Task { return ind; } - private boolean hasNoClass(Individual ind) { - List vclasses = ind.getVClasses(false); - if (vclasses == null || vclasses.isEmpty()) { - log.debug("Individual " + ind + " has no classes."); - return true; - } - return false; - } - private boolean isExcluded(Individual ind) { return excluders.isExcluded(ind); } @@ -214,4 +208,23 @@ public class UpdateUrisTask implements Task { } + /** + * This will be first in the list of SearchIndexExcluders. + */ + public static class ExcludeIfNoVClasses implements SearchIndexExcluder { + @Override + public String checkForExclusion(Individual ind) { + List vclasses = ind.getVClasses(false); + if (vclasses == null || vclasses.isEmpty()) { + return "Individual " + ind + " has no classes."; + } + return null; + } + + @Override + public String toString() { + return "ExcludeIfNoVClasses"; + } + + } }