diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/modules/searchIndexer/SearchIndexer.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/modules/searchIndexer/SearchIndexer.java index 9cfc43cd2..e616b347f 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/modules/searchIndexer/SearchIndexer.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/modules/searchIndexer/SearchIndexer.java @@ -110,23 +110,15 @@ public interface SearchIndexer extends Application.Module { */ public static class Event { public enum Type { - STARTUP, - - START_PROCESSING_URIS, + STARTUP, PROGRESS, - PROGRESS_PROCESSING_URIS, + START_PROCESSING_URIS, STOP_PROCESSING_URIS, - STOP_PROCESSING_URIS, + START_PROCESSING_STATEMENTS, STOP_PROCESSING_STATEMENTS, - START_PROCESSING_STATEMENTS, + REBUILD_REQUESTED, REBUILD_COMPLETE, - PROGRESS_PROCESSING_STATEMENTS, - - STOP_PROCESSING_STATEMENTS, - - REBUILD_REQUESTED, - - SHUTDOWN_REQUESTED + SHUTDOWN_REQUESTED, SHUTDOWN_COMPLETE } private final Type type; diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/modules/searchIndexer/SearchIndexerStatus.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/modules/searchIndexer/SearchIndexerStatus.java index afc5e1c16..59f7e8f1b 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/modules/searchIndexer/SearchIndexerStatus.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/modules/searchIndexer/SearchIndexerStatus.java @@ -5,23 +5,34 @@ package edu.cornell.mannlib.vitro.webapp.modules.searchIndexer; import java.util.Date; /** - * An immutable summary of the status of the SearchIndexer, at some point in - * time.Contains the current state, and some counts. + * An immutable summary of the status of the SearchIndexer, at a fixed point in + * time. Contains the current state, the time, and some counts. * * If the indexer is processing URIs, processing statements, or preparing a - * rebuild, the counts are URI_COUNTS, STATEMENT_COUNTS, or REBUILD_COUNTS. + * rebuild, the counts are URI_COUNTS, STATEMENT_COUNTS, or REBUILD_COUNTS, + * respectively. * - * When the indexer starts up, and when it is is shut down, the counts are + * When the indexer starts up, becomes idle, or shuts down, the counts are * NO_COUNTS. - * - * If the indexer is idle, the counts are carried over from the previous - * operation. */ public class SearchIndexerStatus { - public enum State { - IDLE, PROCESSING_URIS, PROCESSING_STMTS, PREPARING_REBUILD, SHUTDOWN + // ---------------------------------------------------------------------- + // factory methods + // ---------------------------------------------------------------------- + + public static SearchIndexerStatus idle() { + return new SearchIndexerStatus(State.IDLE, new Date(), new NoCounts()); } + public static SearchIndexerStatus shutdown() { + return new SearchIndexerStatus(State.SHUTDOWN, new Date(), + new NoCounts()); + } + + // ---------------------------------------------------------------------- + // the instance + // ---------------------------------------------------------------------- + private final State state; private final Date since; private final Counts counts; @@ -39,11 +50,19 @@ public class SearchIndexerStatus { public Date getSince() { return since; } - + public Counts getCounts() { return counts; } + // ---------------------------------------------------------------------- + // helper classes + // ---------------------------------------------------------------------- + + public enum State { + IDLE, PROCESSING_URIS, PROCESSING_STMTS, PREPARING_REBUILD, SHUTDOWN + } + public abstract static class Counts { public enum Type { URI_COUNTS, STATEMENT_COUNTS, REBUILD_COUNTS, NO_COUNTS diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/modules/searchIndexer/SearchIndexerUtils.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/modules/searchIndexer/SearchIndexerUtils.java new file mode 100644 index 000000000..65784d665 --- /dev/null +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/modules/searchIndexer/SearchIndexerUtils.java @@ -0,0 +1,20 @@ +/* $This file is distributed under the terms of the license in /doc/license.txt$ */ + +package edu.cornell.mannlib.vitro.webapp.modules.searchIndexer; + +/** + * Some handy methods for dealing with the search index. + */ +public class SearchIndexerUtils { + + /** + * The document ID in the search index is derived from the individual's URI. + */ + public static String getIdForUri(String uri){ + if( uri != null ){ + return "vitroIndividual:" + uri; + }else{ + return null; + } + } +} diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/SearchIndexer.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/SearchIndexer.java index 8ab34a82e..7f9e14ba8 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/SearchIndexer.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/SearchIndexer.java @@ -18,7 +18,6 @@ import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchResponse; import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchResultDocumentList; import edu.cornell.mannlib.vitro.webapp.search.IndexingException; import edu.cornell.mannlib.vitro.webapp.search.beans.IndexerIface; -import edu.cornell.mannlib.vitro.webapp.search.documentBuilding.IndividualToSearchDocument; public class SearchIndexer implements IndexerIface { @@ -27,7 +26,6 @@ public class SearchIndexer implements IndexerIface { protected SearchEngine server; protected boolean indexing; protected HashSet urisIndexed; - protected IndividualToSearchDocument individualToSearchDoc; /** * System is shutting down if true. @@ -48,9 +46,8 @@ public class SearchIndexer implements IndexerIface { */ protected boolean doingFullIndexRebuild = false; - public SearchIndexer( SearchEngine server, IndividualToSearchDocument indToDoc){ + public SearchIndexer( SearchEngine server){ this.server = server; - this.individualToSearchDoc = indToDoc; } @Override @@ -74,7 +71,7 @@ public class SearchIndexer implements IndexerIface { urisIndexed.add(ind.getURI()); } log.debug("indexing " + ind.getURI()); - doc = individualToSearchDoc.translate(ind); +// doc = individualToSearchDoc.translate(ind); if( doc != null){ if( log.isDebugEnabled()){ @@ -109,9 +106,9 @@ public class SearchIndexer implements IndexerIface { public void removeFromIndex(String uri) throws IndexingException { if( uri != null ){ try { - server.deleteById(individualToSearchDoc.getIdForUri(uri)); +// server.deleteById(individualToSearchDoc.getIdForUri(uri)); log.debug("deleted " + " " + uri); - } catch (SearchEngineException e) { + } catch (Exception e) { log.error( "could not delete individual " + uri, e); } } @@ -136,7 +133,7 @@ public class SearchIndexer implements IndexerIface { public void abortIndexingAndCleanUp() { shutdownRequested = true; try{ - individualToSearchDoc.shutdown(); +// individualToSearchDoc.shutdown(); }catch(Exception e){ if( log != null) log.debug(e,e); diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/documentBuilding/IndividualToSearchDocument.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/documentBuilding/IndividualToSearchDocument.java deleted file mode 100644 index 923e1ad56..000000000 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/documentBuilding/IndividualToSearchDocument.java +++ /dev/null @@ -1,356 +0,0 @@ - -/* $This file is distributed under the terms of the license in /doc/license.txt$ */ - -package edu.cornell.mannlib.vitro.webapp.search.documentBuilding; -import static edu.cornell.mannlib.vitro.webapp.search.VitroSearchTermNames.ALLTEXT; -import static edu.cornell.mannlib.vitro.webapp.search.VitroSearchTermNames.ALLTEXTUNSTEMMED; -import static edu.cornell.mannlib.vitro.webapp.search.VitroSearchTermNames.CLASSGROUP_URI; -import static edu.cornell.mannlib.vitro.webapp.search.VitroSearchTermNames.DOCID; -import static edu.cornell.mannlib.vitro.webapp.search.VitroSearchTermNames.INDEXEDTIME; -import static edu.cornell.mannlib.vitro.webapp.search.VitroSearchTermNames.MOST_SPECIFIC_TYPE_URIS; -import static edu.cornell.mannlib.vitro.webapp.search.VitroSearchTermNames.NAME_LOWERCASE_SINGLE_VALUED; -import static edu.cornell.mannlib.vitro.webapp.search.VitroSearchTermNames.NAME_RAW; -import static edu.cornell.mannlib.vitro.webapp.search.VitroSearchTermNames.RDFTYPE; -import static edu.cornell.mannlib.vitro.webapp.search.VitroSearchTermNames.URI; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.joda.time.DateTime; -import org.jsoup.Jsoup; - -import com.hp.hpl.jena.shared.JenaException; -import com.hp.hpl.jena.vocabulary.OWL; - -import edu.cornell.mannlib.vitro.webapp.application.ApplicationUtils; -import edu.cornell.mannlib.vitro.webapp.beans.DataPropertyStatement; -import edu.cornell.mannlib.vitro.webapp.beans.Individual; -import edu.cornell.mannlib.vitro.webapp.beans.IndividualImpl; -import edu.cornell.mannlib.vitro.webapp.beans.ObjectPropertyStatement; -import edu.cornell.mannlib.vitro.webapp.beans.VClass; -import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchInputDocument; -import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchResultDocument; -import edu.cornell.mannlib.vitro.webapp.search.IndexingException; -import edu.cornell.mannlib.vitro.webapp.searchindex.documentBuilding.DocumentModifier; -import edu.cornell.mannlib.vitro.webapp.searchindex.exclusions.SearchIndexExcluder; - -public class IndividualToSearchDocument { - - public static final Log log = LogFactory.getLog(IndividualToSearchDocument.class.getName()); - - protected final String label = "http://www.w3.org/2000/01/rdf-schema#label"; - - protected List documentModifiers = new ArrayList(); - - protected List excludes; - - public IndividualToSearchDocument(List excludes, List docModifiers){ - this.excludes = excludes; - this.documentModifiers = docModifiers; - } - - public SearchInputDocument translate(Individual ind) throws IndexingException{ - try{ - String excludeMsg = checkExcludes( ind ); - if( excludeMsg != DONT_EXCLUDE){ - log.debug(ind.getURI() + " " + excludeMsg); - return null; - } - - SearchInputDocument doc = ApplicationUtils.instance().getSearchEngine().createInputDocument(); - - //DocID - doc.addField(DOCID, getIdForUri( ind.getURI() ) ); - - //vitro id - doc.addField(URI, ind.getURI()); - log.debug(ind.getURI() + " init boost: " + doc.getDocumentBoost()); - - //get label from ind - addLabel(ind, doc); - - //add classes, classgroups get if prohibited because of its class - StringBuffer classPublicNames = new StringBuffer(""); - addClasses(ind, doc, classPublicNames); - addMostSpecificTypeUris( ind, doc ); - - log.debug(ind.getURI() + " post class boost: " + doc.getDocumentBoost()); - - // collecting URIs and rdfs:labels of objects of statements - StringBuffer objectNames = new StringBuffer(""); - StringBuffer addUri = new StringBuffer(""); - addObjectPropertyText(ind, doc, objectNames, addUri); - - //time of index in msec past epoch - doc.addField(INDEXEDTIME, (Object) new DateTime().getMillis() ); - - addAllText( ind, doc, classPublicNames, objectNames ); - - //boost for entity - if(ind.getSearchBoost() != null && ind.getSearchBoost() != 0) { - doc.setDocumentBoost(ind.getSearchBoost()); - } - - log.debug(ind.getURI() + " pre mod boost: " + doc.getDocumentBoost()); - - runAdditionalDocModifers(ind,doc); - - log.debug(ind.getURI() + " post mod boost: " + doc.getDocumentBoost()); - - return doc; - }catch(SkipIndividualException ex){ - //indicates that this individual should not be indexed by returning null - log.debug(ex); - return null; - }catch(Exception th){ - log.error(th,th); - return null; - } - } - - - protected String checkExcludes(Individual ind) { - for( SearchIndexExcluder excluder : excludes){ - try{ - String msg = excluder.checkForExclusion(ind); - log.debug("individual=" + ind.getURI() + " (" + ind.getLabel() - + "), excluder=" + excluder + ", types=" - + ind.getMostSpecificTypeURIs() + ", msg=" + msg); - if( msg != DONT_EXCLUDE) - return msg; - }catch (Exception e) { - return e.getMessage(); - } - } - return DONT_EXCLUDE; - } - - protected Map docModClassToTime = new HashMap(); - protected long docModCount =0; - - protected void runAdditionalDocModifers( Individual ind, SearchInputDocument doc ) - throws SkipIndividualException{ - //run the document modifiers - if( documentModifiers != null && !documentModifiers.isEmpty()){ - docModCount++; - for(DocumentModifier modifier: documentModifiers){ - - long start = System.currentTimeMillis(); - - modifier.modifyDocument(ind, doc); - - if( log.isDebugEnabled()){ - long delta = System.currentTimeMillis() - start; - synchronized(docModClassToTime){ - Class clz = modifier.getClass(); - if( docModClassToTime.containsKey( clz.getName() )){ - Long time = docModClassToTime.get(clz.getName() ); - docModClassToTime.put(clz.getName(), time + delta); - }else{ - docModClassToTime.put(clz.getName(), delta); - } - } - if( docModCount % 200 == 0 ){ - log.debug("DocumentModifier timings"); - for( Entry entry: docModClassToTime.entrySet()){ - log.debug("average msec to run " + entry.getKey() + ": " + (entry.getValue()/docModCount)); - } - } - } - } - } - } - - protected void addAllText(Individual ind, SearchInputDocument doc, StringBuffer classPublicNames, StringBuffer objectNames) { - String t=null; - //ALLTEXT, all of the 'full text' - StringBuffer allTextValue = new StringBuffer(); - - try{ - //collecting data property statements - List dataPropertyStatements = ind.getDataPropertyStatements(); - if (dataPropertyStatements != null) { - Iterator dataPropertyStmtIter = dataPropertyStatements.iterator(); - while (dataPropertyStmtIter.hasNext()) { - DataPropertyStatement dataPropertyStmt = dataPropertyStmtIter.next(); - if(dataPropertyStmt.getDatapropURI().equals(label)){ // we don't want label to be added to alltext - continue; - } - allTextValue.append(" "); - allTextValue.append(((t=dataPropertyStmt.getData()) == null)?"":t); - } - } - }catch(JenaException je){ - //VIVO-15 Trap for characters that cause search indexing to abort - log.error(String.format("Continuing to index %s but could not get all dataproperties because %s",ind.getURI(),je.getMessage())); - } - - allTextValue.append(objectNames.toString()); - - allTextValue.append(' '); - allTextValue.append(classPublicNames); - - try { - String stripped = Jsoup.parse(allTextValue.toString()).text(); - allTextValue.setLength(0); - allTextValue.append(stripped); - } catch(Exception e) { - log.debug("Could not strip HTML during search indexing. " + e); - } - - String alltext = allTextValue.toString(); - - doc.addField(ALLTEXT, alltext); - doc.addField(ALLTEXTUNSTEMMED, alltext); - } - - - - - - /** - * Get the rdfs:labes for objects of statements and put in objectNames. - * Get the URIs for objects of statements and put in addUri. - */ - protected void addObjectPropertyText(Individual ind, SearchInputDocument doc, - StringBuffer objectNames, StringBuffer addUri) { - - try{ - List objectPropertyStatements = ind.getObjectPropertyStatements(); - if (objectPropertyStatements != null) { - Iterator objectPropertyStmtIter = objectPropertyStatements.iterator(); - while (objectPropertyStmtIter.hasNext()) { - ObjectPropertyStatement objectPropertyStmt = objectPropertyStmtIter.next(); - if( "http://www.w3.org/2002/07/owl#differentFrom".equals(objectPropertyStmt.getPropertyURI()) ){ - continue; - } - try { - objectNames.append(" "); - String t=null; - objectNames.append(((t=objectPropertyStmt.getObject().getRdfsLabel()) == null)?"":t); - addUri.append(" "); - addUri.append(((t=objectPropertyStmt.getObject().getURI()) == null)?"":t); - } catch (Exception e) { - log.debug("could not index name of related object: " + e.getMessage()); - } - } - } - }catch(JenaException je){ - //VIVO-15 Trap for characters that cause search indexing to abort - log.error(String.format("Continuing to index %s but could not get all object properties because %s",ind.getURI(),je.getMessage())); - } - } - - /** - * Adds the info about the classes that the individual is a member - * of, classgroups and checks if prohibited. - * @param classPublicNames - * @returns true if prohibited from search - * @throws SkipIndividualException - */ - protected void addClasses(Individual ind, SearchInputDocument doc, StringBuffer classPublicNames) throws SkipIndividualException{ - List vclasses = ind.getVClasses(false); - if( vclasses == null || vclasses.isEmpty() ){ - throw new SkipIndividualException("Not indexing because individual has no classes"); - } - - for(VClass clz : vclasses){ - if(clz.getURI() == null){ - continue; - }else if(OWL.Thing.getURI().equals(clz.getURI())){ - //don't add owl:Thing as the type in the index - continue; - } else { - if( clz.getSearchBoost() != null){ - doc.setDocumentBoost(doc.getDocumentBoost() + clz.getSearchBoost()); - } - - doc.addField(RDFTYPE, clz.getURI()); - - if(clz.getName() != null){ - classPublicNames.append(" "); - classPublicNames.append(clz.getName()); - } - - //Add the Classgroup URI to a field - if(clz.getGroupURI() != null){ - doc.addField(CLASSGROUP_URI,clz.getGroupURI()); - } - } - } - } - - protected void addMostSpecificTypeUris(Individual ind, SearchInputDocument doc){ - List mstURIs = ind.getMostSpecificTypeURIs(); - if( mstURIs != null ){ - for( String typeURI : mstURIs ){ - if( typeURI != null && ! typeURI.trim().isEmpty() ) - doc.addField(MOST_SPECIFIC_TYPE_URIS, typeURI); - } - } - } - - protected void addLabel(Individual ind, SearchInputDocument doc) { - String value = ""; - String label = ind.getRdfsLabel(); - if (label != null) { - value = label; - } else { - value = ind.getLocalName(); - } - - doc.addField(NAME_RAW, value); - doc.addField(NAME_LOWERCASE_SINGLE_VALUED,value); - - // NAME_RAW will be copied by the search engine into the following fields: - // NAME_LOWERCASE, NAME_UNSTEMMED, NAME_STEMMED, NAME_PHONETIC, AC_NAME_UNTOKENIZED, AC_NAME_STEMMED - } - - public Object getIndexId(Object obj) { - throw new Error("IndiviudalToSearchDocument.getIndexId() is unimplemented"); - } - - public String getIdForUri(String uri){ - if( uri != null ){ - return "vitroIndividual:" + uri; - }else{ - return null; - } - } - - public String getQueryForId(String uri ){ - return DOCID + ':' + getIdForUri(uri); - } - - public Individual unTranslate(Object result) { - Individual ent = null; - - if( result instanceof SearchResultDocument){ - SearchResultDocument hit = (SearchResultDocument) result; - String uri= (String) hit.getFirstValue(URI); - - ent = new IndividualImpl(); - ent.setURI(uri); - } - return ent; - } - - public void shutdown(){ - for(DocumentModifier dm: documentModifiers){ - try{ - dm.shutdown(); - }catch(Exception e){ - if( log != null) - log.debug(e,e); - } - } - } - - public static final String DONT_EXCLUDE =null; -} diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/SearchIndexerImpl.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/SearchIndexerImpl.java index 343df3a57..df5877ff8 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/SearchIndexerImpl.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/SearchIndexerImpl.java @@ -3,79 +3,90 @@ package edu.cornell.mannlib.vitro.webapp.searchindex; import static edu.cornell.mannlib.vitro.webapp.modelaccess.ModelNames.DISPLAY; -import static edu.cornell.mannlib.vitro.webapp.modules.searchIndexer.SearchIndexer.Event.Type.REBUILD_REQUESTED; -import static edu.cornell.mannlib.vitro.webapp.modules.searchIndexer.SearchIndexer.Event.Type.*; -import static edu.cornell.mannlib.vitro.webapp.modules.searchIndexer.SearchIndexer.Event.Type.STOP_PROCESSING_STATEMENTS; +import static edu.cornell.mannlib.vitro.webapp.modules.searchIndexer.SearchIndexer.Event.Type.SHUTDOWN_COMPLETE; +import static edu.cornell.mannlib.vitro.webapp.modules.searchIndexer.SearchIndexer.Event.Type.SHUTDOWN_REQUESTED; +import static edu.cornell.mannlib.vitro.webapp.modules.searchIndexer.SearchIndexer.Event.Type.STARTUP; +import static java.util.concurrent.TimeUnit.MINUTES; import java.util.ArrayList; import java.util.Collection; -import java.util.Collections; -import java.util.Date; import java.util.List; +import java.util.Objects; import java.util.Set; +import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.RejectedExecutionException; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicReference; import javax.servlet.ServletContext; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import edu.cornell.mannlib.vitro.webapp.dao.WebappDaoFactory; +import edu.cornell.mannlib.vitro.webapp.dao.filtering.WebappDaoFactoryFiltering; +import edu.cornell.mannlib.vitro.webapp.dao.filtering.filters.VitroFilterUtils; +import edu.cornell.mannlib.vitro.webapp.dao.filtering.filters.VitroFilters; import edu.cornell.mannlib.vitro.webapp.modelaccess.ModelAccess; import edu.cornell.mannlib.vitro.webapp.modules.Application; import edu.cornell.mannlib.vitro.webapp.modules.ComponentStartupStatus; import edu.cornell.mannlib.vitro.webapp.modules.searchIndexer.SearchIndexer; import edu.cornell.mannlib.vitro.webapp.modules.searchIndexer.SearchIndexerStatus; import edu.cornell.mannlib.vitro.webapp.modules.searchIndexer.SearchIndexerStatus.State; -import edu.cornell.mannlib.vitro.webapp.search.indexing.IndexBuilder; -import edu.cornell.mannlib.vitro.webapp.search.indexing.IndexingEventListener; import edu.cornell.mannlib.vitro.webapp.searchindex.documentBuilding.DocumentModifier; import edu.cornell.mannlib.vitro.webapp.searchindex.exclusions.SearchIndexExcluder; import edu.cornell.mannlib.vitro.webapp.searchindex.indexing.IndexingUriFinder; +import edu.cornell.mannlib.vitro.webapp.searchindex.tasks.RebuildIndexTask; +import edu.cornell.mannlib.vitro.webapp.searchindex.tasks.UpdateUrisTask; import edu.cornell.mannlib.vitro.webapp.utils.configuration.ConfigurationBeanLoader; import edu.cornell.mannlib.vitro.webapp.utils.configuration.ConfigurationBeanLoaderException; -import edu.cornell.mannlib.vitro.webapp.utils.threads.VitroBackgroundThread.WorkLevel; -import edu.cornell.mannlib.vitro.webapp.utils.threads.VitroBackgroundThread.WorkLevelStamp; +import edu.cornell.mannlib.vitro.webapp.utils.threads.VitroBackgroundThread; /** - * TODO A silly implementation that just wraps the old IndexBuilder. + * An implementation of the SearchIndexer interface. + * + * This implementation uses a single-threaded task queue to permit indexing to + * run one at a time in a "background" thread. The task queue is controlled by a + * scheduler that allows us to suspend incoming tasks (pause). + * + * A thread pool is available so the tasks can create small units of work to be + * run in parallel. Each task should block until all of its work units are + * complete, to preserve the pattern of running one task at a time. */ public class SearchIndexerImpl implements SearchIndexer { private static final Log log = LogFactory.getLog(SearchIndexerImpl.class); private final ListenerList listeners = new ListenerList(); + private final TaskQueue taskQueue = new TaskQueue(); + private final Scheduler scheduler = new Scheduler(taskQueue); + private final WorkerThreadPool pool = new WorkerThreadPool(); private ServletContext ctx; private Set excluders; private Set modifiers; private Set uriFinders; - - // TODO - private IndexBuilder indexBuilder; + private WebappDaoFactory wadf; @Override public void startup(Application application, ComponentStartupStatus ss) { try { this.ctx = application.getServletContext(); + loadConfiguration(); + + this.wadf = getFilteredWebappDaoFactory(); + + listeners.fireEvent(new Event(STARTUP, getStatus())); ss.info("Configured SearchIndexer: excluders=" + excluders + ", modifiers=" + modifiers + ", uriFinders=" + uriFinders); - - { // >>>>>>> TODO - this.indexBuilder = (IndexBuilder) ctx - .getAttribute(IndexBuilder.class.getName()); - - this.indexBuilder.addIndexBuilderListener(new BridgeListener()); - } - - createAndFire(STARTUP); } catch (Exception e) { ss.fatal("Failed to configure the SearchIndexer", e); } } - private void createAndFire(Event.Type type) { - listeners.fireEvent(new Event(type, getStatus())); - } - private void loadConfiguration() throws ConfigurationBeanLoaderException { ConfigurationBeanLoader beanLoader = new ConfigurationBeanLoader( ModelAccess.on(ctx).getOntModel(DISPLAY), ctx); @@ -84,80 +95,36 @@ public class SearchIndexerImpl implements SearchIndexer { uriFinders = beanLoader.loadAll(IndexingUriFinder.class); } - /* - * (non-Javadoc) - * - * @see - * edu.cornell.mannlib.vitro.webapp.modules.searchIndexer.SearchIndexer# - * scheduleUpdatesForUris(java.util.Collection) + /** + * Use a filtered DAO factory, so only public data goes into the search + * index. */ + private WebappDaoFactory getFilteredWebappDaoFactory() { + WebappDaoFactory rawWadf = ModelAccess.on(ctx).getWebappDaoFactory(); + VitroFilters vf = VitroFilterUtils.getPublicFilter(ctx); + return new WebappDaoFactoryFiltering(rawWadf, vf); + } + @Override public void scheduleUpdatesForUris(Collection uris) { - // TODO - for (String uri : uris) { - indexBuilder.addToChanged(uri); - } + log.debug("Schedule updates for " + uris.size() + " uris."); + scheduler.scheduleTask(new UpdateUrisTask(uris, excluders, modifiers, + wadf.getIndividualDao(), listeners, pool)); } - /* - * (non-Javadoc) - * - * @see - * edu.cornell.mannlib.vitro.webapp.modules.searchIndexer.SearchIndexer# - * rebuildIndex() - */ @Override public void rebuildIndex() { - // TODO - indexBuilder.doIndexRebuild(); + scheduler.scheduleTask(new RebuildIndexTask()); } - /* - * (non-Javadoc) - * - * @see - * edu.cornell.mannlib.vitro.webapp.modules.searchIndexer.SearchIndexer# - * pause() - */ @Override public void pause() { - // TODO - indexBuilder.pause(); + scheduler.pause(); } - /* - * (non-Javadoc) - * - * @see - * edu.cornell.mannlib.vitro.webapp.modules.searchIndexer.SearchIndexer# - * unpause() - */ @Override public void unpause() { - // TODO - indexBuilder.unpause(); - } - - /* - * (non-Javadoc) - * - * @see - * edu.cornell.mannlib.vitro.webapp.modules.searchIndexer.SearchIndexer# - * getStatus() - */ - @Override - public SearchIndexerStatus getStatus() { - // TODO - WorkLevelStamp workLevel = indexBuilder.getWorkLevel(); - WorkLevel level = workLevel.getLevel(); - Date since = workLevel.getSince(); - if (level == WorkLevel.IDLE) { - return new SearchIndexerStatus(State.IDLE, since, - new SearchIndexerStatus.NoCounts()); - } else { - return new SearchIndexerStatus(State.PROCESSING_URIS, since, - new SearchIndexerStatus.UriCounts(1, 2, 3, 6)); - } + scheduler.unpause(); } @Override @@ -170,63 +137,264 @@ public class SearchIndexerImpl implements SearchIndexer { listeners.remove(listener); } - /* - * (non-Javadoc) - * - * @see - * edu.cornell.mannlib.vitro.webapp.modules.Application.Component#shutdown - * (edu.cornell.mannlib.vitro.webapp.modules.Application) - */ @Override - public void shutdown(Application application) { - // TODO + public SearchIndexerStatus getStatus() { + return taskQueue.getStatus(); } + @Override + public synchronized void shutdown(Application application) { + SearchIndexerStatus status = taskQueue.getStatus(); + if (status.getState() != State.SHUTDOWN) { + listeners.fireEvent(new Event(SHUTDOWN_REQUESTED, status)); + + taskQueue.shutdown(); + + for (DocumentModifier dm : modifiers) { + try { + dm.shutdown(); + } catch (Exception e) { + log.warn("Failed to shut down document modifier " + dm, e); + } + } + + listeners.fireEvent(new Event(SHUTDOWN_COMPLETE, taskQueue + .getStatus())); + } + } + + // ---------------------------------------------------------------------- + // Helper classes + // ---------------------------------------------------------------------- + /** - * A simple thread-safe list of event listeners. + * A simple thread-safe list of event listeners. All methods are + * synchronized. */ - private static class ListenerList { + public static class ListenerList { private final List list; public ListenerList() { - list = Collections.synchronizedList(new ArrayList()); + list = new ArrayList(); } - public void add(Listener l) { + public synchronized void add(Listener l) { list.add(l); } - public void remove(Listener l) { + public synchronized void remove(Listener l) { list.remove(l); } - public void fireEvent(Event e) { - synchronized (list) { - for (Listener l : list) { - l.receiveSearchIndexerEvent(e); + public synchronized void fireEvent(Event event) { + for (Listener l : list) { + try { + l.receiveSearchIndexerEvent(event); + } catch (Exception e) { + log.warn("Failed to deliver event '" + event + + "' to listener '" + l + "'", e); } } } } - private class BridgeListener implements IndexingEventListener { - @Override - public void notifyOfIndexingEvent(EventTypes ie) { - switch (ie) { - case START_UPDATE: - createAndFire(START_PROCESSING_STATEMENTS); - break; - case FINISHED_UPDATE: - createAndFire(STOP_PROCESSING_STATEMENTS); - break; - case START_FULL_REBUILD: - createAndFire(REBUILD_REQUESTED); - createAndFire(START_PROCESSING_STATEMENTS); - break; - default: // FINISH_FULL_REBUILD - createAndFire(STOP_PROCESSING_STATEMENTS); - break; + /** + * A scheduler either collects tasks (if paused), or passes them to the + * queue (if not paused). All methods are synchronized. + */ + private static class Scheduler { + private final TaskQueue taskQueue; + private final List deferredQueue; + private volatile boolean paused; + + public Scheduler(TaskQueue taskQueue) { + this.taskQueue = taskQueue; + this.deferredQueue = new ArrayList(); + } + + public synchronized void scheduleTask(Task task) { + if (paused) { + deferredQueue.add(task); + log.debug("added task to deferred queue: " + task); + } else { + taskQueue.scheduleTask(task); + log.debug("added task to task queue: " + task); + } + } + + public synchronized void pause() { + paused = true; + } + + public synchronized void unpause() { + paused = false; + for (Task task : deferredQueue) { + taskQueue.scheduleTask(task); + log.debug("moved task from deferred queue to task queue: " + task); } } } + + /** + * A single-threaded task queue that can tell us the status of the current + * task. + * + * If no current task, it can return a status of IDLE or SHUTDOWN. + */ + private static class TaskQueue { + private final ExecutorService queue = Executors + .newSingleThreadExecutor(new VitroBackgroundThread.Factory( + "SearchIndexer_TaskQueue")); + + private AtomicReference current = new AtomicReference<>( + new QueueStatus(SearchIndexerStatus.idle())); + + public void scheduleTask(Task task) { + try { + queue.execute(new TaskWrapper(task)); + } catch (RejectedExecutionException e) { + log.warn("Search Indexer task was rejected: " + e); + } + } + + public SearchIndexerStatus getStatus() { + return current.get().getStatus(); + } + + public void shutdown() { + try { + queue.shutdownNow(); + boolean terminated = queue.awaitTermination(1, MINUTES); + if (!terminated) { + log.warn("SearchIndexer task queue did not shut down " + + "within 1 minute."); + } + current.set(new QueueStatus(SearchIndexerStatus.shutdown())); + } catch (InterruptedException e) { + log.warn("call to 'awaitTermination' was interrupted."); + } + } + + /** When this wrapper is run, we will know the current task and status. */ + private class TaskWrapper implements Runnable { + private final Task task; + + public TaskWrapper(Task task) { + this.task = task; + } + + @Override + public void run() { + current.set(new QueueStatus(task)); + log.debug("starting task: " + task); + task.run(); + current.set(new QueueStatus(SearchIndexerStatus.idle())); + log.debug("ended task: " + task); + } + } + + /** Either a specific status or a task to interrogate. */ + private class QueueStatus { + private final Task task; + private final SearchIndexerStatus status; + + public QueueStatus(Task task) { + this.task = Objects.requireNonNull(task); + this.status = null; + } + + public QueueStatus(SearchIndexerStatus status) { + this.task = null; + this.status = Objects.requireNonNull(status); + } + + public SearchIndexerStatus getStatus() { + if (task != null) { + return task.getStatus(); + } else { + return status; + } + } + } + } + + public static interface Task extends Runnable { + public SearchIndexerStatus getStatus(); + + public void notifyWorkUnitCompletion(Runnable workUnit); + } + + /** + * A thread pool for handling many small units of work submitted by a task. + * + * The task is notified as each unit completes. + * + * Only one task is active at a time, so the task can simply wait until this + * pool is idle to know that all of its units have completed. + * + * When shutting down, no attempt is made to interrupt the currently + * executing work units, since they are assumed to be small. + */ + public static class WorkerThreadPool { + private final ThreadPoolExecutor pool; + + public WorkerThreadPool() { + this.pool = new ThreadPoolExecutor(10, 10, 10, TimeUnit.SECONDS, + new ArrayBlockingQueue(50), + new VitroBackgroundThread.Factory( + "SearchIndexer_ThreadPool")); + } + + public void submit(Runnable workUnit, Task task) { + pool.execute(new WorkUnitWrapper(workUnit, task)); + } + + public void waitUntilIdle() { + for (int i = 0; i < 60; i++) { + if (pool.getActiveCount() == 0) { + return; + } else { + try { + Thread.sleep(1000); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } + } + } + + public void shutdown() { + pool.shutdown(); + try { + boolean terminated = pool.awaitTermination(1, MINUTES); + if (!terminated) { + log.warn("SearchIndexer thread pool did not shut down " + + "within 1 minute."); + } + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } + + private static class WorkUnitWrapper implements Runnable { + private final Runnable workUnit; + private final Task task; + + public WorkUnitWrapper(Runnable workUnit, Task task) { + this.workUnit = workUnit; + this.task = task; + } + + @Override + public void run() { + try { + workUnit.run(); + } finally { + task.notifyWorkUnitCompletion(workUnit); + } + } + + } + + } } diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/SearchIndexerSetup.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/SearchIndexerSetup.java index 8784ccb2e..30da194f1 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/SearchIndexerSetup.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/SearchIndexerSetup.java @@ -6,7 +6,6 @@ import static edu.cornell.mannlib.vitro.webapp.modelaccess.ModelNames.DISPLAY; import java.util.ArrayList; import java.util.List; -import java.util.Set; import javax.servlet.ServletContext; import javax.servlet.ServletContextEvent; @@ -26,11 +25,8 @@ import edu.cornell.mannlib.vitro.webapp.dao.jena.ModelContext; import edu.cornell.mannlib.vitro.webapp.modelaccess.ModelAccess; import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngine; import edu.cornell.mannlib.vitro.webapp.search.SearchIndexer; -import edu.cornell.mannlib.vitro.webapp.search.documentBuilding.IndividualToSearchDocument; import edu.cornell.mannlib.vitro.webapp.search.indexing.IndexBuilder; import edu.cornell.mannlib.vitro.webapp.search.indexing.SearchReindexingListener; -import edu.cornell.mannlib.vitro.webapp.searchindex.documentBuilding.DocumentModifier; -import edu.cornell.mannlib.vitro.webapp.searchindex.exclusions.SearchIndexExcluder; import edu.cornell.mannlib.vitro.webapp.searchindex.indexing.IndexingUriFinder; import edu.cornell.mannlib.vitro.webapp.startup.ComponentStartupStatusImpl; import edu.cornell.mannlib.vitro.webapp.startup.StartupStatus; @@ -40,12 +36,12 @@ import edu.cornell.mannlib.vitro.webapp.utils.developer.Key; import edu.cornell.mannlib.vitro.webapp.utils.developer.listeners.DeveloperDisabledModelChangeListener; /** - * TODO - * A silly implementation that just wraps the old IndexBuilder with a new SearchIndexerImpl. + * TODO A silly implementation that just wraps the old IndexBuilder with a new + * SearchIndexerImpl. */ public class SearchIndexerSetup implements ServletContextListener { private static final Log log = LogFactory.getLog(SearchIndexerSetup.class); - + private ServletContext ctx; private OntModel displayModel; private ConfigurationBeanLoader beanLoader; @@ -55,79 +51,55 @@ public class SearchIndexerSetup implements ServletContextListener { this.ctx = sce.getServletContext(); this.displayModel = ModelAccess.on(ctx).getOntModel(DISPLAY); this.beanLoader = new ConfigurationBeanLoader(displayModel, ctx); - + ServletContext context = sce.getServletContext(); StartupStatus ss = StartupStatus.getBean(context); - SearchEngine searchEngine = ApplicationUtils.instance().getSearchEngine(); + SearchEngine searchEngine = ApplicationUtils.instance() + .getSearchEngine(); - try { - IndividualToSearchDocument indToSearchDoc = setupTranslation(); - - /* setup search indexer */ - SearchIndexer searchIndexer = new SearchIndexer(searchEngine, indToSearchDoc); - - // This is where the builder gets the list of places to try to - // get objects to index. It is filtered so that non-public text - // does not get into the search index. - WebappDaoFactory wadf = ModelAccess.on(context) - .getWebappDaoFactory(); - VitroFilters vf = VitroFilterUtils.getPublicFilter(context); - wadf = new WebappDaoFactoryFiltering(wadf, vf); - - // make objects that will find additional URIs for context nodes etc - List uriFinders = loadUriFinders(); - - // Make the IndexBuilder - IndexBuilder builder = new IndexBuilder(searchIndexer, wadf, - uriFinders); - // Save it to the servlet context so we can access it later in the - // webapp. - context.setAttribute(IndexBuilder.class.getName(), builder); - - // Create listener to notify index builder of changes to model - // (can be disabled by developer setting.) - ModelContext.registerListenerForChanges(context, - new DeveloperDisabledModelChangeListener( - new SearchReindexingListener(builder), - Key.SEARCH_INDEX_SUPPRESS_MODEL_CHANGE_LISTENER)); - - ss.info(this, "Setup of search indexer completed."); - - ApplicationUtils.instance().getSearchIndexer().startup(ApplicationUtils.instance(), new ComponentStartupStatusImpl(this, ss)); - } catch (Throwable e) { - ss.fatal(this, "could not setup search engine", e); + { // >>>>> TODO + try { +// /* setup search indexer */ +// SearchIndexer searchIndexer = new SearchIndexer(searchEngine, +// indToSearchDoc); +// +// // Make the IndexBuilder +// IndexBuilder builder = new IndexBuilder(searchIndexer, wadf, +// uriFinders); +// +// // Create listener to notify index builder of changes to model +// // (can be disabled by developer setting.) +// ModelContext +// .registerListenerForChanges( +// context, +// new DeveloperDisabledModelChangeListener( +// new SearchReindexingListener(builder), +// Key.SEARCH_INDEX_SUPPRESS_MODEL_CHANGE_LISTENER)); +// +// ss.info(this, "Setup of search indexer completed."); +// + } catch (Throwable e) { + ss.fatal(this, "could not setup search engine", e); + } } - + ApplicationUtils + .instance() + .getSearchIndexer() + .startup(ApplicationUtils.instance(), + new ComponentStartupStatusImpl(this, ss)); } @Override public void contextDestroyed(ServletContextEvent sce) { - IndexBuilder builder = (IndexBuilder) sce.getServletContext() - .getAttribute(IndexBuilder.class.getName()); - if (builder != null) - builder.stopIndexingThread(); + ApplicationUtils.instance().getSearchIndexer() + .shutdown(ApplicationUtils.instance()); - } - - private IndividualToSearchDocument setupTranslation() { - try { - Set excluders = beanLoader.loadAll(SearchIndexExcluder.class); - log.debug("Excludes: (" + excluders.size() + ") " + excluders); - - Set modifiers = beanLoader.loadAll(DocumentModifier.class); - log.debug("Modifiers: (" + modifiers.size() + ") " + modifiers); - - return new IndividualToSearchDocument(new ArrayList<>(excluders), new ArrayList<>(modifiers)); - } catch (ConfigurationBeanLoaderException e) { - throw new RuntimeException("Failed to configure the SearchIndexer", e); - } - } - - private List loadUriFinders() { - try { - return new ArrayList<>(beanLoader.loadAll(IndexingUriFinder.class)); - } catch (ConfigurationBeanLoaderException e) { - throw new RuntimeException("Failed to configure the SearchIndexer", e); + { // >>>>> TODO + IndexBuilder builder = (IndexBuilder) sce.getServletContext() + .getAttribute(IndexBuilder.class.getName()); + if (builder != null) + builder.stopIndexingThread(); } } + } diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/exclusions/ExcludeBasedOnType.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/exclusions/ExcludeBasedOnType.java index ee668eccc..15eda7f25 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/exclusions/ExcludeBasedOnType.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/exclusions/ExcludeBasedOnType.java @@ -20,7 +20,7 @@ import edu.cornell.mannlib.vitro.webapp.utils.configuration.Property; */ public class ExcludeBasedOnType implements SearchIndexExcluder { - private static final String SKIP_MSG = "skipping due to type."; + private static final String SKIP_MSG = "skipping due to type: "; private final Set typeURIs = new HashSet<>(); @@ -35,7 +35,7 @@ public class ExcludeBasedOnType implements SearchIndexExcluder { return DONT_EXCLUDE; } if (typeURIinExcludeList(ind.getVClass())) { - return SKIP_MSG; + return SKIP_MSG + ind.getVClass(); } List vclasses = new ArrayList<>(); @@ -44,7 +44,7 @@ public class ExcludeBasedOnType implements SearchIndexExcluder { for (VClass vclz : vclasses) { if (typeURIinExcludeList(vclz)) - return SKIP_MSG; + return SKIP_MSG + vclz; } return DONT_EXCLUDE; diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/tasks/RebuildIndexTask.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/tasks/RebuildIndexTask.java new file mode 100644 index 000000000..752abdda7 --- /dev/null +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/tasks/RebuildIndexTask.java @@ -0,0 +1,41 @@ +/* $This file is distributed under the terms of the license in /doc/license.txt$ */ + +package edu.cornell.mannlib.vitro.webapp.searchindex.tasks; + +import edu.cornell.mannlib.vitro.webapp.modules.searchIndexer.SearchIndexerStatus; +import edu.cornell.mannlib.vitro.webapp.searchindex.SearchIndexerImpl.Task; + +/** + * TODO + */ +public class RebuildIndexTask implements Task { + + /* (non-Javadoc) + * @see java.lang.Runnable#run() + */ + @Override + public void run() { + // TODO Auto-generated method stub + throw new RuntimeException("RebuildIndexTask.run() not implemented."); + + } + + /* (non-Javadoc) + * @see edu.cornell.mannlib.vitro.webapp.searchindex.SearchIndexerImpl.Task#getStatus() + */ + @Override + public SearchIndexerStatus getStatus() { + // TODO Auto-generated method stub + throw new RuntimeException( + "RebuildIndexTask.getStatus() not implemented."); + + } + + @Override + public void notifyWorkUnitCompletion(Runnable workUnit) { + // TODO Auto-generated method stub + throw new RuntimeException("RebuildIndexTask.notifyWorkUnitCompletion() not implemented."); + + } + +} diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/tasks/UpdateDocumentWorkUnit.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/tasks/UpdateDocumentWorkUnit.java new file mode 100644 index 000000000..715e1717d --- /dev/null +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/tasks/UpdateDocumentWorkUnit.java @@ -0,0 +1,195 @@ +/* $This file is distributed under the terms of the license in /doc/license.txt$ */ + +package edu.cornell.mannlib.vitro.webapp.searchindex.tasks; + +import static edu.cornell.mannlib.vitro.webapp.search.VitroSearchTermNames.ALLTEXT; +import static edu.cornell.mannlib.vitro.webapp.search.VitroSearchTermNames.ALLTEXTUNSTEMMED; +import static edu.cornell.mannlib.vitro.webapp.search.VitroSearchTermNames.CLASSGROUP_URI; +import static edu.cornell.mannlib.vitro.webapp.search.VitroSearchTermNames.DOCID; +import static edu.cornell.mannlib.vitro.webapp.search.VitroSearchTermNames.INDEXEDTIME; +import static edu.cornell.mannlib.vitro.webapp.search.VitroSearchTermNames.MOST_SPECIFIC_TYPE_URIS; +import static edu.cornell.mannlib.vitro.webapp.search.VitroSearchTermNames.NAME_LOWERCASE_SINGLE_VALUED; +import static edu.cornell.mannlib.vitro.webapp.search.VitroSearchTermNames.NAME_RAW; +import static edu.cornell.mannlib.vitro.webapp.search.VitroSearchTermNames.RDFTYPE; +import static edu.cornell.mannlib.vitro.webapp.search.VitroSearchTermNames.URI; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +import org.apache.commons.lang.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.joda.time.DateTime; +import org.jsoup.Jsoup; + +import com.hp.hpl.jena.vocabulary.OWL; +import com.hp.hpl.jena.vocabulary.RDFS; + +import edu.cornell.mannlib.vitro.webapp.application.ApplicationUtils; +import edu.cornell.mannlib.vitro.webapp.beans.DataPropertyStatement; +import edu.cornell.mannlib.vitro.webapp.beans.Individual; +import edu.cornell.mannlib.vitro.webapp.beans.ObjectPropertyStatement; +import edu.cornell.mannlib.vitro.webapp.beans.VClass; +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngine; +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchInputDocument; +import edu.cornell.mannlib.vitro.webapp.modules.searchIndexer.SearchIndexerUtils; +import edu.cornell.mannlib.vitro.webapp.searchindex.documentBuilding.DocumentModifier; + +public class UpdateDocumentWorkUnit implements Runnable { + private static final Log log = LogFactory + .getLog(UpdateDocumentWorkUnit.class); + + private static final String URI_OWL_THING = OWL.Thing.getURI(); + private static final String URI_DIFFERENT_FROM = OWL.differentFrom.getURI(); + private static final String URI_RDFS_LABEL = RDFS.label.getURI(); + + private final Individual ind; + private final List modifiers; + private final SearchEngine searchEngine; + + public UpdateDocumentWorkUnit(Individual ind, + Collection modifiers) { + this.ind = ind; + this.modifiers = new ArrayList<>(modifiers); + this.searchEngine = ApplicationUtils.instance().getSearchEngine(); + } + + public Individual getInd() { + return ind; + } + + @Override + public void run() { + try { + SearchInputDocument doc = searchEngine.createInputDocument(); + + addIdAndUri(doc); + addLabel(doc); + addClasses(doc); + addMostSpecificTypes(doc); + addObjectPropertyText(doc); + addDataPropertyText(doc); + addEntityBoost(doc); + + for (DocumentModifier modifier : modifiers) { + modifier.modifyDocument(ind, doc); + } + + addIndexedTime(doc); + + searchEngine.add(doc); + } catch (Exception e) { + log.warn("Failed to add '" + ind + "' to the search index.", e); + } + } + + private void addIdAndUri(SearchInputDocument doc) { + doc.addField(DOCID, SearchIndexerUtils.getIdForUri(ind.getURI())); + doc.addField(URI, ind.getURI()); + } + + private void addLabel(SearchInputDocument doc) { + String name = ind.getRdfsLabel(); + if (name == null) { + name = ind.getLocalName(); + } + + doc.addField(NAME_RAW, name); + doc.addField(NAME_LOWERCASE_SINGLE_VALUED, name); + } + + /** + * For each class that the individual belongs to, record the class URI, the + * class group URI, the class Name, and the class boost. + */ + private void addClasses(SearchInputDocument doc) { + List vclasses = ind.getVClasses(false); + if (vclasses == null) { + return; + } + + for (VClass clz : vclasses) { + String classUri = clz.getURI(); + if (classUri == null || URI_OWL_THING.equals(classUri)) { + continue; + } + doc.addField(RDFTYPE, classUri); + + String classGroupUri = clz.getGroupURI(); + if (classGroupUri != null) { + doc.addField(CLASSGROUP_URI, classGroupUri); + } + + addToAlltext(doc, clz.getName()); + + Float boost = clz.getSearchBoost(); + if (boost != null) { + doc.setDocumentBoost(doc.getDocumentBoost() + boost); + } + } + } + + private void addMostSpecificTypes(SearchInputDocument doc) { + List mstURIs = ind.getMostSpecificTypeURIs(); + if (mstURIs != null) { + for (String typeURI : mstURIs) { + if (StringUtils.isNotBlank(typeURI)) { + doc.addField(MOST_SPECIFIC_TYPE_URIS, typeURI); + } + } + } + } + + private void addObjectPropertyText(SearchInputDocument doc) { + List stmts = ind.getObjectPropertyStatements(); + if (stmts == null) { + return; + } + + for (ObjectPropertyStatement stmt : stmts) { + if (URI_DIFFERENT_FROM.equals(stmt.getPropertyURI())) { + continue; + } + addToAlltext(doc, stmt.getObject().getRdfsLabel()); + } + } + + private void addDataPropertyText(SearchInputDocument doc) { + List stmts = ind.getDataPropertyStatements(); + if (stmts == null) { + return; + } + + for (DataPropertyStatement stmt : stmts) { + if (stmt.getDatapropURI().equals(URI_RDFS_LABEL)) { + continue; + } + addToAlltext(doc, stmt.getData()); + } + } + + private void addEntityBoost(SearchInputDocument doc) { + Float boost = ind.getSearchBoost(); + if(boost != null && ! boost.equals(0.0F)) { + doc.setDocumentBoost(boost); + } + } + + private void addIndexedTime(SearchInputDocument doc) { + doc.addField(INDEXEDTIME, (Object) new DateTime().getMillis()); + } + + private void addToAlltext(SearchInputDocument doc, String raw) { + if (StringUtils.isBlank(raw)) { + return; + } + String clean = Jsoup.parse(raw).text(); + if (StringUtils.isBlank(clean)) { + return; + } + doc.addField(ALLTEXT, clean); + doc.addField(ALLTEXTUNSTEMMED, clean); + + } +} diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/tasks/UpdateUrisTask.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/tasks/UpdateUrisTask.java new file mode 100644 index 000000000..766cfc695 --- /dev/null +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/searchindex/tasks/UpdateUrisTask.java @@ -0,0 +1,199 @@ +/* $This file is distributed under the terms of the license in /doc/license.txt$ */ + +package edu.cornell.mannlib.vitro.webapp.searchindex.tasks; + +import static edu.cornell.mannlib.vitro.webapp.modules.searchIndexer.SearchIndexer.Event.Type.START_PROCESSING_URIS; +import static edu.cornell.mannlib.vitro.webapp.modules.searchIndexer.SearchIndexer.Event.Type.STOP_PROCESSING_URIS; +import static edu.cornell.mannlib.vitro.webapp.modules.searchIndexer.SearchIndexerStatus.State.PROCESSING_URIS; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Date; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import edu.cornell.mannlib.vitro.webapp.application.ApplicationUtils; +import edu.cornell.mannlib.vitro.webapp.beans.Individual; +import edu.cornell.mannlib.vitro.webapp.beans.VClass; +import edu.cornell.mannlib.vitro.webapp.dao.IndividualDao; +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngine; +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngineException; +import edu.cornell.mannlib.vitro.webapp.modules.searchIndexer.SearchIndexer.Event; +import edu.cornell.mannlib.vitro.webapp.modules.searchIndexer.SearchIndexerStatus; +import edu.cornell.mannlib.vitro.webapp.modules.searchIndexer.SearchIndexerStatus.UriCounts; +import edu.cornell.mannlib.vitro.webapp.modules.searchIndexer.SearchIndexerUtils; +import edu.cornell.mannlib.vitro.webapp.searchindex.SearchIndexerImpl.ListenerList; +import edu.cornell.mannlib.vitro.webapp.searchindex.SearchIndexerImpl.Task; +import edu.cornell.mannlib.vitro.webapp.searchindex.SearchIndexerImpl.WorkerThreadPool; +import edu.cornell.mannlib.vitro.webapp.searchindex.documentBuilding.DocumentModifier; +import edu.cornell.mannlib.vitro.webapp.searchindex.exclusions.SearchIndexExcluder; + +/** + * Given a list of URIs, remove the ones that don't belong in the index and + * update the ones that do belong. + * + * A URI doesn't belong in the index if there is no individual with that URI, or + * if the individual has no VClasses assigned to it, or if the individual is + * excluded by one of the excluders. + * + * Deletions are done synchronously, but updates are scheduled to run on the + * thread pool. + */ +public class UpdateUrisTask implements Task { + private static final Log log = LogFactory.getLog(UpdateUrisTask.class); + + private final Set uris; + private final IndividualDao indDao; + private final List excluders; + private final List modifiers; + private final ListenerList listeners; + private final WorkerThreadPool pool; + + private final Status status; + private final SearchEngine searchEngine; + + public UpdateUrisTask(Collection uris, + Collection excluders, + Collection modifiers, IndividualDao indDao, + ListenerList listeners, WorkerThreadPool pool) { + this.uris = new HashSet<>(uris); + this.excluders = new ArrayList<>(excluders); + this.modifiers = new ArrayList<>(modifiers); + this.indDao = indDao; + this.listeners = listeners; + this.pool = pool; + + this.status = new Status(uris.size()); + + this.searchEngine = ApplicationUtils.instance().getSearchEngine(); + } + + @Override + public void run() { + listeners.fireEvent(new Event(START_PROCESSING_URIS, status + .getSearchIndexerStatus())); + for (String uri : uris) { + if (isInterrupted()) { + log.info("Interrupted: " + status.getSearchIndexerStatus()); + return; + } else { + Individual ind = getIndividual(uri); + if (ind == null || hasNoClass(ind) || isExcluded(ind)) { + deleteDocument(uri); + } else { + updateDocument(ind); + } + } + } + pool.waitUntilIdle(); + listeners.fireEvent(new Event(STOP_PROCESSING_URIS, status + .getSearchIndexerStatus())); + } + + private boolean isInterrupted() { + if (Thread.interrupted()) { + Thread.currentThread().interrupt(); + return true; + } else { + return false; + } + } + + private Individual getIndividual(String uri) { + Individual ind = indDao.getIndividualByURI(uri); + if (ind == null) { + log.debug("Found no individual for '" + uri + "'"); + } + return ind; + } + + private boolean hasNoClass(Individual ind) { + List vclasses = ind.getVClasses(false); + if (vclasses == null || vclasses.isEmpty()) { + log.debug("Individual " + ind + " has no classes."); + return true; + } + return false; + } + + private boolean isExcluded(Individual ind) { + for (SearchIndexExcluder excluder : excluders) { + String message = excluder.checkForExclusion(ind); + if (message != SearchIndexExcluder.DONT_EXCLUDE) { + log.debug("Excluded " + ind + " because " + message); + return true; + } + } + return false; + } + + /** A delete is fast enough to be done synchronously. */ + private void deleteDocument(String uri) { + try { + searchEngine.deleteById(SearchIndexerUtils.getIdForUri(uri)); + status.incrementDeletes(); + log.debug("deleted '" + uri + "' from search index."); + } catch (SearchEngineException e) { + log.warn("Failed to delete '" + uri + "' from search index", e); + } + } + + private void updateDocument(Individual ind) { + Runnable workUnit = new UpdateDocumentWorkUnit(ind, modifiers); + pool.submit(workUnit, this); + log.debug("scheduled update to " + ind); + } + + @Override + public void notifyWorkUnitCompletion(Runnable workUnit) { + log.debug("completed update to " + + ((UpdateDocumentWorkUnit) workUnit).getInd()); + status.incrementUpdates(); + } + + @Override + public SearchIndexerStatus getStatus() { + return status.getSearchIndexerStatus(); + } + + // ---------------------------------------------------------------------- + // helper classes + // ---------------------------------------------------------------------- + + /** + * A thread-safe collection of status information. All methods are + * synchronized. + */ + private static class Status { + private final int total; + private int updated = 0; + private int deleted = 0; + private Date since = new Date(); + + public Status(int total) { + this.total = total; + } + + public synchronized void incrementUpdates() { + updated++; + since = new Date(); + } + + public synchronized void incrementDeletes() { + deleted++; + since = new Date(); + } + + public synchronized SearchIndexerStatus getSearchIndexerStatus() { + int remaining = total - updated - deleted; + return new SearchIndexerStatus(PROCESSING_URIS, since, + new UriCounts(deleted, updated, remaining, total)); + } + + } + +}