From b8f2118d4bac05a889b0065199f03f12c6bbb79d Mon Sep 17 00:00:00 2001 From: briancaruso Date: Thu, 30 Jun 2011 21:40:59 +0000 Subject: [PATCH] refactoring IndividualToSolrDocument.java. Fix for NIHVIVO-2806 --- .../webapp/search/indexing/IndexBuilder.java | 13 +- .../search/indexing/IndexWorkerThread.java | 3 + .../webapp/search/solr/DocumentModifier.java | 2 +- .../search/solr/IndividualToSolrDocument.java | 395 +++++++++--------- .../vitro/webapp/search/solr/NameBoost.java | 34 ++ .../search/solr/SkipIndividualException.java | 11 + .../vitro/webapp/search/solr/SolrIndexer.java | 12 +- .../vitro/webapp/search/solr/SolrSetup.java | 2 +- 8 files changed, 273 insertions(+), 199 deletions(-) create mode 100644 webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/NameBoost.java create mode 100644 webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/SkipIndividualException.java diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/indexing/IndexBuilder.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/indexing/IndexBuilder.java index 0569582f3..785bbcb4a 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/indexing/IndexBuilder.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/indexing/IndexBuilder.java @@ -117,6 +117,10 @@ public class IndexBuilder extends Thread { return isReindexRequested() || ! changedUris.isEmpty() ; } + + /** + * This is called when the system shuts down. + */ public synchronized void stopIndexingThread() { stopRequested = true; this.notifyAll(); @@ -238,7 +242,12 @@ public class IndexBuilder extends Thread { if( ! forceNewIndex ){ for(String deleteMe : deletes ){ - indexer.removeFromIndex(deleteMe); + try{ + indexer.removeFromIndex(deleteMe); + }catch(Exception ex){ + log.debug("could not remove individual " + deleteMe + + " from index, usually this is harmless",ex); + } } } @@ -251,7 +260,7 @@ public class IndexBuilder extends Thread { log.error(e,e); } - if( aborted && forceNewIndex ){ + if( aborted ){ indexer.abortIndexingAndCleanUp(); }else{ indexer.endIndexing(); diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/indexing/IndexWorkerThread.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/indexing/IndexWorkerThread.java index 8b0cc69e5..aa3dd7698 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/indexing/IndexWorkerThread.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/indexing/IndexWorkerThread.java @@ -60,6 +60,9 @@ class IndexWorkerThread extends Thread{ ind = individualsToIndex.next(); indexer.index( ind ); } catch (IndexingException e) { + if( stopRequested ) + return; + if( ind != null ) log.error("Could not index individual " + ind.getURI() , e ); else diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/DocumentModifier.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/DocumentModifier.java index f1d97270a..b3b960451 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/DocumentModifier.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/DocumentModifier.java @@ -10,7 +10,7 @@ import edu.cornell.mannlib.vitro.webapp.beans.Individual; * This interface represents an object that can add to a SolrInputDocument. */ public interface DocumentModifier { - public void modifyDocument(Individual individual, SolrInputDocument doc, StringBuffer addUri); + public void modifyDocument(Individual individual, SolrInputDocument doc, StringBuffer addUri) throws SkipIndividualException; //called to inform the DocumentModifier that the system is shutting down public void shutdown(); diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/IndividualToSolrDocument.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/IndividualToSolrDocument.java index b133cfea2..58a2e5712 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/IndividualToSolrDocument.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/IndividualToSolrDocument.java @@ -63,116 +63,157 @@ public class IndividualToSolrDocument { @SuppressWarnings("static-access") public SolrInputDocument translate(Individual ind) throws IndexingException{ - long tProhibited = System.currentTimeMillis(); - ArrayList superClassNames = null; - StringBuffer addUri = null; - String value; - StringBuffer classPublicNames = new StringBuffer(); - classPublicNames.append(""); - SolrInputDocument doc = new SolrInputDocument(); - - String id = ind.getURI(); - log.debug("translating " + id); - - if(id == null){ - log.debug("cannot add individuals without URIs to lucene Index"); - return null; - }else if( id.startsWith(VitroVocabulary.vitroURI) || - id.startsWith(VitroVocabulary.VITRO_PUBLIC) || - id.startsWith(VitroVocabulary.PSEUDO_BNODE_NS) || - id.startsWith(OWL.NS)){ - log.debug("not indexing because of namespace:" + id); - return null; - } - - //filter out class groups, owl:ObjectProperties etc.. - if(individualProhibitedFromSearch.isIndividualProhibited(id)){ - return null; - } - - log.debug("time to check if individual is prohibited:" + Long.toString(System.currentTimeMillis() - tProhibited)); - - // Types and classgroups - boolean prohibited = false; - List vclasses = ind.getVClasses(false); - superClassNames = new ArrayList(); - String superLclName = null; - long tClassgroup = System.currentTimeMillis(); - for(VClass clz : vclasses){ - superLclName = clz.getLocalName(); - superClassNames.add(superLclName); - if(clz.getURI() == null){ - continue; - }else if(OWL.Thing.getURI().equals(clz.getURI())){ - //index individuals of type owl:Thing, just don't add owl:Thing as the type field in the index - continue; - } else if(clz.getURI().startsWith(OWL.NS)){ - log.debug("not indexing " + id + " because of type " + clz.getURI()); - return null; - } else if(contextNodeClassNames.contains(superLclName)) { // check to see if context node is being indexed. - return null; - } - else { - if( !prohibited && classesProhibitedFromSearch.isClassProhibitedFromSearch(clz.getURI())) - prohibited = true; - if( clz.getSearchBoost() != null) - doc.setDocumentBoost(doc.getDocumentBoost() + clz.getSearchBoost()); - - doc.addField(term.RDFTYPE, clz.getURI()); - - if(clz.getLocalName() != null){ - doc.addField(term.CLASSLOCALNAME, clz.getLocalName()); - doc.addField(term.CLASSLOCALNAMELOWERCASE, clz.getLocalName().toLowerCase()); - } - - if(clz.getName() != null){ - classPublicNames.append(" "); - classPublicNames.append(clz.getName()); - } - - //Classgroup URI - if(clz.getGroupURI() != null){ - doc.addField(term.CLASSGROUP_URI,clz.getGroupURI()); - } - - } - } - - if(superClassNames.isEmpty()){ - return null; - } - - log.debug("time to check if class is prohibited and adding classes, classgroups and type to the index: " + Long.toString(System.currentTimeMillis() - tClassgroup)); + try{ + log.debug("translating " + ind.getURI()); + checkForSkipBasedOnNS( ind ); + + SolrInputDocument doc = new SolrInputDocument(); + + //DocID + doc.addField(term.DOCID, getIdForUri( ind.getURI() ) ); + + //vitro id + doc.addField(term.URI, ind.getURI()); + + //java class + doc.addField(term.JCLASS, entClassName); - - doc.addField(term.PROHIBITED_FROM_TEXT_RESULTS, prohibited?"1":"0"); - - //DocID - doc.addField(term.DOCID, getIdForUri( ind.getURI() ) ); - - //vitro id - doc.addField(term.URI, id); - - //java class - doc.addField(term.JCLASS, entClassName); - - //Individual Label - if(ind.getRdfsLabel() != null) - value = ind.getRdfsLabel(); - else{ - log.debug("Using local name for individual with rdfs:label " + ind.getURI()); - value = ind.getLocalName(); - } - - // collecting object property statements - - String uri = ind.getURI(); - StringBuffer objectNames = new StringBuffer(); - objectNames.append(""); - String t=null; - addUri = new StringBuffer(); - addUri.append(""); - List objectPropertyStatements = ind.getObjectPropertyStatements(); + //Individual Label + addLabel( ind, doc ); + + //add classes, classgroups get if prohibied becasue of its class + StringBuffer classPublicNames = new StringBuffer(""); + boolean prohibited = addClasses(ind, doc, classPublicNames); + + //filter out class groups, owl:ObjectProperties etc.. + if(individualProhibitedFromSearch.isIndividualProhibited( ind.getURI() )){ + return null; + } + + // collecting URIs and rdfs:labels of objects of statements + StringBuffer objectNames = new StringBuffer(""); + StringBuffer addUri = new StringBuffer(""); + addObjectPropertyText(ind, doc, objectNames, addUri); + + //add if the individual has a thumbnail or not. + addThumbnailExistance(ind, doc); + + //time of index in millis past epoc + doc.addField(term.INDEXEDTIME,(new DateTime()).getMillis()); + + if(!prohibited){ + addAllText( ind, doc, classPublicNames, objectNames ); + + runAdditionalDocModifers(ind,doc,addUri); + + //boost for entity + if(documentModifiers == null || documentModifiers.isEmpty() && + (ind.getSearchBoost() != null && ind.getSearchBoost() != 0)) { + doc.setDocumentBoost(ind.getSearchBoost()); + } + } + + return doc; + }catch(SkipIndividualException ex){ + //indicates that this individual should not be indexed + log.debug(ex); + return null; + } + } + + + private void runAdditionalDocModifers( Individual ind, SolrInputDocument doc, StringBuffer addUri ) + throws SkipIndividualException{ + //run the document modifiers + if( documentModifiers != null && !documentModifiers.isEmpty()){ + for(DocumentModifier modifier: documentModifiers){ + modifier.modifyDocument(ind, doc, addUri); + } + } + } + + private void checkForSkipBasedOnNS(Individual ind) throws SkipIndividualException { + String id = ind.getURI(); + if(id == null){ + throw new SkipIndividualException("cannot add individuals without URIs to lucene Index"); + }else if( id.startsWith(VitroVocabulary.vitroURI) || + id.startsWith(VitroVocabulary.VITRO_PUBLIC) || + id.startsWith(VitroVocabulary.PSEUDO_BNODE_NS) || + id.startsWith(OWL.NS)){ + throw new SkipIndividualException("not indexing because of namespace:" + id); + } + } + + private void addAllText(Individual ind, SolrInputDocument doc, StringBuffer classPublicNames, StringBuffer objectNames) { + String t=null; + + //ALLTEXT, all of the 'full text' + StringBuffer allTextValue = new StringBuffer(); + allTextValue.append(""); + allTextValue.append(" "); + allTextValue.append(((t=ind.getName()) == null)?"":t); + allTextValue.append(" "); + allTextValue.append(((t=ind.getAnchor()) == null)?"":t); + allTextValue.append(" "); + allTextValue.append(classPublicNames); + + //collecting data property statements + List dataPropertyStatements = ind.getDataPropertyStatements(); + if (dataPropertyStatements != null) { + Iterator dataPropertyStmtIter = dataPropertyStatements.iterator(); + while (dataPropertyStmtIter.hasNext()) { + DataPropertyStatement dataPropertyStmt = dataPropertyStmtIter.next(); + allTextValue.append(" "); + allTextValue.append(((t=dataPropertyStmt.getData()) == null)?"":t); + } + } + + allTextValue.append(objectNames.toString()); + + String alltext = allTextValue.toString(); + doc.addField(term.ALLTEXT, alltext); + doc.addField(term.ALLTEXTUNSTEMMED, alltext); + doc.addField(term.ALLTEXT_PHONETIC, alltext); + } + + private void addLabel(Individual ind, SolrInputDocument doc) { + String value = ""; + if(ind.getRdfsLabel() != null) + value = ind.getRdfsLabel(); + else{ + value = ind.getLocalName(); + } + doc.addField(term.NAME_RAW, value); + doc.addField(term.NAME_LOWERCASE, value); + doc.addField(term.NAME_UNSTEMMED, value); + doc.addField(term.NAME_STEMMED, value); + doc.addField(term.NAME_PHONETIC, value); + doc.addField(term.AC_NAME_UNTOKENIZED, value); + doc.addField(term.AC_NAME_STEMMED, value); + // doc.addField(term.AC_NAME_TOKENIZED, value); + } + + /** + * Adds if the individual has a thumbnail image or not. + */ + private void addThumbnailExistance(Individual ind, SolrInputDocument doc) { + try{ + if(ind.hasThumb()) + doc.addField(term.THUMBNAIL, "1"); + else + doc.addField(term.THUMBNAIL, "0"); + }catch(Exception ex){ + log.debug("could not index thumbnail: " + ex); + } + } + + /** + * Get the rdfs:labes for objects of statements and put in objectNames. + * Get the URIs for objects of statements and put in addUri. + */ + private void addObjectPropertyText(Individual ind, SolrInputDocument doc, + StringBuffer objectNames, StringBuffer addUri) { + List objectPropertyStatements = ind.getObjectPropertyStatements(); if (objectPropertyStatements != null) { Iterator objectPropertyStmtIter = objectPropertyStatements.iterator(); while (objectPropertyStmtIter.hasNext()) { @@ -180,7 +221,8 @@ public class IndividualToSolrDocument { if( "http://www.w3.org/2002/07/owl#differentFrom".equals(objectPropertyStmt.getPropertyURI()) ) continue; try { - objectNames.append(" "); + objectNames.append(" "); + String t=null; objectNames.append(((t=objectPropertyStmt.getObject().getName()) == null)?"":t); addUri.append(" "); addUri.append(((t=objectPropertyStmt.getObject().getURI()) == null)?"":t); @@ -188,92 +230,68 @@ public class IndividualToSolrDocument { log.debug("could not index name of related object: " + e.getMessage()); } } - } - - doc.addField(term.NAME_RAW, value, NAME_BOOST); - doc.addField(term.NAME_LOWERCASE, value, NAME_BOOST); - doc.addField(term.NAME_UNSTEMMED, value, NAME_BOOST); - doc.addField(term.NAME_STEMMED, value, NAME_BOOST); - doc.addField(term.NAME_PHONETIC, value); - doc.addField(term.AC_NAME_UNTOKENIZED, value); - doc.addField(term.AC_NAME_STEMMED, value); - // doc.addField(term.AC_NAME_TOKENIZED, value); - - - long tMoniker = System.currentTimeMillis(); - - if(documentModifiers == null || documentModifiers.isEmpty()){ - //boost for entity - if(ind.getSearchBoost() != null && ind.getSearchBoost() != 0) { - doc.setDocumentBoost(ind.getSearchBoost()); - } - } - - //thumbnail - try{ - value = null; - if(ind.hasThumb()) - doc.addField(term.THUMBNAIL, "1"); - else - doc.addField(term.THUMBNAIL, "0"); - }catch(Exception ex){ - log.debug("could not index thumbnail: " + ex); - } - - - //time of index in millis past epoc - // Object anon[] = { new Long((new DateTime() ).getMillis()) }; - // doc.addField(term.INDEXEDTIME, String.format("%019d", anon)); - doc.addField(term.INDEXEDTIME,(new DateTime()).getMillis()); - - log.debug("time to include thumbnail and indexedtime in the index: " + Long.toString(System.currentTimeMillis() - tMoniker)); + } + } - long tPropertyStatements = System.currentTimeMillis(); + /** + * Adds the info about the classes that the individual is a member + * of, classgroups and checks if prohibited. + * @param classPublicNames + * @returns true if prohibited from search + * @throws SkipIndividualException + */ + protected boolean addClasses(Individual ind, SolrInputDocument doc, StringBuffer classPublicNames) throws SkipIndividualException{ + ArrayList superClassNames = null; - //collecting data property statements - - if(!prohibited){ - //ALLTEXT, all of the 'full text' - StringBuffer allTextValue = new StringBuffer(); - allTextValue.append(""); - allTextValue.append(" "); - allTextValue.append(((t=ind.getName()) == null)?"":t); - allTextValue.append(" "); - allTextValue.append(((t=ind.getAnchor()) == null)?"":t); - allTextValue.append(" "); - allTextValue.append(classPublicNames.toString()); - - List dataPropertyStatements = ind.getDataPropertyStatements(); - if (dataPropertyStatements != null) { - Iterator dataPropertyStmtIter = dataPropertyStatements.iterator(); - while (dataPropertyStmtIter.hasNext()) { - DataPropertyStatement dataPropertyStmt = dataPropertyStmtIter.next(); - allTextValue.append(" "); - allTextValue.append(((t=dataPropertyStmt.getData()) == null)?"":t); + // Types and classgroups + boolean prohibited = false; + List vclasses = ind.getVClasses(false); + superClassNames = new ArrayList(); + for(VClass clz : vclasses){ + String superLclName = clz.getLocalName(); + superClassNames.add(superLclName); + if(clz.getURI() == null){ + continue; + }else if(OWL.Thing.getURI().equals(clz.getURI())){ + //index individuals of type owl:Thing, just don't add owl:Thing as the type field in the index + continue; + } else if(clz.getURI().startsWith(OWL.NS)){ + throw new SkipIndividualException("not indexing " + ind.getURI() + " because of type " + clz.getURI() ); + } else if(contextNodeClassNames.contains(superLclName)) { // check to see if context node is being indexed. + throw new SkipIndividualException("not indexing " + ind.getURI() + " because of context node type " + clz.getURI() ); + } else { + if( !prohibited && classesProhibitedFromSearch.isClassProhibitedFromSearch(clz.getURI())) + prohibited = true; + if( clz.getSearchBoost() != null) + doc.setDocumentBoost(doc.getDocumentBoost() + clz.getSearchBoost()); + + doc.addField(term.RDFTYPE, clz.getURI()); + + if(clz.getLocalName() != null){ + doc.addField(term.CLASSLOCALNAME, clz.getLocalName()); + doc.addField(term.CLASSLOCALNAMELOWERCASE, clz.getLocalName().toLowerCase()); } - } - - allTextValue.append(objectNames.toString()); - - log.debug("time to include data property statements, object property statements in the index: " + Long.toString(System.currentTimeMillis() - tPropertyStatements)); - - String alltext = allTextValue.toString(); - doc.addField(term.ALLTEXT, alltext); - doc.addField(term.ALLTEXTUNSTEMMED, alltext); - doc.addField(term.ALLTEXT_PHONETIC, alltext); - - //run the document modifiers - if( documentModifiers != null && !documentModifiers.isEmpty()){ - for(DocumentModifier modifier: documentModifiers){ - modifier.modifyDocument(ind, doc, addUri); + + if(clz.getName() != null){ + classPublicNames.append(" "); + classPublicNames.append(clz.getName()); } + + //Add the Classgroup URI to a field + if(clz.getGroupURI() != null){ + doc.addField(term.CLASSGROUP_URI,clz.getGroupURI()); + } } - } + } - return doc; + if(superClassNames.isEmpty()){ + throw new SkipIndividualException("Not indexing because individual has no super classes"); + } + + doc.addField(term.PROHIBITED_FROM_TEXT_RESULTS, prohibited?"1":"0"); + return prohibited; } - public Object getIndexId(Object obj) { throw new Error("IndiviudalToSolrDocument.getIndexId() is unimplemented"); } @@ -343,5 +361,4 @@ public class IndividualToSolrDocument { public static float NAME_BOOST = 1.2F; - } diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/NameBoost.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/NameBoost.java new file mode 100644 index 000000000..66ebc6ce2 --- /dev/null +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/NameBoost.java @@ -0,0 +1,34 @@ +/* $This file is distributed under the terms of the license in /doc/license.txt$ */ + +package edu.cornell.mannlib.vitro.webapp.search.solr; + +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.SolrInputField; + +import edu.cornell.mannlib.vitro.webapp.beans.Individual; +import edu.cornell.mannlib.vitro.webapp.search.VitroSearchTermNames; + +public class NameBoost implements DocumentModifier { + + static VitroSearchTermNames term = new VitroSearchTermNames(); + String[] fieldsToBoost = {term.NAME_RAW,term.NAME_LOWERCASE,term.NAME_UNSTEMMED,term.NAME_STEMMED}; + + static final float NAME_BOOST = (float) 1.2; + + @Override + public void modifyDocument(Individual individual, SolrInputDocument doc, + StringBuffer addUri) { + + for( String fieldName : fieldsToBoost){ + SolrInputField field = doc.getField(fieldName); + if( field != null ) + field.setBoost(field.getBoost() * NAME_BOOST); + } + } + + @Override + public void shutdown() { + // do nothing. + } + +} diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/SkipIndividualException.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/SkipIndividualException.java new file mode 100644 index 000000000..5d2848ce6 --- /dev/null +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/SkipIndividualException.java @@ -0,0 +1,11 @@ +/* $This file is distributed under the terms of the license in /doc/license.txt$ */ + +package edu.cornell.mannlib.vitro.webapp.search.solr; + +class SkipIndividualException extends Exception{ + + public SkipIndividualException(String string) { + super(string); + } + +} \ No newline at end of file diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/SolrIndexer.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/SolrIndexer.java index b32a08b05..bf7837413 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/SolrIndexer.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/SolrIndexer.java @@ -173,7 +173,7 @@ public class SolrIndexer implements IndexerIface { individualToSolrDoc.shutdown(); }catch(Exception e){ if( log != null) - log.warn(e,e); + log.debug(e,e); } } @@ -187,11 +187,11 @@ public class SolrIndexer implements IndexerIface { } catch(IOException e){ log.error("Could not commit to solr server", e); } - try { - server.optimize(); - } catch (Exception e) { - log.error("Could not optimize solr server", e); - } +// try { +// server.optimize(); +// } catch (Exception e) { +// log.error("Could not optimize solr server", e); +// } indexing = false; notifyAll(); } diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/SolrSetup.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/SolrSetup.java index 03cb4754c..f08b63f32 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/SolrSetup.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/SolrSetup.java @@ -78,9 +78,9 @@ public class SolrSetup implements javax.servlet.ServletContextListener{ OntModel jenaOntModel = ModelContext.getJenaOntModel(context); List modifiers = new ArrayList(); - // modifiers.add(new CalculateParameters(ModelContext.getJenaOntModel(context))); modifiers.add(new CalculateParameters(dataset)); modifiers.add(new ContextNodeFields(jenaOntModel)); + modifiers.add(new NameBoost()); IndividualToSolrDocument indToSolrDoc = new IndividualToSolrDocument( new ProhibitedFromSearch(DisplayVocabulary.PRIMARY_LUCENE_INDEX_URI, displayOntModel),