From 3843886b87bf45705b7332e50c037a7d08505713 Mon Sep 17 00:00:00 2001 From: bdc34 Date: Tue, 8 Feb 2011 00:43:19 +0000 Subject: [PATCH] Changes to Entity2LuceneDoc --- .../search/lucene/Entity2LuceneDoc.java | 255 ++++++++---------- 1 file changed, 105 insertions(+), 150 deletions(-) diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/Entity2LuceneDoc.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/Entity2LuceneDoc.java index a4409c526..059d4c5c7 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/Entity2LuceneDoc.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/Entity2LuceneDoc.java @@ -15,11 +15,14 @@ import org.apache.lucene.document.Field; import org.apache.lucene.index.Term; import org.joda.time.DateTime; +import com.hp.hpl.jena.vocabulary.OWL; + import edu.cornell.mannlib.vitro.webapp.beans.DataPropertyStatement; import edu.cornell.mannlib.vitro.webapp.beans.Individual; import edu.cornell.mannlib.vitro.webapp.beans.IndividualImpl; import edu.cornell.mannlib.vitro.webapp.beans.ObjectPropertyStatement; import edu.cornell.mannlib.vitro.webapp.beans.VClass; +import edu.cornell.mannlib.vitro.webapp.dao.VitroVocabulary; import edu.cornell.mannlib.vitro.webapp.search.IndexingException; import edu.cornell.mannlib.vitro.webapp.search.beans.ProhibitedFromSearch; import edu.cornell.mannlib.vitro.webapp.search.docbuilder.Obj2DocIface; @@ -55,8 +58,6 @@ public class Entity2LuceneDoc implements Obj2DocIface{ public static String NAMERAW = "nameraw"; /** portal ( 2 ^ portalId ) */ public static String PORTAL = "portal"; - /** Flag 2 (legacy, only used at Cornell) */ - public static String FLAG2 = "FLAG2"; /** time of index in msec since epoc */ public static String INDEXEDTIME= "indexedTime"; /** timekey of entity in yyyymmddhhmm */ @@ -74,6 +75,8 @@ public class Entity2LuceneDoc implements Obj2DocIface{ public static final String KEYWORDS = "KEYWORDS"; /** Does the individual have a thumbnail image? 1=yes 0=no */ public static final String THUMBNAIL = "THUMBNAIL"; + /** Should individual be included in full text search results? 1=yes 0=no */ + public static final String PROHIBITED_FROM_TEXT_RESULTS = "PROHIBITED_FROM_TEXT_RESULTS"; } private static final Log log = LogFactory.getLog(Entity2LuceneDoc.class.getName()); @@ -102,24 +105,64 @@ public class Entity2LuceneDoc implements Obj2DocIface{ //DocId String id = ent.getURI(); + log.debug("translating " + id); + if( id == null ){ - log.debug("cannot translate bnodes"); - throw new IndexingException("Not indexing bnodes"); + log.debug("cannot add individuals without URIs to lucene index"); + return null; + }else if( id.startsWith( VitroVocabulary.vitroURI ) + || id.startsWith( VitroVocabulary.VITRO_PUBLIC ) + || id.startsWith( VitroVocabulary.PSEUDO_BNODE_NS) + || id.startsWith( OWL.NS ) ){ + log.debug("not indxing because of namespace:" + id ); + return null; } - List vclasses = ent.getVClasses(false); - for( VClass vclass : vclasses ){ - if( classesProhibitedFromSearch.isClassProhibited(vclass.getURI()) ){ + /* Types and ClassGroup */ + boolean prohibited = false; + List vclasses = ent.getVClasses(false); + for( VClass clz : vclasses){ + if( clz.getURI() == null ){ + continue; + }else if( OWL.Thing.getURI().equals( clz.getURI()) ){ + //index individuals of type owl:Thing, just don't add owl:Thing the type field in the index + continue; + } else if ( clz.getURI().startsWith( OWL.NS ) ){ + log.debug("not indexing " + id + " because of type " + clz.getURI()); return null; + }else if( clz.getURI().startsWith( VitroVocabulary.vitroURI ) + || clz.getURI().startsWith( VitroVocabulary.VITRO_PUBLIC ) + || clz.getURI().startsWith( VitroVocabulary.PSEUDO_BNODE_NS) ){ + log.debug("not indexing " + id + " because of type " + clz.getURI()); + return null; + }else{ + if( !prohibited && classesProhibitedFromSearch.isClassProhibited(clz.getURI()) ) + prohibited = true; + + if( clz.getSearchBoost() != null ) + doc.setBoost( doc.getBoost() + clz.getSearchBoost() ); + + doc.add( new Field(term.RDFTYPE, clz.getURI(), + Field.Store.YES, Field.Index.NOT_ANALYZED)); + + if( clz.getName() != null ) + classPublicNames = classPublicNames + " " + clz.getName(); + + //Classgroup URI + if( clz.getGroupURI() != null ) + doc.add( new Field(term.CLASSGROUP_URI, clz.getGroupURI(), + Field.Store.YES, Field.Index.NOT_ANALYZED)); } - } + } + doc.add( new Field(term.PROHIBITED_FROM_TEXT_RESULTS, prohibited?"1":"0", + Field.Store.NO,Field.Index.NOT_ANALYZED_NO_NORMS) ); + /* lucene DOCID */ doc.add( new Field(term.DOCID, entClassName + id, Field.Store.YES, Field.Index.NOT_ANALYZED)); //vitro Id - doc.add( new Field(term.URI, id, Field.Store.YES, Field.Index.NOT_ANALYZED)); - log.debug( id ); + doc.add( new Field(term.URI, id, Field.Store.YES, Field.Index.NOT_ANALYZED)); //java class doc.add( new Field(term.JCLASS, entClassName, Field.Store.YES, Field.Index.NOT_ANALYZED)); @@ -152,24 +195,7 @@ public class Entity2LuceneDoc implements Obj2DocIface{ //boost for entity if( ent.getSearchBoost() != null && ent.getSearchBoost() != 0 ) doc.setBoost(ent.getSearchBoost()); - - //rdf:type and ClassGroup - for( VClass clz : vclasses){ - //document boost for given classes - if( clz.getSearchBoost() != null ) - doc.setBoost( doc.getBoost() + clz.getSearchBoost() ); - doc.add( new Field(term.RDFTYPE, clz.getURI(), - Field.Store.YES, Field.Index.NOT_ANALYZED)); - - if( clz.getName() != null ) - classPublicNames = classPublicNames + " " + clz.getName(); - - //Classgroup URI - if( clz.getGroupURI() != null ) - doc.add( new Field(term.CLASSGROUP_URI, clz.getGroupURI(), - Field.Store.YES, Field.Index.NOT_ANALYZED)); - } - + //Modification time if( ent.getModTime() != null){ value = (new DateTime(ent.getModTime().getTime())) @@ -191,13 +217,12 @@ public class Entity2LuceneDoc implements Obj2DocIface{ }catch (Exception ex){ value = null; } - - if( value != null ){ + if( value != null ) doc.add( new Field(term.SUNRISE, value, Field.Store.YES, Field.Index.NOT_ANALYZED)); - }else{ + else doc.add(new Field(term.SUNRISE, earliestTime, Field.Store.YES, Field.Index.NOT_ANALYZED)); - } - + + /* Sunset */ try{ value = null; if( ent.getSunset() != null ){ @@ -207,13 +232,12 @@ public class Entity2LuceneDoc implements Obj2DocIface{ }catch (Exception ex){ value = null; } - - if( value != null ){ + if( value != null ) doc.add( new Field(term.SUNSET, value, Field.Store.YES, Field.Index.NOT_ANALYZED)); - }else{ + else doc.add(new Field(term.SUNSET, latestTime, Field.Store.YES, Field.Index.NOT_ANALYZED)); - } - + + /* timekey */ try{ value = null; if( ent.getTimekey() != null ){ @@ -224,6 +248,7 @@ public class Entity2LuceneDoc implements Obj2DocIface{ log.error("could not save timekey " + ex); } + /* thumbnail */ try{ value = null; if( ent.hasThumb() ) @@ -237,123 +262,53 @@ public class Entity2LuceneDoc implements Obj2DocIface{ //time of index in millis past epoc Object anon[] = { new Long((new DateTime() ).getMillis()) }; doc.add( new Field(term.INDEXEDTIME, String.format( "%019d", anon ), - Field.Store.YES, Field.Index.NOT_ANALYZED)); - - //portal Flags - doPortalFlags(ent, doc); - - //do flag 2 legacy, only used at Cornell - //doFlag2( ent, doc ); + Field.Store.YES, Field.Index.NOT_ANALYZED)); - //ALLTEXT, all of the 'full text' - String t=null; - value =""; - value+= " "+( ((t=ent.getName()) == null)?"":t ); - value+= " "+( ((t=ent.getAnchor()) == null)?"":t); - value+= " "+ ( ((t=ent.getMoniker()) == null)?"":t ); - value+= " "+ ( ((t=ent.getDescription()) == null)?"":t ); - value+= " "+ ( ((t=ent.getBlurb()) == null)?"":t ); - value+= " "+ getKeyterms(ent); - - value+= " " + classPublicNames; - - List dataPropertyStatements = ent.getDataPropertyStatements(); - if (dataPropertyStatements != null) { - Iterator dataPropertyStmtIter = dataPropertyStatements.iterator(); - while (dataPropertyStmtIter.hasNext()) { - DataPropertyStatement dataPropertyStmt = dataPropertyStmtIter.next(); - value+= " "+ ( ((t=dataPropertyStmt.getData()) == null)?"":t ); - } - } - - List objectPropertyStatements = ent.getObjectPropertyStatements(); - if (objectPropertyStatements != null) { - Iterator objectPropertyStmtIter = objectPropertyStatements.iterator(); - while (objectPropertyStmtIter.hasNext()) { - ObjectPropertyStatement objectPropertyStmt = objectPropertyStmtIter.next(); - if( "http://www.w3.org/2002/07/owl#differentFrom".equals(objectPropertyStmt.getPropertyURI()) ) - continue; - try { - value+= " "+ ( ((t=objectPropertyStmt.getObject().getName()) == null)?"":t ); - } catch (Exception e) { - log.debug("could not index name of related object: " + e.getMessage()); + if( ! prohibited ){ + //ALLTEXT, all of the 'full text' + String t=null; + value =""; + value+= " "+( ((t=ent.getName()) == null)?"":t ); + value+= " "+( ((t=ent.getAnchor()) == null)?"":t); + value+= " "+ ( ((t=ent.getMoniker()) == null)?"":t ); + value+= " "+ ( ((t=ent.getDescription()) == null)?"":t ); + value+= " "+ ( ((t=ent.getBlurb()) == null)?"":t ); + value+= " "+ getKeyterms(ent); + + value+= " " + classPublicNames; + + List dataPropertyStatements = ent.getDataPropertyStatements(); + if (dataPropertyStatements != null) { + Iterator dataPropertyStmtIter = dataPropertyStatements.iterator(); + while (dataPropertyStmtIter.hasNext()) { + DataPropertyStatement dataPropertyStmt = dataPropertyStmtIter.next(); + value+= " "+ ( ((t=dataPropertyStmt.getData()) == null)?"":t ); } } + + List objectPropertyStatements = ent.getObjectPropertyStatements(); + if (objectPropertyStatements != null) { + Iterator objectPropertyStmtIter = objectPropertyStatements.iterator(); + while (objectPropertyStmtIter.hasNext()) { + ObjectPropertyStatement objectPropertyStmt = objectPropertyStmtIter.next(); + if( "http://www.w3.org/2002/07/owl#differentFrom".equals(objectPropertyStmt.getPropertyURI()) ) + continue; + try { + value+= " "+ ( ((t=objectPropertyStmt.getObject().getName()) == null)?"":t ); + } catch (Exception e) { + log.debug("could not index name of related object: " + e.getMessage()); + } + } + } + //stemmed terms + doc.add( new Field(term.ALLTEXT, value , Field.Store.NO, Field.Index.ANALYZED)); + //unstemmed terms + doc.add( new Field(term.ALLTEXTUNSTEMMED, value, Field.Store.NO, Field.Index.ANALYZED)); } - - //what else? linkAnchors? externalIds? - - //stemmed terms - doc.add( new Field(term.ALLTEXT, value , Field.Store.NO, Field.Index.ANALYZED)); - //unstemmed terms - doc.add( new Field(term.ALLTEXTUNSTEMMED, value, Field.Store.NO, Field.Index.ANALYZED)); - + + //flagX and portal flags are no longer indexed. return doc; - } - - /** - * Flag two is a legacy field that is used only by Cornell. - * It is related to the old portal filtering. - */ - private void doFlag2(Individual ent, Document doc) { - String flag2Set = ent.getFlag2Set(); - if( flag2Set != null && ! "".equals(flag2Set)){ - for( String flag2Value : flag2Set.split(",")){ - if( flag2Value != null ){ - String value = flag2Value.replace(",", ""); - if(!value.isEmpty()) - doc.add( new Field(term.FLAG2, value, Field.Store.NO, Field.Index.ANALYZED)); - } - } - } - } - - /** - * Splits up the entity's flag1 value into portal id and then - * adds the id to the doc. - * - * This should work fine with blank portal id and entities with - * the portal set to NULL. - * - * @param ent - * @param doc - */ - @SuppressWarnings("static-access") - private void doPortalFlags(Individual ent, Document doc){ - /* this is the code to add the portal names, we don't use this - * now but since there is no consistant way to store flags you - * might want this in the future. - String portalIdsInCommaSeperatedList = ent.getFlag1Set(); - - if(portalIdsInCommaSeperatedList == null) return; - String[] portalNames = portalIdsInCommaSeperatedList.split(","); - for( String name : portalNames){ - doc.add( new Field(term.PORTAL,name,Field.Store.NO,Field.Index.NOT_ANALYZED)); - } - */ - - /* this is the code to store portal ids to the lucene index */ - if( ent.getFlag1Numeric() == 0 ) - return; - Long[] portalIds = FlagMathUtils.numeric2numerics( ent.getFlag1Numeric() ); - if( portalIds == null || portalIds.length == 0) - return; - - log.debug("Flag 1 numeric: " + ent.getFlag1Numeric() + " for " + ent.getURI()); - - long id = -1; - for( Long idLong : portalIds){ - if( idLong != null ){ - id = idLong.longValue(); - String numericPortal = Long.toString(id); - if( numericPortal != null ){ - doc.add( new Field(term.PORTAL,numericPortal, - Field.Store.NO, Field.Index.NOT_ANALYZED)); - log.debug("adding portal " + numericPortal + " to " + ent.getURI()); - } - } - } - } + } @SuppressWarnings("static-access") public boolean canUnTranslate(Object result) {