Changes to Entity2LuceneDoc

This commit is contained in:
bdc34 2011-02-08 00:43:19 +00:00
parent 54fa664a24
commit 3843886b87

View file

@ -15,11 +15,14 @@ import org.apache.lucene.document.Field;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.joda.time.DateTime; import org.joda.time.DateTime;
import com.hp.hpl.jena.vocabulary.OWL;
import edu.cornell.mannlib.vitro.webapp.beans.DataPropertyStatement; import edu.cornell.mannlib.vitro.webapp.beans.DataPropertyStatement;
import edu.cornell.mannlib.vitro.webapp.beans.Individual; import edu.cornell.mannlib.vitro.webapp.beans.Individual;
import edu.cornell.mannlib.vitro.webapp.beans.IndividualImpl; import edu.cornell.mannlib.vitro.webapp.beans.IndividualImpl;
import edu.cornell.mannlib.vitro.webapp.beans.ObjectPropertyStatement; import edu.cornell.mannlib.vitro.webapp.beans.ObjectPropertyStatement;
import edu.cornell.mannlib.vitro.webapp.beans.VClass; import edu.cornell.mannlib.vitro.webapp.beans.VClass;
import edu.cornell.mannlib.vitro.webapp.dao.VitroVocabulary;
import edu.cornell.mannlib.vitro.webapp.search.IndexingException; import edu.cornell.mannlib.vitro.webapp.search.IndexingException;
import edu.cornell.mannlib.vitro.webapp.search.beans.ProhibitedFromSearch; import edu.cornell.mannlib.vitro.webapp.search.beans.ProhibitedFromSearch;
import edu.cornell.mannlib.vitro.webapp.search.docbuilder.Obj2DocIface; import edu.cornell.mannlib.vitro.webapp.search.docbuilder.Obj2DocIface;
@ -55,8 +58,6 @@ public class Entity2LuceneDoc implements Obj2DocIface{
public static String NAMERAW = "nameraw"; public static String NAMERAW = "nameraw";
/** portal ( 2 ^ portalId ) */ /** portal ( 2 ^ portalId ) */
public static String PORTAL = "portal"; public static String PORTAL = "portal";
/** Flag 2 (legacy, only used at Cornell) */
public static String FLAG2 = "FLAG2";
/** time of index in msec since epoc */ /** time of index in msec since epoc */
public static String INDEXEDTIME= "indexedTime"; public static String INDEXEDTIME= "indexedTime";
/** timekey of entity in yyyymmddhhmm */ /** timekey of entity in yyyymmddhhmm */
@ -74,6 +75,8 @@ public class Entity2LuceneDoc implements Obj2DocIface{
public static final String KEYWORDS = "KEYWORDS"; public static final String KEYWORDS = "KEYWORDS";
/** Does the individual have a thumbnail image? 1=yes 0=no */ /** Does the individual have a thumbnail image? 1=yes 0=no */
public static final String THUMBNAIL = "THUMBNAIL"; public static final String THUMBNAIL = "THUMBNAIL";
/** Should individual be included in full text search results? 1=yes 0=no */
public static final String PROHIBITED_FROM_TEXT_RESULTS = "PROHIBITED_FROM_TEXT_RESULTS";
} }
private static final Log log = LogFactory.getLog(Entity2LuceneDoc.class.getName()); private static final Log log = LogFactory.getLog(Entity2LuceneDoc.class.getName());
@ -102,24 +105,64 @@ public class Entity2LuceneDoc implements Obj2DocIface{
//DocId //DocId
String id = ent.getURI(); String id = ent.getURI();
log.debug("translating " + id);
if( id == null ){ if( id == null ){
log.debug("cannot translate bnodes"); log.debug("cannot add individuals without URIs to lucene index");
throw new IndexingException("Not indexing bnodes"); return null;
}else if( id.startsWith( VitroVocabulary.vitroURI )
|| id.startsWith( VitroVocabulary.VITRO_PUBLIC )
|| id.startsWith( VitroVocabulary.PSEUDO_BNODE_NS)
|| id.startsWith( OWL.NS ) ){
log.debug("not indxing because of namespace:" + id );
return null;
} }
List<VClass> vclasses = ent.getVClasses(false); /* Types and ClassGroup */
for( VClass vclass : vclasses ){ boolean prohibited = false;
if( classesProhibitedFromSearch.isClassProhibited(vclass.getURI()) ){ List<VClass> vclasses = ent.getVClasses(false);
for( VClass clz : vclasses){
if( clz.getURI() == null ){
continue;
}else if( OWL.Thing.getURI().equals( clz.getURI()) ){
//index individuals of type owl:Thing, just don't add owl:Thing the type field in the index
continue;
} else if ( clz.getURI().startsWith( OWL.NS ) ){
log.debug("not indexing " + id + " because of type " + clz.getURI());
return null; return null;
}else if( clz.getURI().startsWith( VitroVocabulary.vitroURI )
|| clz.getURI().startsWith( VitroVocabulary.VITRO_PUBLIC )
|| clz.getURI().startsWith( VitroVocabulary.PSEUDO_BNODE_NS) ){
log.debug("not indexing " + id + " because of type " + clz.getURI());
return null;
}else{
if( !prohibited && classesProhibitedFromSearch.isClassProhibited(clz.getURI()) )
prohibited = true;
if( clz.getSearchBoost() != null )
doc.setBoost( doc.getBoost() + clz.getSearchBoost() );
doc.add( new Field(term.RDFTYPE, clz.getURI(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
if( clz.getName() != null )
classPublicNames = classPublicNames + " " + clz.getName();
//Classgroup URI
if( clz.getGroupURI() != null )
doc.add( new Field(term.CLASSGROUP_URI, clz.getGroupURI(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
} }
} }
doc.add( new Field(term.PROHIBITED_FROM_TEXT_RESULTS, prohibited?"1":"0",
Field.Store.NO,Field.Index.NOT_ANALYZED_NO_NORMS) );
/* lucene DOCID */
doc.add( new Field(term.DOCID, entClassName + id, doc.add( new Field(term.DOCID, entClassName + id,
Field.Store.YES, Field.Index.NOT_ANALYZED)); Field.Store.YES, Field.Index.NOT_ANALYZED));
//vitro Id //vitro Id
doc.add( new Field(term.URI, id, Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add( new Field(term.URI, id, Field.Store.YES, Field.Index.NOT_ANALYZED));
log.debug( id );
//java class //java class
doc.add( new Field(term.JCLASS, entClassName, Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add( new Field(term.JCLASS, entClassName, Field.Store.YES, Field.Index.NOT_ANALYZED));
@ -152,24 +195,7 @@ public class Entity2LuceneDoc implements Obj2DocIface{
//boost for entity //boost for entity
if( ent.getSearchBoost() != null && ent.getSearchBoost() != 0 ) if( ent.getSearchBoost() != null && ent.getSearchBoost() != 0 )
doc.setBoost(ent.getSearchBoost()); doc.setBoost(ent.getSearchBoost());
//rdf:type and ClassGroup
for( VClass clz : vclasses){
//document boost for given classes
if( clz.getSearchBoost() != null )
doc.setBoost( doc.getBoost() + clz.getSearchBoost() );
doc.add( new Field(term.RDFTYPE, clz.getURI(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
if( clz.getName() != null )
classPublicNames = classPublicNames + " " + clz.getName();
//Classgroup URI
if( clz.getGroupURI() != null )
doc.add( new Field(term.CLASSGROUP_URI, clz.getGroupURI(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
}
//Modification time //Modification time
if( ent.getModTime() != null){ if( ent.getModTime() != null){
value = (new DateTime(ent.getModTime().getTime())) value = (new DateTime(ent.getModTime().getTime()))
@ -191,13 +217,12 @@ public class Entity2LuceneDoc implements Obj2DocIface{
}catch (Exception ex){ }catch (Exception ex){
value = null; value = null;
} }
if( value != null )
if( value != null ){
doc.add( new Field(term.SUNRISE, value, Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add( new Field(term.SUNRISE, value, Field.Store.YES, Field.Index.NOT_ANALYZED));
}else{ else
doc.add(new Field(term.SUNRISE, earliestTime, Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field(term.SUNRISE, earliestTime, Field.Store.YES, Field.Index.NOT_ANALYZED));
}
/* Sunset */
try{ try{
value = null; value = null;
if( ent.getSunset() != null ){ if( ent.getSunset() != null ){
@ -207,13 +232,12 @@ public class Entity2LuceneDoc implements Obj2DocIface{
}catch (Exception ex){ }catch (Exception ex){
value = null; value = null;
} }
if( value != null )
if( value != null ){
doc.add( new Field(term.SUNSET, value, Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add( new Field(term.SUNSET, value, Field.Store.YES, Field.Index.NOT_ANALYZED));
}else{ else
doc.add(new Field(term.SUNSET, latestTime, Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field(term.SUNSET, latestTime, Field.Store.YES, Field.Index.NOT_ANALYZED));
}
/* timekey */
try{ try{
value = null; value = null;
if( ent.getTimekey() != null ){ if( ent.getTimekey() != null ){
@ -224,6 +248,7 @@ public class Entity2LuceneDoc implements Obj2DocIface{
log.error("could not save timekey " + ex); log.error("could not save timekey " + ex);
} }
/* thumbnail */
try{ try{
value = null; value = null;
if( ent.hasThumb() ) if( ent.hasThumb() )
@ -237,123 +262,53 @@ public class Entity2LuceneDoc implements Obj2DocIface{
//time of index in millis past epoc //time of index in millis past epoc
Object anon[] = { new Long((new DateTime() ).getMillis()) }; Object anon[] = { new Long((new DateTime() ).getMillis()) };
doc.add( new Field(term.INDEXEDTIME, String.format( "%019d", anon ), doc.add( new Field(term.INDEXEDTIME, String.format( "%019d", anon ),
Field.Store.YES, Field.Index.NOT_ANALYZED)); Field.Store.YES, Field.Index.NOT_ANALYZED));
//portal Flags
doPortalFlags(ent, doc);
//do flag 2 legacy, only used at Cornell
//doFlag2( ent, doc );
//ALLTEXT, all of the 'full text' if( ! prohibited ){
String t=null; //ALLTEXT, all of the 'full text'
value =""; String t=null;
value+= " "+( ((t=ent.getName()) == null)?"":t ); value ="";
value+= " "+( ((t=ent.getAnchor()) == null)?"":t); value+= " "+( ((t=ent.getName()) == null)?"":t );
value+= " "+ ( ((t=ent.getMoniker()) == null)?"":t ); value+= " "+( ((t=ent.getAnchor()) == null)?"":t);
value+= " "+ ( ((t=ent.getDescription()) == null)?"":t ); value+= " "+ ( ((t=ent.getMoniker()) == null)?"":t );
value+= " "+ ( ((t=ent.getBlurb()) == null)?"":t ); value+= " "+ ( ((t=ent.getDescription()) == null)?"":t );
value+= " "+ getKeyterms(ent); value+= " "+ ( ((t=ent.getBlurb()) == null)?"":t );
value+= " "+ getKeyterms(ent);
value+= " " + classPublicNames;
value+= " " + classPublicNames;
List<DataPropertyStatement> dataPropertyStatements = ent.getDataPropertyStatements();
if (dataPropertyStatements != null) { List<DataPropertyStatement> dataPropertyStatements = ent.getDataPropertyStatements();
Iterator<DataPropertyStatement> dataPropertyStmtIter = dataPropertyStatements.iterator(); if (dataPropertyStatements != null) {
while (dataPropertyStmtIter.hasNext()) { Iterator<DataPropertyStatement> dataPropertyStmtIter = dataPropertyStatements.iterator();
DataPropertyStatement dataPropertyStmt = dataPropertyStmtIter.next(); while (dataPropertyStmtIter.hasNext()) {
value+= " "+ ( ((t=dataPropertyStmt.getData()) == null)?"":t ); DataPropertyStatement dataPropertyStmt = dataPropertyStmtIter.next();
} value+= " "+ ( ((t=dataPropertyStmt.getData()) == null)?"":t );
}
List<ObjectPropertyStatement> objectPropertyStatements = ent.getObjectPropertyStatements();
if (objectPropertyStatements != null) {
Iterator<ObjectPropertyStatement> objectPropertyStmtIter = objectPropertyStatements.iterator();
while (objectPropertyStmtIter.hasNext()) {
ObjectPropertyStatement objectPropertyStmt = objectPropertyStmtIter.next();
if( "http://www.w3.org/2002/07/owl#differentFrom".equals(objectPropertyStmt.getPropertyURI()) )
continue;
try {
value+= " "+ ( ((t=objectPropertyStmt.getObject().getName()) == null)?"":t );
} catch (Exception e) {
log.debug("could not index name of related object: " + e.getMessage());
} }
} }
List<ObjectPropertyStatement> objectPropertyStatements = ent.getObjectPropertyStatements();
if (objectPropertyStatements != null) {
Iterator<ObjectPropertyStatement> objectPropertyStmtIter = objectPropertyStatements.iterator();
while (objectPropertyStmtIter.hasNext()) {
ObjectPropertyStatement objectPropertyStmt = objectPropertyStmtIter.next();
if( "http://www.w3.org/2002/07/owl#differentFrom".equals(objectPropertyStmt.getPropertyURI()) )
continue;
try {
value+= " "+ ( ((t=objectPropertyStmt.getObject().getName()) == null)?"":t );
} catch (Exception e) {
log.debug("could not index name of related object: " + e.getMessage());
}
}
}
//stemmed terms
doc.add( new Field(term.ALLTEXT, value , Field.Store.NO, Field.Index.ANALYZED));
//unstemmed terms
doc.add( new Field(term.ALLTEXTUNSTEMMED, value, Field.Store.NO, Field.Index.ANALYZED));
} }
//what else? linkAnchors? externalIds? //flagX and portal flags are no longer indexed.
//stemmed terms
doc.add( new Field(term.ALLTEXT, value , Field.Store.NO, Field.Index.ANALYZED));
//unstemmed terms
doc.add( new Field(term.ALLTEXTUNSTEMMED, value, Field.Store.NO, Field.Index.ANALYZED));
return doc; return doc;
} }
/**
* Flag two is a legacy field that is used only by Cornell.
* It is related to the old portal filtering.
*/
private void doFlag2(Individual ent, Document doc) {
String flag2Set = ent.getFlag2Set();
if( flag2Set != null && ! "".equals(flag2Set)){
for( String flag2Value : flag2Set.split(",")){
if( flag2Value != null ){
String value = flag2Value.replace(",", "");
if(!value.isEmpty())
doc.add( new Field(term.FLAG2, value, Field.Store.NO, Field.Index.ANALYZED));
}
}
}
}
/**
* Splits up the entity's flag1 value into portal id and then
* adds the id to the doc.
*
* This should work fine with blank portal id and entities with
* the portal set to NULL.
*
* @param ent
* @param doc
*/
@SuppressWarnings("static-access")
private void doPortalFlags(Individual ent, Document doc){
/* this is the code to add the portal names, we don't use this
* now but since there is no consistant way to store flags you
* might want this in the future.
String portalIdsInCommaSeperatedList = ent.getFlag1Set();
if(portalIdsInCommaSeperatedList == null) return;
String[] portalNames = portalIdsInCommaSeperatedList.split(",");
for( String name : portalNames){
doc.add( new Field(term.PORTAL,name,Field.Store.NO,Field.Index.NOT_ANALYZED));
}
*/
/* this is the code to store portal ids to the lucene index */
if( ent.getFlag1Numeric() == 0 )
return;
Long[] portalIds = FlagMathUtils.numeric2numerics( ent.getFlag1Numeric() );
if( portalIds == null || portalIds.length == 0)
return;
log.debug("Flag 1 numeric: " + ent.getFlag1Numeric() + " for " + ent.getURI());
long id = -1;
for( Long idLong : portalIds){
if( idLong != null ){
id = idLong.longValue();
String numericPortal = Long.toString(id);
if( numericPortal != null ){
doc.add( new Field(term.PORTAL,numericPortal,
Field.Store.NO, Field.Index.NOT_ANALYZED));
log.debug("adding portal " + numericPortal + " to " + ent.getURI());
}
}
}
}
@SuppressWarnings("static-access") @SuppressWarnings("static-access")
public boolean canUnTranslate(Object result) { public boolean canUnTranslate(Object result) {