Changes to Entity2LuceneDoc

This commit is contained in:
bdc34 2011-02-08 00:43:19 +00:00
parent 54fa664a24
commit 3843886b87

View file

@ -15,11 +15,14 @@ import org.apache.lucene.document.Field;
import org.apache.lucene.index.Term;
import org.joda.time.DateTime;
import com.hp.hpl.jena.vocabulary.OWL;
import edu.cornell.mannlib.vitro.webapp.beans.DataPropertyStatement;
import edu.cornell.mannlib.vitro.webapp.beans.Individual;
import edu.cornell.mannlib.vitro.webapp.beans.IndividualImpl;
import edu.cornell.mannlib.vitro.webapp.beans.ObjectPropertyStatement;
import edu.cornell.mannlib.vitro.webapp.beans.VClass;
import edu.cornell.mannlib.vitro.webapp.dao.VitroVocabulary;
import edu.cornell.mannlib.vitro.webapp.search.IndexingException;
import edu.cornell.mannlib.vitro.webapp.search.beans.ProhibitedFromSearch;
import edu.cornell.mannlib.vitro.webapp.search.docbuilder.Obj2DocIface;
@ -55,8 +58,6 @@ public class Entity2LuceneDoc implements Obj2DocIface{
public static String NAMERAW = "nameraw";
/** portal ( 2 ^ portalId ) */
public static String PORTAL = "portal";
/** Flag 2 (legacy, only used at Cornell) */
public static String FLAG2 = "FLAG2";
/** time of index in msec since epoc */
public static String INDEXEDTIME= "indexedTime";
/** timekey of entity in yyyymmddhhmm */
@ -74,6 +75,8 @@ public class Entity2LuceneDoc implements Obj2DocIface{
public static final String KEYWORDS = "KEYWORDS";
/** Does the individual have a thumbnail image? 1=yes 0=no */
public static final String THUMBNAIL = "THUMBNAIL";
/** Should individual be included in full text search results? 1=yes 0=no */
public static final String PROHIBITED_FROM_TEXT_RESULTS = "PROHIBITED_FROM_TEXT_RESULTS";
}
private static final Log log = LogFactory.getLog(Entity2LuceneDoc.class.getName());
@ -102,24 +105,64 @@ public class Entity2LuceneDoc implements Obj2DocIface{
//DocId
String id = ent.getURI();
if( id == null ){
log.debug("cannot translate bnodes");
throw new IndexingException("Not indexing bnodes");
}
log.debug("translating " + id);
List<VClass> vclasses = ent.getVClasses(false);
for( VClass vclass : vclasses ){
if( classesProhibitedFromSearch.isClassProhibited(vclass.getURI()) ){
if( id == null ){
log.debug("cannot add individuals without URIs to lucene index");
return null;
}else if( id.startsWith( VitroVocabulary.vitroURI )
|| id.startsWith( VitroVocabulary.VITRO_PUBLIC )
|| id.startsWith( VitroVocabulary.PSEUDO_BNODE_NS)
|| id.startsWith( OWL.NS ) ){
log.debug("not indxing because of namespace:" + id );
return null;
}
}
/* Types and ClassGroup */
boolean prohibited = false;
List<VClass> vclasses = ent.getVClasses(false);
for( VClass clz : vclasses){
if( clz.getURI() == null ){
continue;
}else if( OWL.Thing.getURI().equals( clz.getURI()) ){
//index individuals of type owl:Thing, just don't add owl:Thing the type field in the index
continue;
} else if ( clz.getURI().startsWith( OWL.NS ) ){
log.debug("not indexing " + id + " because of type " + clz.getURI());
return null;
}else if( clz.getURI().startsWith( VitroVocabulary.vitroURI )
|| clz.getURI().startsWith( VitroVocabulary.VITRO_PUBLIC )
|| clz.getURI().startsWith( VitroVocabulary.PSEUDO_BNODE_NS) ){
log.debug("not indexing " + id + " because of type " + clz.getURI());
return null;
}else{
if( !prohibited && classesProhibitedFromSearch.isClassProhibited(clz.getURI()) )
prohibited = true;
if( clz.getSearchBoost() != null )
doc.setBoost( doc.getBoost() + clz.getSearchBoost() );
doc.add( new Field(term.RDFTYPE, clz.getURI(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
if( clz.getName() != null )
classPublicNames = classPublicNames + " " + clz.getName();
//Classgroup URI
if( clz.getGroupURI() != null )
doc.add( new Field(term.CLASSGROUP_URI, clz.getGroupURI(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
}
}
doc.add( new Field(term.PROHIBITED_FROM_TEXT_RESULTS, prohibited?"1":"0",
Field.Store.NO,Field.Index.NOT_ANALYZED_NO_NORMS) );
/* lucene DOCID */
doc.add( new Field(term.DOCID, entClassName + id,
Field.Store.YES, Field.Index.NOT_ANALYZED));
//vitro Id
doc.add( new Field(term.URI, id, Field.Store.YES, Field.Index.NOT_ANALYZED));
log.debug( id );
//java class
doc.add( new Field(term.JCLASS, entClassName, Field.Store.YES, Field.Index.NOT_ANALYZED));
@ -153,23 +196,6 @@ public class Entity2LuceneDoc implements Obj2DocIface{
if( ent.getSearchBoost() != null && ent.getSearchBoost() != 0 )
doc.setBoost(ent.getSearchBoost());
//rdf:type and ClassGroup
for( VClass clz : vclasses){
//document boost for given classes
if( clz.getSearchBoost() != null )
doc.setBoost( doc.getBoost() + clz.getSearchBoost() );
doc.add( new Field(term.RDFTYPE, clz.getURI(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
if( clz.getName() != null )
classPublicNames = classPublicNames + " " + clz.getName();
//Classgroup URI
if( clz.getGroupURI() != null )
doc.add( new Field(term.CLASSGROUP_URI, clz.getGroupURI(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
}
//Modification time
if( ent.getModTime() != null){
value = (new DateTime(ent.getModTime().getTime()))
@ -191,13 +217,12 @@ public class Entity2LuceneDoc implements Obj2DocIface{
}catch (Exception ex){
value = null;
}
if( value != null ){
if( value != null )
doc.add( new Field(term.SUNRISE, value, Field.Store.YES, Field.Index.NOT_ANALYZED));
}else{
else
doc.add(new Field(term.SUNRISE, earliestTime, Field.Store.YES, Field.Index.NOT_ANALYZED));
}
/* Sunset */
try{
value = null;
if( ent.getSunset() != null ){
@ -207,13 +232,12 @@ public class Entity2LuceneDoc implements Obj2DocIface{
}catch (Exception ex){
value = null;
}
if( value != null ){
if( value != null )
doc.add( new Field(term.SUNSET, value, Field.Store.YES, Field.Index.NOT_ANALYZED));
}else{
else
doc.add(new Field(term.SUNSET, latestTime, Field.Store.YES, Field.Index.NOT_ANALYZED));
}
/* timekey */
try{
value = null;
if( ent.getTimekey() != null ){
@ -224,6 +248,7 @@ public class Entity2LuceneDoc implements Obj2DocIface{
log.error("could not save timekey " + ex);
}
/* thumbnail */
try{
value = null;
if( ent.hasThumb() )
@ -239,12 +264,7 @@ public class Entity2LuceneDoc implements Obj2DocIface{
doc.add( new Field(term.INDEXEDTIME, String.format( "%019d", anon ),
Field.Store.YES, Field.Index.NOT_ANALYZED));
//portal Flags
doPortalFlags(ent, doc);
//do flag 2 legacy, only used at Cornell
//doFlag2( ent, doc );
if( ! prohibited ){
//ALLTEXT, all of the 'full text'
String t=null;
value ="";
@ -280,81 +300,16 @@ public class Entity2LuceneDoc implements Obj2DocIface{
}
}
}
//what else? linkAnchors? externalIds?
//stemmed terms
doc.add( new Field(term.ALLTEXT, value , Field.Store.NO, Field.Index.ANALYZED));
//unstemmed terms
doc.add( new Field(term.ALLTEXTUNSTEMMED, value, Field.Store.NO, Field.Index.ANALYZED));
}
//flagX and portal flags are no longer indexed.
return doc;
}
/**
* Flag two is a legacy field that is used only by Cornell.
* It is related to the old portal filtering.
*/
private void doFlag2(Individual ent, Document doc) {
String flag2Set = ent.getFlag2Set();
if( flag2Set != null && ! "".equals(flag2Set)){
for( String flag2Value : flag2Set.split(",")){
if( flag2Value != null ){
String value = flag2Value.replace(",", "");
if(!value.isEmpty())
doc.add( new Field(term.FLAG2, value, Field.Store.NO, Field.Index.ANALYZED));
}
}
}
}
/**
* Splits up the entity's flag1 value into portal id and then
* adds the id to the doc.
*
* This should work fine with blank portal id and entities with
* the portal set to NULL.
*
* @param ent
* @param doc
*/
@SuppressWarnings("static-access")
private void doPortalFlags(Individual ent, Document doc){
/* this is the code to add the portal names, we don't use this
* now but since there is no consistant way to store flags you
* might want this in the future.
String portalIdsInCommaSeperatedList = ent.getFlag1Set();
if(portalIdsInCommaSeperatedList == null) return;
String[] portalNames = portalIdsInCommaSeperatedList.split(",");
for( String name : portalNames){
doc.add( new Field(term.PORTAL,name,Field.Store.NO,Field.Index.NOT_ANALYZED));
}
*/
/* this is the code to store portal ids to the lucene index */
if( ent.getFlag1Numeric() == 0 )
return;
Long[] portalIds = FlagMathUtils.numeric2numerics( ent.getFlag1Numeric() );
if( portalIds == null || portalIds.length == 0)
return;
log.debug("Flag 1 numeric: " + ent.getFlag1Numeric() + " for " + ent.getURI());
long id = -1;
for( Long idLong : portalIds){
if( idLong != null ){
id = idLong.longValue();
String numericPortal = Long.toString(id);
if( numericPortal != null ){
doc.add( new Field(term.PORTAL,numericPortal,
Field.Store.NO, Field.Index.NOT_ANALYZED));
log.debug("adding portal " + numericPortal + " to " + ent.getURI());
}
}
}
}
@SuppressWarnings("static-access")
public boolean canUnTranslate(Object result) {
if( result != null && result instanceof Document){