Changes to Entity2LuceneDoc
This commit is contained in:
parent
54fa664a24
commit
3843886b87
1 changed files with 105 additions and 150 deletions
|
@ -15,11 +15,14 @@ import org.apache.lucene.document.Field;
|
|||
import org.apache.lucene.index.Term;
|
||||
import org.joda.time.DateTime;
|
||||
|
||||
import com.hp.hpl.jena.vocabulary.OWL;
|
||||
|
||||
import edu.cornell.mannlib.vitro.webapp.beans.DataPropertyStatement;
|
||||
import edu.cornell.mannlib.vitro.webapp.beans.Individual;
|
||||
import edu.cornell.mannlib.vitro.webapp.beans.IndividualImpl;
|
||||
import edu.cornell.mannlib.vitro.webapp.beans.ObjectPropertyStatement;
|
||||
import edu.cornell.mannlib.vitro.webapp.beans.VClass;
|
||||
import edu.cornell.mannlib.vitro.webapp.dao.VitroVocabulary;
|
||||
import edu.cornell.mannlib.vitro.webapp.search.IndexingException;
|
||||
import edu.cornell.mannlib.vitro.webapp.search.beans.ProhibitedFromSearch;
|
||||
import edu.cornell.mannlib.vitro.webapp.search.docbuilder.Obj2DocIface;
|
||||
|
@ -55,8 +58,6 @@ public class Entity2LuceneDoc implements Obj2DocIface{
|
|||
public static String NAMERAW = "nameraw";
|
||||
/** portal ( 2 ^ portalId ) */
|
||||
public static String PORTAL = "portal";
|
||||
/** Flag 2 (legacy, only used at Cornell) */
|
||||
public static String FLAG2 = "FLAG2";
|
||||
/** time of index in msec since epoc */
|
||||
public static String INDEXEDTIME= "indexedTime";
|
||||
/** timekey of entity in yyyymmddhhmm */
|
||||
|
@ -74,6 +75,8 @@ public class Entity2LuceneDoc implements Obj2DocIface{
|
|||
public static final String KEYWORDS = "KEYWORDS";
|
||||
/** Does the individual have a thumbnail image? 1=yes 0=no */
|
||||
public static final String THUMBNAIL = "THUMBNAIL";
|
||||
/** Should individual be included in full text search results? 1=yes 0=no */
|
||||
public static final String PROHIBITED_FROM_TEXT_RESULTS = "PROHIBITED_FROM_TEXT_RESULTS";
|
||||
}
|
||||
|
||||
private static final Log log = LogFactory.getLog(Entity2LuceneDoc.class.getName());
|
||||
|
@ -102,24 +105,64 @@ public class Entity2LuceneDoc implements Obj2DocIface{
|
|||
|
||||
//DocId
|
||||
String id = ent.getURI();
|
||||
log.debug("translating " + id);
|
||||
|
||||
if( id == null ){
|
||||
log.debug("cannot translate bnodes");
|
||||
throw new IndexingException("Not indexing bnodes");
|
||||
log.debug("cannot add individuals without URIs to lucene index");
|
||||
return null;
|
||||
}else if( id.startsWith( VitroVocabulary.vitroURI )
|
||||
|| id.startsWith( VitroVocabulary.VITRO_PUBLIC )
|
||||
|| id.startsWith( VitroVocabulary.PSEUDO_BNODE_NS)
|
||||
|| id.startsWith( OWL.NS ) ){
|
||||
log.debug("not indxing because of namespace:" + id );
|
||||
return null;
|
||||
}
|
||||
|
||||
List<VClass> vclasses = ent.getVClasses(false);
|
||||
for( VClass vclass : vclasses ){
|
||||
if( classesProhibitedFromSearch.isClassProhibited(vclass.getURI()) ){
|
||||
/* Types and ClassGroup */
|
||||
boolean prohibited = false;
|
||||
List<VClass> vclasses = ent.getVClasses(false);
|
||||
for( VClass clz : vclasses){
|
||||
if( clz.getURI() == null ){
|
||||
continue;
|
||||
}else if( OWL.Thing.getURI().equals( clz.getURI()) ){
|
||||
//index individuals of type owl:Thing, just don't add owl:Thing the type field in the index
|
||||
continue;
|
||||
} else if ( clz.getURI().startsWith( OWL.NS ) ){
|
||||
log.debug("not indexing " + id + " because of type " + clz.getURI());
|
||||
return null;
|
||||
}else if( clz.getURI().startsWith( VitroVocabulary.vitroURI )
|
||||
|| clz.getURI().startsWith( VitroVocabulary.VITRO_PUBLIC )
|
||||
|| clz.getURI().startsWith( VitroVocabulary.PSEUDO_BNODE_NS) ){
|
||||
log.debug("not indexing " + id + " because of type " + clz.getURI());
|
||||
return null;
|
||||
}else{
|
||||
if( !prohibited && classesProhibitedFromSearch.isClassProhibited(clz.getURI()) )
|
||||
prohibited = true;
|
||||
|
||||
if( clz.getSearchBoost() != null )
|
||||
doc.setBoost( doc.getBoost() + clz.getSearchBoost() );
|
||||
|
||||
doc.add( new Field(term.RDFTYPE, clz.getURI(),
|
||||
Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
|
||||
if( clz.getName() != null )
|
||||
classPublicNames = classPublicNames + " " + clz.getName();
|
||||
|
||||
//Classgroup URI
|
||||
if( clz.getGroupURI() != null )
|
||||
doc.add( new Field(term.CLASSGROUP_URI, clz.getGroupURI(),
|
||||
Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
}
|
||||
}
|
||||
}
|
||||
doc.add( new Field(term.PROHIBITED_FROM_TEXT_RESULTS, prohibited?"1":"0",
|
||||
Field.Store.NO,Field.Index.NOT_ANALYZED_NO_NORMS) );
|
||||
|
||||
/* lucene DOCID */
|
||||
doc.add( new Field(term.DOCID, entClassName + id,
|
||||
Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
|
||||
//vitro Id
|
||||
doc.add( new Field(term.URI, id, Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
log.debug( id );
|
||||
doc.add( new Field(term.URI, id, Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
|
||||
//java class
|
||||
doc.add( new Field(term.JCLASS, entClassName, Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
|
@ -152,24 +195,7 @@ public class Entity2LuceneDoc implements Obj2DocIface{
|
|||
//boost for entity
|
||||
if( ent.getSearchBoost() != null && ent.getSearchBoost() != 0 )
|
||||
doc.setBoost(ent.getSearchBoost());
|
||||
|
||||
//rdf:type and ClassGroup
|
||||
for( VClass clz : vclasses){
|
||||
//document boost for given classes
|
||||
if( clz.getSearchBoost() != null )
|
||||
doc.setBoost( doc.getBoost() + clz.getSearchBoost() );
|
||||
doc.add( new Field(term.RDFTYPE, clz.getURI(),
|
||||
Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
|
||||
if( clz.getName() != null )
|
||||
classPublicNames = classPublicNames + " " + clz.getName();
|
||||
|
||||
//Classgroup URI
|
||||
if( clz.getGroupURI() != null )
|
||||
doc.add( new Field(term.CLASSGROUP_URI, clz.getGroupURI(),
|
||||
Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
}
|
||||
|
||||
|
||||
//Modification time
|
||||
if( ent.getModTime() != null){
|
||||
value = (new DateTime(ent.getModTime().getTime()))
|
||||
|
@ -191,13 +217,12 @@ public class Entity2LuceneDoc implements Obj2DocIface{
|
|||
}catch (Exception ex){
|
||||
value = null;
|
||||
}
|
||||
|
||||
if( value != null ){
|
||||
if( value != null )
|
||||
doc.add( new Field(term.SUNRISE, value, Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
}else{
|
||||
else
|
||||
doc.add(new Field(term.SUNRISE, earliestTime, Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
}
|
||||
|
||||
|
||||
/* Sunset */
|
||||
try{
|
||||
value = null;
|
||||
if( ent.getSunset() != null ){
|
||||
|
@ -207,13 +232,12 @@ public class Entity2LuceneDoc implements Obj2DocIface{
|
|||
}catch (Exception ex){
|
||||
value = null;
|
||||
}
|
||||
|
||||
if( value != null ){
|
||||
if( value != null )
|
||||
doc.add( new Field(term.SUNSET, value, Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
}else{
|
||||
else
|
||||
doc.add(new Field(term.SUNSET, latestTime, Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
}
|
||||
|
||||
|
||||
/* timekey */
|
||||
try{
|
||||
value = null;
|
||||
if( ent.getTimekey() != null ){
|
||||
|
@ -224,6 +248,7 @@ public class Entity2LuceneDoc implements Obj2DocIface{
|
|||
log.error("could not save timekey " + ex);
|
||||
}
|
||||
|
||||
/* thumbnail */
|
||||
try{
|
||||
value = null;
|
||||
if( ent.hasThumb() )
|
||||
|
@ -237,123 +262,53 @@ public class Entity2LuceneDoc implements Obj2DocIface{
|
|||
//time of index in millis past epoc
|
||||
Object anon[] = { new Long((new DateTime() ).getMillis()) };
|
||||
doc.add( new Field(term.INDEXEDTIME, String.format( "%019d", anon ),
|
||||
Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
|
||||
//portal Flags
|
||||
doPortalFlags(ent, doc);
|
||||
|
||||
//do flag 2 legacy, only used at Cornell
|
||||
//doFlag2( ent, doc );
|
||||
Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
|
||||
//ALLTEXT, all of the 'full text'
|
||||
String t=null;
|
||||
value ="";
|
||||
value+= " "+( ((t=ent.getName()) == null)?"":t );
|
||||
value+= " "+( ((t=ent.getAnchor()) == null)?"":t);
|
||||
value+= " "+ ( ((t=ent.getMoniker()) == null)?"":t );
|
||||
value+= " "+ ( ((t=ent.getDescription()) == null)?"":t );
|
||||
value+= " "+ ( ((t=ent.getBlurb()) == null)?"":t );
|
||||
value+= " "+ getKeyterms(ent);
|
||||
|
||||
value+= " " + classPublicNames;
|
||||
|
||||
List<DataPropertyStatement> dataPropertyStatements = ent.getDataPropertyStatements();
|
||||
if (dataPropertyStatements != null) {
|
||||
Iterator<DataPropertyStatement> dataPropertyStmtIter = dataPropertyStatements.iterator();
|
||||
while (dataPropertyStmtIter.hasNext()) {
|
||||
DataPropertyStatement dataPropertyStmt = dataPropertyStmtIter.next();
|
||||
value+= " "+ ( ((t=dataPropertyStmt.getData()) == null)?"":t );
|
||||
}
|
||||
}
|
||||
|
||||
List<ObjectPropertyStatement> objectPropertyStatements = ent.getObjectPropertyStatements();
|
||||
if (objectPropertyStatements != null) {
|
||||
Iterator<ObjectPropertyStatement> objectPropertyStmtIter = objectPropertyStatements.iterator();
|
||||
while (objectPropertyStmtIter.hasNext()) {
|
||||
ObjectPropertyStatement objectPropertyStmt = objectPropertyStmtIter.next();
|
||||
if( "http://www.w3.org/2002/07/owl#differentFrom".equals(objectPropertyStmt.getPropertyURI()) )
|
||||
continue;
|
||||
try {
|
||||
value+= " "+ ( ((t=objectPropertyStmt.getObject().getName()) == null)?"":t );
|
||||
} catch (Exception e) {
|
||||
log.debug("could not index name of related object: " + e.getMessage());
|
||||
if( ! prohibited ){
|
||||
//ALLTEXT, all of the 'full text'
|
||||
String t=null;
|
||||
value ="";
|
||||
value+= " "+( ((t=ent.getName()) == null)?"":t );
|
||||
value+= " "+( ((t=ent.getAnchor()) == null)?"":t);
|
||||
value+= " "+ ( ((t=ent.getMoniker()) == null)?"":t );
|
||||
value+= " "+ ( ((t=ent.getDescription()) == null)?"":t );
|
||||
value+= " "+ ( ((t=ent.getBlurb()) == null)?"":t );
|
||||
value+= " "+ getKeyterms(ent);
|
||||
|
||||
value+= " " + classPublicNames;
|
||||
|
||||
List<DataPropertyStatement> dataPropertyStatements = ent.getDataPropertyStatements();
|
||||
if (dataPropertyStatements != null) {
|
||||
Iterator<DataPropertyStatement> dataPropertyStmtIter = dataPropertyStatements.iterator();
|
||||
while (dataPropertyStmtIter.hasNext()) {
|
||||
DataPropertyStatement dataPropertyStmt = dataPropertyStmtIter.next();
|
||||
value+= " "+ ( ((t=dataPropertyStmt.getData()) == null)?"":t );
|
||||
}
|
||||
}
|
||||
|
||||
List<ObjectPropertyStatement> objectPropertyStatements = ent.getObjectPropertyStatements();
|
||||
if (objectPropertyStatements != null) {
|
||||
Iterator<ObjectPropertyStatement> objectPropertyStmtIter = objectPropertyStatements.iterator();
|
||||
while (objectPropertyStmtIter.hasNext()) {
|
||||
ObjectPropertyStatement objectPropertyStmt = objectPropertyStmtIter.next();
|
||||
if( "http://www.w3.org/2002/07/owl#differentFrom".equals(objectPropertyStmt.getPropertyURI()) )
|
||||
continue;
|
||||
try {
|
||||
value+= " "+ ( ((t=objectPropertyStmt.getObject().getName()) == null)?"":t );
|
||||
} catch (Exception e) {
|
||||
log.debug("could not index name of related object: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
//stemmed terms
|
||||
doc.add( new Field(term.ALLTEXT, value , Field.Store.NO, Field.Index.ANALYZED));
|
||||
//unstemmed terms
|
||||
doc.add( new Field(term.ALLTEXTUNSTEMMED, value, Field.Store.NO, Field.Index.ANALYZED));
|
||||
}
|
||||
|
||||
//what else? linkAnchors? externalIds?
|
||||
|
||||
//stemmed terms
|
||||
doc.add( new Field(term.ALLTEXT, value , Field.Store.NO, Field.Index.ANALYZED));
|
||||
//unstemmed terms
|
||||
doc.add( new Field(term.ALLTEXTUNSTEMMED, value, Field.Store.NO, Field.Index.ANALYZED));
|
||||
|
||||
|
||||
//flagX and portal flags are no longer indexed.
|
||||
return doc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Flag two is a legacy field that is used only by Cornell.
|
||||
* It is related to the old portal filtering.
|
||||
*/
|
||||
private void doFlag2(Individual ent, Document doc) {
|
||||
String flag2Set = ent.getFlag2Set();
|
||||
if( flag2Set != null && ! "".equals(flag2Set)){
|
||||
for( String flag2Value : flag2Set.split(",")){
|
||||
if( flag2Value != null ){
|
||||
String value = flag2Value.replace(",", "");
|
||||
if(!value.isEmpty())
|
||||
doc.add( new Field(term.FLAG2, value, Field.Store.NO, Field.Index.ANALYZED));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Splits up the entity's flag1 value into portal id and then
|
||||
* adds the id to the doc.
|
||||
*
|
||||
* This should work fine with blank portal id and entities with
|
||||
* the portal set to NULL.
|
||||
*
|
||||
* @param ent
|
||||
* @param doc
|
||||
*/
|
||||
@SuppressWarnings("static-access")
|
||||
private void doPortalFlags(Individual ent, Document doc){
|
||||
/* this is the code to add the portal names, we don't use this
|
||||
* now but since there is no consistant way to store flags you
|
||||
* might want this in the future.
|
||||
String portalIdsInCommaSeperatedList = ent.getFlag1Set();
|
||||
|
||||
if(portalIdsInCommaSeperatedList == null) return;
|
||||
String[] portalNames = portalIdsInCommaSeperatedList.split(",");
|
||||
for( String name : portalNames){
|
||||
doc.add( new Field(term.PORTAL,name,Field.Store.NO,Field.Index.NOT_ANALYZED));
|
||||
}
|
||||
*/
|
||||
|
||||
/* this is the code to store portal ids to the lucene index */
|
||||
if( ent.getFlag1Numeric() == 0 )
|
||||
return;
|
||||
Long[] portalIds = FlagMathUtils.numeric2numerics( ent.getFlag1Numeric() );
|
||||
if( portalIds == null || portalIds.length == 0)
|
||||
return;
|
||||
|
||||
log.debug("Flag 1 numeric: " + ent.getFlag1Numeric() + " for " + ent.getURI());
|
||||
|
||||
long id = -1;
|
||||
for( Long idLong : portalIds){
|
||||
if( idLong != null ){
|
||||
id = idLong.longValue();
|
||||
String numericPortal = Long.toString(id);
|
||||
if( numericPortal != null ){
|
||||
doc.add( new Field(term.PORTAL,numericPortal,
|
||||
Field.Store.NO, Field.Index.NOT_ANALYZED));
|
||||
log.debug("adding portal " + numericPortal + " to " + ent.getURI());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("static-access")
|
||||
public boolean canUnTranslate(Object result) {
|
||||
|
|
Loading…
Add table
Reference in a new issue