Removing union graph from application graph. Adding new field nameUnstemmed to lucene index. Minor refactoring of lucene objects.

This commit is contained in:
bdc34 2010-06-25 20:05:05 +00:00
parent 444d37bd5a
commit b6d0c61e86
9 changed files with 51 additions and 121 deletions

View file

@ -503,13 +503,13 @@ public class PagedSearchController extends VitroHttpServlet implements Searcher{
//indicated in the query string. //indicated in the query string.
//The analyzer is needed so that we use the same analyzer on the search queries as //The analyzer is needed so that we use the same analyzer on the search queries as
//was used on the text that was indexed. //was used on the text that was indexed.
VitroQueryParser qp = new VitroQueryParser(defaultSearchField,analyzer); QueryParser qp = new QueryParser(defaultSearchField,analyzer);
//this sets the query parser to AND all of the query terms it finds. //this sets the query parser to AND all of the query terms it finds.
qp.setDefaultOperator(QueryParser.AND_OPERATOR); qp.setDefaultOperator(QueryParser.AND_OPERATOR);
//set up the map of stemmed field names -> unstemmed field names //set up the map of stemmed field names -> unstemmed field names
HashMap<String,String> map = new HashMap<String, String>(); // HashMap<String,String> map = new HashMap<String, String>();
map.put(Entity2LuceneDoc.term.ALLTEXT,Entity2LuceneDoc.term.ALLTEXTUNSTEMMED); // map.put(Entity2LuceneDoc.term.ALLTEXT,Entity2LuceneDoc.term.ALLTEXTUNSTEMMED);
qp.setStemmedToUnstemmed(map); // qp.setStemmedToUnstemmed(map);
return qp; return qp;
} }

View file

@ -47,6 +47,8 @@ public class Entity2LuceneDoc implements Obj2DocIface{
public static String MODTIME = "modTime"; public static String MODTIME = "modTime";
/** Name of entity, tab or vclass */ /** Name of entity, tab or vclass */
public static String NAME = "name"; public static String NAME = "name";
/** Name of entity, unstemmed */
public static String NAMEUNSTEMMED = "nameunstemmed";
/** Name of portal */ /** Name of portal */
public static String PORTAL = "portal"; public static String PORTAL = "portal";
/** time of index in msec since epoc */ /** time of index in msec since epoc */
@ -109,6 +111,9 @@ public class Entity2LuceneDoc implements Obj2DocIface{
Field.Store.YES, Field.Index.ANALYZED); Field.Store.YES, Field.Index.ANALYZED);
name.setBoost( NAME_BOOST ); name.setBoost( NAME_BOOST );
doc.add( name ); doc.add( name );
Field nameUn = new Field(term.NAMEUNSTEMMED, value,
Field.Store.YES, Field.Index.ANALYZED);
nameUn.setBoost( NAME_BOOST );
//boost for entity //boost for entity
if( ent.getSearchBoost() != null && ent.getSearchBoost() != 0 ) if( ent.getSearchBoost() != null && ent.getSearchBoost() != 0 )

View file

@ -100,10 +100,10 @@ public class HtmlLowerStopAnalyzer extends Analyzer {
// //
TokenStream result = new StandardTokenizer(arg0); TokenStream result = new StandardTokenizer(arg0);
result = new StandardFilter(result); result = new StandardFilter(result); //break into tokens
result = new LowerCaseFilter(result); result = new LowerCaseFilter(result); //lower case
result = new StopFilter(result, _stopWords, IGNORE_CASE); result = new StopFilter(result, _stopWords, IGNORE_CASE); //remove stop words
result = new ISOLatin1AccentFilter(result); result = new ISOLatin1AccentFilter(result); //ISO-8859-1 accented chars are replace by unaccented
return result; return result;
} }

View file

@ -36,7 +36,8 @@ import edu.cornell.mannlib.vitro.webapp.utils.FlagMathUtils;
* QueryParser see: * QueryParser see:
* http://lucene.apache.org/java/docs/queryparsersyntax.html * http://lucene.apache.org/java/docs/queryparsersyntax.html
* http://today.java.net/pub/a/today/2003/11/07/QueryParserRules.html * http://today.java.net/pub/a/today/2003/11/07/QueryParserRules.html
* *
* This class is not thread safe, use one instance per request.
* @author bdc34 * @author bdc34
* *
*/ */
@ -47,48 +48,30 @@ public class LuceneQuery extends VitroQuery {
private final int ADVANCED =2; private final int ADVANCED =2;
private int queryType = SIMPLE; private int queryType = SIMPLE;
public boolean defaultTimeWindow = true;
private Query query = null; private Query query = null;
private Analyzer analyzer = null; private Analyzer analyzer = null;
private static final Log log = LogFactory.getLog(LuceneQuery.class.getName()); private static final Log log = LogFactory.getLog(LuceneQuery.class.getName());
//private IndexReader indexReader;
public LuceneQuery(VitroRequest request, PortalFlag portalState, public LuceneQuery(VitroRequest request, PortalFlag portalState,
Analyzer analyzer, String indexDir ){ Analyzer analyzer, String defualtField ){
super(request,portalState); //the super class will stash the parameters for us. super(request,portalState); //the super class will stash the parameters for us.
this.analyzer = analyzer; this.analyzer = analyzer;
// if( indexReader == null ){
// try {
// indexReader = IndexReader.open( indexDir );
// } catch (IOException e) {
// System.out.println("LuceneQuery: could not create IndexReader"+e);
// e.printStackTrace();
// }
// }
if( isAdvancedQuery( request ) ){ if( isAdvancedQuery( request ) ){
queryType = ADVANCED; queryType = ADVANCED;
} }
} }
@SuppressWarnings("static-access") @SuppressWarnings("static-access")
private QueryParser getQueryParser(){ private QueryParser getQueryParser(){
//defaultSearchField indicates which field search against when there is no term //defaultSearchField indicates which field search against when there is no term
//indicated in the query string. //indicated in the query string.
//The analyzer is needed so that we use the same analyzer on the search queries as //The analyzer is needed so that we use the same analyzer on the search queries as
//was used on the text that was indexed. //was used on the text that was indexed.
VitroQueryParser qp = new VitroQueryParser(defaultSearchField,analyzer); QueryParser qp = new QueryParser(defaultSearchField,analyzer);
//this sets the query parser to AND all of the query terms it finds. //this sets the query parser to AND all of the query terms it finds.
qp.setDefaultOperator(QueryParser.AND_OPERATOR); qp.setDefaultOperator(QueryParser.AND_OPERATOR);
//set up the map of stemmed field names -> unstemmed field names
HashMap<String,String> map = new HashMap<String, String>();
map.put(Entity2LuceneDoc.term.ALLTEXT,Entity2LuceneDoc.term.ALLTEXTUNSTEMMED);
qp.setStemmedToUnstemmed(map);
return qp; return qp;
} }
@ -132,67 +115,7 @@ public class LuceneQuery extends VitroQuery {
return this.query; return this.query;
} }
/**
* Adds a Query that will get doc where the
* SUNSET is > NOW and SUNRISE <= NOW. We'll do
* this by creating two RangeQueries, one to
* check that SUNRISE is between [BEGINNING_OF_TIME, NOW]
* and that SUNSET is between [NOW, END_OF_TIME]
* There don't seem to be any GraterThanQuery
* or LessThanQuery classes in lucene.
*/
// private BooleanQuery makeDefaultTimeWindowQuery(){
// String nowStr = new DateTime().toString(LuceneIndexer.DATE_FORMAT);
// Term BEGINNING_OF_TIME = null;
// Term now = new Term(Entity2LuceneDoc.term.SUNRISE,nowStr );
// RangeQuery sunriseBeforeNow = new RangeQuery(BEGINNING_OF_TIME,now, true);
// Term END_OF_TIME = null;
// now = new Term(Entity2LuceneDoc.term.SUNSET,nowStr);
// RangeQuery sunsetAfterNow = new RangeQuery(now,END_OF_TIME, false);
// BooleanQuery qRv = new BooleanQuery();
// qRv.add( sunriseBeforeNow, BooleanClause.Occur.MUST);
// qRv.add( sunsetAfterNow, BooleanClause.Occur.MUST);
// return qRv;
// }
/**
* Makes queries to return only things between the given times and adds
* them as BooleanQuery objects.
*
* If earliest is null then the query include anything that existed before latest.
* If latest is null then the query will include anthing that existes after earliest.
* If both earliest and latest are null then NO restrictions will be added to the query.
*/
private Query addTimeWindowedQuery( Query query, DateTime earliest, DateTime latest){
Query returnQuery = null;
if( earliest ==null && latest == null ) return query;
if( earliest != null && latest != null ){
//we work with the SUNSET here since that is the last time the
//object will be seen.
Term earliestTerm = new Term(Entity2LuceneDoc.term.SUNSET,
earliest.toString(LuceneIndexer.DATE_FORMAT));
Term latestTerm = new Term(Entity2LuceneDoc.term.SUNRISE,
latest.toString(LuceneIndexer.DATE_FORMAT));
RangeQuery timeWindowQuery = new RangeQuery(earliestTerm,latestTerm, true);
BooleanQuery bQuery = new BooleanQuery();
bQuery.add( query, BooleanClause.Occur.MUST);
bQuery.add( timeWindowQuery, BooleanClause.Occur.MUST);
returnQuery = bQuery;
}
return returnQuery;
}
// Term beginning_of_time = new Term(Entity2LuceneDoc.term.SUNSET,
// BEGINNING_OF_TIME);
// Term end_of_time = new Term(Entity2LuceneDoc.term.SUNRISE,
// END_OF_TIME);
/** /**
* Makes a flag based query clause. This is where searches can filter by portal. * Makes a flag based query clause. This is where searches can filter by portal.
* *
@ -250,8 +173,6 @@ public class LuceneQuery extends VitroQuery {
return false; return false;
} }
@Override @Override
public String getTerms() { public String getTerms() {
if( getParameters() != null && if( getParameters() != null &&

View file

@ -2,27 +2,24 @@
package edu.cornell.mannlib.vitro.webapp.search.lucene; package edu.cornell.mannlib.vitro.webapp.search.lucene;
import javax.servlet.http.HttpServletRequest; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Analyzer; import edu.cornell.mannlib.vitro.webapp.controller.VitroRequest;
import edu.cornell.mannlib.vitro.webapp.flags.PortalFlag;
import edu.cornell.mannlib.vitro.webapp.controller.VitroRequest; import edu.cornell.mannlib.vitro.webapp.search.SearchException;
import edu.cornell.mannlib.vitro.webapp.flags.PortalFlag; import edu.cornell.mannlib.vitro.webapp.search.beans.VitroQuery;
import edu.cornell.mannlib.vitro.webapp.search.SearchException; import edu.cornell.mannlib.vitro.webapp.search.beans.VitroQueryFactory;
import edu.cornell.mannlib.vitro.webapp.search.beans.VitroQuery;
import edu.cornell.mannlib.vitro.webapp.search.beans.VitroQueryFactory;
public class LuceneQueryFactory implements VitroQueryFactory { public class LuceneQueryFactory implements VitroQueryFactory {
public static final int MAX_QUERY_LENGTH = 500; public static final int MAX_QUERY_LENGTH = 500;
private String indexDir; private String defaultField;
private Analyzer analyzer = null;
public LuceneQueryFactory(Analyzer analyzer, String indexDir ){
this.analyzer = analyzer; public LuceneQueryFactory(Analyzer analyzer, String defaultField ){
this.indexDir = indexDir; this.analyzer = analyzer;
} this.defaultField = defaultField;
}
private Analyzer analyzer = null;
public VitroQuery getQuery(VitroRequest request, PortalFlag portalState) throws SearchException { public VitroQuery getQuery(VitroRequest request, PortalFlag portalState) throws SearchException {
//there should be a better way to integrate this with LuceneQuery //there should be a better way to integrate this with LuceneQuery
@ -34,7 +31,7 @@ public class LuceneQueryFactory implements VitroQueryFactory {
if( txt.length() > MAX_QUERY_LENGTH ) if( txt.length() > MAX_QUERY_LENGTH )
throw new SearchException("The search was too long. The maximum " + throw new SearchException("The search was too long. The maximum " +
"query length is " + MAX_QUERY_LENGTH ); "query length is " + MAX_QUERY_LENGTH );
LuceneQuery query = new LuceneQuery(request, portalState, analyzer, indexDir); LuceneQuery query = new LuceneQuery(request, portalState, analyzer, defaultField );
return query; return query;
} }

View file

@ -4,6 +4,7 @@ package edu.cornell.mannlib.vitro.webapp.search.lucene;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
@ -14,6 +15,9 @@ import javax.servlet.ServletContextEvent;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.KeywordAnalyzer;
import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanQuery;
import com.hp.hpl.jena.ontology.OntModel; import com.hp.hpl.jena.ontology.OntModel;
@ -88,7 +92,7 @@ public class LuceneSetup implements javax.servlet.ServletContextListener {
// the queries need to know the analyzer to use so that the same one can be used // the queries need to know the analyzer to use so that the same one can be used
// to analyze the fields in the incoming user query terms. // to analyze the fields in the incoming user query terms.
LuceneSearcher searcher = new LuceneSearcher( LuceneSearcher searcher = new LuceneSearcher(
new LuceneQueryFactory(getAnalyzer(), indexDir), new LuceneQueryFactory(getAnalyzer(), Entity2LuceneDoc.term.ALLTEXT),
indexDir); indexDir);
searcher.addObj2Doc(new Entity2LuceneDoc()); searcher.addObj2Doc(new Entity2LuceneDoc());
context.setAttribute(Searcher.class.getName(), searcher); context.setAttribute(Searcher.class.getName(), searcher);
@ -186,11 +190,15 @@ public class LuceneSetup implements javax.servlet.ServletContextListener {
* *
* @return * @return
*/ */
private Analyzer getAnalyzer() { public Analyzer getAnalyzer() {
return new VitroAnalyzer(); PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper( new KeywordAnalyzer());
} analyzer.addAnalyzer(Entity2LuceneDoc.term.ALLTEXT, new HtmlLowerStopStemAnalyzer());
analyzer.addAnalyzer(Entity2LuceneDoc.term.NAME, new HtmlLowerStopStemAnalyzer());
analyzer.addAnalyzer(Entity2LuceneDoc.term.ALLTEXTUNSTEMMED, new HtmlLowerStopAnalyzer());
analyzer.addAnalyzer(Entity2LuceneDoc.term.NAME, new HtmlLowerStopAnalyzer());
return analyzer;
}
public static final String ANALYZER= "lucene.analyzer"; public static final String ANALYZER= "lucene.analyzer";
public static final String INDEX_DIR = "lucene.indexDir"; public static final String INDEX_DIR = "lucene.indexDir";
public static final String SEARCH_DATAPROPERTY_BLACKLIST = public static final String SEARCH_DATAPROPERTY_BLACKLIST =

View file

@ -79,7 +79,7 @@ public class LuceneSetupCJK implements javax.servlet.ServletContextListener {
// the queries need to know the analyzer to use so that the same one can be used // the queries need to know the analyzer to use so that the same one can be used
// to analyze the fields in the incoming user query terms. // to analyze the fields in the incoming user query terms.
LuceneSearcher searcher = new LuceneSearcher( LuceneSearcher searcher = new LuceneSearcher(
new LuceneQueryFactory(getAnalyzer(), indexDir), new LuceneQueryFactory(getAnalyzer(), Entity2LuceneDoc.term.ALLTEXT),
indexDir); indexDir);
searcher.addObj2Doc(new Entity2LuceneDoc()); searcher.addObj2Doc(new Entity2LuceneDoc());
context.setAttribute(Searcher.class.getName(), searcher); context.setAttribute(Searcher.class.getName(), searcher);

View file

@ -24,7 +24,8 @@ public class VitroAnalyzer extends Analyzer {
if( Entity2LuceneDoc.term.ALLTEXT.equals(field) || if( Entity2LuceneDoc.term.ALLTEXT.equals(field) ||
Entity2LuceneDoc.term.NAME.equals(field) ) Entity2LuceneDoc.term.NAME.equals(field) )
return stemmingAnalyzer.tokenStream(field, reader); return stemmingAnalyzer.tokenStream(field, reader);
else if( Entity2LuceneDoc.term.ALLTEXTUNSTEMMED.equals(field) ) else if( Entity2LuceneDoc.term.ALLTEXTUNSTEMMED.equals(field) ||
Entity2LuceneDoc.term.NAMEUNSTEMMED.equals(field) )
return nonStemmingAnalyzer.tokenStream(field, reader); return nonStemmingAnalyzer.tokenStream(field, reader);
else{ else{
return keywordAnalyzer.tokenStream(field, reader); return keywordAnalyzer.tokenStream(field, reader);

View file

@ -60,8 +60,6 @@ public class JenaDataSourceSetup extends JenaDataSourceSetupBase implements java
unionOms.setUserAccountsModel(userAccountsModel); unionOms.setUserAccountsModel(userAccountsModel);
OntModel displayModel = ontModelFromContextAttribute(sce.getServletContext(),"displayOntModel"); OntModel displayModel = ontModelFromContextAttribute(sce.getServletContext(),"displayOntModel");
OntModel displayUnionModel = ModelFactory.createOntologyModel(MEM_ONT_MODEL_SPEC,ModelFactory.createUnion(displayModel, unionModel));
sce.getServletContext().setAttribute("displayOntModel", displayUnionModel);
baseOms.setDisplayModel(displayModel); baseOms.setDisplayModel(displayModel);
inferenceOms.setDisplayModel(displayModel); inferenceOms.setDisplayModel(displayModel);
unionOms.setDisplayModel(displayModel); unionOms.setDisplayModel(displayModel);