Removing union graph from application graph. Adding new field nameUnstemmed to lucene index. Minor refactoring of lucene objects.

2010-06-25 20:05:05 +00:00 · 2010-06-25 20:05:05 +00:00 · b6d0c61e86
commit b6d0c61e86
parent 444d37bd5a
9 changed files with 51 additions and 121 deletions
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/PagedSearchController.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/PagedSearchController.java
@ -503,13 +503,13 @@ public class PagedSearchController extends VitroHttpServlet implements Searcher{
        //indicated in the query string.
        //The analyzer is needed so that we use the same analyzer on the search queries as
        //was used on the text that was indexed.
-        VitroQueryParser qp = new VitroQueryParser(defaultSearchField,analyzer);
+        QueryParser qp = new QueryParser(defaultSearchField,analyzer);
        //this sets the query parser to AND all of the query terms it finds.
        qp.setDefaultOperator(QueryParser.AND_OPERATOR);
        //set up the map of stemmed field names -> unstemmed field names
-        HashMap<String,String> map = new HashMap<String, String>();
-        map.put(Entity2LuceneDoc.term.ALLTEXT,Entity2LuceneDoc.term.ALLTEXTUNSTEMMED);
-        qp.setStemmedToUnstemmed(map);
+//        HashMap<String,String> map = new HashMap<String, String>();
+//        map.put(Entity2LuceneDoc.term.ALLTEXT,Entity2LuceneDoc.term.ALLTEXTUNSTEMMED);
+//        qp.setStemmedToUnstemmed(map);
        return qp;
    }
 
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/Entity2LuceneDoc.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/Entity2LuceneDoc.java
@ -47,6 +47,8 @@ public class Entity2LuceneDoc  implements Obj2DocIface{
        public static String MODTIME    = "modTime";
        /** Name of entity, tab or vclass */
        public static String NAME       = "name";
+        /** Name of entity, unstemmed */
+        public static String NAMEUNSTEMMED       = "nameunstemmed";
        /** Name of portal */
        public static String PORTAL     = "portal";
        /** time of index in msec since epoc */
@ -109,6 +111,9 @@ public class Entity2LuceneDoc  implements Obj2DocIface{
                               Field.Store.YES, Field.Index.ANALYZED);
        name.setBoost( NAME_BOOST );
        doc.add( name );
+        Field nameUn = new Field(term.NAMEUNSTEMMED, value, 
+        						Field.Store.YES, Field.Index.ANALYZED);
+        nameUn.setBoost( NAME_BOOST );

        //boost for entity
        if( ent.getSearchBoost() != null && ent.getSearchBoost() != 0 )
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/HtmlLowerStopAnalyzer.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/HtmlLowerStopAnalyzer.java
@ -100,10 +100,10 @@ public class HtmlLowerStopAnalyzer extends Analyzer {
 //        
        
        TokenStream result = new StandardTokenizer(arg0); 
-        result = new StandardFilter(result); 
-        result = new LowerCaseFilter(result); 
-        result = new StopFilter(result, _stopWords, IGNORE_CASE); 
-        result = new ISOLatin1AccentFilter(result); 
+        result = new StandardFilter(result);  //break into tokens
+        result = new LowerCaseFilter(result);  //lower case
+        result = new StopFilter(result, _stopWords, IGNORE_CASE);  //remove stop words
+        result = new ISOLatin1AccentFilter(result); //ISO-8859-1 accented chars are replace by unaccented 
        return result;
    
    }
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneQuery.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneQuery.java
@ -37,6 +37,7 @@ import edu.cornell.mannlib.vitro.webapp.utils.FlagMathUtils;
 * http://lucene.apache.org/java/docs/queryparsersyntax.html
 * http://today.java.net/pub/a/today/2003/11/07/QueryParserRules.html
 *
+ * This class is not thread safe, use one instance per request.
 * @author bdc34
 *
 */
@ -47,29 +48,16 @@ public class LuceneQuery extends VitroQuery {
    private final int ADVANCED =2;
    private int queryType = SIMPLE;

-    public boolean defaultTimeWindow = true;
-
    private Query query = null;
    private Analyzer analyzer = null;
    
    private static final Log log = LogFactory.getLog(LuceneQuery.class.getName());

-    //private IndexReader indexReader;
-
    public LuceneQuery(VitroRequest request, PortalFlag portalState,
-                       Analyzer analyzer, String indexDir ){
-    	
+                       Analyzer analyzer, String defualtField ){    	
        super(request,portalState); //the super class will stash the parameters for us.
        this.analyzer = analyzer;

-//        if( indexReader == null ){
-//          try {
-//              indexReader = IndexReader.open( indexDir );
-//          } catch (IOException e) {
-//              System.out.println("LuceneQuery: could not create IndexReader"+e);
-//              e.printStackTrace();
-//          }
-//        }
        if( isAdvancedQuery( request ) ){
            queryType = ADVANCED;
        }
@ -81,14 +69,9 @@ public class LuceneQuery extends VitroQuery {
        //indicated in the query string.
        //The analyzer is needed so that we use the same analyzer on the search queries as
        //was used on the text that was indexed.
-        VitroQueryParser qp = new VitroQueryParser(defaultSearchField,analyzer);
+        QueryParser qp = new QueryParser(defaultSearchField,analyzer);
        //this sets the query parser to AND all of the query terms it finds.
        qp.setDefaultOperator(QueryParser.AND_OPERATOR);
-        //set up the map of stemmed field names -> unstemmed field names
-        HashMap<String,String> map = new HashMap<String, String>();
-        map.put(Entity2LuceneDoc.term.ALLTEXT,Entity2LuceneDoc.term.ALLTEXTUNSTEMMED);
-        qp.setStemmedToUnstemmed(map);
-
        return qp;
    }

@ -133,66 +116,6 @@ public class LuceneQuery extends VitroQuery {
        return this.query;
    }
  
-    /**
-     * Adds a Query that will get doc where the
-     * SUNSET is > NOW  and SUNRISE <= NOW. We'll do
-     * this by creating two RangeQueries, one to
-     * check that SUNRISE is between [BEGINNING_OF_TIME, NOW]
-     * and that SUNSET is between [NOW, END_OF_TIME]
-     * There don't seem to be any GraterThanQuery
-     * or LessThanQuery classes in lucene.
-     */
-//     private BooleanQuery makeDefaultTimeWindowQuery(){
-//         String nowStr = new DateTime().toString(LuceneIndexer.DATE_FORMAT);
-
-//         Term BEGINNING_OF_TIME = null;
-//         Term now = new Term(Entity2LuceneDoc.term.SUNRISE,nowStr );
-//         RangeQuery sunriseBeforeNow = new RangeQuery(BEGINNING_OF_TIME,now, true);
-
-//         Term END_OF_TIME = null;
-//         now = new Term(Entity2LuceneDoc.term.SUNSET,nowStr);
-//         RangeQuery sunsetAfterNow = new RangeQuery(now,END_OF_TIME, false);
-
-//         BooleanQuery qRv = new BooleanQuery();
-//         qRv.add( sunriseBeforeNow, BooleanClause.Occur.MUST);
-//         qRv.add( sunsetAfterNow, BooleanClause.Occur.MUST);
-
-//         return qRv;
-//     }
-
-    /**
-     * Makes queries to return only things between the given times and adds
-     * them as BooleanQuery objects.
-     *
-     * If earliest is null then the query include anything that existed before latest.
-     * If latest is null then the query will include anthing that existes after earliest.
-     * If both earliest and latest are null then NO restrictions will be added to the query.
-     */
-    private Query addTimeWindowedQuery( Query query, DateTime earliest, DateTime latest){
-        Query returnQuery = null;
-        if( earliest ==null && latest == null ) return query;
-
-        if( earliest != null && latest != null ){
-            //we work with the SUNSET here since that is the last time the
-            //object will be seen.
-             Term earliestTerm = new Term(Entity2LuceneDoc.term.SUNSET,
-                                          earliest.toString(LuceneIndexer.DATE_FORMAT));
-             Term latestTerm = new Term(Entity2LuceneDoc.term.SUNRISE,
-                                        latest.toString(LuceneIndexer.DATE_FORMAT));
-
-             RangeQuery timeWindowQuery = new RangeQuery(earliestTerm,latestTerm, true);
-             BooleanQuery bQuery = new BooleanQuery();
-             bQuery.add( query, BooleanClause.Occur.MUST);
-             bQuery.add( timeWindowQuery, BooleanClause.Occur.MUST);
-             returnQuery = bQuery;
-        }
-        return returnQuery;
-    }
-
-            // Term beginning_of_time = new Term(Entity2LuceneDoc.term.SUNSET,
-//                                               BEGINNING_OF_TIME);
-//             Term end_of_time = new Term(Entity2LuceneDoc.term.SUNRISE,
-//                                         END_OF_TIME);
    /**
     * Makes a flag based query clause.  This is where searches can filter by portal.
     *
@ -250,8 +173,6 @@ public class LuceneQuery extends VitroQuery {
        return false;
    }

-
-
    @Override
    public String getTerms() {
        if( getParameters() != null &&
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneQueryFactory.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneQueryFactory.java
@ -2,8 +2,6 @@

 package edu.cornell.mannlib.vitro.webapp.search.lucene;

-import javax.servlet.http.HttpServletRequest;
-
 import org.apache.lucene.analysis.Analyzer;

 import edu.cornell.mannlib.vitro.webapp.controller.VitroRequest;
@ -15,15 +13,14 @@ import edu.cornell.mannlib.vitro.webapp.search.beans.VitroQueryFactory;
 public class LuceneQueryFactory implements VitroQueryFactory {

    public static final int MAX_QUERY_LENGTH = 500;    
-    private String indexDir;
-
-    public LuceneQueryFactory(Analyzer analyzer, String indexDir ){
-        this.analyzer = analyzer;
-        this.indexDir = indexDir;
-    }
-
+    private String defaultField;
    private Analyzer analyzer = null;
    
+    public LuceneQueryFactory(Analyzer analyzer, String defaultField ){
+        this.analyzer = analyzer;
+        this.defaultField = defaultField;
+    }    
+
    public VitroQuery getQuery(VitroRequest request, PortalFlag portalState) throws SearchException {
        //there should be a better way to integrate this with LuceneQuery
        //here we check that the request has the parameters that we need to
@ -34,7 +31,7 @@ public class LuceneQueryFactory implements VitroQueryFactory {
        if( txt.length() > MAX_QUERY_LENGTH )
            throw new SearchException("The search was too long. The maximum " +
            		"query length is " + MAX_QUERY_LENGTH );
-        LuceneQuery query = new LuceneQuery(request, portalState, analyzer, indexDir);
+        LuceneQuery query = new LuceneQuery(request, portalState, analyzer, defaultField );
        return query;
    }

--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneSetup.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneSetup.java
@ -4,6 +4,7 @@ package edu.cornell.mannlib.vitro.webapp.search.lucene;

 import java.io.File;
 import java.io.IOException;
+import java.io.Reader;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
@ -14,6 +15,9 @@ import javax.servlet.ServletContextEvent;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.KeywordAnalyzer;
+import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.search.BooleanQuery;

 import com.hp.hpl.jena.ontology.OntModel;
@ -88,7 +92,7 @@ public class LuceneSetup implements javax.servlet.ServletContextListener {
 	            // the queries need to know the analyzer to use so that the same one can be used
 	            // to analyze the fields in the incoming user query terms.
 	            LuceneSearcher searcher = new LuceneSearcher(
-	                    new LuceneQueryFactory(getAnalyzer(), indexDir),
+	                    new LuceneQueryFactory(getAnalyzer(), Entity2LuceneDoc.term.ALLTEXT),
 	                    indexDir);
 	            searcher.addObj2Doc(new Entity2LuceneDoc());
 	            context.setAttribute(Searcher.class.getName(), searcher);		           
@ -186,11 +190,15 @@ public class LuceneSetup implements javax.servlet.ServletContextListener {
     *
     * @return
     */
-    private Analyzer getAnalyzer() {
-        return new VitroAnalyzer();
+    public Analyzer getAnalyzer() {
+        PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper( new KeywordAnalyzer());
+        analyzer.addAnalyzer(Entity2LuceneDoc.term.ALLTEXT, new HtmlLowerStopStemAnalyzer());
+        analyzer.addAnalyzer(Entity2LuceneDoc.term.NAME, new HtmlLowerStopStemAnalyzer());
+        analyzer.addAnalyzer(Entity2LuceneDoc.term.ALLTEXTUNSTEMMED, new HtmlLowerStopAnalyzer());
+        analyzer.addAnalyzer(Entity2LuceneDoc.term.NAME, new HtmlLowerStopAnalyzer());        
+        return analyzer;
    }
    
-
    public static final String ANALYZER= "lucene.analyzer";
    public static final String INDEX_DIR = "lucene.indexDir";
    public static final String SEARCH_DATAPROPERTY_BLACKLIST = 
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneSetupCJK.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneSetupCJK.java
@ -79,7 +79,7 @@ public class LuceneSetupCJK implements javax.servlet.ServletContextListener {
            // the queries need to know the analyzer to use so that the same one can be used
            // to analyze the fields in the incoming user query terms.
            LuceneSearcher searcher = new LuceneSearcher(
-                    new LuceneQueryFactory(getAnalyzer(), indexDir),
+                    new LuceneQueryFactory(getAnalyzer(), Entity2LuceneDoc.term.ALLTEXT),
                    indexDir);
            searcher.addObj2Doc(new Entity2LuceneDoc());
            context.setAttribute(Searcher.class.getName(), searcher);
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/VitroAnalyzer.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/VitroAnalyzer.java
@ -24,7 +24,8 @@ public class VitroAnalyzer extends Analyzer {
        if( Entity2LuceneDoc.term.ALLTEXT.equals(field) ||
            Entity2LuceneDoc.term.NAME.equals(field) )
            return stemmingAnalyzer.tokenStream(field, reader);
-        else if( Entity2LuceneDoc.term.ALLTEXTUNSTEMMED.equals(field) )
+        else if( Entity2LuceneDoc.term.ALLTEXTUNSTEMMED.equals(field) ||
+        		Entity2LuceneDoc.term.NAMEUNSTEMMED.equals(field) )
            return nonStemmingAnalyzer.tokenStream(field, reader);
        else{
            return keywordAnalyzer.tokenStream(field, reader);
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/servlet/setup/JenaDataSourceSetup.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/servlet/setup/JenaDataSourceSetup.java
@ -60,8 +60,6 @@ public class JenaDataSourceSetup extends JenaDataSourceSetupBase implements java
        	unionOms.setUserAccountsModel(userAccountsModel);       
            
        	OntModel displayModel = ontModelFromContextAttribute(sce.getServletContext(),"displayOntModel");
-        	OntModel displayUnionModel = ModelFactory.createOntologyModel(MEM_ONT_MODEL_SPEC,ModelFactory.createUnion(displayModel, unionModel));
-        	sce.getServletContext().setAttribute("displayOntModel", displayUnionModel);
        	baseOms.setDisplayModel(displayModel);
        	inferenceOms.setDisplayModel(displayModel);
        	unionOms.setDisplayModel(displayModel);