Upgraded lucene-2.4-*.jar to lucene-2.9.3-*.jar. Upgraded solr-1.0.jar to apache-solr-core-1.4.1.jar. Removed LuceneHighlighter and SimpleLuceneHighlighter as they are not used anymore. Made minor changes to the way Query is parsed in PagedSearchController. The query now uses a MultiFieldQueryParser that searches the query against ALLTEXT, NAME and TYPE fields in the search index.

2011-03-14 13:41:16 +00:00 · 2011-03-14 13:41:16 +00:00 · 801d789696
commit 801d789696
parent af8ce43e16
15 changed files with 67 additions and 242 deletions
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/beans/Searcher.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/beans/Searcher.java
@ -41,7 +41,7 @@ public interface Searcher {
     * @param q
     * @return
     */
-    public abstract VitroHighlighter getHighlighter(VitroQuery q);
+ //   public abstract VitroHighlighter getHighlighter(VitroQuery q);

    /**
     * Used to close the searcher if the index that it was using gets
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/PagedSearchController.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/PagedSearchController.java
@ -28,6 +28,7 @@ import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.queryParser.MultiFieldQueryParser;
 import org.apache.lucene.queryParser.ParseException;
 import org.apache.lucene.queryParser.QueryParser;
 import org.apache.lucene.search.BooleanClause;
@ -69,7 +70,6 @@ import edu.cornell.mannlib.vitro.webapp.search.beans.VitroQueryFactory;
 import edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc;
 import edu.cornell.mannlib.vitro.webapp.search.lucene.LuceneIndexFactory;
 import edu.cornell.mannlib.vitro.webapp.search.lucene.LuceneSetup;
-import edu.cornell.mannlib.vitro.webapp.search.lucene.SimpleLuceneHighlighter;
 import edu.cornell.mannlib.vitro.webapp.utils.FlagMathUtils;
 import edu.cornell.mannlib.vitro.webapp.utils.Html2Text;
 import edu.cornell.mannlib.vitro.webapp.utils.StringUtils;
@ -157,6 +157,9 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear

    @Override
    protected ResponseValues processRequest(VitroRequest vreq) {
+    	
+    	log.info("All parameters present in the request: "+ vreq.getParameterMap().toString());
+    	
        //There may be other non-html formats in the future
        Format format = getFormat(vreq);            
        boolean wasXmlRequested = Format.XML == format;
@ -178,6 +181,10 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
            VClassDao vclassDao = vreq.getWebappDaoFactory().getVClassDao();
            String alphaFilter = vreq.getParameter("alpha");
            
+            
+            log.info("IndividualDao is " + iDao.toString() + " Public classes in the classgroup are " + grpDao.getPublicGroupsWithVClasses().toString());
+            log.info("VClassDao is "+ vclassDao.toString() );
+            
            int startIndex = 0;
            try{ 
                startIndex = Integer.parseInt(vreq.getParameter("startIndex")); 
@ -206,6 +213,8 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
            String qtxt = vreq.getParameter(VitroQuery.QUERY_PARAMETER_NAME);
            Analyzer analyzer = getAnalyzer(getServletContext());
            
+            log.info("Query text is "+ qtxt + " Analyzer is "+ analyzer.toString());
+            
            Query query = null;
            try {
                query = getQuery(vreq, portalFlag, analyzer, qtxt);
@ -218,6 +227,8 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
                                                
            TopDocs topDocs = null;
            try{
+            	log.info("Searching for query term in the Index with maxHitSize "+ maxHitSize);
+            	log.info("Query is "+ query.toString());
                topDocs = searcherForRequest.search(query,null,maxHitSize);
            }catch(Throwable t){
                log.error("in first pass at search: " + t);
@ -241,7 +252,9 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
                return doFailedSearch(msg, qtxt,format);
            }
            
+            
            int hitsLength = topDocs.scoreDocs.length;
+            log.info("No. of hits "+ hitsLength);
            if ( hitsLength < 1 ){                
                return doNoHits(qtxt,format);
            }            
@ -260,6 +273,7 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
                    if( (i >= startIndex) && (i <= lastHitToShow) ){                        
                        Document doc = searcherForRequest.doc(topDocs.scoreDocs[i].doc);                    
                        String uri = doc.get(Entity2LuceneDoc.term.URI);
+                        log.info("Retrieving entity with uri "+ uri);
                        Individual ent = new IndividualImpl();
                        ent.setURI(uri);
                        ent = iDao.getIndividualByURI(uri);
@ -582,11 +596,20 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
                        "query length is " + MAX_QUERY_LENGTH );
                return null;
            }               
+            
+            log.info("Parsing query using QueryParser ");
+            
            QueryParser parser = getQueryParser(analyzer);
            query = parser.parse(querystr);

            String alpha = request.getParameter("alpha");
+            
+            
            if( alpha != null && !"".equals(alpha) && alpha.length() == 1){
+            	
+            	log.info("Firing alpha query ");
+            	log.info("request.getParameter(alpha) is " + alpha);
+            	
                BooleanQuery boolQuery = new BooleanQuery();
                boolQuery.add( query, BooleanClause.Occur.MUST );
                boolQuery.add( 
@ -597,7 +620,11 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
            
            //check if this is classgroup filtered
            Object param = request.getParameter("classgroup");
-            if( param != null && !"".equals(param)){                         
+            if( param != null && !"".equals(param)){
+            	
+            	log.info("Firing classgroup query ");
+                log.info("request.getParameter(classgroup) is "+ param.toString());
+
                  BooleanQuery boolQuery = new BooleanQuery();
                  boolQuery.add( query, BooleanClause.Occur.MUST);
                  boolQuery.add(  new TermQuery(
@ -609,8 +636,11 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear

            //check if this is rdf:type filtered
            param = request.getParameter("type");
-            if(  param != null && !"".equals(param)){                
-                BooleanQuery boolQuery = new BooleanQuery();
+            if(  param != null && !"".equals(param)){                         
+            	log.info("Firing type query ");
+            	log.info("request.getParameter(type) is "+ param.toString());   
+                
+            	BooleanQuery boolQuery = new BooleanQuery();
                boolQuery.add( query, BooleanClause.Occur.MUST);
                boolQuery.add(  new TermQuery(
                                    new Term(Entity2LuceneDoc.term.RDFTYPE, 
@ -623,6 +653,7 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
            //it by making a BooelanQuery.
            Query flagQuery = makeFlagQuery( portalState );
            if( flagQuery != null ){
+            	log.info("Firing Flag query ");
                BooleanQuery boolQuery = new BooleanQuery();
                boolQuery.add( query, BooleanClause.Occur.MUST);
                boolQuery.add( flagQuery, BooleanClause.Occur.MUST);
@ -646,14 +677,17 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
        //indicated in the query string.
        //The analyzer is needed so that we use the same analyzer on the search queries as
        //was used on the text that was indexed.
-        QueryParser qp = new QueryParser(defaultSearchField,analyzer);
+    	//QueryParser qp = new QueryParser("NAME",analyzer);
        //this sets the query parser to AND all of the query terms it finds.
-        qp.setDefaultOperator(QueryParser.AND_OPERATOR);
+        //qp.setDefaultOperator(QueryParser.AND_OPERATOR);
        //set up the map of stemmed field names -> unstemmed field names
 //        HashMap<String,String> map = new HashMap<String, String>();
 //        map.put(Entity2LuceneDoc.term.ALLTEXT,Entity2LuceneDoc.term.ALLTEXTUNSTEMMED);
 //        qp.setStemmedToUnstemmed(map);
-        return qp;
+    	
+    	MultiFieldQueryParser qp = new MultiFieldQueryParser(new String[]{"ALLTEXT", "name", "type"}, analyzer);
+    	
+    	return qp;
    }
 
    /**
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/Entity2LuceneDoc.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/Entity2LuceneDoc.java
@ -149,16 +149,21 @@ public class Entity2LuceneDoc  implements Obj2DocIface{
                if( clz.getSearchBoost() != null )
                    doc.setBoost( doc.getBoost() + clz.getSearchBoost() );
                
-                doc.add( new Field(term.RDFTYPE, clz.getURI(), 
-                                    Field.Store.YES, Field.Index.NOT_ANALYZED));
+                Field typeField = new Field (term.RDFTYPE, clz.getName(), Field.Store.YES, Field.Index.ANALYZED);
+                typeField.setBoost(2*FIELD_BOOST);
+                
+                doc.add( typeField);
                
                if( clz.getName() != null )
                    classPublicNames = classPublicNames + " " + clz.getName();
                
                //Classgroup URI
-                if( clz.getGroupURI() != null )
-                    doc.add( new Field(term.CLASSGROUP_URI, clz.getGroupURI(), 
-                                        Field.Store.YES, Field.Index.NOT_ANALYZED));
+                if( clz.getGroupURI() != null ){
+                	Field classGroupField = new Field(term.CLASSGROUP_URI, clz.getGroupURI(), 
+                            Field.Store.YES, Field.Index.ANALYZED);
+                	classGroupField.setBoost(FIELD_BOOST);
+                    doc.add(classGroupField);
+                }
            }
        }        
        doc.add( new Field(term.PROHIBITED_FROM_TEXT_RESULTS, prohibited?"1":"0", 
@ -184,7 +189,7 @@ public class Entity2LuceneDoc  implements Obj2DocIface{
            value = ent.getLocalName();
        }
        Field name =new Field(term.NAME, value, 
-                               Field.Store.NO, Field.Index.ANALYZED);
+                               Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES);
        name.setBoost( NAME_BOOST );
        doc.add( name );
        
@ -238,7 +243,7 @@ public class Entity2LuceneDoc  implements Obj2DocIface{
            }
        }catch (Exception ex){
            value = null;
-        }
+        } 
        if( value != null )
            doc.add( new Field(term.SUNSET, value, Field.Store.YES, Field.Index.NOT_ANALYZED));
        else
@ -308,9 +313,9 @@ public class Entity2LuceneDoc  implements Obj2DocIface{
                }
            }
            //stemmed terms
-            doc.add( new  Field(term.ALLTEXT, value , Field.Store.NO, Field.Index.ANALYZED));
+            doc.add( new  Field(term.ALLTEXT, value , Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES));
            //unstemmed terms
-            doc.add( new Field(term.ALLTEXTUNSTEMMED, value, Field.Store.NO, Field.Index.ANALYZED));
+            doc.add( new Field(term.ALLTEXTUNSTEMMED, value, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES));
        }
        
        //flagX and portal flags are no longer indexed.
@ -359,6 +364,7 @@ public class Entity2LuceneDoc  implements Obj2DocIface{
        this.classesProhibitedFromSearch = classesProhibitedFromSearch;
    }
    
-    public static float NAME_BOOST = 10;
-    public static float KEYWORD_BOOST = 2;
+    public static float NAME_BOOST = 3.0F;
+    public static float KEYWORD_BOOST = 2.0F;
+    public static float FIELD_BOOST = 1.0F;
 }
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneHighlighter.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneHighlighter.java
@ -1,135 +0,0 @@
-/* $This file is distributed under the terms of the license in /doc/license.txt$ */
-
-package edu.cornell.mannlib.vitro.webapp.search.lucene;
-
-import java.io.IOException;
-import java.io.StringReader;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.highlight.Formatter;
-import org.apache.lucene.search.highlight.Highlighter;
-import org.apache.lucene.search.highlight.NullFragmenter;
-import org.apache.lucene.search.highlight.QueryScorer;
-import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
-
-import edu.cornell.mannlib.vitro.webapp.search.beans.VitroHighlighter;
-
-public class LuceneHighlighter extends VitroHighlighter{
-    /* See VitroHighlighter for prefix tag and postfix tag */
-
-    Highlighter nonFragHighlighter = null;
-    Highlighter fragHighlighter = null;
-
-    Analyzer analyzer = null;
-
-    /**
-     * Makes a VitroHighlighter that uses lucene highlighters.
-     * PreTag and PostTag are from VitroHighlighter.
-     *
-     * @param query - the query to highlight for.
-     * @param a - the Analyzer that was used in the query.
-     */
-    public LuceneHighlighter(Query query, Analyzer a){
-        QueryScorer scorer = new QueryScorer( query );
-        /* See VitroHighlighter for prefix tag and postfix tag */
-        Formatter formatter =
-            new SimpleHTMLFormatter(preTag,postTag);
-        this.analyzer = a;
-        this.fragHighlighter = new Highlighter(formatter, scorer);
-
-        //here we make a highlighter that doesn't fragment
-        this.nonFragHighlighter = new Highlighter( formatter, scorer);
-        this.nonFragHighlighter.setTextFragmenter(new NullFragmenter());
-    }
-
-    
-    private Pattern htmlOrNot = Pattern.compile("(<[^>]*>)|([^<]*)");
-    private int HTML_PATTERN_INDEX = 1;
-    private int TEXT_PATTERN_INDEX = 2;
-    /**
-     * Highlights in a string. No Fragmenting. Attempts to avoid some HTML.
-     * @param in
-     * @return
-     */
-    public String highlight( String in){
-        Matcher matcher =  htmlOrNot.matcher(in);
-        StringBuilder output = new StringBuilder();
-        
-        boolean found = matcher.find();
-        if( ! found )
-            return in;
-        
-        while( found ){
-            String foundHtmlElement = matcher.group( HTML_PATTERN_INDEX );
-            if( foundHtmlElement != null ){
-                output.append( foundHtmlElement );
-            }else{
-                String foundTextNode = matcher.group( TEXT_PATTERN_INDEX );
-                String hi = foundTextNode;
-                try {
-                    hi = nonFragHighlighter.getBestFragment(analyzer,"contents",foundTextNode);                    
-                } catch (IOException e) {
-                    return in;
-                }
-                if( hi != null )
-                    output.append( hi );
-                else
-                    output.append( foundTextNode );
-            }
-            found = matcher.find();
-        }
-        return output.toString();        
-    }
-    
-    
-            
-    
-    
-    protected boolean WITH_ELLIPSIS = true;
-    protected String ellipsis = "...";
-    public String getHighlightFragments(String in ) {
-
-        if(WITH_ELLIPSIS ){
-            if( in != null && in.trim().length() > 0){
-                String b = doHighlight( in ,fragHighlighter);
-                if( b != null && b.trim().length() > 0 )
-                    return ellipsis + " " + b + " " + ellipsis;
-                else
-                    return "";
-            } else {
-                return "";
-            }
-        } else {
-            return doHighlight(  in , fragHighlighter);
-        }
-    }
-
-    private String doHighlight(String in, Highlighter hi ) {
-        String result = in;
-
-        if(in != null ){
-
-
-            TokenStream tokenStream =
-                analyzer.tokenStream("contents", new StringReader(in));
-            //       Get 3 best fragments and seperate with a "..."
-            try {
-                result = hi.getBestFragments(tokenStream, in , 3, "...");
-            } catch (IOException e) {
-                // TODO Auto-generated catch block
-                e.printStackTrace();
-            }
-        }
-
-        return result;
-    }
-    
-    private final int maxDocCharsToAnalyze = 4000;
-    Log log = LogFactory.getLog(LuceneHighlighter.class);
-}
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneSearcher.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneSearcher.java
@ -306,7 +306,7 @@ public class LuceneSearcher implements Searcher {
     * we need to 'rewrite' the query.  That takes any wild cards
     * and replaces them will all terms that are found in the index.
     */
-    public VitroHighlighter getHighlighter(VitroQuery queryIn){
+/*    public VitroHighlighter getHighlighter(VitroQuery queryIn){
        if( ! (queryIn instanceof LuceneQuery) ){
            log.error("LuceneSearcher expects to get a LuceneQuery");
            throw new Error("LuceneSearcher expects to get a LuceneQuery");
@ -327,6 +327,6 @@ public class LuceneSearcher implements Searcher {
            log.error(e, e);
        }
        return   (VitroHighlighter)highlighter;
-    }
+    }*/

 }
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneSetup.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneSetup.java
@ -21,6 +21,7 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.KeywordAnalyzer;
 import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.search.BooleanQuery;

 import com.hp.hpl.jena.ontology.OntModel;
@ -229,10 +230,12 @@ public class LuceneSetup implements javax.servlet.ServletContextListener {
     */
    private Analyzer getAnalyzer() {
        PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper( new KeywordAnalyzer());
-        analyzer.addAnalyzer(ALLTEXT, new HtmlLowerStopStemAnalyzer());
-        analyzer.addAnalyzer(NAME, new HtmlLowerStopStemAnalyzer());
+      //  PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper( new StandardAnalyzer());
+    	analyzer.addAnalyzer(ALLTEXT, new HtmlLowerStopStemAnalyzer());
+      // analyzer.addAnalyzer(NAME, new HtmlLowerStopStemAnalyzer());
        analyzer.addAnalyzer(ALLTEXTUNSTEMMED, new HtmlLowerStopAnalyzer());
-        analyzer.addAnalyzer(NAMEUNSTEMMED, new HtmlLowerStopAnalyzer());        
+        analyzer.addAnalyzer(NAMEUNSTEMMED, new HtmlLowerStopAnalyzer());      
+        analyzer.addAnalyzer(NAME, new KeywordAnalyzer());
        return analyzer;
    }
        
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/SimpleLuceneHighlighter.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/SimpleLuceneHighlighter.java
@ -1,83 +0,0 @@
-/* $This file is distributed under the terms of the license in /doc/license.txt$ */
-
-package edu.cornell.mannlib.vitro.webapp.search.lucene;
-
-import java.io.IOException;
-import java.io.StringReader;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.highlight.Formatter;
-import org.apache.lucene.search.highlight.Highlighter;
-import org.apache.lucene.search.highlight.NullFragmenter;
-import org.apache.lucene.search.highlight.QueryScorer;
-import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
-
-import edu.cornell.mannlib.vitro.webapp.search.beans.VitroHighlighter;
-import edu.cornell.mannlib.vitro.webapp.utils.Html2Text;
-
-/**
- * This is a highlighter and fragmenter for use with PagedSearchController. 
- */
-public class SimpleLuceneHighlighter extends VitroHighlighter{    
-    Highlighter fragHighlighter = null;
-    Analyzer analyzer = null;
-    
-    public SimpleLuceneHighlighter(Query query, Analyzer a){
-        QueryScorer scorer = new QueryScorer( query ,Entity2LuceneDoc.term.ALLTEXT);
-
-        Formatter formatter =
-            new SimpleHTMLFormatter(preTag,postTag);
-        this.analyzer = a;
-        this.fragHighlighter = new Highlighter(formatter, scorer);
-    }
-   
-    @Override
-    public String highlight( String in){
-        //not really implemented.
-        return in;
-    }
-    
-    @Override
-    public String getHighlightFragments(String in ) {
-        Html2Text h2t = new Html2Text();
-        try{
-            h2t.parse(in);
-        }catch(IOException ioe){
-            log.debug("could not strip html from string",ioe);
-        }
-        String txt = h2t.getText();
-
-        if( txt != null && txt.trim().length() > 0){
-            String b = doHighlight( txt ,fragHighlighter);
-            if( b != null && b.trim().length() > 0 )
-                return "..." + " " + b + " " + "...";
-            else
-                return "";
-        } else {
-            return "";
-        }
-    }
-
-    private String doHighlight(String in, Highlighter hi ) {
-        String result = in;
-        if(in != null ){
-            TokenStream tokenStream =
-                analyzer.tokenStream(Entity2LuceneDoc.term.ALLTEXT, new StringReader(in));
-            try {
-                //Get 3 best fragments and seperate with a "..."
-                result = hi.getBestFragments(tokenStream, in , 3, "...");
-            } catch (IOException e) {
-                log.debug("could not highlight",e);
-            }
-        }
-        return result;
-    }
-    
-    private static Log log = LogFactory.getLog(SimpleLuceneHighlighter.class);
-}