Upgraded lucene-2.4-*.jar to lucene-2.9.3-*.jar. Upgraded solr-1.0.jar to apache-solr-core-1.4.1.jar. Removed LuceneHighlighter and SimpleLuceneHighlighter as they are not used anymore. Made minor changes to the way Query is parsed in PagedSearchController. The query now uses a MultiFieldQueryParser that searches the query against ALLTEXT, NAME and TYPE fields in the search index.

2011-03-14 13:41:16 +00:00 · 2011-03-14 13:41:16 +00:00 · 801d789696
commit 801d789696
parent af8ce43e16
15 changed files with 67 additions and 242 deletions
--- a/webapp/lib/apache-solr-core-1.4.1.jar
+++ b/webapp/lib/apache-solr-core-1.4.1.jar
--- a/webapp/lib/lucene-analyzers-2.4.jar
+++ b/webapp/lib/lucene-analyzers-2.4.jar
--- a/webapp/lib/lucene-analyzers-2.9.3.jar
+++ b/webapp/lib/lucene-analyzers-2.9.3.jar
--- a/webapp/lib/lucene-core-2.4.0.jar
+++ b/webapp/lib/lucene-core-2.4.0.jar
--- a/webapp/lib/lucene-core-2.9.3.jar
+++ b/webapp/lib/lucene-core-2.9.3.jar
--- a/webapp/lib/lucene-highlighter-2.4.jar
+++ b/webapp/lib/lucene-highlighter-2.4.jar
--- a/webapp/lib/lucene-highlighter-2.9.3.jar
+++ b/webapp/lib/lucene-highlighter-2.9.3.jar
--- a/webapp/lib/solr-1.0.jar
+++ b/webapp/lib/solr-1.0.jar
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/beans/Searcher.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/beans/Searcher.java
@ -41,7 +41,7 @@ public interface Searcher {
     * @param q
     * @return
     */
-    public abstract VitroHighlighter getHighlighter(VitroQuery q);
+ //   public abstract VitroHighlighter getHighlighter(VitroQuery q);
    /**
     * Used to close the searcher if the index that it was using gets
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/PagedSearchController.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/PagedSearchController.java
@ -28,6 +28,7 @@ import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queryParser.MultiFieldQueryParser;
 import org.apache.lucene.queryParser.ParseException;
 import org.apache.lucene.queryParser.QueryParser;
 import org.apache.lucene.search.BooleanClause;
@ -69,7 +70,6 @@ import edu.cornell.mannlib.vitro.webapp.search.beans.VitroQueryFactory;
 import edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc;
 import edu.cornell.mannlib.vitro.webapp.search.lucene.LuceneIndexFactory;
 import edu.cornell.mannlib.vitro.webapp.search.lucene.LuceneSetup;
 import edu.cornell.mannlib.vitro.webapp.search.lucene.SimpleLuceneHighlighter;
 import edu.cornell.mannlib.vitro.webapp.utils.FlagMathUtils;
 import edu.cornell.mannlib.vitro.webapp.utils.Html2Text;
 import edu.cornell.mannlib.vitro.webapp.utils.StringUtils;
@ -157,6 +157,9 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
    @Override
    protected ResponseValues processRequest(VitroRequest vreq) {
    	log.info("All parameters present in the request: "+ vreq.getParameterMap().toString());
        //There may be other non-html formats in the future
        Format format = getFormat(vreq);            
        boolean wasXmlRequested = Format.XML == format;
@ -178,6 +181,10 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
            VClassDao vclassDao = vreq.getWebappDaoFactory().getVClassDao();
            String alphaFilter = vreq.getParameter("alpha");
            log.info("IndividualDao is " + iDao.toString() + " Public classes in the classgroup are " + grpDao.getPublicGroupsWithVClasses().toString());
            log.info("VClassDao is "+ vclassDao.toString() );
            int startIndex = 0;
            try{ 
                startIndex = Integer.parseInt(vreq.getParameter("startIndex")); 
@ -206,6 +213,8 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
            String qtxt = vreq.getParameter(VitroQuery.QUERY_PARAMETER_NAME);
            Analyzer analyzer = getAnalyzer(getServletContext());
            log.info("Query text is "+ qtxt + " Analyzer is "+ analyzer.toString());
            Query query = null;
            try {
                query = getQuery(vreq, portalFlag, analyzer, qtxt);
@ -218,6 +227,8 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
            TopDocs topDocs = null;
            try{
            	log.info("Searching for query term in the Index with maxHitSize "+ maxHitSize);
            	log.info("Query is "+ query.toString());
                topDocs = searcherForRequest.search(query,null,maxHitSize);
            }catch(Throwable t){
                log.error("in first pass at search: " + t);
@ -241,7 +252,9 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
                return doFailedSearch(msg, qtxt,format);
            }
            int hitsLength = topDocs.scoreDocs.length;
            log.info("No. of hits "+ hitsLength);
            if ( hitsLength < 1 ){                
                return doNoHits(qtxt,format);
            }            
@ -260,6 +273,7 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
                    if( (i >= startIndex) && (i <= lastHitToShow) ){                        
                        Document doc = searcherForRequest.doc(topDocs.scoreDocs[i].doc);                    
                        String uri = doc.get(Entity2LuceneDoc.term.URI);
                        log.info("Retrieving entity with uri "+ uri);
                        Individual ent = new IndividualImpl();
                        ent.setURI(uri);
                        ent = iDao.getIndividualByURI(uri);
@ -582,11 +596,20 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
                        "query length is " + MAX_QUERY_LENGTH );
                return null;
            }               
            log.info("Parsing query using QueryParser ");
            QueryParser parser = getQueryParser(analyzer);
            query = parser.parse(querystr);
            String alpha = request.getParameter("alpha");
            if( alpha != null && !"".equals(alpha) && alpha.length() == 1){
            	log.info("Firing alpha query ");
            	log.info("request.getParameter(alpha) is " + alpha);
                BooleanQuery boolQuery = new BooleanQuery();
                boolQuery.add( query, BooleanClause.Occur.MUST );
                boolQuery.add( 
@ -598,6 +621,10 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
            //check if this is classgroup filtered
            Object param = request.getParameter("classgroup");
            if( param != null && !"".equals(param)){
            	log.info("Firing classgroup query ");
                log.info("request.getParameter(classgroup) is "+ param.toString());
                  BooleanQuery boolQuery = new BooleanQuery();
                  boolQuery.add( query, BooleanClause.Occur.MUST);
                  boolQuery.add(  new TermQuery(
@ -610,6 +637,9 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
            //check if this is rdf:type filtered
            param = request.getParameter("type");
            if(  param != null && !"".equals(param)){                         
            	log.info("Firing type query ");
            	log.info("request.getParameter(type) is "+ param.toString());   
            	BooleanQuery boolQuery = new BooleanQuery();
                boolQuery.add( query, BooleanClause.Occur.MUST);
                boolQuery.add(  new TermQuery(
@ -623,6 +653,7 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
            //it by making a BooelanQuery.
            Query flagQuery = makeFlagQuery( portalState );
            if( flagQuery != null ){
            	log.info("Firing Flag query ");
                BooleanQuery boolQuery = new BooleanQuery();
                boolQuery.add( query, BooleanClause.Occur.MUST);
                boolQuery.add( flagQuery, BooleanClause.Occur.MUST);
@ -646,13 +677,16 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
        //indicated in the query string.
        //The analyzer is needed so that we use the same analyzer on the search queries as
        //was used on the text that was indexed.
-        QueryParser qp = new QueryParser(defaultSearchField,analyzer);
+    	//QueryParser qp = new QueryParser("NAME",analyzer);
        //this sets the query parser to AND all of the query terms it finds.
-        qp.setDefaultOperator(QueryParser.AND_OPERATOR);
+        //qp.setDefaultOperator(QueryParser.AND_OPERATOR);
        //set up the map of stemmed field names -> unstemmed field names
 //        HashMap<String,String> map = new HashMap<String, String>();
 //        map.put(Entity2LuceneDoc.term.ALLTEXT,Entity2LuceneDoc.term.ALLTEXTUNSTEMMED);
 //        qp.setStemmedToUnstemmed(map);
    	MultiFieldQueryParser qp = new MultiFieldQueryParser(new String[]{"ALLTEXT", "name", "type"}, analyzer);
    	return qp;
    }
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/Entity2LuceneDoc.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/Entity2LuceneDoc.java
@ -149,16 +149,21 @@ public class Entity2LuceneDoc  implements Obj2DocIface{
                if( clz.getSearchBoost() != null )
                    doc.setBoost( doc.getBoost() + clz.getSearchBoost() );
-                doc.add( new Field(term.RDFTYPE, clz.getURI(), 
+                Field typeField = new Field (term.RDFTYPE, clz.getName(), Field.Store.YES, Field.Index.ANALYZED);
-                                    Field.Store.YES, Field.Index.NOT_ANALYZED));
+                typeField.setBoost(2*FIELD_BOOST);
                doc.add( typeField);
                if( clz.getName() != null )
                    classPublicNames = classPublicNames + " " + clz.getName();
                //Classgroup URI
-                if( clz.getGroupURI() != null )
+                if( clz.getGroupURI() != null ){
-                    doc.add( new Field(term.CLASSGROUP_URI, clz.getGroupURI(), 
+                	Field classGroupField = new Field(term.CLASSGROUP_URI, clz.getGroupURI(), 
-                                        Field.Store.YES, Field.Index.NOT_ANALYZED));
+                            Field.Store.YES, Field.Index.ANALYZED);
                	classGroupField.setBoost(FIELD_BOOST);
                    doc.add(classGroupField);
                }
            }
        }        
        doc.add( new Field(term.PROHIBITED_FROM_TEXT_RESULTS, prohibited?"1":"0", 
@ -184,7 +189,7 @@ public class Entity2LuceneDoc  implements Obj2DocIface{
            value = ent.getLocalName();
        }
        Field name =new Field(term.NAME, value, 
-                               Field.Store.NO, Field.Index.ANALYZED);
+                               Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES);
        name.setBoost( NAME_BOOST );
        doc.add( name );
@ -308,9 +313,9 @@ public class Entity2LuceneDoc  implements Obj2DocIface{
                }
            }
            //stemmed terms
-            doc.add( new  Field(term.ALLTEXT, value , Field.Store.NO, Field.Index.ANALYZED));
+            doc.add( new  Field(term.ALLTEXT, value , Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES));
            //unstemmed terms
-            doc.add( new Field(term.ALLTEXTUNSTEMMED, value, Field.Store.NO, Field.Index.ANALYZED));
+            doc.add( new Field(term.ALLTEXTUNSTEMMED, value, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES));
        }
        //flagX and portal flags are no longer indexed.
@ -359,6 +364,7 @@ public class Entity2LuceneDoc  implements Obj2DocIface{
        this.classesProhibitedFromSearch = classesProhibitedFromSearch;
    }
-    public static float NAME_BOOST = 10;
+    public static float NAME_BOOST = 3.0F;
-    public static float KEYWORD_BOOST = 2;
+    public static float KEYWORD_BOOST = 2.0F;
    public static float FIELD_BOOST = 1.0F;
 }
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneHighlighter.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneHighlighter.java
@ -1,135 +0,0 @@
 /* $This file is distributed under the terms of the license in /doc/license.txt$ */
 package edu.cornell.mannlib.vitro.webapp.search.lucene;
 import java.io.IOException;
 import java.io.StringReader;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.highlight.Formatter;
 import org.apache.lucene.search.highlight.Highlighter;
 import org.apache.lucene.search.highlight.NullFragmenter;
 import org.apache.lucene.search.highlight.QueryScorer;
 import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
 import edu.cornell.mannlib.vitro.webapp.search.beans.VitroHighlighter;
 public class LuceneHighlighter extends VitroHighlighter{
    /* See VitroHighlighter for prefix tag and postfix tag */
    Highlighter nonFragHighlighter = null;
    Highlighter fragHighlighter = null;
    Analyzer analyzer = null;
    /**
     * Makes a VitroHighlighter that uses lucene highlighters.
     * PreTag and PostTag are from VitroHighlighter.
     *
     * @param query - the query to highlight for.
     * @param a - the Analyzer that was used in the query.
     */
    public LuceneHighlighter(Query query, Analyzer a){
        QueryScorer scorer = new QueryScorer( query );
        /* See VitroHighlighter for prefix tag and postfix tag */
        Formatter formatter =
            new SimpleHTMLFormatter(preTag,postTag);
        this.analyzer = a;
        this.fragHighlighter = new Highlighter(formatter, scorer);
        //here we make a highlighter that doesn't fragment
        this.nonFragHighlighter = new Highlighter( formatter, scorer);
        this.nonFragHighlighter.setTextFragmenter(new NullFragmenter());
    }
    private Pattern htmlOrNot = Pattern.compile("(<[^>]*>)|([^<]*)");
    private int HTML_PATTERN_INDEX = 1;
    private int TEXT_PATTERN_INDEX = 2;
    /**
     * Highlights in a string. No Fragmenting. Attempts to avoid some HTML.
     * @param in
     * @return
     */
    public String highlight( String in){
        Matcher matcher =  htmlOrNot.matcher(in);
        StringBuilder output = new StringBuilder();
        boolean found = matcher.find();
        if( ! found )
            return in;
        while( found ){
            String foundHtmlElement = matcher.group( HTML_PATTERN_INDEX );
            if( foundHtmlElement != null ){
                output.append( foundHtmlElement );
            }else{
                String foundTextNode = matcher.group( TEXT_PATTERN_INDEX );
                String hi = foundTextNode;
                try {
                    hi = nonFragHighlighter.getBestFragment(analyzer,"contents",foundTextNode);                    
                } catch (IOException e) {
                    return in;
                }
                if( hi != null )
                    output.append( hi );
                else
                    output.append( foundTextNode );
            }
            found = matcher.find();
        }
        return output.toString();        
    }
    protected boolean WITH_ELLIPSIS = true;
    protected String ellipsis = "...";
    public String getHighlightFragments(String in ) {
        if(WITH_ELLIPSIS ){
            if( in != null && in.trim().length() > 0){
                String b = doHighlight( in ,fragHighlighter);
                if( b != null && b.trim().length() > 0 )
                    return ellipsis + " " + b + " " + ellipsis;
                else
                    return "";
            } else {
                return "";
            }
        } else {
            return doHighlight(  in , fragHighlighter);
        }
    }
    private String doHighlight(String in, Highlighter hi ) {
        String result = in;
        if(in != null ){
            TokenStream tokenStream =
                analyzer.tokenStream("contents", new StringReader(in));
            //       Get 3 best fragments and seperate with a "..."
            try {
                result = hi.getBestFragments(tokenStream, in , 3, "...");
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
        return result;
    }
    private final int maxDocCharsToAnalyze = 4000;
    Log log = LogFactory.getLog(LuceneHighlighter.class);
 }
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneSearcher.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneSearcher.java
@ -306,7 +306,7 @@ public class LuceneSearcher implements Searcher {
     * we need to 'rewrite' the query.  That takes any wild cards
     * and replaces them will all terms that are found in the index.
     */
-    public VitroHighlighter getHighlighter(VitroQuery queryIn){
+/*    public VitroHighlighter getHighlighter(VitroQuery queryIn){
        if( ! (queryIn instanceof LuceneQuery) ){
            log.error("LuceneSearcher expects to get a LuceneQuery");
            throw new Error("LuceneSearcher expects to get a LuceneQuery");
@ -327,6 +327,6 @@ public class LuceneSearcher implements Searcher {
            log.error(e, e);
        }
        return   (VitroHighlighter)highlighter;
-    }
+    }*/
 }
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneSetup.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneSetup.java
@ -21,6 +21,7 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.KeywordAnalyzer;
 import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.search.BooleanQuery;
 import com.hp.hpl.jena.ontology.OntModel;
@ -229,10 +230,12 @@ public class LuceneSetup implements javax.servlet.ServletContextListener {
     */
    private Analyzer getAnalyzer() {
        PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper( new KeywordAnalyzer());
      //  PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper( new StandardAnalyzer());
    	analyzer.addAnalyzer(ALLTEXT, new HtmlLowerStopStemAnalyzer());
-        analyzer.addAnalyzer(NAME, new HtmlLowerStopStemAnalyzer());
+      // analyzer.addAnalyzer(NAME, new HtmlLowerStopStemAnalyzer());
        analyzer.addAnalyzer(ALLTEXTUNSTEMMED, new HtmlLowerStopAnalyzer());
        analyzer.addAnalyzer(NAMEUNSTEMMED, new HtmlLowerStopAnalyzer());      
        analyzer.addAnalyzer(NAME, new KeywordAnalyzer());
        return analyzer;
    }
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/SimpleLuceneHighlighter.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/SimpleLuceneHighlighter.java
@ -1,83 +0,0 @@
 /* $This file is distributed under the terms of the license in /doc/license.txt$ */
 package edu.cornell.mannlib.vitro.webapp.search.lucene;
 import java.io.IOException;
 import java.io.StringReader;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.highlight.Formatter;
 import org.apache.lucene.search.highlight.Highlighter;
 import org.apache.lucene.search.highlight.NullFragmenter;
 import org.apache.lucene.search.highlight.QueryScorer;
 import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
 import edu.cornell.mannlib.vitro.webapp.search.beans.VitroHighlighter;
 import edu.cornell.mannlib.vitro.webapp.utils.Html2Text;
 /**
 * This is a highlighter and fragmenter for use with PagedSearchController. 
 */
 public class SimpleLuceneHighlighter extends VitroHighlighter{    
    Highlighter fragHighlighter = null;
    Analyzer analyzer = null;
    public SimpleLuceneHighlighter(Query query, Analyzer a){
        QueryScorer scorer = new QueryScorer( query ,Entity2LuceneDoc.term.ALLTEXT);
        Formatter formatter =
            new SimpleHTMLFormatter(preTag,postTag);
        this.analyzer = a;
        this.fragHighlighter = new Highlighter(formatter, scorer);
    }
    @Override
    public String highlight( String in){
        //not really implemented.
        return in;
    }
    @Override
    public String getHighlightFragments(String in ) {
        Html2Text h2t = new Html2Text();
        try{
            h2t.parse(in);
        }catch(IOException ioe){
            log.debug("could not strip html from string",ioe);
        }
        String txt = h2t.getText();
        if( txt != null && txt.trim().length() > 0){
            String b = doHighlight( txt ,fragHighlighter);
            if( b != null && b.trim().length() > 0 )
                return "..." + " " + b + " " + "...";
            else
                return "";
        } else {
            return "";
        }
    }
    private String doHighlight(String in, Highlighter hi ) {
        String result = in;
        if(in != null ){
            TokenStream tokenStream =
                analyzer.tokenStream(Entity2LuceneDoc.term.ALLTEXT, new StringReader(in));
            try {
                //Get 3 best fragments and seperate with a "..."
                result = hi.getBestFragments(tokenStream, in , 3, "...");
            } catch (IOException e) {
                log.debug("could not highlight",e);
            }
        }
        return result;
    }
    private static Log log = LogFactory.getLog(SimpleLuceneHighlighter.class);
 }