diff --git a/webapp/lib/apache-solr-core-1.4.1.jar b/webapp/lib/apache-solr-core-1.4.1.jar new file mode 100644 index 000000000..2a5d49ed7 Binary files /dev/null and b/webapp/lib/apache-solr-core-1.4.1.jar differ diff --git a/webapp/lib/lucene-analyzers-2.4.jar b/webapp/lib/lucene-analyzers-2.4.jar deleted file mode 100644 index 5cd81f6fa..000000000 Binary files a/webapp/lib/lucene-analyzers-2.4.jar and /dev/null differ diff --git a/webapp/lib/lucene-analyzers-2.9.3.jar b/webapp/lib/lucene-analyzers-2.9.3.jar new file mode 100644 index 000000000..05bb2d576 Binary files /dev/null and b/webapp/lib/lucene-analyzers-2.9.3.jar differ diff --git a/webapp/lib/lucene-core-2.4.0.jar b/webapp/lib/lucene-core-2.4.0.jar deleted file mode 100644 index 440d76865..000000000 Binary files a/webapp/lib/lucene-core-2.4.0.jar and /dev/null differ diff --git a/webapp/lib/lucene-core-2.9.3.jar b/webapp/lib/lucene-core-2.9.3.jar new file mode 100644 index 000000000..4351f976b Binary files /dev/null and b/webapp/lib/lucene-core-2.9.3.jar differ diff --git a/webapp/lib/lucene-highlighter-2.4.jar b/webapp/lib/lucene-highlighter-2.4.jar deleted file mode 100644 index 31332477b..000000000 Binary files a/webapp/lib/lucene-highlighter-2.4.jar and /dev/null differ diff --git a/webapp/lib/lucene-highlighter-2.9.3.jar b/webapp/lib/lucene-highlighter-2.9.3.jar new file mode 100644 index 000000000..1b9cce4ec Binary files /dev/null and b/webapp/lib/lucene-highlighter-2.9.3.jar differ diff --git a/webapp/lib/solr-1.0.jar b/webapp/lib/solr-1.0.jar deleted file mode 100644 index 96bee9482..000000000 Binary files a/webapp/lib/solr-1.0.jar and /dev/null differ diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/beans/Searcher.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/beans/Searcher.java index 6d1fb6e30..fdf5b716a 100755 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/beans/Searcher.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/beans/Searcher.java @@ -41,7 +41,7 @@ public interface Searcher { * @param q * @return */ - public abstract VitroHighlighter getHighlighter(VitroQuery q); + // public abstract VitroHighlighter getHighlighter(VitroQuery q); /** * Used to close the searcher if the index that it was using gets diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/PagedSearchController.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/PagedSearchController.java index 379cd1c83..94e7d0750 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/PagedSearchController.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/PagedSearchController.java @@ -28,6 +28,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.Term; +import org.apache.lucene.queryParser.MultiFieldQueryParser; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.BooleanClause; @@ -69,7 +70,6 @@ import edu.cornell.mannlib.vitro.webapp.search.beans.VitroQueryFactory; import edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc; import edu.cornell.mannlib.vitro.webapp.search.lucene.LuceneIndexFactory; import edu.cornell.mannlib.vitro.webapp.search.lucene.LuceneSetup; -import edu.cornell.mannlib.vitro.webapp.search.lucene.SimpleLuceneHighlighter; import edu.cornell.mannlib.vitro.webapp.utils.FlagMathUtils; import edu.cornell.mannlib.vitro.webapp.utils.Html2Text; import edu.cornell.mannlib.vitro.webapp.utils.StringUtils; @@ -157,6 +157,9 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear @Override protected ResponseValues processRequest(VitroRequest vreq) { + + log.info("All parameters present in the request: "+ vreq.getParameterMap().toString()); + //There may be other non-html formats in the future Format format = getFormat(vreq); boolean wasXmlRequested = Format.XML == format; @@ -178,6 +181,10 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear VClassDao vclassDao = vreq.getWebappDaoFactory().getVClassDao(); String alphaFilter = vreq.getParameter("alpha"); + + log.info("IndividualDao is " + iDao.toString() + " Public classes in the classgroup are " + grpDao.getPublicGroupsWithVClasses().toString()); + log.info("VClassDao is "+ vclassDao.toString() ); + int startIndex = 0; try{ startIndex = Integer.parseInt(vreq.getParameter("startIndex")); @@ -206,6 +213,8 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear String qtxt = vreq.getParameter(VitroQuery.QUERY_PARAMETER_NAME); Analyzer analyzer = getAnalyzer(getServletContext()); + log.info("Query text is "+ qtxt + " Analyzer is "+ analyzer.toString()); + Query query = null; try { query = getQuery(vreq, portalFlag, analyzer, qtxt); @@ -218,6 +227,8 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear TopDocs topDocs = null; try{ + log.info("Searching for query term in the Index with maxHitSize "+ maxHitSize); + log.info("Query is "+ query.toString()); topDocs = searcherForRequest.search(query,null,maxHitSize); }catch(Throwable t){ log.error("in first pass at search: " + t); @@ -241,7 +252,9 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear return doFailedSearch(msg, qtxt,format); } + int hitsLength = topDocs.scoreDocs.length; + log.info("No. of hits "+ hitsLength); if ( hitsLength < 1 ){ return doNoHits(qtxt,format); } @@ -260,6 +273,7 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear if( (i >= startIndex) && (i <= lastHitToShow) ){ Document doc = searcherForRequest.doc(topDocs.scoreDocs[i].doc); String uri = doc.get(Entity2LuceneDoc.term.URI); + log.info("Retrieving entity with uri "+ uri); Individual ent = new IndividualImpl(); ent.setURI(uri); ent = iDao.getIndividualByURI(uri); @@ -582,11 +596,20 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear "query length is " + MAX_QUERY_LENGTH ); return null; } + + log.info("Parsing query using QueryParser "); + QueryParser parser = getQueryParser(analyzer); query = parser.parse(querystr); String alpha = request.getParameter("alpha"); + + if( alpha != null && !"".equals(alpha) && alpha.length() == 1){ + + log.info("Firing alpha query "); + log.info("request.getParameter(alpha) is " + alpha); + BooleanQuery boolQuery = new BooleanQuery(); boolQuery.add( query, BooleanClause.Occur.MUST ); boolQuery.add( @@ -597,7 +620,11 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear //check if this is classgroup filtered Object param = request.getParameter("classgroup"); - if( param != null && !"".equals(param)){ + if( param != null && !"".equals(param)){ + + log.info("Firing classgroup query "); + log.info("request.getParameter(classgroup) is "+ param.toString()); + BooleanQuery boolQuery = new BooleanQuery(); boolQuery.add( query, BooleanClause.Occur.MUST); boolQuery.add( new TermQuery( @@ -609,8 +636,11 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear //check if this is rdf:type filtered param = request.getParameter("type"); - if( param != null && !"".equals(param)){ - BooleanQuery boolQuery = new BooleanQuery(); + if( param != null && !"".equals(param)){ + log.info("Firing type query "); + log.info("request.getParameter(type) is "+ param.toString()); + + BooleanQuery boolQuery = new BooleanQuery(); boolQuery.add( query, BooleanClause.Occur.MUST); boolQuery.add( new TermQuery( new Term(Entity2LuceneDoc.term.RDFTYPE, @@ -623,6 +653,7 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear //it by making a BooelanQuery. Query flagQuery = makeFlagQuery( portalState ); if( flagQuery != null ){ + log.info("Firing Flag query "); BooleanQuery boolQuery = new BooleanQuery(); boolQuery.add( query, BooleanClause.Occur.MUST); boolQuery.add( flagQuery, BooleanClause.Occur.MUST); @@ -646,14 +677,17 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear //indicated in the query string. //The analyzer is needed so that we use the same analyzer on the search queries as //was used on the text that was indexed. - QueryParser qp = new QueryParser(defaultSearchField,analyzer); + //QueryParser qp = new QueryParser("NAME",analyzer); //this sets the query parser to AND all of the query terms it finds. - qp.setDefaultOperator(QueryParser.AND_OPERATOR); + //qp.setDefaultOperator(QueryParser.AND_OPERATOR); //set up the map of stemmed field names -> unstemmed field names // HashMap map = new HashMap(); // map.put(Entity2LuceneDoc.term.ALLTEXT,Entity2LuceneDoc.term.ALLTEXTUNSTEMMED); // qp.setStemmedToUnstemmed(map); - return qp; + + MultiFieldQueryParser qp = new MultiFieldQueryParser(new String[]{"ALLTEXT", "name", "type"}, analyzer); + + return qp; } /** diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/Entity2LuceneDoc.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/Entity2LuceneDoc.java index d188cbff0..0c228b84f 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/Entity2LuceneDoc.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/Entity2LuceneDoc.java @@ -149,16 +149,21 @@ public class Entity2LuceneDoc implements Obj2DocIface{ if( clz.getSearchBoost() != null ) doc.setBoost( doc.getBoost() + clz.getSearchBoost() ); - doc.add( new Field(term.RDFTYPE, clz.getURI(), - Field.Store.YES, Field.Index.NOT_ANALYZED)); + Field typeField = new Field (term.RDFTYPE, clz.getName(), Field.Store.YES, Field.Index.ANALYZED); + typeField.setBoost(2*FIELD_BOOST); + + doc.add( typeField); if( clz.getName() != null ) classPublicNames = classPublicNames + " " + clz.getName(); //Classgroup URI - if( clz.getGroupURI() != null ) - doc.add( new Field(term.CLASSGROUP_URI, clz.getGroupURI(), - Field.Store.YES, Field.Index.NOT_ANALYZED)); + if( clz.getGroupURI() != null ){ + Field classGroupField = new Field(term.CLASSGROUP_URI, clz.getGroupURI(), + Field.Store.YES, Field.Index.ANALYZED); + classGroupField.setBoost(FIELD_BOOST); + doc.add(classGroupField); + } } } doc.add( new Field(term.PROHIBITED_FROM_TEXT_RESULTS, prohibited?"1":"0", @@ -184,7 +189,7 @@ public class Entity2LuceneDoc implements Obj2DocIface{ value = ent.getLocalName(); } Field name =new Field(term.NAME, value, - Field.Store.NO, Field.Index.ANALYZED); + Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES); name.setBoost( NAME_BOOST ); doc.add( name ); @@ -238,7 +243,7 @@ public class Entity2LuceneDoc implements Obj2DocIface{ } }catch (Exception ex){ value = null; - } + } if( value != null ) doc.add( new Field(term.SUNSET, value, Field.Store.YES, Field.Index.NOT_ANALYZED)); else @@ -308,9 +313,9 @@ public class Entity2LuceneDoc implements Obj2DocIface{ } } //stemmed terms - doc.add( new Field(term.ALLTEXT, value , Field.Store.NO, Field.Index.ANALYZED)); + doc.add( new Field(term.ALLTEXT, value , Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES)); //unstemmed terms - doc.add( new Field(term.ALLTEXTUNSTEMMED, value, Field.Store.NO, Field.Index.ANALYZED)); + doc.add( new Field(term.ALLTEXTUNSTEMMED, value, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES)); } //flagX and portal flags are no longer indexed. @@ -359,6 +364,7 @@ public class Entity2LuceneDoc implements Obj2DocIface{ this.classesProhibitedFromSearch = classesProhibitedFromSearch; } - public static float NAME_BOOST = 10; - public static float KEYWORD_BOOST = 2; + public static float NAME_BOOST = 3.0F; + public static float KEYWORD_BOOST = 2.0F; + public static float FIELD_BOOST = 1.0F; } diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneHighlighter.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneHighlighter.java deleted file mode 100644 index 5467b5958..000000000 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneHighlighter.java +++ /dev/null @@ -1,135 +0,0 @@ -/* $This file is distributed under the terms of the license in /doc/license.txt$ */ - -package edu.cornell.mannlib.vitro.webapp.search.lucene; - -import java.io.IOException; -import java.io.StringReader; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.highlight.Formatter; -import org.apache.lucene.search.highlight.Highlighter; -import org.apache.lucene.search.highlight.NullFragmenter; -import org.apache.lucene.search.highlight.QueryScorer; -import org.apache.lucene.search.highlight.SimpleHTMLFormatter; - -import edu.cornell.mannlib.vitro.webapp.search.beans.VitroHighlighter; - -public class LuceneHighlighter extends VitroHighlighter{ - /* See VitroHighlighter for prefix tag and postfix tag */ - - Highlighter nonFragHighlighter = null; - Highlighter fragHighlighter = null; - - Analyzer analyzer = null; - - /** - * Makes a VitroHighlighter that uses lucene highlighters. - * PreTag and PostTag are from VitroHighlighter. - * - * @param query - the query to highlight for. - * @param a - the Analyzer that was used in the query. - */ - public LuceneHighlighter(Query query, Analyzer a){ - QueryScorer scorer = new QueryScorer( query ); - /* See VitroHighlighter for prefix tag and postfix tag */ - Formatter formatter = - new SimpleHTMLFormatter(preTag,postTag); - this.analyzer = a; - this.fragHighlighter = new Highlighter(formatter, scorer); - - //here we make a highlighter that doesn't fragment - this.nonFragHighlighter = new Highlighter( formatter, scorer); - this.nonFragHighlighter.setTextFragmenter(new NullFragmenter()); - } - - - private Pattern htmlOrNot = Pattern.compile("(<[^>]*>)|([^<]*)"); - private int HTML_PATTERN_INDEX = 1; - private int TEXT_PATTERN_INDEX = 2; - /** - * Highlights in a string. No Fragmenting. Attempts to avoid some HTML. - * @param in - * @return - */ - public String highlight( String in){ - Matcher matcher = htmlOrNot.matcher(in); - StringBuilder output = new StringBuilder(); - - boolean found = matcher.find(); - if( ! found ) - return in; - - while( found ){ - String foundHtmlElement = matcher.group( HTML_PATTERN_INDEX ); - if( foundHtmlElement != null ){ - output.append( foundHtmlElement ); - }else{ - String foundTextNode = matcher.group( TEXT_PATTERN_INDEX ); - String hi = foundTextNode; - try { - hi = nonFragHighlighter.getBestFragment(analyzer,"contents",foundTextNode); - } catch (IOException e) { - return in; - } - if( hi != null ) - output.append( hi ); - else - output.append( foundTextNode ); - } - found = matcher.find(); - } - return output.toString(); - } - - - - - - protected boolean WITH_ELLIPSIS = true; - protected String ellipsis = "..."; - public String getHighlightFragments(String in ) { - - if(WITH_ELLIPSIS ){ - if( in != null && in.trim().length() > 0){ - String b = doHighlight( in ,fragHighlighter); - if( b != null && b.trim().length() > 0 ) - return ellipsis + " " + b + " " + ellipsis; - else - return ""; - } else { - return ""; - } - } else { - return doHighlight( in , fragHighlighter); - } - } - - private String doHighlight(String in, Highlighter hi ) { - String result = in; - - if(in != null ){ - - - TokenStream tokenStream = - analyzer.tokenStream("contents", new StringReader(in)); - // Get 3 best fragments and seperate with a "..." - try { - result = hi.getBestFragments(tokenStream, in , 3, "..."); - } catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - } - - return result; - } - - private final int maxDocCharsToAnalyze = 4000; - Log log = LogFactory.getLog(LuceneHighlighter.class); -} diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneSearcher.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneSearcher.java index 8d0a16004..2888ed935 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneSearcher.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneSearcher.java @@ -306,7 +306,7 @@ public class LuceneSearcher implements Searcher { * we need to 'rewrite' the query. That takes any wild cards * and replaces them will all terms that are found in the index. */ - public VitroHighlighter getHighlighter(VitroQuery queryIn){ +/* public VitroHighlighter getHighlighter(VitroQuery queryIn){ if( ! (queryIn instanceof LuceneQuery) ){ log.error("LuceneSearcher expects to get a LuceneQuery"); throw new Error("LuceneSearcher expects to get a LuceneQuery"); @@ -327,6 +327,6 @@ public class LuceneSearcher implements Searcher { log.error(e, e); } return (VitroHighlighter)highlighter; - } + }*/ } diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneSetup.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneSetup.java index 1d22721c2..3e54c9b14 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneSetup.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneSetup.java @@ -21,6 +21,7 @@ import org.apache.commons.logging.LogFactory; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.KeywordAnalyzer; import org.apache.lucene.analysis.PerFieldAnalyzerWrapper; +import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.search.BooleanQuery; import com.hp.hpl.jena.ontology.OntModel; @@ -229,10 +230,12 @@ public class LuceneSetup implements javax.servlet.ServletContextListener { */ private Analyzer getAnalyzer() { PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper( new KeywordAnalyzer()); - analyzer.addAnalyzer(ALLTEXT, new HtmlLowerStopStemAnalyzer()); - analyzer.addAnalyzer(NAME, new HtmlLowerStopStemAnalyzer()); + // PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper( new StandardAnalyzer()); + analyzer.addAnalyzer(ALLTEXT, new HtmlLowerStopStemAnalyzer()); + // analyzer.addAnalyzer(NAME, new HtmlLowerStopStemAnalyzer()); analyzer.addAnalyzer(ALLTEXTUNSTEMMED, new HtmlLowerStopAnalyzer()); - analyzer.addAnalyzer(NAMEUNSTEMMED, new HtmlLowerStopAnalyzer()); + analyzer.addAnalyzer(NAMEUNSTEMMED, new HtmlLowerStopAnalyzer()); + analyzer.addAnalyzer(NAME, new KeywordAnalyzer()); return analyzer; } diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/SimpleLuceneHighlighter.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/SimpleLuceneHighlighter.java deleted file mode 100644 index 2caa8a551..000000000 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/SimpleLuceneHighlighter.java +++ /dev/null @@ -1,83 +0,0 @@ -/* $This file is distributed under the terms of the license in /doc/license.txt$ */ - -package edu.cornell.mannlib.vitro.webapp.search.lucene; - -import java.io.IOException; -import java.io.StringReader; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.highlight.Formatter; -import org.apache.lucene.search.highlight.Highlighter; -import org.apache.lucene.search.highlight.NullFragmenter; -import org.apache.lucene.search.highlight.QueryScorer; -import org.apache.lucene.search.highlight.SimpleHTMLFormatter; - -import edu.cornell.mannlib.vitro.webapp.search.beans.VitroHighlighter; -import edu.cornell.mannlib.vitro.webapp.utils.Html2Text; - -/** - * This is a highlighter and fragmenter for use with PagedSearchController. - */ -public class SimpleLuceneHighlighter extends VitroHighlighter{ - Highlighter fragHighlighter = null; - Analyzer analyzer = null; - - public SimpleLuceneHighlighter(Query query, Analyzer a){ - QueryScorer scorer = new QueryScorer( query ,Entity2LuceneDoc.term.ALLTEXT); - - Formatter formatter = - new SimpleHTMLFormatter(preTag,postTag); - this.analyzer = a; - this.fragHighlighter = new Highlighter(formatter, scorer); - } - - @Override - public String highlight( String in){ - //not really implemented. - return in; - } - - @Override - public String getHighlightFragments(String in ) { - Html2Text h2t = new Html2Text(); - try{ - h2t.parse(in); - }catch(IOException ioe){ - log.debug("could not strip html from string",ioe); - } - String txt = h2t.getText(); - - if( txt != null && txt.trim().length() > 0){ - String b = doHighlight( txt ,fragHighlighter); - if( b != null && b.trim().length() > 0 ) - return "..." + " " + b + " " + "..."; - else - return ""; - } else { - return ""; - } - } - - private String doHighlight(String in, Highlighter hi ) { - String result = in; - if(in != null ){ - TokenStream tokenStream = - analyzer.tokenStream(Entity2LuceneDoc.term.ALLTEXT, new StringReader(in)); - try { - //Get 3 best fragments and seperate with a "..." - result = hi.getBestFragments(tokenStream, in , 3, "..."); - } catch (IOException e) { - log.debug("could not highlight",e); - } - } - return result; - } - - private static Log log = LogFactory.getLog(SimpleLuceneHighlighter.class); -}