Upgraded lucene-2.4-*.jar to lucene-2.9.3-*.jar. Upgraded solr-1.0.jar to apache-solr-core-1.4.1.jar. Removed LuceneHighlighter and SimpleLuceneHighlighter as they are not used anymore. Made minor changes to the way Query is parsed in PagedSearchController. The query now uses a MultiFieldQueryParser that searches the query against ALLTEXT, NAME and TYPE fields in the search index.

This commit is contained in:
bkoniden 2011-03-14 13:41:16 +00:00
parent af8ce43e16
commit 801d789696
15 changed files with 67 additions and 242 deletions

View file

@ -41,7 +41,7 @@ public interface Searcher {
* @param q
* @return
*/
public abstract VitroHighlighter getHighlighter(VitroQuery q);
// public abstract VitroHighlighter getHighlighter(VitroQuery q);
/**
* Used to close the searcher if the index that it was using gets

View file

@ -28,6 +28,7 @@ import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause;
@ -69,7 +70,6 @@ import edu.cornell.mannlib.vitro.webapp.search.beans.VitroQueryFactory;
import edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc;
import edu.cornell.mannlib.vitro.webapp.search.lucene.LuceneIndexFactory;
import edu.cornell.mannlib.vitro.webapp.search.lucene.LuceneSetup;
import edu.cornell.mannlib.vitro.webapp.search.lucene.SimpleLuceneHighlighter;
import edu.cornell.mannlib.vitro.webapp.utils.FlagMathUtils;
import edu.cornell.mannlib.vitro.webapp.utils.Html2Text;
import edu.cornell.mannlib.vitro.webapp.utils.StringUtils;
@ -157,6 +157,9 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
@Override
protected ResponseValues processRequest(VitroRequest vreq) {
log.info("All parameters present in the request: "+ vreq.getParameterMap().toString());
//There may be other non-html formats in the future
Format format = getFormat(vreq);
boolean wasXmlRequested = Format.XML == format;
@ -178,6 +181,10 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
VClassDao vclassDao = vreq.getWebappDaoFactory().getVClassDao();
String alphaFilter = vreq.getParameter("alpha");
log.info("IndividualDao is " + iDao.toString() + " Public classes in the classgroup are " + grpDao.getPublicGroupsWithVClasses().toString());
log.info("VClassDao is "+ vclassDao.toString() );
int startIndex = 0;
try{
startIndex = Integer.parseInt(vreq.getParameter("startIndex"));
@ -206,6 +213,8 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
String qtxt = vreq.getParameter(VitroQuery.QUERY_PARAMETER_NAME);
Analyzer analyzer = getAnalyzer(getServletContext());
log.info("Query text is "+ qtxt + " Analyzer is "+ analyzer.toString());
Query query = null;
try {
query = getQuery(vreq, portalFlag, analyzer, qtxt);
@ -218,6 +227,8 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
TopDocs topDocs = null;
try{
log.info("Searching for query term in the Index with maxHitSize "+ maxHitSize);
log.info("Query is "+ query.toString());
topDocs = searcherForRequest.search(query,null,maxHitSize);
}catch(Throwable t){
log.error("in first pass at search: " + t);
@ -241,7 +252,9 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
return doFailedSearch(msg, qtxt,format);
}
int hitsLength = topDocs.scoreDocs.length;
log.info("No. of hits "+ hitsLength);
if ( hitsLength < 1 ){
return doNoHits(qtxt,format);
}
@ -260,6 +273,7 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
if( (i >= startIndex) && (i <= lastHitToShow) ){
Document doc = searcherForRequest.doc(topDocs.scoreDocs[i].doc);
String uri = doc.get(Entity2LuceneDoc.term.URI);
log.info("Retrieving entity with uri "+ uri);
Individual ent = new IndividualImpl();
ent.setURI(uri);
ent = iDao.getIndividualByURI(uri);
@ -582,11 +596,20 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
"query length is " + MAX_QUERY_LENGTH );
return null;
}
log.info("Parsing query using QueryParser ");
QueryParser parser = getQueryParser(analyzer);
query = parser.parse(querystr);
String alpha = request.getParameter("alpha");
if( alpha != null && !"".equals(alpha) && alpha.length() == 1){
log.info("Firing alpha query ");
log.info("request.getParameter(alpha) is " + alpha);
BooleanQuery boolQuery = new BooleanQuery();
boolQuery.add( query, BooleanClause.Occur.MUST );
boolQuery.add(
@ -597,7 +620,11 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
//check if this is classgroup filtered
Object param = request.getParameter("classgroup");
if( param != null && !"".equals(param)){
if( param != null && !"".equals(param)){
log.info("Firing classgroup query ");
log.info("request.getParameter(classgroup) is "+ param.toString());
BooleanQuery boolQuery = new BooleanQuery();
boolQuery.add( query, BooleanClause.Occur.MUST);
boolQuery.add( new TermQuery(
@ -609,8 +636,11 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
//check if this is rdf:type filtered
param = request.getParameter("type");
if( param != null && !"".equals(param)){
BooleanQuery boolQuery = new BooleanQuery();
if( param != null && !"".equals(param)){
log.info("Firing type query ");
log.info("request.getParameter(type) is "+ param.toString());
BooleanQuery boolQuery = new BooleanQuery();
boolQuery.add( query, BooleanClause.Occur.MUST);
boolQuery.add( new TermQuery(
new Term(Entity2LuceneDoc.term.RDFTYPE,
@ -623,6 +653,7 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
//it by making a BooelanQuery.
Query flagQuery = makeFlagQuery( portalState );
if( flagQuery != null ){
log.info("Firing Flag query ");
BooleanQuery boolQuery = new BooleanQuery();
boolQuery.add( query, BooleanClause.Occur.MUST);
boolQuery.add( flagQuery, BooleanClause.Occur.MUST);
@ -646,14 +677,17 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
//indicated in the query string.
//The analyzer is needed so that we use the same analyzer on the search queries as
//was used on the text that was indexed.
QueryParser qp = new QueryParser(defaultSearchField,analyzer);
//QueryParser qp = new QueryParser("NAME",analyzer);
//this sets the query parser to AND all of the query terms it finds.
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
//qp.setDefaultOperator(QueryParser.AND_OPERATOR);
//set up the map of stemmed field names -> unstemmed field names
// HashMap<String,String> map = new HashMap<String, String>();
// map.put(Entity2LuceneDoc.term.ALLTEXT,Entity2LuceneDoc.term.ALLTEXTUNSTEMMED);
// qp.setStemmedToUnstemmed(map);
return qp;
MultiFieldQueryParser qp = new MultiFieldQueryParser(new String[]{"ALLTEXT", "name", "type"}, analyzer);
return qp;
}
/**

View file

@ -149,16 +149,21 @@ public class Entity2LuceneDoc implements Obj2DocIface{
if( clz.getSearchBoost() != null )
doc.setBoost( doc.getBoost() + clz.getSearchBoost() );
doc.add( new Field(term.RDFTYPE, clz.getURI(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
Field typeField = new Field (term.RDFTYPE, clz.getName(), Field.Store.YES, Field.Index.ANALYZED);
typeField.setBoost(2*FIELD_BOOST);
doc.add( typeField);
if( clz.getName() != null )
classPublicNames = classPublicNames + " " + clz.getName();
//Classgroup URI
if( clz.getGroupURI() != null )
doc.add( new Field(term.CLASSGROUP_URI, clz.getGroupURI(),
Field.Store.YES, Field.Index.NOT_ANALYZED));
if( clz.getGroupURI() != null ){
Field classGroupField = new Field(term.CLASSGROUP_URI, clz.getGroupURI(),
Field.Store.YES, Field.Index.ANALYZED);
classGroupField.setBoost(FIELD_BOOST);
doc.add(classGroupField);
}
}
}
doc.add( new Field(term.PROHIBITED_FROM_TEXT_RESULTS, prohibited?"1":"0",
@ -184,7 +189,7 @@ public class Entity2LuceneDoc implements Obj2DocIface{
value = ent.getLocalName();
}
Field name =new Field(term.NAME, value,
Field.Store.NO, Field.Index.ANALYZED);
Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES);
name.setBoost( NAME_BOOST );
doc.add( name );
@ -238,7 +243,7 @@ public class Entity2LuceneDoc implements Obj2DocIface{
}
}catch (Exception ex){
value = null;
}
}
if( value != null )
doc.add( new Field(term.SUNSET, value, Field.Store.YES, Field.Index.NOT_ANALYZED));
else
@ -308,9 +313,9 @@ public class Entity2LuceneDoc implements Obj2DocIface{
}
}
//stemmed terms
doc.add( new Field(term.ALLTEXT, value , Field.Store.NO, Field.Index.ANALYZED));
doc.add( new Field(term.ALLTEXT, value , Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES));
//unstemmed terms
doc.add( new Field(term.ALLTEXTUNSTEMMED, value, Field.Store.NO, Field.Index.ANALYZED));
doc.add( new Field(term.ALLTEXTUNSTEMMED, value, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.YES));
}
//flagX and portal flags are no longer indexed.
@ -359,6 +364,7 @@ public class Entity2LuceneDoc implements Obj2DocIface{
this.classesProhibitedFromSearch = classesProhibitedFromSearch;
}
public static float NAME_BOOST = 10;
public static float KEYWORD_BOOST = 2;
public static float NAME_BOOST = 3.0F;
public static float KEYWORD_BOOST = 2.0F;
public static float FIELD_BOOST = 1.0F;
}

View file

@ -1,135 +0,0 @@
/* $This file is distributed under the terms of the license in /doc/license.txt$ */
package edu.cornell.mannlib.vitro.webapp.search.lucene;
import java.io.IOException;
import java.io.StringReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.NullFragmenter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import edu.cornell.mannlib.vitro.webapp.search.beans.VitroHighlighter;
public class LuceneHighlighter extends VitroHighlighter{
/* See VitroHighlighter for prefix tag and postfix tag */
Highlighter nonFragHighlighter = null;
Highlighter fragHighlighter = null;
Analyzer analyzer = null;
/**
* Makes a VitroHighlighter that uses lucene highlighters.
* PreTag and PostTag are from VitroHighlighter.
*
* @param query - the query to highlight for.
* @param a - the Analyzer that was used in the query.
*/
public LuceneHighlighter(Query query, Analyzer a){
QueryScorer scorer = new QueryScorer( query );
/* See VitroHighlighter for prefix tag and postfix tag */
Formatter formatter =
new SimpleHTMLFormatter(preTag,postTag);
this.analyzer = a;
this.fragHighlighter = new Highlighter(formatter, scorer);
//here we make a highlighter that doesn't fragment
this.nonFragHighlighter = new Highlighter( formatter, scorer);
this.nonFragHighlighter.setTextFragmenter(new NullFragmenter());
}
private Pattern htmlOrNot = Pattern.compile("(<[^>]*>)|([^<]*)");
private int HTML_PATTERN_INDEX = 1;
private int TEXT_PATTERN_INDEX = 2;
/**
* Highlights in a string. No Fragmenting. Attempts to avoid some HTML.
* @param in
* @return
*/
public String highlight( String in){
Matcher matcher = htmlOrNot.matcher(in);
StringBuilder output = new StringBuilder();
boolean found = matcher.find();
if( ! found )
return in;
while( found ){
String foundHtmlElement = matcher.group( HTML_PATTERN_INDEX );
if( foundHtmlElement != null ){
output.append( foundHtmlElement );
}else{
String foundTextNode = matcher.group( TEXT_PATTERN_INDEX );
String hi = foundTextNode;
try {
hi = nonFragHighlighter.getBestFragment(analyzer,"contents",foundTextNode);
} catch (IOException e) {
return in;
}
if( hi != null )
output.append( hi );
else
output.append( foundTextNode );
}
found = matcher.find();
}
return output.toString();
}
protected boolean WITH_ELLIPSIS = true;
protected String ellipsis = "...";
public String getHighlightFragments(String in ) {
if(WITH_ELLIPSIS ){
if( in != null && in.trim().length() > 0){
String b = doHighlight( in ,fragHighlighter);
if( b != null && b.trim().length() > 0 )
return ellipsis + " " + b + " " + ellipsis;
else
return "";
} else {
return "";
}
} else {
return doHighlight( in , fragHighlighter);
}
}
private String doHighlight(String in, Highlighter hi ) {
String result = in;
if(in != null ){
TokenStream tokenStream =
analyzer.tokenStream("contents", new StringReader(in));
// Get 3 best fragments and seperate with a "..."
try {
result = hi.getBestFragments(tokenStream, in , 3, "...");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
return result;
}
private final int maxDocCharsToAnalyze = 4000;
Log log = LogFactory.getLog(LuceneHighlighter.class);
}

View file

@ -306,7 +306,7 @@ public class LuceneSearcher implements Searcher {
* we need to 'rewrite' the query. That takes any wild cards
* and replaces them will all terms that are found in the index.
*/
public VitroHighlighter getHighlighter(VitroQuery queryIn){
/* public VitroHighlighter getHighlighter(VitroQuery queryIn){
if( ! (queryIn instanceof LuceneQuery) ){
log.error("LuceneSearcher expects to get a LuceneQuery");
throw new Error("LuceneSearcher expects to get a LuceneQuery");
@ -327,6 +327,6 @@ public class LuceneSearcher implements Searcher {
log.error(e, e);
}
return (VitroHighlighter)highlighter;
}
}*/
}

View file

@ -21,6 +21,7 @@ import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.KeywordAnalyzer;
import org.apache.lucene.analysis.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.search.BooleanQuery;
import com.hp.hpl.jena.ontology.OntModel;
@ -229,10 +230,12 @@ public class LuceneSetup implements javax.servlet.ServletContextListener {
*/
private Analyzer getAnalyzer() {
PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper( new KeywordAnalyzer());
analyzer.addAnalyzer(ALLTEXT, new HtmlLowerStopStemAnalyzer());
analyzer.addAnalyzer(NAME, new HtmlLowerStopStemAnalyzer());
// PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper( new StandardAnalyzer());
analyzer.addAnalyzer(ALLTEXT, new HtmlLowerStopStemAnalyzer());
// analyzer.addAnalyzer(NAME, new HtmlLowerStopStemAnalyzer());
analyzer.addAnalyzer(ALLTEXTUNSTEMMED, new HtmlLowerStopAnalyzer());
analyzer.addAnalyzer(NAMEUNSTEMMED, new HtmlLowerStopAnalyzer());
analyzer.addAnalyzer(NAMEUNSTEMMED, new HtmlLowerStopAnalyzer());
analyzer.addAnalyzer(NAME, new KeywordAnalyzer());
return analyzer;
}

View file

@ -1,83 +0,0 @@
/* $This file is distributed under the terms of the license in /doc/license.txt$ */
package edu.cornell.mannlib.vitro.webapp.search.lucene;
import java.io.IOException;
import java.io.StringReader;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.NullFragmenter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import edu.cornell.mannlib.vitro.webapp.search.beans.VitroHighlighter;
import edu.cornell.mannlib.vitro.webapp.utils.Html2Text;
/**
* This is a highlighter and fragmenter for use with PagedSearchController.
*/
public class SimpleLuceneHighlighter extends VitroHighlighter{
Highlighter fragHighlighter = null;
Analyzer analyzer = null;
public SimpleLuceneHighlighter(Query query, Analyzer a){
QueryScorer scorer = new QueryScorer( query ,Entity2LuceneDoc.term.ALLTEXT);
Formatter formatter =
new SimpleHTMLFormatter(preTag,postTag);
this.analyzer = a;
this.fragHighlighter = new Highlighter(formatter, scorer);
}
@Override
public String highlight( String in){
//not really implemented.
return in;
}
@Override
public String getHighlightFragments(String in ) {
Html2Text h2t = new Html2Text();
try{
h2t.parse(in);
}catch(IOException ioe){
log.debug("could not strip html from string",ioe);
}
String txt = h2t.getText();
if( txt != null && txt.trim().length() > 0){
String b = doHighlight( txt ,fragHighlighter);
if( b != null && b.trim().length() > 0 )
return "..." + " " + b + " " + "...";
else
return "";
} else {
return "";
}
}
private String doHighlight(String in, Highlighter hi ) {
String result = in;
if(in != null ){
TokenStream tokenStream =
analyzer.tokenStream(Entity2LuceneDoc.term.ALLTEXT, new StringReader(in));
try {
//Get 3 best fragments and seperate with a "..."
result = hi.getBestFragments(tokenStream, in , 3, "...");
} catch (IOException e) {
log.debug("could not highlight",e);
}
}
return result;
}
private static Log log = LogFactory.getLog(SimpleLuceneHighlighter.class);
}