NIHVIVO-774 Fine-tune tokenized autocomplete query to get desired results

This commit is contained in:
rjy7 2010-07-13 17:47:34 +00:00
parent 35f60021dd
commit 0f93083a66

View file

@ -26,6 +26,8 @@ import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document; import org.apache.lucene.document.Document;
import org.apache.lucene.index.Term; import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.IndexSearcher;
@ -54,13 +56,13 @@ import freemarker.template.Configuration;
/** /**
* AutocompleteController is used to generate autocomplete and select element content * AutocompleteController is used to generate autocomplete and select element content
* through a Lucene search. The search logic is copied from PagedSearchController. * through a Lucene search. The search logic is copied from AutocompleteController.
*/ */
/* rjy7 We should have a SearchController that is subclassed by both PagedSearchController /* rjy7 We should have a SearchController that is subclassed by both AutocompleteController
* and AjaxSearchController, so the methods don't all have to be copied into both places. * and AjaxSearchController, so the methods don't all have to be copied into both places.
* The parent SearchController should extend FreeMarkerHttpServlet. Can only be done * The parent SearchController should extend FreeMarkerHttpServlet. Can only be done
* once PagedSearchController has been moved to FreeMarker. * once AutocompleteController has been moved to FreeMarker.
*/ */
public class AutocompleteController extends FreeMarkerHttpServlet implements Searcher{ public class AutocompleteController extends FreeMarkerHttpServlet implements Searcher{
@ -219,8 +221,7 @@ public class AutocompleteController extends FreeMarkerHttpServlet implements Sea
return null; return null;
} }
query = makeNameQuery(querystr, request); query = makeNameQuery(querystr, analyzer, request);
// Filter by type // Filter by type
{ {
@ -251,45 +252,67 @@ public class AutocompleteController extends FreeMarkerHttpServlet implements Sea
return query; return query;
} }
private Query makeNameQuery(String querystr, HttpServletRequest request) { private Query makeNameQuery(String querystr, Analyzer analyzer, HttpServletRequest request) {
String stemParam = (String) request.getParameter("stem");
boolean stem = "true".equals(stemParam);
String tokenizeParam = (String) request.getParameter("stem"); String tokenizeParam = (String) request.getParameter("tokenize");
boolean tokenize = "true".equals(tokenizeParam); boolean tokenize = "true".equals(tokenizeParam);
// The search index is lowercased // Note: Stemming is only relevant if we are tokenizing: an untokenized name
querystr = querystr.toLowerCase(); // query will not be stemmed. So we don't look at the stem parameter until we get to
// makeTokenizedNameQuery().
// If the last token of the query string ends in a word-delimiting character
// it should not get a wildcard query term.
// E.g., "Dickens," should match "Dickens" but not "Dickenson"
// This test might need to be moved to makeNameQuery().
Pattern p = Pattern.compile("\\W$");
Matcher m = p.matcher(querystr);
boolean lastTermIsWildcard = !m.find();
// Stemming is only relevant if we are tokenizing. An untokenized name
// query will not stem.
if (tokenize) { if (tokenize) {
return makeTokenizedNameQuery(querystr, stem, lastTermIsWildcard); return makeTokenizedNameQuery(querystr, analyzer, request);
} else { } else {
return makeUntokenizedNameQuery(querystr); return makeUntokenizedNameQuery(querystr);
} }
} }
private Query makeTokenizedNameQuery(String querystr, boolean stem, boolean lastTermIsWildcard) { private Query makeTokenizedNameQuery(String querystr, Analyzer analyzer, HttpServletRequest request) {
String stemParam = (String) request.getParameter("stem");
boolean stem = "true".equals(stemParam);
String termName = stem ? Entity2LuceneDoc.term.NAME : Entity2LuceneDoc.term.NAMEUNSTEMMED;
BooleanQuery boolQuery = new BooleanQuery();
// Use the query parser to analyze the search term the same way the indexed text was analyzed.
// For example, text is lowercased, and function words are stripped out.
QueryParser parser = getQueryParser(termName, analyzer);
// The wildcard query doesn't play well with stemming. Query term name:tales* doesn't match
// "tales", which is indexed as "tale", while query term name:tales does. Obviously we need
// the wildcard for name:tal*, so the only way to get them all to match is use a disjunction
// of wildcard and non-wildcard queries. The query will look have only an implicit disjunction
// operator: e.g., +(name:tales name:tales*)
try {
Query query = parser.parse(querystr);
log.debug("Adding non-wildcard query for " + querystr);
boolQuery.add(query, BooleanClause.Occur.SHOULD);
Query wildcardQuery = parser.parse(querystr + "*");
log.debug("Adding wildcard query for " + querystr);
boolQuery.add(wildcardQuery, BooleanClause.Occur.SHOULD);
log.debug("Name query is: " + boolQuery.toString());
; } catch (ParseException e) {
log.error(e, e);
}
return boolQuery;
/*
Query query = null; Query query = null;
String termName = stem ? Entity2LuceneDoc.term.NAME : Entity2LuceneDoc.term.NAMEUNSTEMMED; // The search index is lowercased
querystr = querystr.toLowerCase();
List<String> terms = Arrays.asList(querystr.split("[, ]+")); List<String> terms = Arrays.asList(querystr.split("[, ]+"));
for (Iterator<String> i = terms.iterator(); i.hasNext(); ) { for (Iterator<String> i = terms.iterator(); i.hasNext(); ) {
String term = (String) i.next(); String term = (String) i.next();
BooleanQuery boolQuery = new BooleanQuery();
// All items but last get a regular term query // All items but last get a regular term query
if (i.hasNext()) { if (i.hasNext()) {
BooleanQuery boolQuery = new BooleanQuery();
boolQuery.add( boolQuery.add(
new TermQuery(new Term(termName, term)), new TermQuery(new Term(termName, term)),
BooleanClause.Occur.MUST); BooleanClause.Occur.MUST);
@ -298,24 +321,24 @@ public class AutocompleteController extends FreeMarkerHttpServlet implements Sea
} }
query = boolQuery; query = boolQuery;
} }
// Last item goes on to next block
else {
querystr = term;
}
}
// Last term // Last term
{ else {
BooleanQuery boolQuery = new BooleanQuery(); // If the last token of the query string ends in a word-delimiting character
// it should not get a wildcard query term.
// E.g., "Dickens," should match "Dickens" but not "Dickenson"
Pattern p = Pattern.compile("\\W$");
Matcher m = p.matcher(querystr);
boolean lastTermIsWildcard = !m.find();
if (lastTermIsWildcard) { if (lastTermIsWildcard) {
log.debug("Adding wildcard query on last term"); log.debug("Adding wildcard query on last term");
boolQuery.add( boolQuery.add(
new WildcardQuery(new Term(termName, querystr + "*")), new WildcardQuery(new Term(termName, term + "*")),
BooleanClause.Occur.MUST); BooleanClause.Occur.MUST);
} else { } else {
log.debug("Adding term query on last term"); log.debug("Adding term query on last term");
boolQuery.add( boolQuery.add(
new TermQuery(new Term(termName, querystr)), new TermQuery(new Term(termName, term)),
BooleanClause.Occur.MUST); BooleanClause.Occur.MUST);
} }
if (query != null) { if (query != null) {
@ -323,8 +346,9 @@ public class AutocompleteController extends FreeMarkerHttpServlet implements Sea
} }
query = boolQuery; query = boolQuery;
} }
}
return query; return query;
*/
} }
private Query makeUntokenizedNameQuery(String querystr) { private Query makeUntokenizedNameQuery(String querystr) {
@ -391,6 +415,17 @@ public class AutocompleteController extends FreeMarkerHttpServlet implements Sea
} }
} }
private QueryParser getQueryParser(String searchField, Analyzer analyzer){
// searchField indicates which field to search against when there is no term
// indicated in the query string.
// The analyzer is needed so that we use the same analyzer on the search queries as
// was used on the text that was indexed.
QueryParser qp = new QueryParser(searchField,analyzer);
//this sets the query parser to AND all of the query terms it finds.
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
return qp;
}
private synchronized IndexSearcher getIndexSearcher(String indexDir) { private synchronized IndexSearcher getIndexSearcher(String indexDir) {
if( searcher == null ){ if( searcher == null ){
try { try {
@ -463,15 +498,15 @@ public class AutocompleteController extends FreeMarkerHttpServlet implements Sea
} }
public VitroHighlighter getHighlighter(VitroQuery q) { public VitroHighlighter getHighlighter(VitroQuery q) {
throw new Error("PagedSearchController.getHighlighter() is unimplemented"); throw new Error("AutocompleteController.getHighlighter() is unimplemented");
} }
public VitroQueryFactory getQueryFactory() { public VitroQueryFactory getQueryFactory() {
throw new Error("PagedSearchController.getQueryFactory() is unimplemented"); throw new Error("AutocompleteController.getQueryFactory() is unimplemented");
} }
public List search(VitroQuery query) throws SearchException { public List search(VitroQuery query) throws SearchException {
throw new Error("PagedSearchController.search() is unimplemented"); throw new Error("AutocompleteController.search() is unimplemented");
} }
} }