NIHVIVO-774 Fine-tune tokenized autocomplete query to get desired results
This commit is contained in:
parent
35f60021dd
commit
0f93083a66
1 changed files with 91 additions and 56 deletions
|
@ -26,6 +26,8 @@ import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.queryParser.ParseException;
|
||||||
|
import org.apache.lucene.queryParser.QueryParser;
|
||||||
import org.apache.lucene.search.BooleanClause;
|
import org.apache.lucene.search.BooleanClause;
|
||||||
import org.apache.lucene.search.BooleanQuery;
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
@ -54,13 +56,13 @@ import freemarker.template.Configuration;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* AutocompleteController is used to generate autocomplete and select element content
|
* AutocompleteController is used to generate autocomplete and select element content
|
||||||
* through a Lucene search. The search logic is copied from PagedSearchController.
|
* through a Lucene search. The search logic is copied from AutocompleteController.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/* rjy7 We should have a SearchController that is subclassed by both PagedSearchController
|
/* rjy7 We should have a SearchController that is subclassed by both AutocompleteController
|
||||||
* and AjaxSearchController, so the methods don't all have to be copied into both places.
|
* and AjaxSearchController, so the methods don't all have to be copied into both places.
|
||||||
* The parent SearchController should extend FreeMarkerHttpServlet. Can only be done
|
* The parent SearchController should extend FreeMarkerHttpServlet. Can only be done
|
||||||
* once PagedSearchController has been moved to FreeMarker.
|
* once AutocompleteController has been moved to FreeMarker.
|
||||||
*/
|
*/
|
||||||
public class AutocompleteController extends FreeMarkerHttpServlet implements Searcher{
|
public class AutocompleteController extends FreeMarkerHttpServlet implements Searcher{
|
||||||
|
|
||||||
|
@ -219,8 +221,7 @@ public class AutocompleteController extends FreeMarkerHttpServlet implements Sea
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
query = makeNameQuery(querystr, request);
|
query = makeNameQuery(querystr, analyzer, request);
|
||||||
|
|
||||||
|
|
||||||
// Filter by type
|
// Filter by type
|
||||||
{
|
{
|
||||||
|
@ -251,45 +252,67 @@ public class AutocompleteController extends FreeMarkerHttpServlet implements Sea
|
||||||
return query;
|
return query;
|
||||||
}
|
}
|
||||||
|
|
||||||
private Query makeNameQuery(String querystr, HttpServletRequest request) {
|
private Query makeNameQuery(String querystr, Analyzer analyzer, HttpServletRequest request) {
|
||||||
String stemParam = (String) request.getParameter("stem");
|
|
||||||
boolean stem = "true".equals(stemParam);
|
|
||||||
|
|
||||||
String tokenizeParam = (String) request.getParameter("stem");
|
String tokenizeParam = (String) request.getParameter("tokenize");
|
||||||
boolean tokenize = "true".equals(tokenizeParam);
|
boolean tokenize = "true".equals(tokenizeParam);
|
||||||
|
|
||||||
// The search index is lowercased
|
// Note: Stemming is only relevant if we are tokenizing: an untokenized name
|
||||||
querystr = querystr.toLowerCase();
|
// query will not be stemmed. So we don't look at the stem parameter until we get to
|
||||||
|
// makeTokenizedNameQuery().
|
||||||
// If the last token of the query string ends in a word-delimiting character
|
|
||||||
// it should not get a wildcard query term.
|
|
||||||
// E.g., "Dickens," should match "Dickens" but not "Dickenson"
|
|
||||||
// This test might need to be moved to makeNameQuery().
|
|
||||||
Pattern p = Pattern.compile("\\W$");
|
|
||||||
Matcher m = p.matcher(querystr);
|
|
||||||
boolean lastTermIsWildcard = !m.find();
|
|
||||||
|
|
||||||
// Stemming is only relevant if we are tokenizing. An untokenized name
|
|
||||||
// query will not stem.
|
|
||||||
if (tokenize) {
|
if (tokenize) {
|
||||||
return makeTokenizedNameQuery(querystr, stem, lastTermIsWildcard);
|
return makeTokenizedNameQuery(querystr, analyzer, request);
|
||||||
} else {
|
} else {
|
||||||
return makeUntokenizedNameQuery(querystr);
|
return makeUntokenizedNameQuery(querystr);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private Query makeTokenizedNameQuery(String querystr, boolean stem, boolean lastTermIsWildcard) {
|
private Query makeTokenizedNameQuery(String querystr, Analyzer analyzer, HttpServletRequest request) {
|
||||||
|
|
||||||
|
String stemParam = (String) request.getParameter("stem");
|
||||||
|
boolean stem = "true".equals(stemParam);
|
||||||
|
String termName = stem ? Entity2LuceneDoc.term.NAME : Entity2LuceneDoc.term.NAMEUNSTEMMED;
|
||||||
|
|
||||||
|
BooleanQuery boolQuery = new BooleanQuery();
|
||||||
|
|
||||||
|
// Use the query parser to analyze the search term the same way the indexed text was analyzed.
|
||||||
|
// For example, text is lowercased, and function words are stripped out.
|
||||||
|
QueryParser parser = getQueryParser(termName, analyzer);
|
||||||
|
|
||||||
|
// The wildcard query doesn't play well with stemming. Query term name:tales* doesn't match
|
||||||
|
// "tales", which is indexed as "tale", while query term name:tales does. Obviously we need
|
||||||
|
// the wildcard for name:tal*, so the only way to get them all to match is use a disjunction
|
||||||
|
// of wildcard and non-wildcard queries. The query will look have only an implicit disjunction
|
||||||
|
// operator: e.g., +(name:tales name:tales*)
|
||||||
|
try {
|
||||||
|
Query query = parser.parse(querystr);
|
||||||
|
log.debug("Adding non-wildcard query for " + querystr);
|
||||||
|
boolQuery.add(query, BooleanClause.Occur.SHOULD);
|
||||||
|
|
||||||
|
Query wildcardQuery = parser.parse(querystr + "*");
|
||||||
|
log.debug("Adding wildcard query for " + querystr);
|
||||||
|
boolQuery.add(wildcardQuery, BooleanClause.Occur.SHOULD);
|
||||||
|
|
||||||
|
log.debug("Name query is: " + boolQuery.toString());
|
||||||
|
; } catch (ParseException e) {
|
||||||
|
log.error(e, e);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
return boolQuery;
|
||||||
|
|
||||||
|
/*
|
||||||
Query query = null;
|
Query query = null;
|
||||||
|
|
||||||
String termName = stem ? Entity2LuceneDoc.term.NAME : Entity2LuceneDoc.term.NAMEUNSTEMMED;
|
// The search index is lowercased
|
||||||
|
querystr = querystr.toLowerCase();
|
||||||
|
|
||||||
List<String> terms = Arrays.asList(querystr.split("[, ]+"));
|
List<String> terms = Arrays.asList(querystr.split("[, ]+"));
|
||||||
for (Iterator<String> i = terms.iterator(); i.hasNext(); ) {
|
for (Iterator<String> i = terms.iterator(); i.hasNext(); ) {
|
||||||
String term = (String) i.next();
|
String term = (String) i.next();
|
||||||
|
BooleanQuery boolQuery = new BooleanQuery();
|
||||||
// All items but last get a regular term query
|
// All items but last get a regular term query
|
||||||
if (i.hasNext()) {
|
if (i.hasNext()) {
|
||||||
BooleanQuery boolQuery = new BooleanQuery();
|
|
||||||
boolQuery.add(
|
boolQuery.add(
|
||||||
new TermQuery(new Term(termName, term)),
|
new TermQuery(new Term(termName, term)),
|
||||||
BooleanClause.Occur.MUST);
|
BooleanClause.Occur.MUST);
|
||||||
|
@ -298,24 +321,24 @@ public class AutocompleteController extends FreeMarkerHttpServlet implements Sea
|
||||||
}
|
}
|
||||||
query = boolQuery;
|
query = boolQuery;
|
||||||
}
|
}
|
||||||
// Last item goes on to next block
|
|
||||||
else {
|
|
||||||
querystr = term;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Last term
|
// Last term
|
||||||
{
|
else {
|
||||||
BooleanQuery boolQuery = new BooleanQuery();
|
// If the last token of the query string ends in a word-delimiting character
|
||||||
|
// it should not get a wildcard query term.
|
||||||
|
// E.g., "Dickens," should match "Dickens" but not "Dickenson"
|
||||||
|
Pattern p = Pattern.compile("\\W$");
|
||||||
|
Matcher m = p.matcher(querystr);
|
||||||
|
boolean lastTermIsWildcard = !m.find();
|
||||||
|
|
||||||
if (lastTermIsWildcard) {
|
if (lastTermIsWildcard) {
|
||||||
log.debug("Adding wildcard query on last term");
|
log.debug("Adding wildcard query on last term");
|
||||||
boolQuery.add(
|
boolQuery.add(
|
||||||
new WildcardQuery(new Term(termName, querystr + "*")),
|
new WildcardQuery(new Term(termName, term + "*")),
|
||||||
BooleanClause.Occur.MUST);
|
BooleanClause.Occur.MUST);
|
||||||
} else {
|
} else {
|
||||||
log.debug("Adding term query on last term");
|
log.debug("Adding term query on last term");
|
||||||
boolQuery.add(
|
boolQuery.add(
|
||||||
new TermQuery(new Term(termName, querystr)),
|
new TermQuery(new Term(termName, term)),
|
||||||
BooleanClause.Occur.MUST);
|
BooleanClause.Occur.MUST);
|
||||||
}
|
}
|
||||||
if (query != null) {
|
if (query != null) {
|
||||||
|
@ -323,8 +346,9 @@ public class AutocompleteController extends FreeMarkerHttpServlet implements Sea
|
||||||
}
|
}
|
||||||
query = boolQuery;
|
query = boolQuery;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
return query;
|
return query;
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
private Query makeUntokenizedNameQuery(String querystr) {
|
private Query makeUntokenizedNameQuery(String querystr) {
|
||||||
|
@ -391,6 +415,17 @@ public class AutocompleteController extends FreeMarkerHttpServlet implements Sea
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private QueryParser getQueryParser(String searchField, Analyzer analyzer){
|
||||||
|
// searchField indicates which field to search against when there is no term
|
||||||
|
// indicated in the query string.
|
||||||
|
// The analyzer is needed so that we use the same analyzer on the search queries as
|
||||||
|
// was used on the text that was indexed.
|
||||||
|
QueryParser qp = new QueryParser(searchField,analyzer);
|
||||||
|
//this sets the query parser to AND all of the query terms it finds.
|
||||||
|
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
|
||||||
|
return qp;
|
||||||
|
}
|
||||||
|
|
||||||
private synchronized IndexSearcher getIndexSearcher(String indexDir) {
|
private synchronized IndexSearcher getIndexSearcher(String indexDir) {
|
||||||
if( searcher == null ){
|
if( searcher == null ){
|
||||||
try {
|
try {
|
||||||
|
@ -463,15 +498,15 @@ public class AutocompleteController extends FreeMarkerHttpServlet implements Sea
|
||||||
}
|
}
|
||||||
|
|
||||||
public VitroHighlighter getHighlighter(VitroQuery q) {
|
public VitroHighlighter getHighlighter(VitroQuery q) {
|
||||||
throw new Error("PagedSearchController.getHighlighter() is unimplemented");
|
throw new Error("AutocompleteController.getHighlighter() is unimplemented");
|
||||||
}
|
}
|
||||||
|
|
||||||
public VitroQueryFactory getQueryFactory() {
|
public VitroQueryFactory getQueryFactory() {
|
||||||
throw new Error("PagedSearchController.getQueryFactory() is unimplemented");
|
throw new Error("AutocompleteController.getQueryFactory() is unimplemented");
|
||||||
}
|
}
|
||||||
|
|
||||||
public List search(VitroQuery query) throws SearchException {
|
public List search(VitroQuery query) throws SearchException {
|
||||||
throw new Error("PagedSearchController.search() is unimplemented");
|
throw new Error("AutocompleteController.search() is unimplemented");
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue