NIHVIVO-2459 Tokenized, stemmed autocomplete search
This commit is contained in:
parent
2bf3e20cd8
commit
997b3ef2cd
4 changed files with 64 additions and 39 deletions
|
@ -479,6 +479,35 @@
|
||||||
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
|
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldtype>
|
</fieldtype>
|
||||||
|
|
||||||
|
<!-- Commenting this fieldtype out for now because we have no use case for
|
||||||
|
a tokenized, unstemmed autocomplete field. Identical to edgengram_stemmed but without
|
||||||
|
the stemming.
|
||||||
|
<fieldtype name="edgengram_unstemmed" class="solr.TextField">
|
||||||
|
<analyzer type="index">
|
||||||
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
<filter class="solr.StopFilterFactory" ignoreCase="true"
|
||||||
|
words="stopwords.txt" enablePositionIncrements="true" />
|
||||||
|
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
|
||||||
|
generateNumberParts="1" catenateWords="0"
|
||||||
|
catenateNumbers="0" catenateAll="0"
|
||||||
|
splitOnCaseChange="1" />
|
||||||
|
<filter class="solr.LowerCaseFilterFactory" />
|
||||||
|
<filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="25" side="front"/>
|
||||||
|
</analyzer>
|
||||||
|
<analyzer type="query">
|
||||||
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
|
<filter class="solr.StopFilterFactory" ignoreCase="true"
|
||||||
|
words="stopwords.txt" enablePositionIncrements="true" />
|
||||||
|
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
|
||||||
|
generateNumberParts="1" catenateWords="0"
|
||||||
|
catenateNumbers="0" catenateAll="0"
|
||||||
|
splitOnCaseChange="1" />
|
||||||
|
<filter class="solr.LowerCaseFilterFactory" />
|
||||||
|
</analyzer>
|
||||||
|
</fieldtype>
|
||||||
|
-->
|
||||||
|
|
||||||
</types>
|
</types>
|
||||||
|
|
||||||
|
|
||||||
|
@ -529,7 +558,7 @@
|
||||||
|
|
||||||
<!-- Autocomplete search fields -->
|
<!-- Autocomplete search fields -->
|
||||||
<field name="acNameUntokenized" type="edgengram_untokenized" indexed="true" stored="false" multiValued="true" />
|
<field name="acNameUntokenized" type="edgengram_untokenized" indexed="true" stored="false" multiValued="true" />
|
||||||
<!-- <field name="acNameTokenized" type="edgengram_tokenized" indexed="true" stored="false" multiValued="true" /> -->
|
<!-- <field name="acNameUnstemmed" type="edgengram_unstemmed" indexed="true" stored="false" multiValued="true" /> -->
|
||||||
<field name="acNameStemmed" type="edgengram_stemmed" indexed="true" stored="false" multiValued="true" />
|
<field name="acNameStemmed" type="edgengram_stemmed" indexed="true" stored="false" multiValued="true" />
|
||||||
|
|
||||||
<field name="indexedTime" type="long" indexed="true" stored="true"/>
|
<field name="indexedTime" type="long" indexed="true" stored="true"/>
|
||||||
|
|
|
@ -69,7 +69,7 @@ public class VitroSearchTermNames {
|
||||||
public static String AC_NAME_STEMMED = "acNameStemmed";
|
public static String AC_NAME_STEMMED = "acNameStemmed";
|
||||||
|
|
||||||
/* There is currently no use case for an autocomplete search field that is tokenized but not stemmed.
|
/* There is currently no use case for an autocomplete search field that is tokenized but not stemmed.
|
||||||
public static String AC_NAME_TOKENIZED = "acNameTokenized"; */
|
public static String AC_NAME_UNSTEMMED = "acNameUnstemmed"; */
|
||||||
|
|
||||||
/** field for beta values of all documents **/
|
/** field for beta values of all documents **/
|
||||||
public static final String BETA = "BETA";
|
public static final String BETA = "BETA";
|
||||||
|
|
|
@ -179,49 +179,43 @@ public class SolrAutocompleteController extends VitroAjaxController {
|
||||||
|
|
||||||
private void setTokenizedNameQuery(SolrQuery query, String queryStr, HttpServletRequest request) {
|
private void setTokenizedNameQuery(SolrQuery query, String queryStr, HttpServletRequest request) {
|
||||||
|
|
||||||
// RY 5/18/2011 For now, just doing untokenized query, due to the interactions of wildcard
|
/* We currently have no use case for a tokenized, unstemmed autocomplete search field, so the option
|
||||||
// query and stemming described below. Need to find a way to do this in Solr.
|
* has been disabled. If needed in the future, will need to add a new field and field type which
|
||||||
// Should take the same approach if we can figure out how to do a disjunction.
|
* is like AC_NAME_STEMMED but doesn't include the stemmer.
|
||||||
// Probably just add an explicit "OR" between the terms.
|
String stemParam = (String) request.getParameter("stem");
|
||||||
|
boolean stem = "true".equals(stemParam);
|
||||||
|
if (stem) {
|
||||||
|
String acTermName = VitroSearchTermNames.AC_NAME_STEMMED;
|
||||||
|
String nonAcTermName = VitroSearchTermNames.NAME_STEMMED;
|
||||||
|
} else {
|
||||||
|
String acTermName = VitroSearchTermNames.AC_NAME_UNSTEMMED;
|
||||||
|
String nonAcTermName = VitroSearchTermNames.NAME_UNSTEMMED;
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
// String stemParam = (String) request.getParameter("stem");
|
String acTermName = VitroSearchTermNames.AC_NAME_STEMMED;
|
||||||
// boolean stem = "true".equals(stemParam);
|
String nonAcTermName = VitroSearchTermNames.NAME_STEMMED;
|
||||||
// String termName = stem ? VitroSearchTermNames.AC_NAME_STEMMED : VitroSearchTermNames.AC_NAME_UNSTEMMED ;
|
|
||||||
|
|
||||||
// // Use the query parser to analyze the search term the same way the indexed text was analyzed.
|
if (queryStr.endsWith(" ")) {
|
||||||
// // For example, text is lowercased, and function words are stripped out.
|
// Solr wants whitespace to be escaped with a backslash
|
||||||
// QueryParser parser = getQueryParser(termName);
|
queryStr = queryStr.replaceAll("\\s+", "\\\\ ");
|
||||||
//
|
queryStr = nonAcTermName + ":" + queryStr;
|
||||||
// // The wildcard query doesn't play well with stemming. Query term name:tales* doesn't match
|
} else {
|
||||||
// // "tales", which is indexed as "tale", while query term name:tales does. Obviously we need
|
int indexOfLastWord = queryStr.lastIndexOf(" ") + 1;
|
||||||
// // the wildcard for name:tal*, so the only way to get them all to match is use a disjunction
|
String queryStr1 = queryStr.substring(0, indexOfLastWord);
|
||||||
// // of wildcard and non-wildcard queries. The query will have only an implicit disjunction
|
String queryStr2 = queryStr.substring(indexOfLastWord);
|
||||||
// // operator: e.g., +(name:tales name:tales*)
|
queryStr = nonAcTermName + ":\"" + queryStr1 + "\"+" + acTermName + ":" + queryStr2;
|
||||||
// try {
|
}
|
||||||
// log.debug("Adding non-wildcard query for " + querystr);
|
|
||||||
// Query query = parser.parse(querystr);
|
log.debug("Tokenized name query string = " + queryStr);
|
||||||
// boolQuery.add(query, BooleanClause.Occur.SHOULD);
|
query.setQuery(queryStr);
|
||||||
//
|
|
||||||
// // Prevent ParseException here when adding * after a space.
|
|
||||||
// // If there's a space at the end, we don't need the wildcard query.
|
|
||||||
// if (! querystr.endsWith(" ")) {
|
|
||||||
// log.debug("Adding wildcard query for " + querystr);
|
|
||||||
// Query wildcardQuery = parser.parse(querystr + "*");
|
|
||||||
// boolQuery.add(wildcardQuery, BooleanClause.Occur.SHOULD);
|
|
||||||
// }
|
|
||||||
//
|
|
||||||
// log.debug("Name query is: " + boolQuery.toString());
|
|
||||||
// } catch (ParseException e) {
|
|
||||||
// log.warn(e, e);
|
|
||||||
// }
|
|
||||||
|
|
||||||
setUntokenizedNameQuery(query, queryStr);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void setUntokenizedNameQuery(SolrQuery query, String queryStr) {
|
private void setUntokenizedNameQuery(SolrQuery query, String queryStr) {
|
||||||
|
|
||||||
|
queryStr = queryStr.trim();
|
||||||
// Solr wants whitespace to be escaped with a backslash
|
// Solr wants whitespace to be escaped with a backslash
|
||||||
// Better: replace \s+
|
|
||||||
queryStr = queryStr.replaceAll("\\s+", "\\\\ ");
|
queryStr = queryStr.replaceAll("\\s+", "\\\\ ");
|
||||||
queryStr = VitroSearchTermNames.AC_NAME_UNTOKENIZED + ":" + queryStr;
|
queryStr = VitroSearchTermNames.AC_NAME_UNTOKENIZED + ":" + queryStr;
|
||||||
query.setQuery(queryStr);
|
query.setQuery(queryStr);
|
||||||
|
|
|
@ -201,6 +201,7 @@ public class IndividualToSolrDocument {
|
||||||
doc.addField(term.NAME_PHONETIC, value, PHONETIC_BOOST);
|
doc.addField(term.NAME_PHONETIC, value, PHONETIC_BOOST);
|
||||||
doc.addField(term.AC_NAME_UNTOKENIZED, value);
|
doc.addField(term.AC_NAME_UNTOKENIZED, value);
|
||||||
doc.addField(term.AC_NAME_STEMMED, value);
|
doc.addField(term.AC_NAME_STEMMED, value);
|
||||||
|
// doc.addField(term.AC_NAME_TOKENIZED, value);
|
||||||
}else{
|
}else{
|
||||||
doc.addField(term.NAME_RAW, value);
|
doc.addField(term.NAME_RAW, value);
|
||||||
doc.addField(term.NAME_LOWERCASE, value);
|
doc.addField(term.NAME_LOWERCASE, value);
|
||||||
|
@ -209,6 +210,7 @@ public class IndividualToSolrDocument {
|
||||||
doc.addField(term.NAME_PHONETIC, value, PHONETIC_BOOST);
|
doc.addField(term.NAME_PHONETIC, value, PHONETIC_BOOST);
|
||||||
doc.addField(term.AC_NAME_UNTOKENIZED, value);
|
doc.addField(term.AC_NAME_UNTOKENIZED, value);
|
||||||
doc.addField(term.AC_NAME_STEMMED, value);
|
doc.addField(term.AC_NAME_STEMMED, value);
|
||||||
|
// doc.addField(term.AC_NAME_TOKENIZED, value);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue