NIHVIVO-2459 Two edgeNGram field definitions for autocomplete. Untokenized autocomplete search.

This commit is contained in:
ryounes 2011-06-28 16:57:47 +00:00
parent 179d2b80d4
commit a7c271a9bd
4 changed files with 102 additions and 105 deletions

View file

@ -220,7 +220,7 @@
NOTE: autoGeneratePhraseQueries="true" tends to not work well for non whitespace delimited languages. NOTE: autoGeneratePhraseQueries="true" tends to not work well for non whitespace delimited languages.
--> -->
<fieldType name="text" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> <fieldType name="text" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
<analyzer type="index"> <analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/> <tokenizer class="solr.WhitespaceTokenizerFactory"/>
<!-- in this example, we will only use synonyms at query time --> <!-- in this example, we will only use synonyms at query time -->
<filter class="solr.SynonymFilterFactory" synonyms="syn.txt" ignoreCase="true" expand="true"/> <filter class="solr.SynonymFilterFactory" synonyms="syn.txt" ignoreCase="true" expand="true"/>
@ -229,65 +229,43 @@
add enablePositionIncrements=true in both the index and query add enablePositionIncrements=true in both the index and query
analyzers to leave a 'gap' for more accurate phrase queries. analyzers to leave a 'gap' for more accurate phrase queries.
--> -->
<filter class="solr.StopFilterFactory" <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
ignoreCase="true" <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"
words="stopwords.txt" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
enablePositionIncrements="true"
/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/> <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
</analyzer> </analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="syn.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
enablePositionIncrements="true"
/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
</analyzer>
</fieldType> </fieldType>
<!-- Like text, but without synonyms and stemming. Good for autocomplete matching of proper names, where we want to remove <!-- Like text, but without synonyms and stemming. Good for autocomplete where we want to remove
stop words but not stem. --> stop words but not stem. -->
<fieldType name="text_unstemmed" class="solr.TextField" positionIncrementGap="100"> <fieldType name="text_unstemmed" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index"> <analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/> <tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory" <filter class="solr.StopFilterFactory" ignoreCase="true"
ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
words="stopwords.txt" <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
enablePositionIncrements="true" /> generateNumberParts="1" catenateWords="0"
<filter class="solr.WordDelimiterFilterFactory" catenateNumbers="0" catenateAll="0"
generateWordParts="1" splitOnCaseChange="1" />
generateNumberParts="1"
catenateWords="0"
catenateNumbers="0"
catenateAll="0"
splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.LowerCaseFilterFactory"/>
</analyzer> </analyzer>
<analyzer type="query"> </fieldType>
<!-- Like text, but without synonyms. Good for autocomplete matching of book/grant titles, etc., where we want to remove
stop words and stem. -->
<fieldType name="text_stemmed" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/> <tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory" <filter class="solr.StopFilterFactory" ignoreCase="true"
ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
words="stopwords.txt" <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
enablePositionIncrements="true" /> generateNumberParts="1" catenateWords="0"
<!-- <filter class="solr.WordDelimiterFilterFactory" catenateNumbers="0" catenateAll="0"
generateWordParts="1" splitOnCaseChange="1" />
generateNumberParts="1"
catenateWords="0"
catenateNumbers="0"
catenateAll="0"
splitOnCaseChange="1"/> -->
<filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
</analyzer> </analyzer>
</fieldType> </fieldType>
@ -476,6 +454,31 @@
</analyzer> </analyzer>
</fieldtype> </fieldtype>
<fieldtype name="edgengram_stemmed" class="solr.TextField">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0"
splitOnCaseChange="1" />
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
<filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="25" side="front"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0"
splitOnCaseChange="1" />
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
</analyzer>
</fieldtype>
</types> </types>
@ -522,13 +525,12 @@
<!-- A sortable version of nameLowercase --> <!-- A sortable version of nameLowercase -->
<field name="nameLowercaseSingleValued" type="lowercase" indexed="true" stored="false" multiValued="false" /> <field name="nameLowercaseSingleValued" type="lowercase" indexed="true" stored="false" multiValued="false" />
<field name="nameUnstemmed" type="text_unstemmed" indexed="true" stored="false" multiValued="true"/> <field name="nameUnstemmed" type="text_unstemmed" indexed="true" stored="false" multiValued="true"/>
<field name="nameStemmed" type="text" indexed="true" stored="false" multiValued="true"/> <field name="nameStemmed" type="text_stemmed" indexed="true" stored="false" multiValued="true"/>
<!-- Untokenized autocomplete on name (suitable for person names) -->
<!-- Autocomplete search fields -->
<field name="acNameUntokenized" type="edgengram_untokenized" indexed="true" stored="false" multiValued="true" /> <field name="acNameUntokenized" type="edgengram_untokenized" indexed="true" stored="false" multiValued="true" />
<!-- <!-- <field name="acNameTokenized" type="edgengram_tokenized" indexed="true" stored="false" multiValued="true" /> -->
<field name="acNameTokenized" type="edgengram_tokenized" indexed="true" stored="false" multiValued="true" /> <field name="acNameStemmed" type="edgengram_stemmed" indexed="true" stored="false" multiValued="true" />
<field name="acNameStemmed" type="edgengram_stemmed" indexed="true" stored="false" multiValued="true" />
-->
<field name="indexedTime" type="long" indexed="true" stored="true"/> <field name="indexedTime" type="long" indexed="true" stored="true"/>
<field name="NAME_PHONETIC" type ="phonetic" indexed="true" stored="false" multiValued="true"/> <field name="NAME_PHONETIC" type ="phonetic" indexed="true" stored="false" multiValued="true"/>

View file

@ -64,6 +64,13 @@ public class VitroSearchTermNames {
/** rdfs:label lowercased, untokenized, edge-n-gram-filtered for autocomplete on people names **/ /** rdfs:label lowercased, untokenized, edge-n-gram-filtered for autocomplete on people names **/
public static String AC_NAME_UNTOKENIZED = "acNameUntokenized"; public static String AC_NAME_UNTOKENIZED = "acNameUntokenized";
/** rdfs:label lowercased, tokenized, stop words, stemmed, edge-n-gram-filtered for autocomplete
* on non-person labels such as book titles and grant names **/
public static String AC_NAME_STEMMED = "acNameStemmed";
/* There is currently no use case for an autocomplete search field that is tokenized but not stemmed.
public static String AC_NAME_TOKENIZED = "acNameTokenized"; */
/** field for beta values of all documents **/ /** field for beta values of all documents **/
public static final String BETA = "BETA"; public static final String BETA = "BETA";
public static final String PHI = "PHI"; public static final String PHI = "PHI";

View file

@ -142,7 +142,7 @@ public class SolrAutocompleteController extends VitroAjaxController {
query.setStart(0) query.setStart(0)
.setRows(DEFAULT_MAX_HIT_COUNT); .setRows(DEFAULT_MAX_HIT_COUNT);
setQuery(query, queryStr, vreq); setNameQuery(query, queryStr, vreq);
// Filter by type // Filter by type
String typeParam = (String) vreq.getParameter(PARAM_RDFTYPE); String typeParam = (String) vreq.getParameter(PARAM_RDFTYPE);
@ -158,7 +158,7 @@ public class SolrAutocompleteController extends VitroAjaxController {
return query; return query;
} }
private void setQuery(SolrQuery query, String queryStr, HttpServletRequest request) { private void setNameQuery(SolrQuery query, String queryStr, HttpServletRequest request) {
if (StringUtils.isBlank(queryStr)) { if (StringUtils.isBlank(queryStr)) {
log.error("No query string"); log.error("No query string");
@ -171,13 +171,13 @@ public class SolrAutocompleteController extends VitroAjaxController {
// query will not be stemmed. So we don't look at the stem parameter until we get to // query will not be stemmed. So we don't look at the stem parameter until we get to
// setTokenizedNameQuery(). // setTokenizedNameQuery().
if (tokenize) { if (tokenize) {
setTokenizedQuery(query, queryStr, request); setTokenizedNameQuery(query, queryStr, request);
} else { } else {
setUntokenizedQuery(query, queryStr); setUntokenizedNameQuery(query, queryStr);
} }
} }
private void setTokenizedQuery(SolrQuery query, String queryStr, HttpServletRequest request) { private void setTokenizedNameQuery(SolrQuery query, String queryStr, HttpServletRequest request) {
// RY 5/18/2011 For now, just doing untokenized query, due to the interactions of wildcard // RY 5/18/2011 For now, just doing untokenized query, due to the interactions of wildcard
// query and stemming described below. Need to find a way to do this in Solr. // query and stemming described below. Need to find a way to do this in Solr.
@ -215,30 +215,15 @@ public class SolrAutocompleteController extends VitroAjaxController {
// log.warn(e, e); // log.warn(e, e);
// } // }
//setUntokenizedQuery(query, queryStr); setUntokenizedNameQuery(query, queryStr);
String stemParam = (String) request.getParameter("stem");
boolean stem = "true".equals(stemParam);
String termName = stem ? VitroSearchTermNames.NAME_STEMMED : VitroSearchTermNames.NAME_UNSTEMMED;
// We have to lowercase manually, because Solr doesn't do text analysis on wildcard queries
queryStr = queryStr.toLowerCase();
// Solr wants whitespace to be escaped with a backslash
// Better: replace \s+
queryStr = queryStr.replaceAll(" ", "\\\\ ");
queryStr = termName + ":" + queryStr + "*";
query.setQuery(queryStr);
} }
private void setUntokenizedQuery(SolrQuery query, String queryStr) { private void setUntokenizedNameQuery(SolrQuery query, String queryStr) {
// We have to lowercase manually, because Solr doesn't do text analysis on wildcard queries
queryStr = queryStr.toLowerCase();
// Solr wants whitespace to be escaped with a backslash // Solr wants whitespace to be escaped with a backslash
// Better: replace \s+ // Better: replace \s+
queryStr = queryStr.replaceAll(" ", "\\\\ "); queryStr = queryStr.replaceAll("\\s+", "\\\\ ");
queryStr = VitroSearchTermNames.NAME_LOWERCASE + ":" + queryStr + "*"; queryStr = VitroSearchTermNames.AC_NAME_UNTOKENIZED + ":" + queryStr;
query.setQuery(queryStr); query.setQuery(queryStr);
} }

View file

@ -175,47 +175,50 @@ public class IndividualToSolrDocument {
String t=null; String t=null;
addUri = new StringBuffer(); addUri = new StringBuffer();
addUri.append(""); addUri.append("");
List<ObjectPropertyStatement> objectPropertyStatements = ind.getObjectPropertyStatements(); List<ObjectPropertyStatement> objectPropertyStatements = ind.getObjectPropertyStatements();
if (objectPropertyStatements != null) { if (objectPropertyStatements != null) {
Iterator<ObjectPropertyStatement> objectPropertyStmtIter = objectPropertyStatements.iterator(); Iterator<ObjectPropertyStatement> objectPropertyStmtIter = objectPropertyStatements.iterator();
while (objectPropertyStmtIter.hasNext()) { while (objectPropertyStmtIter.hasNext()) {
ObjectPropertyStatement objectPropertyStmt = objectPropertyStmtIter.next(); ObjectPropertyStatement objectPropertyStmt = objectPropertyStmtIter.next();
if( "http://www.w3.org/2002/07/owl#differentFrom".equals(objectPropertyStmt.getPropertyURI()) ) if( "http://www.w3.org/2002/07/owl#differentFrom".equals(objectPropertyStmt.getPropertyURI()) )
continue; continue;
try { try {
objectNames.append(" "); objectNames.append(" ");
objectNames.append(((t=objectPropertyStmt.getObject().getName()) == null)?"":t); objectNames.append(((t=objectPropertyStmt.getObject().getName()) == null)?"":t);
addUri.append(" "); addUri.append(" ");
addUri.append(((t=objectPropertyStmt.getObject().getURI()) == null)?"":t); addUri.append(((t=objectPropertyStmt.getObject().getURI()) == null)?"":t);
} catch (Exception e) { } catch (Exception e) {
log.debug("could not index name of related object: " + e.getMessage()); log.debug("could not index name of related object: " + e.getMessage());
} }
} }
} }
if(documentModifiers == null || documentModifiers.isEmpty()){ if(documentModifiers == null || documentModifiers.isEmpty()){
doc.addField(term.NAME_RAW, value, NAME_BOOST); doc.addField(term.NAME_RAW, value, NAME_BOOST);
doc.addField(term.NAME_LOWERCASE, value, NAME_BOOST); doc.addField(term.NAME_LOWERCASE, value, NAME_BOOST);
doc.addField(term.NAME_UNSTEMMED, value,NAME_BOOST); doc.addField(term.NAME_UNSTEMMED, value, NAME_BOOST);
doc.addField(term.NAME_STEMMED, value, NAME_BOOST); doc.addField(term.NAME_STEMMED, value, NAME_BOOST);
doc.addField(term.NAME_PHONETIC, value, PHONETIC_BOOST); doc.addField(term.NAME_PHONETIC, value, PHONETIC_BOOST);
doc.addField(term.AC_NAME_UNTOKENIZED, value); doc.addField(term.AC_NAME_UNTOKENIZED, value);
}else{ doc.addField(term.AC_NAME_STEMMED, value);
}else{
doc.addField(term.NAME_RAW, value); doc.addField(term.NAME_RAW, value);
doc.addField(term.NAME_LOWERCASE, value); doc.addField(term.NAME_LOWERCASE, value);
doc.addField(term.NAME_UNSTEMMED, value); doc.addField(term.NAME_UNSTEMMED, value);
doc.addField(term.NAME_STEMMED, value); doc.addField(term.NAME_STEMMED, value);
doc.addField(term.NAME_PHONETIC, value, PHONETIC_BOOST); doc.addField(term.NAME_PHONETIC, value, PHONETIC_BOOST);
doc.addField(term.AC_NAME_UNTOKENIZED, value); doc.addField(term.AC_NAME_UNTOKENIZED, value);
} doc.addField(term.AC_NAME_STEMMED, value);
}
long tMoniker = System.currentTimeMillis(); long tMoniker = System.currentTimeMillis();
if(documentModifiers == null || documentModifiers.isEmpty()){ if(documentModifiers == null || documentModifiers.isEmpty()){
//boost for entity //boost for entity
if(ind.getSearchBoost() != null && ind.getSearchBoost() != 0) if(ind.getSearchBoost() != null && ind.getSearchBoost() != 0) {
doc.setDocumentBoost(ind.getSearchBoost()); doc.setDocumentBoost(ind.getSearchBoost());
}
} }
//thumbnail //thumbnail