NIHVIVO-2459 Two edgeNGram field definitions for autocomplete. Untokenized autocomplete search.

This commit is contained in:
ryounes 2011-06-28 16:57:47 +00:00
parent 179d2b80d4
commit a7c271a9bd
4 changed files with 102 additions and 105 deletions

View file

@ -220,7 +220,7 @@
NOTE: autoGeneratePhraseQueries="true" tends to not work well for non whitespace delimited languages.
-->
<fieldType name="text" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
<analyzer type="index">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<!-- in this example, we will only use synonyms at query time -->
<filter class="solr.SynonymFilterFactory" synonyms="syn.txt" ignoreCase="true" expand="true"/>
@ -229,65 +229,43 @@
add enablePositionIncrements=true in both the index and query
analyzers to leave a 'gap' for more accurate phrase queries.
-->
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
enablePositionIncrements="true"
/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"
catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="syn.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
enablePositionIncrements="true"
/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
</analyzer>
</fieldType>
<!-- Like text, but without synonyms and stemming. Good for autocomplete matching of proper names, where we want to remove
stop words but not stem. -->
<!-- Like text, but without synonyms and stemming. Good for autocomplete where we want to remove
stop words but not stem. -->
<fieldType name="text_unstemmed" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0"
splitOnCaseChange="1" />
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<!-- Like text, but without synonyms. Good for autocomplete matching of book/grant titles, etc., where we want to remove
stop words and stem. -->
<fieldType name="text_stemmed" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
enablePositionIncrements="true" />
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1"
generateNumberParts="1"
catenateWords="0"
catenateNumbers="0"
catenateAll="0"
splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
enablePositionIncrements="true" />
<!-- <filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1"
generateNumberParts="1"
catenateWords="0"
catenateNumbers="0"
catenateAll="0"
splitOnCaseChange="1"/> -->
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0"
splitOnCaseChange="1" />
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
</analyzer>
</fieldType>
@ -475,7 +453,32 @@
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
</fieldtype>
<fieldtype name="edgengram_stemmed" class="solr.TextField">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0"
splitOnCaseChange="1" />
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
<filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="25" side="front"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0"
splitOnCaseChange="1" />
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
</analyzer>
</fieldtype>
</types>
@ -522,13 +525,12 @@
<!-- A sortable version of nameLowercase -->
<field name="nameLowercaseSingleValued" type="lowercase" indexed="true" stored="false" multiValued="false" />
<field name="nameUnstemmed" type="text_unstemmed" indexed="true" stored="false" multiValued="true"/>
<field name="nameStemmed" type="text" indexed="true" stored="false" multiValued="true"/>
<!-- Untokenized autocomplete on name (suitable for person names) -->
<field name="nameStemmed" type="text_stemmed" indexed="true" stored="false" multiValued="true"/>
<!-- Autocomplete search fields -->
<field name="acNameUntokenized" type="edgengram_untokenized" indexed="true" stored="false" multiValued="true" />
<!--
<field name="acNameTokenized" type="edgengram_tokenized" indexed="true" stored="false" multiValued="true" />
<field name="acNameStemmed" type="edgengram_stemmed" indexed="true" stored="false" multiValued="true" />
-->
<!-- <field name="acNameTokenized" type="edgengram_tokenized" indexed="true" stored="false" multiValued="true" /> -->
<field name="acNameStemmed" type="edgengram_stemmed" indexed="true" stored="false" multiValued="true" />
<field name="indexedTime" type="long" indexed="true" stored="true"/>
<field name="NAME_PHONETIC" type ="phonetic" indexed="true" stored="false" multiValued="true"/>