vitro/solr/homeDirectoryTemplate/conf/schema.xml
Brian Caruso b9d1b5d48f Remvoing unused Solr fileds ALLTEXT_PHONETIC, PHI, ADJACENT_NODES, modType, JCLASS,
Adding field mostSpecificTypesURIs for VIVO-221
2013-07-25 11:38:40 -04:00

280 lines
15 KiB
XML

<?xml version="1.0" encoding="UTF-8" ?>
<schema name="example" version="1.3">
<types>
<!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
<!-- boolean type: "true" or "false" -->
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/>
<!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings -->
<fieldtype name="binary" class="solr.BinaryField"/>
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/>
<!-- A Trie based date field for faster date range queries and date faceting. -->
<fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/>
<fieldType name="random" class="solr.RandomSortField" indexed="true" />
<fieldType name="text" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"
catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1"
catenateWords="0" catenateNumbers="0" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
</analyzer>
</fieldType>
<fieldType name="text_unstemmed" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
<!-- This field intentionally sets solr.WordDelimiterFilterFactory splitOnCaseChange to 0/false because of issue NIHVIVO-3332 -->
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" />
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<!-- Like text, but without synonyms. Good for autocomplete matching of book/grant titles, etc.,
where we want to remove stop words and stem. -->
<fieldType name="text_stemmed" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords-name.txt" enablePositionIncrements="true" />
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0" splitOnCaseChange="1" />
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
</analyzer>
</fieldType>
<!-- A general unstemmed text field - good if one does not know the language of the field -->
<fieldType name="textgen" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.PhoneticFilterFactory" encoder="Metaphone" inject="false"/>
</analyzer>
</fieldtype>
<!-- lowercases the entire field value, keeping it as a single token. -->
<fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.KeywordTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
</fieldType>
<!-- since fields of this type are by default not stored or indexed,
any data added to them will be ignored outright. -->
<fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />
<fieldtype name="edgengram_untokenized" class="solr.TextField">
<analyzer type="index">
<tokenizer class="solr.KeywordTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.EdgeNGramFilterFactory" minGramSize="2" maxGramSize="25" side="front"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.KeywordTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
</fieldtype>
<fieldtype name="edgengram_stemmed" class="solr.TextField">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords-name.txt" enablePositionIncrements="true" />
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0"
splitOnCaseChange="1" />
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
<filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="25" side="front"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords-name.txt" enablePositionIncrements="true" />
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0"
splitOnCaseChange="1" />
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
</analyzer>
</fieldtype>
</types>
<fields>
<!-- Valid attributes for fields:
name: mandatory - the name for the field
type: mandatory - the name of a previously defined type from the
<types> section
indexed: true if this field should be indexed (searchable or sortable)
stored: true if this field should be retrievable
multiValued: true if this field may contain multiple values per document
omitNorms: (expert) set to true to omit the norms associated with
this field (this disables length normalization and index-time
boosting for the field, and saves some memory). Only full-text
fields or fields that need an index-time boost need norms.
termVectors: [false] set to true to store the term vector for a
given field.
When using MoreLikeThis, fields used for similarity should be
stored for best performance.
termPositions: Store position information with the term vector.
This will increase storage costs.
termOffsets: Store offset information with the term vector. This
will increase storage costs.
default: a value that should be used if no value is specified
when adding a document.
-->
<!-- **************************** Vitro Fields *************************** -->
<field name="DocId" type="string" indexed="true" stored="true" required="true" omitNorms="true"/>
<field name="type" type="string" indexed="true" stored="true" omitNorms="true" multiValued="true"/>
<field name="classLocalName" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="classgroup" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="mostSpecificTypeURIs" type="string" indexed="true" stored="true" omitNorms="true" multiValued="true"/>
<field name="PROHIBITED_FROM_TEXT_RESULTS" type="string" indexed="true" stored="false" omitNorms="true" multiValued="true"/>
<field name="URI" type="string" indexed="true" stored="true" multiValued="false" omitNorms="true"/>
<field name="nameRaw" type="string" indexed="false" stored="true" multiValued="true"/>
<!-- nameText added for NIHVIVO-3701 -->
<field name="nameText" type="text" indexed="true" stored="false" multiValued="true"/>
<field name="nameLowercase" type="lowercase" indexed="true" stored="false" multiValued="true"/>
<!-- A sortable version of nameLowercase -->
<field name="nameLowercaseSingleValued" type="lowercase" indexed="true" stored="false" multiValued="false" omitNorms="true" />
<field name="nameUnstemmed" type="text_unstemmed" indexed="true" stored="false" multiValued="true"/>
<field name="nameStemmed" type="text_stemmed" indexed="true" stored="false" multiValued="true"/>
<!-- Autocomplete search fields -->
<field name="acNameUntokenized" type="edgengram_untokenized" indexed="true" stored="false" multiValued="true" />
<field name="acNameStemmed" type="edgengram_stemmed" indexed="true" stored="false" multiValued="true" />
<field name="indexedTime" type="long" indexed="true" stored="true"/>
<field name="NAME_PHONETIC" type ="phonetic" indexed="true" stored="false" multiValued="true"/>
<field name="ALLTEXT" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="ALLTEXTUNSTEMMED" type="textgen" indexed="true" stored="false" multiValued="true"/>
<field name="THUMBNAIL" type="string" indexed="true" stored="true"/>
<field name="BETA" type="float" indexed="true" stored="true" multiValued="false"/>
<!-- field for storing locations of thumbnails -->
<field name="THUMBNAIL_URL" type="string" indexed="false" stored="true"/>
<!-- field for source institution's url -->
<field name="siteURL" type="string" indexed="true" stored="true"/>
<!-- field for source institution's name -->
<field name="siteName" type="string" indexed="true" stored="true"/>
<!-- field for preferred title -->
<field name="PREFERRED_TITLE" type="string" indexed="true" stored="true"/>
<!-- Copy nameRaw into several other fields -->
<copyField source="nameRaw" dest="nameStemmed" />
<copyField source="nameRaw" dest="nameUnstemmed" />
<copyField source="nameRaw" dest="nameLowercase" />
<copyField source="nameRaw" dest="NAME_PHONETIC" />
<copyField source="nameRaw" dest="acNameUntokenized" />
<copyField source="nameRaw" dest="acNameStemmed" />
<copyField source="nameRaw" dest="nameText" />
<!-- nameLowercaseSingleValued is not copied from nameRaw becasue nameRaw might have multiple values -->
<!-- field for hash signature, used for comparing to versions from external caches -->
<field name="etag" type="string" stored="true" indexed="false" multiValued="false" />
<!-- **************************** End Vitro Fields *************************** -->
<!-- **************************** Dynamic Fields *************************** -->
<dynamicField name="*_string" type="string" indexed="true" stored="true" multiValued="true"/>
<!-- **************************** End Dynamic Fields *************************** -->
<field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
</fields>
<!-- Field to use to determine and enforce document uniqueness.
Unless this field is marked with required="false", it will be a required field -->
<uniqueKey>DocId</uniqueKey>
<!-- field for the QueryParser to use when an explicit fieldname is absent -->
<!-- This is now set in solrconfig.xml -->
<!-- <defaultSearchField>ALLTEXT</defaultSearchField> -->
<!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
<solrQueryParser defaultOperator="AND"/>
<!-- Similarity is the scoring routine for each document vs. a query.
A custom similarity may be specified here, but the default is fine
for most applications. -->
<similarity class="org.apache.lucene.search.DefaultSimilarity"/>
</schema>