added morphology analyzer

This commit is contained in:
Georgy Litvinov 2020-07-09 11:46:59 +02:00
parent 74665136ea
commit fbd8b1163f
22 changed files with 25768 additions and 570 deletions

Binary file not shown.

BIN
jmorphy-libs/dawg-7.7.2.jar Normal file

Binary file not shown.

BIN
jmorphy-libs/guava-18.0.jar Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
jmorphy-libs/noggit-0.7.jar Normal file

Binary file not shown.

View file

@ -0,0 +1,680 @@
[
[
"POST",
"",
"ЧР",
"часть речи"
],
[
"NOUN",
"POST",
"СУЩ",
"имя существительное"
],
[
"ADJF",
"POST",
"ПРИЛ",
"имя прилагательное (полное)"
],
[
"ADJS",
"POST",
"КРРИЛ",
"имя прилагательное (краткое)"
],
[
"COMP",
"POST",
"КОМП",
"компаратив"
],
[
"VERB",
"POST",
"ГЛ",
"глагол (личная форма)"
],
[
"INFN",
"POST",
"ИНФ",
"глагол (инфинитив)"
],
[
"PRTF",
"POST",
"ПРИЧ",
"причастие (полное)"
],
[
"PRTS",
"POST",
"КРРИЧ",
"причастие (краткое)"
],
[
"GRND",
"POST",
"ДЕЕПР",
"деепричастие"
],
[
"NUMR",
"POST",
"ЧИСЛ",
"числительное"
],
[
"ADVB",
"POST",
"Н",
"наречие"
],
[
"NPRO",
"POST",
"МС",
"местоимение-существительное"
],
[
"PRED",
"POST",
"ПРЕДК",
"предикатив"
],
[
"PREP",
"POST",
"ПР",
"предлог"
],
[
"CONJ",
"POST",
"СОЮЗ",
"союз"
],
[
"PRCL",
"POST",
"ЧАСТ",
"частица"
],
[
"INTJ",
"POST",
"МЕЖД",
"междометие"
],
[
"ANim",
"",
"Од-неод",
"одушевлённость / одушевлённость не выражена"
],
[
"anim",
"ANim",
"од",
"одушевлённое"
],
[
"inan",
"ANim",
"неод",
"неодушевлённое"
],
[
"GNdr",
"",
"хр",
"род / род не выражен"
],
[
"masc",
"GNdr",
"мр",
"мужской род"
],
[
"femn",
"GNdr",
"жр",
"женский род"
],
[
"neut",
"GNdr",
"ср",
"средний род"
],
[
"Ms-f",
"",
"ор",
"общий род"
],
[
"NMbr",
"",
"Число",
"число"
],
[
"sing",
"NMbr",
"ед",
"единственное число"
],
[
"plur",
"NMbr",
"мн",
"множественное число"
],
[
"Sgtm",
"",
"sg",
"singularia tantum"
],
[
"Pltm",
"",
"pl",
"pluralia tantum"
],
[
"Fixd",
"",
"0",
"неизменяемое"
],
[
"CAse",
"",
"Падеж",
"категория падежа"
],
[
"nomn",
"CAse",
"им",
"именительный падеж"
],
[
"gent",
"CAse",
"рд",
"родительный падеж"
],
[
"datv",
"CAse",
"дт",
"дательный падеж"
],
[
"accs",
"CAse",
"вн",
"винительный падеж"
],
[
"ablt",
"CAse",
"тв",
"творительный падеж"
],
[
"loct",
"CAse",
"пр",
"предложный падеж"
],
[
"voct",
"nomn",
"зв",
"звательный падеж"
],
[
"gen1",
"gent",
"рд1",
"первый родительный падеж"
],
[
"gen2",
"gent",
"рд2",
"второй родительный (частичный) падеж"
],
[
"acc2",
"accs",
"вн2",
"второй винительный падеж"
],
[
"loc1",
"loct",
"пр1",
"первый предложный падеж"
],
[
"loc2",
"loct",
"пр2",
"второй предложный (местный) падеж"
],
[
"Abbr",
"",
"аббр",
"аббревиатура"
],
[
"Name",
"",
"имя",
"имя"
],
[
"Surn",
"",
"фам",
"фамилия"
],
[
"Patr",
"",
"отч",
"отчество"
],
[
"Geox",
"",
"гео",
"топоним"
],
[
"Orgn",
"",
"орг",
"организация"
],
[
"Trad",
"",
"tm",
"торговая марка"
],
[
"Subx",
"",
"субст?",
"возможна субстантивация"
],
[
"Supr",
"",
"превосх",
"превосходная степень"
],
[
"Qual",
"",
"кач",
"качественное"
],
[
"Apro",
"",
"мест-п",
"местоименное"
],
[
"Anum",
"",
"числ-п",
"порядковое"
],
[
"Poss",
"",
"притяж",
"притяжательное"
],
[
"V-ey",
"",
"*ею",
"форма на -ею"
],
[
"V-oy",
"",
"*ою",
"форма на -ою"
],
[
"Cmp2",
"",
"сравн2",
"сравнительная степень на по-"
],
[
"V-ej",
"",
"*ей",
"форма компаратива на -ей"
],
[
"ASpc",
"",
"Вид",
"категория вида"
],
[
"perf",
"ASpc",
"сов",
"совершенный вид"
],
[
"impf",
"ASpc",
"несов",
"несовершенный вид"
],
[
"TRns",
"",
"Перех",
"категория переходности"
],
[
"tran",
"TRns",
"перех",
"переходный"
],
[
"intr",
"TRns",
"неперех",
"непереходный"
],
[
"Impe",
"",
"безл",
"безличный"
],
[
"Impx",
"",
"безл?",
"возможно безличное употребление"
],
[
"Mult",
"",
"мног",
"многократный"
],
[
"Refl",
"",
"возвр",
"возвратный"
],
[
"PErs",
"",
"Лицо",
"категория лица"
],
[
"1per",
"PErs",
"1л",
"1 лицо"
],
[
"2per",
"PErs",
"2л",
"2 лицо"
],
[
"3per",
"PErs",
"3л",
"3 лицо"
],
[
"TEns",
"",
"Время",
"категория времени"
],
[
"pres",
"TEns",
"наст",
"настоящее время"
],
[
"past",
"TEns",
"прош",
"прошедшее время"
],
[
"futr",
"TEns",
"буд",
"будущее время"
],
[
"MOod",
"",
"Накл",
"категория наклонения"
],
[
"indc",
"MOod",
"изъяв",
"изъявительное наклонение"
],
[
"impr",
"MOod",
"повел",
"повелительное наклонение"
],
[
"INvl",
"",
"Совм",
"категория совместности"
],
[
"incl",
"INvl",
"вкл",
"говорящий включён (идем, идемте) "
],
[
"excl",
"INvl",
"выкл",
"говорящий не включён в действие (иди, идите)"
],
[
"VOic",
"",
"Залог",
"категория залога"
],
[
"actv",
"VOic",
"действ",
"действительный залог"
],
[
"pssv",
"VOic",
"страд",
"страдательный залог"
],
[
"Infr",
"",
"разг",
"разговорное"
],
[
"Slng",
"",
"жарг",
"жаргонное"
],
[
"Arch",
"",
"арх",
"устаревшее"
],
[
"Litr",
"",
"лит",
"литературный вариант"
],
[
"Erro",
"",
"опеч",
"опечатка"
],
[
"Dist",
"",
"искаж",
"искажение"
],
[
"Ques",
"",
"вопр",
"вопросительное"
],
[
"Dmns",
"",
"указ",
"указательное"
],
[
"Prnt",
"",
"вводн",
"вводное слово"
],
[
"V-be",
"",
"*ье",
"форма на -ье"
],
[
"V-en",
"",
"*енен",
"форма на -енен"
],
[
"V-ie",
"",
"*ие",
"отчество через -ие-"
],
[
"V-bi",
"",
"*ьи",
"форма на -ьи"
],
[
"Fimp",
"",
"*несов",
"деепричастие от глагола несовершенного вида"
],
[
"Prdx",
"",
"предк?",
"может выступать в роли предикатива"
],
[
"Coun",
"",
"счетн",
"счётная форма"
],
[
"Coll",
"",
"собир",
"собирательное числительное"
],
[
"V-sh",
"",
"*ши",
"деепричастие на -ши"
],
[
"Af-p",
"",
"*предл",
"форма после предлога"
],
[
"Inmx",
"",
"не/одуш?",
"может использоваться как одуш. / неодуш. "
],
[
"Vpre",
"",
"в_предл",
"Вариант предлога ( со, подо, ...)"
],
[
"Anph",
"",
"Анаф",
"Анафорическое (местоимение)"
],
[
"Init",
"",
"иниц",
"Инициал"
],
[
"Adjx",
"",
"прил?",
"может выступать в роли прилагательного"
]
]

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,102 @@
[
[
"language_code",
"ru"
],
[
"format_version",
"2.4"
],
[
"pymorphy2_version",
"0.8"
],
[
"compiled_at",
"2015-03-15T21:59:48.477191"
],
[
"source",
"opencorpora.org"
],
[
"source_version",
"0.92"
],
[
"source_revision",
"393658"
],
[
"source_lexemes_count",
389835
],
[
"source_links_count",
256468
],
[
"gramtab_length",
4753
],
[
"gramtab_formats",
{
"opencorpora-int": "gramtab-opencorpora-int.json",
"opencorpora-ext": "gramtab-opencorpora-ext.json"
}
],
[
"paradigms_length",
3163
],
[
"suffixes_length",
15463
],
[
"words_dawg_length",
5096128
],
[
"compile_options",
{
"max_suffix_length": 5,
"paradigm_prefixes": [
"",
"по",
"наи"
],
"min_ending_freq": 2,
"min_paradigm_popularity": 3
}
],
[
"prediction_suffixes_dawg_lengths",
[
366134,
1929,
21
]
],
[
"P(t|w)",
true
],
[
"P(t|w)_unique_words",
21121
],
[
"P(t|w)_outcomes",
248127
],
[
"P(t|w)_min_word_freq",
1
],
[
"corpus_revision",
"3725883"
]
]

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load diff

Binary file not shown.

View file

@ -1,569 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!-- Solr managed schema - automatically generated - DO NOT EDIT -->
<schema name="example" version="1.5">
<uniqueKey>DocId</uniqueKey>
<fieldType name="alphaOnlySort" class="solr.TextField" omitNorms="true" sortMissingLast="true">
<analyzer>
<tokenizer class="solr.KeywordTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.TrimFilterFactory"/>
<filter class="solr.PatternReplaceFilterFactory" pattern="([^a-z])" replace="all" replacement=""/>
</analyzer>
</fieldType>
<fieldType name="ancestor_path" class="solr.TextField">
<analyzer type="index">
<tokenizer class="solr.KeywordTokenizerFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/"/>
</analyzer>
</fieldType>
<fieldType name="binary" class="solr.BinaryField"/>
<fieldType name="boolean" class="solr.BoolField" omitNorms="true" sortMissingLast="true"/>
<fieldType name="currency" class="solr.CurrencyField" currencyConfig="currency.xml" defaultCurrency="USD" precisionStep="8"/>
<fieldType name="date" class="solr.TrieDateField" omitNorms="true" positionIncrementGap="0" precisionStep="0"/>
<fieldType name="descendent_path" class="solr.TextField">
<analyzer type="index">
<tokenizer class="solr.PathHierarchyTokenizerFactory" delimiter="/"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.KeywordTokenizerFactory"/>
</analyzer>
</fieldType>
<fieldType name="double" class="solr.TrieDoubleField" omitNorms="true" positionIncrementGap="0" precisionStep="0"/>
<fieldType name="edgengram_stemmed" class="solr.TextField">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords-name.txt" ignoreCase="true"/>
<filter class="solr.WordDelimiterFilterFactory" catenateNumbers="0" generateNumberParts="1" splitOnCaseChange="1" generateWordParts="1" catenateAll="0" catenateWords="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
<filter class="solr.EdgeNGramFilterFactory" maxGramSize="25" minGramSize="1"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords-name.txt" ignoreCase="true"/>
<filter class="solr.WordDelimiterFilterFactory" catenateNumbers="0" generateNumberParts="1" splitOnCaseChange="1" generateWordParts="1" catenateAll="0" catenateWords="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
</analyzer>
</fieldType>
<fieldType name="edgengram_untokenized" class="solr.TextField">
<analyzer type="index">
<tokenizer class="solr.KeywordTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EdgeNGramFilterFactory" maxGramSize="25" minGramSize="2"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.KeywordTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="float" class="solr.TrieFloatField" omitNorms="true" positionIncrementGap="0" precisionStep="0"/>
<fieldType name="ignored" class="solr.StrField" indexed="false" stored="false" multiValued="true"/>
<fieldType name="int" class="solr.TrieIntField" omitNorms="true" positionIncrementGap="0" precisionStep="0"/>
<fieldType name="location" class="solr.LatLonType" subFieldSuffix="_coordinate"/>
<fieldType name="location_rpt" class="solr.SpatialRecursivePrefixTreeFieldType" geo="true" maxDistErr="0.000009" distErrPct="0.025" distanceUnits="degrees"/>
<fieldType name="long" class="solr.TrieLongField" omitNorms="true" positionIncrementGap="0" precisionStep="0"/>
<fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.KeywordTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="payloads" class="solr.TextField" indexed="true" stored="false">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/>
</analyzer>
</fieldType>
<fieldType name="phonetic" class="solr.TextField" indexed="true" stored="false">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.PhoneticFilterFactory" encoder="Metaphone" inject="false"/>
</analyzer>
</fieldType>
<fieldType name="point" class="solr.PointType" subFieldSuffix="_d" dimension="2"/>
<fieldType name="random" class="solr.RandomSortField" indexed="true"/>
<fieldType name="string" class="solr.StrField" omitNorms="true" sortMissingLast="true"/>
<fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" positionIncrementGap="0" precisionStep="6"/>
<fieldType name="tdouble" class="solr.TrieDoubleField" positionIncrementGap="0" precisionStep="8"/>
<fieldType name="text" class="solr.TextField" autoGeneratePhraseQueries="true" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
<filter class="solr.WordDelimiterFilterFactory" catenateNumbers="1" generateNumberParts="1" generateWordParts="1" catenateAll="0" catenateWords="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.SynonymFilterFactory" expand="true" ignoreCase="true" synonyms="synonyms.txt"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
<filter class="solr.WordDelimiterFilterFactory" catenateNumbers="0" generateNumberParts="1" generateWordParts="1" catenateAll="0" catenateWords="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
</analyzer>
</fieldType>
<fieldType name="text_ar" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_ar.txt" ignoreCase="true"/>
<filter class="solr.ArabicNormalizationFilterFactory"/>
<filter class="solr.ArabicStemFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_bg" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_bg.txt" ignoreCase="true"/>
<filter class="solr.BulgarianStemFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_ca" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.ElisionFilterFactory" articles="lang/contractions_ca.txt" ignoreCase="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_ca.txt" ignoreCase="true"/>
<filter class="solr.SnowballPorterFilterFactory" language="Catalan"/>
</analyzer>
</fieldType>
<fieldType name="text_cjk" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.CJKWidthFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.CJKBigramFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_ckb" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.SoraniNormalizationFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_ckb.txt" ignoreCase="true"/>
<filter class="solr.SoraniStemFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_cz" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_cz.txt" ignoreCase="true"/>
<filter class="solr.CzechStemFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_da" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" format="snowball" words="lang/stopwords_da.txt" ignoreCase="true"/>
<filter class="solr.SnowballPorterFilterFactory" language="Danish"/>
</analyzer>
</fieldType>
<fieldType name="text_de" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" format="snowball" words="lang/stopwords_de.txt" ignoreCase="true"/>
<filter class="solr.GermanNormalizationFilterFactory"/>
<filter class="solr.GermanLightStemFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_el" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.GreekLowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_el.txt" ignoreCase="false"/>
<filter class="solr.GreekStemFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_en.txt" ignoreCase="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPossessiveFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" expand="true" ignoreCase="true" synonyms="synonyms.txt"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_en.txt" ignoreCase="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPossessiveFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_en_splitting" class="solr.TextField" autoGeneratePhraseQueries="true" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_en.txt" ignoreCase="true"/>
<filter class="solr.WordDelimiterFilterFactory" catenateNumbers="1" generateNumberParts="1" splitOnCaseChange="1" generateWordParts="1" catenateAll="0" catenateWords="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" expand="true" ignoreCase="true" synonyms="synonyms.txt"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_en.txt" ignoreCase="true"/>
<filter class="solr.WordDelimiterFilterFactory" catenateNumbers="0" generateNumberParts="1" splitOnCaseChange="1" generateWordParts="1" catenateAll="0" catenateWords="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_en_splitting_tight" class="solr.TextField" autoGeneratePhraseQueries="true" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" expand="false" ignoreCase="true" synonyms="synonyms.txt"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_en.txt" ignoreCase="true"/>
<filter class="solr.WordDelimiterFilterFactory" catenateNumbers="1" generateNumberParts="0" generateWordParts="0" catenateAll="0" catenateWords="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.EnglishMinimalStemFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_es" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" format="snowball" words="lang/stopwords_es.txt" ignoreCase="true"/>
<filter class="solr.SpanishLightStemFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_eu" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_eu.txt" ignoreCase="true"/>
<filter class="solr.SnowballPorterFilterFactory" language="Basque"/>
</analyzer>
</fieldType>
<fieldType name="text_fa" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<charFilter class="solr.PersianCharFilterFactory"/>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ArabicNormalizationFilterFactory"/>
<filter class="solr.PersianNormalizationFilterFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_fa.txt" ignoreCase="true"/>
</analyzer>
</fieldType>
<fieldType name="text_fi" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" format="snowball" words="lang/stopwords_fi.txt" ignoreCase="true"/>
<filter class="solr.SnowballPorterFilterFactory" language="Finnish"/>
</analyzer>
</fieldType>
<fieldType name="text_fr" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.ElisionFilterFactory" articles="lang/contractions_fr.txt" ignoreCase="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" format="snowball" words="lang/stopwords_fr.txt" ignoreCase="true"/>
<filter class="solr.FrenchLightStemFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_ga" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.ElisionFilterFactory" articles="lang/contractions_ga.txt" ignoreCase="true"/>
<filter class="solr.StopFilterFactory" words="lang/hyphenations_ga.txt" ignoreCase="true"/>
<filter class="solr.IrishLowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_ga.txt" ignoreCase="true"/>
<filter class="solr.SnowballPorterFilterFactory" language="Irish"/>
</analyzer>
</fieldType>
<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
<filter class="solr.SynonymFilterFactory" expand="true" ignoreCase="true" synonyms="synonyms.txt"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.ReversedWildcardFilterFactory" maxPosQuestion="2" maxFractionAsterisk="0.33" maxPosAsterisk="3" withOriginal="true"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" expand="true" ignoreCase="true" synonyms="synonyms.txt"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_gl" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_gl.txt" ignoreCase="true"/>
<filter class="solr.GalicianStemFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_hi" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.IndicNormalizationFilterFactory"/>
<filter class="solr.HindiNormalizationFilterFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_hi.txt" ignoreCase="true"/>
<filter class="solr.HindiStemFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_hu" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" format="snowball" words="lang/stopwords_hu.txt" ignoreCase="true"/>
<filter class="solr.SnowballPorterFilterFactory" language="Hungarian"/>
</analyzer>
</fieldType>
<fieldType name="text_hy" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_hy.txt" ignoreCase="true"/>
<filter class="solr.SnowballPorterFilterFactory" language="Armenian"/>
</analyzer>
</fieldType>
<fieldType name="text_id" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_id.txt" ignoreCase="true"/>
<filter class="solr.IndonesianStemFilterFactory" stemDerivational="true"/>
</analyzer>
</fieldType>
<fieldType name="text_it" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.ElisionFilterFactory" articles="lang/contractions_it.txt" ignoreCase="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" format="snowball" words="lang/stopwords_it.txt" ignoreCase="true"/>
<filter class="solr.ItalianLightStemFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_ja" class="solr.TextField" autoGeneratePhraseQueries="false" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>
<filter class="solr.JapaneseBaseFormFilterFactory"/>
<filter class="solr.JapanesePartOfSpeechStopFilterFactory" tags="lang/stoptags_ja.txt"/>
<filter class="solr.CJKWidthFilterFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_ja.txt" ignoreCase="true"/>
<filter class="solr.JapaneseKatakanaStemFilterFactory" minimumLength="4"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_lv.txt" ignoreCase="true"/>
<filter class="solr.LatvianStemFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_nl" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" format="snowball" words="lang/stopwords_nl.txt" ignoreCase="true"/>
<filter class="solr.StemmerOverrideFilterFactory" dictionary="lang/stemdict_nl.txt" ignoreCase="false"/>
<filter class="solr.SnowballPorterFilterFactory" language="Dutch"/>
</analyzer>
</fieldType>
<fieldType name="text_no" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" format="snowball" words="lang/stopwords_no.txt" ignoreCase="true"/>
<filter class="solr.SnowballPorterFilterFactory" language="Norwegian"/>
</analyzer>
</fieldType>
<fieldType name="text_pt" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" format="snowball" words="lang/stopwords_pt.txt" ignoreCase="true"/>
<filter class="solr.PortugueseLightStemFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_ro" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_ro.txt" ignoreCase="true"/>
<filter class="solr.SnowballPorterFilterFactory" language="Romanian"/>
</analyzer>
</fieldType>
<fieldType name="text_ru" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" format="snowball" words="lang/stopwords_ru.txt" ignoreCase="true"/>
<filter class="solr.SnowballPorterFilterFactory" language="Russian"/>
</analyzer>
</fieldType>
<fieldType name="text_stemmed" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords-name.txt" ignoreCase="true"/>
<filter class="solr.WordDelimiterFilterFactory" catenateNumbers="0" generateNumberParts="1" splitOnCaseChange="1" generateWordParts="1" catenateAll="0" catenateWords="0"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
</analyzer>
</fieldType>
<fieldType name="text_sv" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" format="snowball" words="lang/stopwords_sv.txt" ignoreCase="true"/>
<filter class="solr.SnowballPorterFilterFactory" language="Swedish"/>
</analyzer>
</fieldType>
<fieldType name="text_tr" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.TurkishLowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" words="lang/stopwords_tr.txt" ignoreCase="false"/>
<filter class="solr.SnowballPorterFilterFactory" language="Turkish"/>
</analyzer>
</fieldType>
<fieldType name="text_unstemmed" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
<filter class="solr.WordDelimiterFilterFactory" catenateNumbers="0" generateNumberParts="1" splitOnCaseChange="0" generateWordParts="1" catenateAll="0" catenateWords="0"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
</analyzer>
</fieldType>
<fieldType name="textgen" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
<filter class="solr.WordDelimiterFilterFactory" catenateNumbers="1" generateNumberParts="1" splitOnCaseChange="0" generateWordParts="1" catenateAll="0" catenateWords="1"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" expand="true" ignoreCase="true" synonyms="synonyms.txt"/>
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
<filter class="solr.WordDelimiterFilterFactory" catenateNumbers="0" generateNumberParts="1" splitOnCaseChange="0" generateWordParts="1" catenateAll="0" catenateWords="0"/>
<filter class="solr.ASCIIFoldingFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="tfloat" class="solr.TrieFloatField" positionIncrementGap="0" precisionStep="8"/>
<fieldType name="tint" class="solr.TrieIntField" omitNorms="true" positionIncrementGap="0" sortMissingLast="true" precisionStep="8"/>
<fieldType name="tlong" class="solr.TrieLongField" positionIncrementGap="0" precisionStep="8"/>
<field name="ALLTEXT" type="text" multiValued="true" indexed="true" stored="true"/>
<field name="ALLTEXTUNSTEMMED" type="textgen" multiValued="true" indexed="true" stored="false"/>
<field name="BETA" type="float" multiValued="false" indexed="true" stored="true"/>
<field name="DocId" type="string" omitNorms="true" multiValued="false" indexed="true" required="true" stored="true"/>
<field name="NAME_PHONETIC" type="phonetic" multiValued="true" indexed="true" stored="false"/>
<field name="PREFERRED_TITLE" type="string" multiValued="true" indexed="true" stored="true"/>
<field name="PROHIBITED_FROM_TEXT_RESULTS" type="string" omitNorms="true" multiValued="true" indexed="true" stored="false"/>
<field name="THUMBNAIL" type="string" indexed="true" stored="true"/>
<field name="THUMBNAIL_URL" type="string" indexed="false" stored="true"/>
<field name="URI" type="string" omitNorms="true" multiValued="false" indexed="true" stored="true"/>
<field name="_root_" type="string" indexed="true" stored="false"/>
<field name="_version_" type="long" indexed="true" stored="true"/>
<field name="acNameStemmed" type="edgengram_stemmed" multiValued="true" indexed="true" stored="false"/>
<field name="acNameUntokenized" type="edgengram_untokenized" multiValued="true" indexed="true" stored="false"/>
<field name="cat" type="string" multiValued="true" indexed="true" stored="true"/>
<field name="classgroup" type="string" multiValued="true" indexed="true" stored="true"/>
<field name="etag" type="string" multiValued="false" indexed="false" stored="true"/>
<field name="features" type="text_general" multiValued="true" indexed="true" stored="true"/>
<field name="inStock" type="boolean" indexed="true" stored="true"/>
<field name="includes" type="text_general" termPositions="true" termVectors="true" indexed="true" termOffsets="true" stored="true"/>
<field name="indexedTime" type="long" indexed="true" stored="true"/>
<field name="manu" type="text_general" omitNorms="true" indexed="true" stored="true"/>
<field name="mostSpecificTypeURIs" type="string" omitNorms="true" multiValued="true" indexed="true" stored="true"/>
<field name="name" type="text_general" indexed="true" stored="true"/>
<field name="nameLowercase" type="lowercase" omitNorms="false" multiValued="true" indexed="true" stored="false"/>
<field name="nameLowercaseSingleValued" type="lowercase" omitNorms="true" multiValued="false" indexed="true" stored="false"/>
<field name="nameRaw" type="string" omitNorms="false" multiValued="true" indexed="true" stored="true"/>
<field name="nameStemmed" type="text_stemmed" omitNorms="false" multiValued="true" indexed="true" stored="false"/>
<field name="nameText" type="text" multiValued="true" indexed="true" stored="false"/>
<field name="nameUnstemmed" type="text_unstemmed" omitNorms="false" multiValued="true" indexed="true" stored="false"/>
<field name="popularity" type="int" indexed="true" stored="true"/>
<field name="price" type="float" indexed="true" stored="true"/>
<field name="siteName" type="string" indexed="true" stored="true"/>
<field name="siteURL" type="string" indexed="true" stored="true"/>
<field name="sku" type="text_en_splitting_tight" omitNorms="true" indexed="true" stored="true"/>
<field name="store" type="location" indexed="true" stored="true"/>
<field name="timestamp" type="date" default="NOW" multiValued="false" indexed="true" stored="true"/>
<field name="type" type="string" omitNorms="true" multiValued="true" indexed="true" stored="true"/>
<field name="weight" type="float" indexed="true" stored="true"/>
<dynamicField name="*_coordinate" type="tdouble" indexed="true" stored="false"/>
<dynamicField name="ignored_*" type="ignored" multiValued="true"/>
<dynamicField name="*_string" type="string" multiValued="true" indexed="true" stored="true"/>
<dynamicField name="random_*" type="random"/>
<dynamicField name="*_tdate" type="tdate" multiValued="true" indexed="true" stored="true"/>
<dynamicField name="*_text" type="text" multiValued="true" indexed="true" stored="true"/>
<dynamicField name="*_tint" type="tint" multiValued="true" indexed="true" stored="true"/>
<dynamicField name="attr_*" type="text_general" multiValued="true" indexed="true" stored="true"/>
<dynamicField name="*_txt" type="text_general" multiValued="true" indexed="true" stored="true"/>
<dynamicField name="*_dts" type="date" multiValued="true" indexed="true" stored="true"/>
<dynamicField name="*_tdt" type="tdate" indexed="true" stored="true"/>
<dynamicField name="*_is" type="int" multiValued="true" indexed="true" stored="true"/>
<dynamicField name="*_ss" type="string" multiValued="true" indexed="true" stored="true"/>
<dynamicField name="*_ls" type="long" multiValued="true" indexed="true" stored="true"/>
<dynamicField name="*_en" type="text_en" multiValued="true" indexed="true" stored="true"/>
<dynamicField name="*_bs" type="boolean" multiValued="true" indexed="true" stored="true"/>
<dynamicField name="*_fs" type="float" multiValued="true" indexed="true" stored="true"/>
<dynamicField name="*_ds" type="double" multiValued="true" indexed="true" stored="true"/>
<dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
<dynamicField name="*_ti" type="tint" indexed="true" stored="true"/>
<dynamicField name="*_tl" type="tlong" indexed="true" stored="true"/>
<dynamicField name="*_tf" type="tfloat" indexed="true" stored="true"/>
<dynamicField name="*_td" type="tdouble" indexed="true" stored="true"/>
<dynamicField name="*_i" type="int" indexed="true" stored="true"/>
<dynamicField name="*_s" type="string" indexed="true" stored="true"/>
<dynamicField name="*_l" type="long" indexed="true" stored="true"/>
<dynamicField name="*_t" type="text_general" indexed="true" stored="true"/>
<dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
<dynamicField name="*_f" type="float" indexed="true" stored="true"/>
<dynamicField name="*_d" type="double" indexed="true" stored="true"/>
<dynamicField name="*_p" type="location" indexed="true" stored="true"/>
<dynamicField name="*_c" type="currency" indexed="true" stored="true"/>
<copyField source="nameRaw" dest="NAME_PHONETIC"/>
<copyField source="nameRaw" dest="acNameStemmed"/>
<copyField source="nameRaw" dest="acNameUntokenized"/>
<copyField source="nameRaw" dest="nameLowercase"/>
<copyField source="nameRaw" dest="nameStemmed"/>
<copyField source="nameRaw" dest="nameText"/>
<copyField source="nameRaw" dest="nameUnstemmed"/>
</schema>

View file

@ -145,7 +145,7 @@
<field name="indexedTime" type="long" indexed="true" stored="true"/>
<field name="NAME_PHONETIC" type ="phonetic" indexed="true" stored="false" multiValued="true"/>
<field name="ALLTEXT" type="text_ru_iph" indexed="true" stored="true" multiValued="true"/>
<field name="ALLTEXT" type="text_ru_morph" indexed="true" stored="true" multiValued="true"/>
<field name="ALLTEXTUNSTEMMED" type="textgen" indexed="true" stored="false" multiValued="true"/>
<field name="THUMBNAIL" type="string" indexed="true" stored="true"/>
@ -1311,6 +1311,16 @@
</analyzer>
</fieldType>
<fieldType name="text_ru_morph" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<charFilter class="solr.HTMLStripCharFilterFactory"/>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ru.txt" format="snowball" />
<filter class="company.evo.jmorphy2.lucene.Jmorphy2StemFilterFactory" dict="lang/pymorphy2_dicts"/>
</analyzer>
</fieldType>
<!-- Swedish -->
<fieldType name="text_sv" class="solr.TextField" positionIncrementGap="100">