NIHVIVO-2437 Upgrade to Solr 3.1

This commit is contained in:
ryounes 2011-05-16 21:58:24 +00:00
parent ad98e7723c
commit 265a86019c
27 changed files with 7060 additions and 2693 deletions

Binary file not shown.

BIN
solr/apache-solr-3.1.0.war Normal file

Binary file not shown.

View file

@ -12,7 +12,7 @@
<property name="solr.build.dir" location="."/> <property name="solr.build.dir" location="."/>
<property name="solr.example.dir" location="${solr.build.dir}/exampleSolr" /> <property name="solr.example.dir" location="${solr.build.dir}/exampleSolr" />
<property name="solr.context.config.example" location="${solr.build.dir}/exampleSolrContext.xml"/> <property name="solr.context.config.example" location="${solr.build.dir}/exampleSolrContext.xml"/>
<property name="solr.war" location="${solr.build.dir}/apache-solr-1.4.1.war"/> <property name="solr.war" location="${solr.build.dir}/apache-solr-3.1.0.war"/>
<!-- ================================= <!-- =================================
target: describe target: describe

File diff suppressed because it is too large Load diff

View file

@ -45,7 +45,16 @@
that avoids logging every request that avoids logging every request
--> -->
<schema name="example" version="1.2"> <schema name="example" version="1.3">
<!-- attribute "name" is the name of this schema and is only used for display purposes.
Applications should change this to reflect the nature of the search collection.
version="1.2" is Solr's version number for the schema syntax and semantics. It should
not normally be changed by applications.
1.0: multiValued attribute did not exist, all fields are multiValued by nature
1.1: multiValued attribute introduced, false by default
1.2: omitTermFreqAndPositions attribute introduced, true by default except for text fields.
1.3: removed optional field compress feature
-->
<types> <types>
<!-- field type definitions. The "name" attribute is <!-- field type definitions. The "name" attribute is
@ -56,15 +65,12 @@
org.apache.solr.analysis package. org.apache.solr.analysis package.
--> -->
<!-- The StrField type is not analyzed, but indexed/stored verbatim. <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
- StrField and TextField support an optional compressThreshold which
limits compression (if enabled in the derived fields) to values which
exceed a certain size (in characters).
-->
<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/> <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
<!-- boolean type: "true" or "false" --> <!-- boolean type: "true" or "false" -->
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/> <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/>
<!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings --> <!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings -->
<fieldtype name="binary" class="solr.BinaryField"/> <fieldtype name="binary" class="solr.BinaryField"/>
@ -208,8 +214,12 @@
words on case-change, alpha numeric boundaries, and non-alphanumeric chars, words on case-change, alpha numeric boundaries, and non-alphanumeric chars,
so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi". so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi".
Synonyms and stopwords are customized by external files, and stemming is enabled. Synonyms and stopwords are customized by external files, and stemming is enabled.
The attribute autoGeneratePhraseQueries="true" (the default) causes words that get split to
form phrase queries. For example, WordDelimiterFilter splitting text:pdp-11 will cause the parser
to generate text:"pdp 11" rather than (text:PDP OR text:11).
NOTE: autoGeneratePhraseQueries="true" tends to not work well for non whitespace delimited languages.
--> -->
<fieldType name="text" class="solr.TextField" positionIncrementGap="100"> <fieldType name="text" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
<analyzer type="index"> <analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/> <tokenizer class="solr.WhitespaceTokenizerFactory"/>
<!-- in this example, we will only use synonyms at query time <!-- in this example, we will only use synonyms at query time
@ -224,8 +234,10 @@
words="stopwords.txt" words="stopwords.txt"
enablePositionIncrements="true" enablePositionIncrements="true"
/> />
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<!-- <filter class="solr.PorterStemFilterFactory"/> -->
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/> <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
</analyzer> </analyzer>
<analyzer type="query"> <analyzer type="query">
@ -238,6 +250,7 @@
/> />
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/> <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
</analyzer> </analyzer>
</fieldType> </fieldType>
@ -266,7 +279,8 @@
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/> <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/> <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/> <filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/> <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.EnglishMinimalStemFilterFactory"/>
<!-- this filter can remove any duplicate tokens that appear at the same position - sometimes <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
possible with WordDelimiterFilter in conjuncton with stemming. --> possible with WordDelimiterFilter in conjuncton with stemming. -->
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/> <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
@ -397,11 +411,38 @@
</analyzer> </analyzer>
</fieldType> </fieldType>
<fieldType name="text_path" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.PathHierarchyTokenizerFactory"/>
</analyzer>
</fieldType>
<!-- since fields of this type are by default not stored or indexed, <!-- since fields of this type are by default not stored or indexed,
any data added to them will be ignored outright. --> any data added to them will be ignored outright. -->
<fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" /> <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />
<!-- This point type indexes the coordinates as separate fields (subFields)
If subFieldType is defined, it references a type, and a dynamic field
definition is created matching *___<typename>. Alternately, if
subFieldSuffix is defined, that is used to create the subFields.
Example: if subFieldType="double", then the coordinates would be
indexed in fields myloc_0___double,myloc_1___double.
Example: if subFieldSuffix="_d" then the coordinates would be indexed
in fields myloc_0_d,myloc_1_d
The subFields are an implementation detail of the fieldType, and end
users normally should not need to know about them.
-->
<fieldType name="point" class="solr.PointType" dimension="2" subFieldSuffix="_d"/>
<!-- A specialized field for geospatial search. If indexed, this fieldType must not be multivalued. -->
<fieldType name="location" class="solr.LatLonType" subFieldSuffix="_coordinate"/>
<!--
A Geohash is a compact representation of a latitude longitude pair in a single field.
See http://wiki.apache.org/solr/SpatialSearch
-->
<fieldtype name="geohash" class="solr.GeoHashField"/>
</types> </types>
@ -412,9 +453,6 @@
<types> section <types> section
indexed: true if this field should be indexed (searchable or sortable) indexed: true if this field should be indexed (searchable or sortable)
stored: true if this field should be retrievable stored: true if this field should be retrievable
compressed: [false] if this field should be stored using gzip compression
(this will only apply if the field type is compressable; among
the standard field types, only TextField and StrField are)
multiValued: true if this field may contain multiple values per document multiValued: true if this field may contain multiple values per document
omitNorms: (expert) set to true to omit the norms associated with omitNorms: (expert) set to true to omit the norms associated with
this field (this disables length normalization and index-time this field (this disables length normalization and index-time
@ -432,9 +470,6 @@
when adding a document. when adding a document.
--> -->
<!-- **************************** Vitro Fields *************************** --> <!-- **************************** Vitro Fields *************************** -->
<field name="DocId" type="string" indexed="true" stored="true" required="true" /> <field name="DocId" type="string" indexed="true" stored="true" required="true" />
@ -460,20 +495,25 @@
<field name="modType" type="ignored"/> <field name="modType" type="ignored"/>
<field name="JCLASS" type="ignored"/> <field name="JCLASS" type="ignored"/>
<!-- **************************** End Vitro Fields *************************** -->
<!-- catchall field, containing all other searchable text fields (implemented <!-- catchall field, containing all other searchable text fields (implemented
via copyField further on in this schema --> via copyField further on in this schema -->
<!-- Same as ALLTEXT
<field name="text" type="text" indexed="true" stored="false" multiValued="true"/> <field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
-->
<!-- catchall text field that indexes tokens both normally and in reverse for efficient <!-- catchall text field that indexes tokens both normally and in reverse for efficient
leading wildcard queries. --> leading wildcard queries. -->
<field name="text_rev" type="text_rev" indexed="true" stored="false" multiValued="true"/> <field name="text_rev" type="text_rev" indexed="true" stored="false" multiValued="true"/>
<!-- Uncommenting the following will create a "timestamp" field using
a default value of "NOW" to indicate when each document was indexed.
-->
<field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/> <field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
<!-- Dynamic field definitions. If a field name is not found, dynamicFields <!-- Dynamic field definitions. If a field name is not found, dynamicFields
will be used if the name matches any of the patterns. will be used if the name matches any of the patterns.
RESTRICTION: the glob-like pattern in the name attribute must have RESTRICTION: the glob-like pattern in the name attribute must have
@ -485,10 +525,16 @@
<dynamicField name="*_s" type="string" indexed="true" stored="true"/> <dynamicField name="*_s" type="string" indexed="true" stored="true"/>
<dynamicField name="*_l" type="long" indexed="true" stored="true"/> <dynamicField name="*_l" type="long" indexed="true" stored="true"/>
<dynamicField name="*_t" type="text" indexed="true" stored="true"/> <dynamicField name="*_t" type="text" indexed="true" stored="true"/>
<dynamicField name="*_txt" type="text" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_b" type="boolean" indexed="true" stored="true"/> <dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
<dynamicField name="*_f" type="float" indexed="true" stored="true"/> <dynamicField name="*_f" type="float" indexed="true" stored="true"/>
<dynamicField name="*_d" type="double" indexed="true" stored="true"/> <dynamicField name="*_d" type="double" indexed="true" stored="true"/>
<!-- Type used to index the lat and lon components for the "location" FieldType -->
<dynamicField name="*_coordinate" type="tdouble" indexed="true" stored="false"/>
<dynamicField name="*_dt" type="date" indexed="true" stored="true"/> <dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
<dynamicField name="*_p" type="location" indexed="true" stored="true"/>
<!-- some trie-coded dynamic fields for faster range queries --> <!-- some trie-coded dynamic fields for faster range queries -->
<dynamicField name="*_ti" type="tint" indexed="true" stored="true"/> <dynamicField name="*_ti" type="tint" indexed="true" stored="true"/>
@ -527,8 +573,6 @@
is added to the index. It's used either to index the same field differently, is added to the index. It's used either to index the same field differently,
or to add multiple fields to the same field for easier/faster searching. --> or to add multiple fields to the same field for easier/faster searching. -->
<!-- <copyField source="name" dest="text"/> -->
<!-- Above, multiple source fields are copied to the [text] field. <!-- Above, multiple source fields are copied to the [text] field.
Another way to map multiple source fields to the same Another way to map multiple source fields to the same
destination field is to use the dynamic field syntax. destination field is to use the dynamic field syntax.

File diff suppressed because it is too large Load diff

View file

@ -12,11 +12,9 @@
#----------------------------------------------------------------------- #-----------------------------------------------------------------------
#some test synonym mappings unlikely to appear in real input text #some test synonym mappings unlikely to appear in real input text
aaa => aaaa aaafoo => aaabar
bbb => bbbb1 bbbb2 bbbfoo => bbbfoo bbbbar
ccc => cccc1,cccc2 cccfoo => cccbar cccbaz
a\=>a => b\=>b
a\,a => b\,b
fooaaa,baraaa,bazaaa fooaaa,baraaa,bazaaa
# Some synonym groups specific to this example # Some synonym groups specific to this example

View file

@ -24,7 +24,7 @@
xmlns:xsl='http://www.w3.org/1999/XSL/Transform' xmlns:xsl='http://www.w3.org/1999/XSL/Transform'
> >
<xsl:output media-type="text/html; charset=UTF-8" encoding="UTF-8"/> <xsl:output media-type="text/html" encoding="UTF-8"/>
<xsl:variable name="title" select="concat('Solr search results (',response/result/@numFound,' documents)')"/> <xsl:variable name="title" select="concat('Solr search results (',response/result/@numFound,' documents)')"/>

View file

@ -27,7 +27,7 @@
<xsl:output <xsl:output
method="xml" method="xml"
encoding="utf-8" encoding="utf-8"
media-type="text/xml; charset=UTF-8" media-type="application/xml"
/> />
<xsl:template match='/'> <xsl:template match='/'>

View file

@ -27,7 +27,7 @@
<xsl:output <xsl:output
method="xml" method="xml"
encoding="utf-8" encoding="utf-8"
media-type="text/xml; charset=UTF-8" media-type="application/xml"
/> />
<xsl:template match='/'> <xsl:template match='/'>
<rss version="2.0"> <rss version="2.0">

View file

@ -28,7 +28,7 @@
<xsl:output <xsl:output
method="html" method="html"
encoding="UTF-8" encoding="UTF-8"
media-type="text/html; charset=UTF-8" media-type="text/html"
doctype-public="-//W3C//DTD XHTML 1.0 Strict//EN" doctype-public="-//W3C//DTD XHTML 1.0 Strict//EN"
doctype-system="http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" doctype-system="http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
/> />

34
solr/exampleSolr/solr.xml Normal file
View file

@ -0,0 +1,34 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!--
All (relative) paths are relative to the installation path
persistent: Save changes made via the API to this file
sharedLib: path to a lib directory that will be shared across all cores
-->
<solr persistent="false">
<!--
adminPath: RequestHandler path to manage cores.
If 'null' (or absent), cores will not be manageable via request handler
-->
<cores adminPath="/admin/cores" defaultCoreName="collection1">
<core name="collection1" instanceDir="." />
</cores>
</solr>

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -14,7 +14,6 @@ import net.sf.jga.fn.UnaryFunctor;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.solr.analysis.HTMLStripReader;
import edu.cornell.mannlib.vitro.webapp.beans.DataPropertyStatement; import edu.cornell.mannlib.vitro.webapp.beans.DataPropertyStatement;
import edu.cornell.mannlib.vitro.webapp.beans.Individual; import edu.cornell.mannlib.vitro.webapp.beans.Individual;
@ -141,23 +140,27 @@ public abstract class VitroHighlighter extends UnaryFunctor<String,String> {
private final String stripHtml(String in){ private final String stripHtml(String in){
/* make a string with html stripped out */ /* make a string with html stripped out */
Reader stripIn =new HTMLStripReader( new StringReader( in ) ); // ryounes 5/16/2011 Broken with upgrade to Solr 3.1: HTMLStripReader has been removed.
StringWriter stripOut = new StringWriter(in.length()); // According to change list, should use HTMLStripCharFilter, but it's not immediately clear how
// to migrate this code. Will enter Jira issue.
char bytes[] = new char[5000]; // Reader stripIn = new HTMLStripReader( new StringReader( in ) );
int bytesRead = 0; // StringWriter stripOut = new StringWriter(in.length());
try { //
//this is a mess, there must be a better way to do this. // char bytes[] = new char[5000];
while ( true ){ // int bytesRead = 0;
bytesRead = stripIn.read( bytes ); // try {
if( bytesRead == -1 ) break; // //this is a mess, there must be a better way to do this.
stripOut.write(bytes, 0, bytesRead ); // while ( true ){
} // bytesRead = stripIn.read( bytes );
} catch (IOException e1) { // if( bytesRead == -1 ) break;
log.error("LuceneHighlighter.getHighlightFragments()" + // stripOut.write(bytes, 0, bytesRead );
" - unable to strip html" + e1); // }
} // } catch (IOException e1) {
return stripOut.toString(); // log.error("LuceneHighlighter.getHighlightFragments()" +
// " - unable to strip html" + e1);
// }
// return stripOut.toString();
return in;
} }
} }

View file

@ -141,7 +141,7 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
//There may be other non-html formats in the future //There may be other non-html formats in the future
Format format = getFormat(vreq); Format format = getFormat(vreq);
boolean wasXmlRequested = Format.XML == format; boolean wasXmlRequested = Format.XML == format;
log.debug("xml was the requested format"); log.debug("Requested format was " + (wasXmlRequested ? "xml" : "html"));
boolean wasHtmlRequested = ! wasXmlRequested; boolean wasHtmlRequested = ! wasXmlRequested;
try { try {

View file

@ -128,7 +128,7 @@ public class SolrPagedSearchController extends FreemarkerHttpServlet {
//There may be other non-html formats in the future //There may be other non-html formats in the future
Format format = getFormat(vreq); Format format = getFormat(vreq);
boolean wasXmlRequested = Format.XML == format; boolean wasXmlRequested = Format.XML == format;
log.debug("xml was the requested format"); log.debug("Requested format was " + (wasXmlRequested ? "xml" : "html"));
boolean wasHtmlRequested = ! wasXmlRequested; boolean wasHtmlRequested = ! wasXmlRequested;
try { try {