NIHVIVO-2437 Upgrade to Solr 3.1

This commit is contained in:
ryounes 2011-05-16 21:58:24 +00:00
parent ad98e7723c
commit 265a86019c
27 changed files with 7060 additions and 2693 deletions

Binary file not shown.

BIN
solr/apache-solr-3.1.0.war Normal file

Binary file not shown.

View file

@ -12,7 +12,7 @@
<property name="solr.build.dir" location="."/>
<property name="solr.example.dir" location="${solr.build.dir}/exampleSolr" />
<property name="solr.context.config.example" location="${solr.build.dir}/exampleSolrContext.xml"/>
<property name="solr.war" location="${solr.build.dir}/apache-solr-1.4.1.war"/>
<property name="solr.war" location="${solr.build.dir}/apache-solr-3.1.0.war"/>
<!-- =================================
target: describe

File diff suppressed because it is too large Load diff

View file

@ -45,7 +45,16 @@
that avoids logging every request
-->
<schema name="example" version="1.2">
<schema name="example" version="1.3">
<!-- attribute "name" is the name of this schema and is only used for display purposes.
Applications should change this to reflect the nature of the search collection.
version="1.2" is Solr's version number for the schema syntax and semantics. It should
not normally be changed by applications.
1.0: multiValued attribute did not exist, all fields are multiValued by nature
1.1: multiValued attribute introduced, false by default
1.2: omitTermFreqAndPositions attribute introduced, true by default except for text fields.
1.3: removed optional field compress feature
-->
<types>
<!-- field type definitions. The "name" attribute is
@ -56,15 +65,12 @@
org.apache.solr.analysis package.
-->
<!-- The StrField type is not analyzed, but indexed/stored verbatim.
- StrField and TextField support an optional compressThreshold which
limits compression (if enabled in the derived fields) to values which
exceed a certain size (in characters).
-->
<!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
<!-- boolean type: "true" or "false" -->
<fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/>
<!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings -->
<fieldtype name="binary" class="solr.BinaryField"/>
@ -208,8 +214,12 @@
words on case-change, alpha numeric boundaries, and non-alphanumeric chars,
so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi".
Synonyms and stopwords are customized by external files, and stemming is enabled.
The attribute autoGeneratePhraseQueries="true" (the default) causes words that get split to
form phrase queries. For example, WordDelimiterFilter splitting text:pdp-11 will cause the parser
to generate text:"pdp 11" rather than (text:PDP OR text:11).
NOTE: autoGeneratePhraseQueries="true" tends to not work well for non whitespace delimited languages.
-->
<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
<fieldType name="text" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<!-- in this example, we will only use synonyms at query time
@ -224,8 +234,10 @@
words="stopwords.txt"
enablePositionIncrements="true"
/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<!-- <filter class="solr.PorterStemFilterFactory"/> -->
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
</analyzer>
<analyzer type="query">
@ -238,6 +250,7 @@
/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
</analyzer>
</fieldType>
@ -266,7 +279,8 @@
<filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.EnglishMinimalStemFilterFactory"/>
<!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
possible with WordDelimiterFilter in conjuncton with stemming. -->
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
@ -397,11 +411,38 @@
</analyzer>
</fieldType>
<fieldType name="text_path" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.PathHierarchyTokenizerFactory"/>
</analyzer>
</fieldType>
<!-- since fields of this type are by default not stored or indexed,
any data added to them will be ignored outright. -->
<fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />
<!-- This point type indexes the coordinates as separate fields (subFields)
If subFieldType is defined, it references a type, and a dynamic field
definition is created matching *___<typename>. Alternately, if
subFieldSuffix is defined, that is used to create the subFields.
Example: if subFieldType="double", then the coordinates would be
indexed in fields myloc_0___double,myloc_1___double.
Example: if subFieldSuffix="_d" then the coordinates would be indexed
in fields myloc_0_d,myloc_1_d
The subFields are an implementation detail of the fieldType, and end
users normally should not need to know about them.
-->
<fieldType name="point" class="solr.PointType" dimension="2" subFieldSuffix="_d"/>
<!-- A specialized field for geospatial search. If indexed, this fieldType must not be multivalued. -->
<fieldType name="location" class="solr.LatLonType" subFieldSuffix="_coordinate"/>
<!--
A Geohash is a compact representation of a latitude longitude pair in a single field.
See http://wiki.apache.org/solr/SpatialSearch
-->
<fieldtype name="geohash" class="solr.GeoHashField"/>
</types>
@ -412,9 +453,6 @@
<types> section
indexed: true if this field should be indexed (searchable or sortable)
stored: true if this field should be retrievable
compressed: [false] if this field should be stored using gzip compression
(this will only apply if the field type is compressable; among
the standard field types, only TextField and StrField are)
multiValued: true if this field may contain multiple values per document
omitNorms: (expert) set to true to omit the norms associated with
this field (this disables length normalization and index-time
@ -432,9 +470,6 @@
when adding a document.
-->
<!-- **************************** Vitro Fields *************************** -->
<field name="DocId" type="string" indexed="true" stored="true" required="true" />
@ -460,20 +495,25 @@
<field name="modType" type="ignored"/>
<field name="JCLASS" type="ignored"/>
<!-- **************************** End Vitro Fields *************************** -->
<!-- catchall field, containing all other searchable text fields (implemented
via copyField further on in this schema -->
<!-- Same as ALLTEXT
<field name="text" type="text" indexed="true" stored="false" multiValued="true"/>
-->
<!-- catchall text field that indexes tokens both normally and in reverse for efficient
leading wildcard queries. -->
<field name="text_rev" type="text_rev" indexed="true" stored="false" multiValued="true"/>
<!-- Uncommenting the following will create a "timestamp" field using
a default value of "NOW" to indicate when each document was indexed.
-->
<field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
<!-- Dynamic field definitions. If a field name is not found, dynamicFields
will be used if the name matches any of the patterns.
RESTRICTION: the glob-like pattern in the name attribute must have
@ -485,10 +525,16 @@
<dynamicField name="*_s" type="string" indexed="true" stored="true"/>
<dynamicField name="*_l" type="long" indexed="true" stored="true"/>
<dynamicField name="*_t" type="text" indexed="true" stored="true"/>
<dynamicField name="*_txt" type="text" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
<dynamicField name="*_f" type="float" indexed="true" stored="true"/>
<dynamicField name="*_d" type="double" indexed="true" stored="true"/>
<!-- Type used to index the lat and lon components for the "location" FieldType -->
<dynamicField name="*_coordinate" type="tdouble" indexed="true" stored="false"/>
<dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
<dynamicField name="*_p" type="location" indexed="true" stored="true"/>
<!-- some trie-coded dynamic fields for faster range queries -->
<dynamicField name="*_ti" type="tint" indexed="true" stored="true"/>
@ -527,8 +573,6 @@
is added to the index. It's used either to index the same field differently,
or to add multiple fields to the same field for easier/faster searching. -->
<!-- <copyField source="name" dest="text"/> -->
<!-- Above, multiple source fields are copied to the [text] field.
Another way to map multiple source fields to the same
destination field is to use the dynamic field syntax.

File diff suppressed because it is too large Load diff

View file

@ -12,11 +12,9 @@
#-----------------------------------------------------------------------
#some test synonym mappings unlikely to appear in real input text
aaa => aaaa
bbb => bbbb1 bbbb2
ccc => cccc1,cccc2
a\=>a => b\=>b
a\,a => b\,b
aaafoo => aaabar
bbbfoo => bbbfoo bbbbar
cccfoo => cccbar cccbaz
fooaaa,baraaa,bazaaa
# Some synonym groups specific to this example

View file

@ -24,7 +24,7 @@
xmlns:xsl='http://www.w3.org/1999/XSL/Transform'
>
<xsl:output media-type="text/html; charset=UTF-8" encoding="UTF-8"/>
<xsl:output media-type="text/html" encoding="UTF-8"/>
<xsl:variable name="title" select="concat('Solr search results (',response/result/@numFound,' documents)')"/>

View file

@ -27,7 +27,7 @@
<xsl:output
method="xml"
encoding="utf-8"
media-type="text/xml; charset=UTF-8"
media-type="application/xml"
/>
<xsl:template match='/'>

View file

@ -27,7 +27,7 @@
<xsl:output
method="xml"
encoding="utf-8"
media-type="text/xml; charset=UTF-8"
media-type="application/xml"
/>
<xsl:template match='/'>
<rss version="2.0">

View file

@ -28,7 +28,7 @@
<xsl:output
method="html"
encoding="UTF-8"
media-type="text/html; charset=UTF-8"
media-type="text/html"
doctype-public="-//W3C//DTD XHTML 1.0 Strict//EN"
doctype-system="http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
/>

34
solr/exampleSolr/solr.xml Normal file
View file

@ -0,0 +1,34 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!--
All (relative) paths are relative to the installation path
persistent: Save changes made via the API to this file
sharedLib: path to a lib directory that will be shared across all cores
-->
<solr persistent="false">
<!--
adminPath: RequestHandler path to manage cores.
If 'null' (or absent), cores will not be manageable via request handler
-->
<cores adminPath="/admin/cores" defaultCoreName="collection1">
<core name="collection1" instanceDir="." />
</cores>
</solr>

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -14,7 +14,6 @@ import net.sf.jga.fn.UnaryFunctor;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.solr.analysis.HTMLStripReader;
import edu.cornell.mannlib.vitro.webapp.beans.DataPropertyStatement;
import edu.cornell.mannlib.vitro.webapp.beans.Individual;
@ -141,23 +140,27 @@ public abstract class VitroHighlighter extends UnaryFunctor<String,String> {
private final String stripHtml(String in){
/* make a string with html stripped out */
Reader stripIn =new HTMLStripReader( new StringReader( in ) );
StringWriter stripOut = new StringWriter(in.length());
char bytes[] = new char[5000];
int bytesRead = 0;
try {
//this is a mess, there must be a better way to do this.
while ( true ){
bytesRead = stripIn.read( bytes );
if( bytesRead == -1 ) break;
stripOut.write(bytes, 0, bytesRead );
}
} catch (IOException e1) {
log.error("LuceneHighlighter.getHighlightFragments()" +
" - unable to strip html" + e1);
}
return stripOut.toString();
// ryounes 5/16/2011 Broken with upgrade to Solr 3.1: HTMLStripReader has been removed.
// According to change list, should use HTMLStripCharFilter, but it's not immediately clear how
// to migrate this code. Will enter Jira issue.
// Reader stripIn = new HTMLStripReader( new StringReader( in ) );
// StringWriter stripOut = new StringWriter(in.length());
//
// char bytes[] = new char[5000];
// int bytesRead = 0;
// try {
// //this is a mess, there must be a better way to do this.
// while ( true ){
// bytesRead = stripIn.read( bytes );
// if( bytesRead == -1 ) break;
// stripOut.write(bytes, 0, bytesRead );
// }
// } catch (IOException e1) {
// log.error("LuceneHighlighter.getHighlightFragments()" +
// " - unable to strip html" + e1);
// }
// return stripOut.toString();
return in;
}
}

View file

@ -141,7 +141,7 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
//There may be other non-html formats in the future
Format format = getFormat(vreq);
boolean wasXmlRequested = Format.XML == format;
log.debug("xml was the requested format");
log.debug("Requested format was " + (wasXmlRequested ? "xml" : "html"));
boolean wasHtmlRequested = ! wasXmlRequested;
try {

View file

@ -128,7 +128,7 @@ public class SolrPagedSearchController extends FreemarkerHttpServlet {
//There may be other non-html formats in the future
Format format = getFormat(vreq);
boolean wasXmlRequested = Format.XML == format;
log.debug("xml was the requested format");
log.debug("Requested format was " + (wasXmlRequested ? "xml" : "html"));
boolean wasHtmlRequested = ! wasXmlRequested;
try {