NIHVIVO-2437 Upgrade to Solr 3.1

2011-05-16 21:58:24 +00:00 · 2011-05-16 21:58:24 +00:00 · 265a86019c
commit 265a86019c
parent ad98e7723c
27 changed files with 7060 additions and 2693 deletions
--- a/solr/apache-solr-1.4.1.war
+++ b/solr/apache-solr-1.4.1.war
--- a/solr/apache-solr-3.1.0.war
+++ b/solr/apache-solr-3.1.0.war
--- a/solr/build.xml
+++ b/solr/build.xml
@ -12,7 +12,7 @@
 	<property name="solr.build.dir" location="."/>
 	<property name="solr.example.dir" location="${solr.build.dir}/exampleSolr" />
 	<property name="solr.context.config.example" location="${solr.build.dir}/exampleSolrContext.xml"/>
-    <property name="solr.war" location="${solr.build.dir}/apache-solr-1.4.1.war"/>
+    <property name="solr.war" location="${solr.build.dir}/apache-solr-3.1.0.war"/>
 	<!-- ================================= 
      target: describe              
--- a/solr/exampleSolr/conf/mapping-FoldToASCII.txt
+++ b/solr/exampleSolr/conf/mapping-FoldToASCII.txt
--- a/solr/exampleSolr/conf/schema.xml
+++ b/solr/exampleSolr/conf/schema.xml
@ -45,7 +45,16 @@
    that avoids logging every request
 -->
-<schema name="example" version="1.2">
+<schema name="example" version="1.3">
  <!-- attribute "name" is the name of this schema and is only used for display purposes.
       Applications should change this to reflect the nature of the search collection.
       version="1.2" is Solr's version number for the schema syntax and semantics.  It should
       not normally be changed by applications.
       1.0: multiValued attribute did not exist, all fields are multiValued by nature
       1.1: multiValued attribute introduced, false by default 
       1.2: omitTermFreqAndPositions attribute introduced, true by default except for text fields.
       1.3: removed optional field compress feature
     -->
  <types>
    <!-- field type definitions. The "name" attribute is
@ -56,15 +65,12 @@
       org.apache.solr.analysis package.
    -->
-    <!-- The StrField type is not analyzed, but indexed/stored verbatim.  
+    <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
       - StrField and TextField support an optional compressThreshold which
       limits compression (if enabled in the derived fields) to values which
       exceed a certain size (in characters).
    -->
    <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
    <!-- boolean type: "true" or "false" -->
    <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/>
    <!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings -->
    <fieldtype name="binary" class="solr.BinaryField"/>
@ -208,8 +214,12 @@
        words on case-change, alpha numeric boundaries, and non-alphanumeric chars,
        so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi".
        Synonyms and stopwords are customized by external files, and stemming is enabled.
        The attribute autoGeneratePhraseQueries="true" (the default) causes words that get split to
        form phrase queries. For example, WordDelimiterFilter splitting text:pdp-11 will cause the parser
        to generate text:"pdp 11" rather than (text:PDP OR text:11).
        NOTE: autoGeneratePhraseQueries="true" tends to not work well for non whitespace delimited languages.
        -->
-    <fieldType name="text" class="solr.TextField" positionIncrementGap="100">
+    <fieldType name="text" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
      <analyzer type="index">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <!-- in this example, we will only use synonyms at query time
@ -224,8 +234,10 @@
                words="stopwords.txt"
                enablePositionIncrements="true"
                />
-        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
        <!-- <filter class="solr.PorterStemFilterFactory"/> -->
        <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>        
      </analyzer>
      <analyzer type="query">
@ -238,6 +250,7 @@
                />
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
         <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
      </analyzer>
    </fieldType>
@ -266,7 +279,8 @@
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
        <filter class="solr.LowerCaseFilterFactory"/>
-        <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
+        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
        <filter class="solr.EnglishMinimalStemFilterFactory"/>
        <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
             possible with WordDelimiterFilter in conjuncton with stemming. -->
        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
@ -397,11 +411,38 @@
      </analyzer>
    </fieldType>
    <fieldType name="text_path" class="solr.TextField" positionIncrementGap="100">
      <analyzer>
        <tokenizer class="solr.PathHierarchyTokenizerFactory"/>
      </analyzer>
    </fieldType>
    <!-- since fields of this type are by default not stored or indexed,
         any data added to them will be ignored outright.  --> 
    <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />
    <!-- This point type indexes the coordinates as separate fields (subFields)
      If subFieldType is defined, it references a type, and a dynamic field
      definition is created matching *___<typename>.  Alternately, if 
      subFieldSuffix is defined, that is used to create the subFields.
      Example: if subFieldType="double", then the coordinates would be
        indexed in fields myloc_0___double,myloc_1___double.
      Example: if subFieldSuffix="_d" then the coordinates would be indexed
        in fields myloc_0_d,myloc_1_d
      The subFields are an implementation detail of the fieldType, and end
      users normally should not need to know about them.
     -->
    <fieldType name="point" class="solr.PointType" dimension="2" subFieldSuffix="_d"/>
    <!-- A specialized field for geospatial search. If indexed, this fieldType must not be multivalued. -->
    <fieldType name="location" class="solr.LatLonType" subFieldSuffix="_coordinate"/>
   <!--
    A Geohash is a compact representation of a latitude longitude pair in a single field.
    See http://wiki.apache.org/solr/SpatialSearch
   -->
    <fieldtype name="geohash" class="solr.GeoHashField"/>   
 </types>
@ -412,9 +453,6 @@
       <types> section
     indexed: true if this field should be indexed (searchable or sortable)
     stored: true if this field should be retrievable
     compressed: [false] if this field should be stored using gzip compression
       (this will only apply if the field type is compressable; among
       the standard field types, only TextField and StrField are)
     multiValued: true if this field may contain multiple values per document
     omitNorms: (expert) set to true to omit the norms associated with
       this field (this disables length normalization and index-time
@ -432,9 +470,6 @@
       when adding a document.
   -->
 <!-- ****************************  Vitro Fields *************************** -->
 <field name="DocId" type="string" indexed="true" stored="true" required="true" /> 
@ -460,20 +495,25 @@
 <field name="modType" type="ignored"/>
 <field name="JCLASS" type="ignored"/>
-
+<!-- ****************************  End Vitro Fields *************************** -->
   <!-- catchall field, containing all other searchable text fields (implemented
        via copyField further on in this schema  -->
   <!-- Same as ALLTEXT
        <field name="text" type="text" indexed="true" stored="false" multiValued="true"/> 
     -->
   <!-- catchall text field that indexes tokens both normally and in reverse for efficient
        leading wildcard queries. -->
   <field name="text_rev" type="text_rev" indexed="true" stored="false" multiValued="true"/>
   <!-- Uncommenting the following will create a "timestamp" field using
        a default value of "NOW" to indicate when each document was indexed.
     -->
   <field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
   <!-- Dynamic field definitions.  If a field name is not found, dynamicFields
        will be used if the name matches any of the patterns.
        RESTRICTION: the glob-like pattern in the name attribute must have
@ -485,10 +525,16 @@
   <dynamicField name="*_s"  type="string"  indexed="true"  stored="true"/>
   <dynamicField name="*_l"  type="long"   indexed="true"  stored="true"/>
   <dynamicField name="*_t"  type="text"    indexed="true"  stored="true"/>
   <dynamicField name="*_txt" type="text"    indexed="true"  stored="true" multiValued="true"/>
   <dynamicField name="*_b"  type="boolean" indexed="true"  stored="true"/>
   <dynamicField name="*_f"  type="float"  indexed="true"  stored="true"/>
   <dynamicField name="*_d"  type="double" indexed="true"  stored="true"/>
   <!-- Type used to index the lat and lon components for the "location" FieldType -->
   <dynamicField name="*_coordinate"  type="tdouble" indexed="true"  stored="false"/>
   <dynamicField name="*_dt" type="date"    indexed="true"  stored="true"/>
   <dynamicField name="*_p"  type="location" indexed="true" stored="true"/>
   <!-- some trie-coded dynamic fields for faster range queries -->
   <dynamicField name="*_ti" type="tint"    indexed="true"  stored="true"/>
@ -527,8 +573,6 @@
        is added to the index.  It's used either to index the same field differently,
        or to add multiple fields to the same field for easier/faster searching.  -->
 <!--   <copyField source="name" dest="text"/> -->
   <!-- Above, multiple source fields are copied to the [text] field. 
 	  Another way to map multiple source fields to the same 
 	  destination field is to use the dynamic field syntax. 
--- a/solr/exampleSolr/conf/solrconfig.xml
+++ b/solr/exampleSolr/conf/solrconfig.xml
--- a/solr/exampleSolr/conf/synonyms.txt
+++ b/solr/exampleSolr/conf/synonyms.txt
@ -12,11 +12,9 @@
 #-----------------------------------------------------------------------
 #some test synonym mappings unlikely to appear in real input text
-aaa => aaaa
+aaafoo => aaabar
-bbb => bbbb1 bbbb2
+bbbfoo => bbbfoo bbbbar
-ccc => cccc1,cccc2
+cccfoo => cccbar cccbaz
 a\=>a => b\=>b
 a\,a => b\,b
 fooaaa,baraaa,bazaaa
 # Some synonym groups specific to this example
--- a/solr/exampleSolr/conf/xslt/example.xsl
+++ b/solr/exampleSolr/conf/xslt/example.xsl
@ -24,7 +24,7 @@
    xmlns:xsl='http://www.w3.org/1999/XSL/Transform'
 >
-  <xsl:output media-type="text/html; charset=UTF-8" encoding="UTF-8"/> 
+  <xsl:output media-type="text/html" encoding="UTF-8"/> 
  <xsl:variable name="title" select="concat('Solr search results (',response/result/@numFound,' documents)')"/>
--- a/solr/exampleSolr/conf/xslt/example_atom.xsl
+++ b/solr/exampleSolr/conf/xslt/example_atom.xsl
@ -27,7 +27,7 @@
  <xsl:output
       method="xml"
       encoding="utf-8"
-       media-type="text/xml; charset=UTF-8"
+       media-type="application/xml"
  />
  <xsl:template match='/'>
--- a/solr/exampleSolr/conf/xslt/example_rss.xsl
+++ b/solr/exampleSolr/conf/xslt/example_rss.xsl
@ -27,7 +27,7 @@
  <xsl:output
       method="xml"
       encoding="utf-8"
-       media-type="text/xml; charset=UTF-8"
+       media-type="application/xml"
  />
  <xsl:template match='/'>
    <rss version="2.0">
--- a/solr/exampleSolr/conf/xslt/luke.xsl
+++ b/solr/exampleSolr/conf/xslt/luke.xsl
@ -28,7 +28,7 @@
    <xsl:output
        method="html"
        encoding="UTF-8"
-        media-type="text/html; charset=UTF-8"
+        media-type="text/html"
        doctype-public="-//W3C//DTD XHTML 1.0 Strict//EN"
        doctype-system="http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
    />
--- a/solr/exampleSolr/solr.xml
+++ b/solr/exampleSolr/solr.xml
@ -0,0 +1,34 @@
 <?xml version="1.0" encoding="UTF-8" ?>
 <!--
 Licensed to the Apache Software Foundation (ASF) under one or more
 contributor license agreements.  See the NOTICE file distributed with
 this work for additional information regarding copyright ownership.
 The ASF licenses this file to You under the Apache License, Version 2.0
 (the "License"); you may not use this file except in compliance with
 the License.  You may obtain a copy of the License at
     http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 -->
 <!--
 All (relative) paths are relative to the installation path
  persistent: Save changes made via the API to this file
  sharedLib: path to a lib directory that will be shared across all cores
 -->
 <solr persistent="false">
  <!--
  adminPath: RequestHandler path to manage cores.  
    If 'null' (or absent), cores will not be manageable via request handler
  -->
  <cores adminPath="/admin/cores" defaultCoreName="collection1">
    <core name="collection1" instanceDir="." />
  </cores>
 </solr>
--- a/webapp/lib/apache-solr-analysis-extras-3.1.0.jar
+++ b/webapp/lib/apache-solr-analysis-extras-3.1.0.jar
--- a/webapp/lib/apache-solr-core-1.4.1.jar
+++ b/webapp/lib/apache-solr-core-1.4.1.jar
--- a/webapp/lib/apache-solr-core-3.1.0.jar
+++ b/webapp/lib/apache-solr-core-3.1.0.jar
--- a/webapp/lib/apache-solr-solrj-1.4.1.jar
+++ b/webapp/lib/apache-solr-solrj-1.4.1.jar
--- a/webapp/lib/apache-solr-solrj-3.1.0.jar
+++ b/webapp/lib/apache-solr-solrj-3.1.0.jar
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/beans/VitroHighlighter.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/beans/VitroHighlighter.java
@ -14,7 +14,6 @@ import net.sf.jga.fn.UnaryFunctor;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.solr.analysis.HTMLStripReader;
 import edu.cornell.mannlib.vitro.webapp.beans.DataPropertyStatement;
 import edu.cornell.mannlib.vitro.webapp.beans.Individual;
@ -141,23 +140,27 @@ public abstract class VitroHighlighter extends UnaryFunctor<String,String> {
    private final String stripHtml(String in){
        /* make a string with html stripped out */
-        Reader stripIn =new HTMLStripReader( new StringReader( in ) );
+        // ryounes 5/16/2011 Broken with upgrade to Solr 3.1: HTMLStripReader has been removed.
-        StringWriter stripOut = new StringWriter(in.length());
+        // According to change list, should use HTMLStripCharFilter, but it's not immediately clear how
-
+        // to migrate this code. Will enter Jira issue.
-        char bytes[] = new char[5000];
+//        Reader stripIn = new HTMLStripReader( new StringReader( in ) );
-        int bytesRead = 0;
+//        StringWriter stripOut = new StringWriter(in.length());
-        try {
+//
-            //this is a mess, there must be a better way to do this.
+//        char bytes[] = new char[5000];
-            while ( true  ){
+//        int bytesRead = 0;
-                bytesRead = stripIn.read( bytes );
+//        try {
-                if( bytesRead == -1 ) break;
+//            //this is a mess, there must be a better way to do this.
-                stripOut.write(bytes, 0, bytesRead  );
+//            while ( true  ){
-            }
+//                bytesRead = stripIn.read( bytes );
-        } catch (IOException e1) {
+//                if( bytesRead == -1 ) break;
-            log.error("LuceneHighlighter.getHighlightFragments()" +
+//                stripOut.write(bytes, 0, bytesRead  );
-                 " - unable to strip html" + e1);
+//            }
-        }
+//        } catch (IOException e1) {
-        return stripOut.toString();
+//            log.error("LuceneHighlighter.getHighlightFragments()" +
 //                 " - unable to strip html" + e1);
 //        }
 //        return stripOut.toString();
        return in;
    }
 }
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/PagedSearchController.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/PagedSearchController.java
@ -141,7 +141,7 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
        //There may be other non-html formats in the future
        Format format = getFormat(vreq);            
        boolean wasXmlRequested = Format.XML == format;
-        log.debug("xml was the requested format");                         
+        log.debug("Requested format was " + (wasXmlRequested ? "xml" : "html"));                      
        boolean wasHtmlRequested = ! wasXmlRequested; 
        try {
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/SolrPagedSearchController.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/SolrPagedSearchController.java
@ -128,7 +128,7 @@ public class SolrPagedSearchController extends FreemarkerHttpServlet {
        //There may be other non-html formats in the future
        Format format = getFormat(vreq);            
        boolean wasXmlRequested = Format.XML == format;
-        log.debug("xml was the requested format");                         
+        log.debug("Requested format was " + (wasXmlRequested ? "xml" : "html"));
        boolean wasHtmlRequested = ! wasXmlRequested; 
        try {