NIHVIVO-2437 Upgrade to Solr 3.1

2011-05-16 21:58:24 +00:00 · 2011-05-16 21:58:24 +00:00 · 265a86019c
commit 265a86019c
parent ad98e7723c
27 changed files with 7060 additions and 2693 deletions
--- a/solr/apache-solr-1.4.1.war
+++ b/solr/apache-solr-1.4.1.war
--- a/solr/apache-solr-3.1.0.war
+++ b/solr/apache-solr-3.1.0.war
--- a/solr/build.xml
+++ b/solr/build.xml
@ -12,7 +12,7 @@
 	<property name="solr.build.dir" location="."/>
 	<property name="solr.example.dir" location="${solr.build.dir}/exampleSolr" />
 	<property name="solr.context.config.example" location="${solr.build.dir}/exampleSolrContext.xml"/>
-    <property name="solr.war" location="${solr.build.dir}/apache-solr-1.4.1.war"/>
+    <property name="solr.war" location="${solr.build.dir}/apache-solr-3.1.0.war"/>

 	<!-- ================================= 
      target: describe              
--- a/solr/exampleSolr/conf/mapping-FoldToASCII.txt
+++ b/solr/exampleSolr/conf/mapping-FoldToASCII.txt
--- a/solr/exampleSolr/conf/schema.xml
+++ b/solr/exampleSolr/conf/schema.xml
@ -45,7 +45,16 @@
    that avoids logging every request
 -->

-<schema name="example" version="1.2">
+<schema name="example" version="1.3">
+  <!-- attribute "name" is the name of this schema and is only used for display purposes.
+       Applications should change this to reflect the nature of the search collection.
+       version="1.2" is Solr's version number for the schema syntax and semantics.  It should
+       not normally be changed by applications.
+       1.0: multiValued attribute did not exist, all fields are multiValued by nature
+       1.1: multiValued attribute introduced, false by default 
+       1.2: omitTermFreqAndPositions attribute introduced, true by default except for text fields.
+       1.3: removed optional field compress feature
+     -->

  <types>
    <!-- field type definitions. The "name" attribute is
@ -56,15 +65,12 @@
       org.apache.solr.analysis package.
    -->

-    <!-- The StrField type is not analyzed, but indexed/stored verbatim.  
-       - StrField and TextField support an optional compressThreshold which
-       limits compression (if enabled in the derived fields) to values which
-       exceed a certain size (in characters).
-    -->
+    <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
    <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>

    <!-- boolean type: "true" or "false" -->
    <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/>
+    
    <!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings -->
    <fieldtype name="binary" class="solr.BinaryField"/>

@ -208,8 +214,12 @@
        words on case-change, alpha numeric boundaries, and non-alphanumeric chars,
        so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi".
        Synonyms and stopwords are customized by external files, and stemming is enabled.
+        The attribute autoGeneratePhraseQueries="true" (the default) causes words that get split to
+        form phrase queries. For example, WordDelimiterFilter splitting text:pdp-11 will cause the parser
+        to generate text:"pdp 11" rather than (text:PDP OR text:11).
+        NOTE: autoGeneratePhraseQueries="true" tends to not work well for non whitespace delimited languages.
        -->
-    <fieldType name="text" class="solr.TextField" positionIncrementGap="100">
+    <fieldType name="text" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
      <analyzer type="index">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <!-- in this example, we will only use synonyms at query time
@ -224,8 +234,10 @@
                words="stopwords.txt"
                enablePositionIncrements="true"
                />
-        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+        <!-- <filter class="solr.PorterStemFilterFactory"/> -->
        <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>        
      </analyzer>
      <analyzer type="query">
@ -238,6 +250,7 @@
                />
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
         <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
      </analyzer>
    </fieldType>
@ -266,7 +279,8 @@
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/>
        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
        <filter class="solr.LowerCaseFilterFactory"/>
-        <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
+        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
+        <filter class="solr.EnglishMinimalStemFilterFactory"/>
        <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
             possible with WordDelimiterFilter in conjuncton with stemming. -->
        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
@ -397,11 +411,38 @@
      </analyzer>
    </fieldType>

+    <fieldType name="text_path" class="solr.TextField" positionIncrementGap="100">
+      <analyzer>
+        <tokenizer class="solr.PathHierarchyTokenizerFactory"/>
+      </analyzer>
+    </fieldType>

    <!-- since fields of this type are by default not stored or indexed,
         any data added to them will be ignored outright.  --> 
    <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />

+    <!-- This point type indexes the coordinates as separate fields (subFields)
+      If subFieldType is defined, it references a type, and a dynamic field
+      definition is created matching *___<typename>.  Alternately, if 
+      subFieldSuffix is defined, that is used to create the subFields.
+      Example: if subFieldType="double", then the coordinates would be
+        indexed in fields myloc_0___double,myloc_1___double.
+      Example: if subFieldSuffix="_d" then the coordinates would be indexed
+        in fields myloc_0_d,myloc_1_d
+      The subFields are an implementation detail of the fieldType, and end
+      users normally should not need to know about them.
+     -->
+    <fieldType name="point" class="solr.PointType" dimension="2" subFieldSuffix="_d"/>
+
+    <!-- A specialized field for geospatial search. If indexed, this fieldType must not be multivalued. -->
+    <fieldType name="location" class="solr.LatLonType" subFieldSuffix="_coordinate"/>
+
+   <!--
+    A Geohash is a compact representation of a latitude longitude pair in a single field.
+    See http://wiki.apache.org/solr/SpatialSearch
+   -->
+    <fieldtype name="geohash" class="solr.GeoHashField"/>   
+ 
 </types>


@ -412,9 +453,6 @@
       <types> section
     indexed: true if this field should be indexed (searchable or sortable)
     stored: true if this field should be retrievable
-     compressed: [false] if this field should be stored using gzip compression
-       (this will only apply if the field type is compressable; among
-       the standard field types, only TextField and StrField are)
     multiValued: true if this field may contain multiple values per document
     omitNorms: (expert) set to true to omit the norms associated with
       this field (this disables length normalization and index-time
@ -432,9 +470,6 @@
       when adding a document.
   -->

-
-
-
 <!-- ****************************  Vitro Fields *************************** -->

 <field name="DocId" type="string" indexed="true" stored="true" required="true" /> 
@ -460,20 +495,25 @@
 <field name="modType" type="ignored"/>
 <field name="JCLASS" type="ignored"/>

-
-
+<!-- ****************************  End Vitro Fields *************************** -->


   <!-- catchall field, containing all other searchable text fields (implemented
        via copyField further on in this schema  -->
+   <!-- Same as ALLTEXT
        <field name="text" type="text" indexed="true" stored="false" multiValued="true"/> 
+     -->

   <!-- catchall text field that indexes tokens both normally and in reverse for efficient
        leading wildcard queries. -->
   <field name="text_rev" type="text_rev" indexed="true" stored="false" multiValued="true"/>

+   <!-- Uncommenting the following will create a "timestamp" field using
+        a default value of "NOW" to indicate when each document was indexed.
+     -->
   <field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
   
+
   <!-- Dynamic field definitions.  If a field name is not found, dynamicFields
        will be used if the name matches any of the patterns.
        RESTRICTION: the glob-like pattern in the name attribute must have
@ -485,10 +525,16 @@
   <dynamicField name="*_s"  type="string"  indexed="true"  stored="true"/>
   <dynamicField name="*_l"  type="long"   indexed="true"  stored="true"/>
   <dynamicField name="*_t"  type="text"    indexed="true"  stored="true"/>
+   <dynamicField name="*_txt" type="text"    indexed="true"  stored="true" multiValued="true"/>
   <dynamicField name="*_b"  type="boolean" indexed="true"  stored="true"/>
   <dynamicField name="*_f"  type="float"  indexed="true"  stored="true"/>
   <dynamicField name="*_d"  type="double" indexed="true"  stored="true"/>
+
+   <!-- Type used to index the lat and lon components for the "location" FieldType -->
+   <dynamicField name="*_coordinate"  type="tdouble" indexed="true"  stored="false"/>
+
   <dynamicField name="*_dt" type="date"    indexed="true"  stored="true"/>
+   <dynamicField name="*_p"  type="location" indexed="true" stored="true"/>

   <!-- some trie-coded dynamic fields for faster range queries -->
   <dynamicField name="*_ti" type="tint"    indexed="true"  stored="true"/>
@ -527,8 +573,6 @@
        is added to the index.  It's used either to index the same field differently,
        or to add multiple fields to the same field for easier/faster searching.  -->
 	
-<!--   <copyField source="name" dest="text"/> -->
-	
   <!-- Above, multiple source fields are copied to the [text] field. 
 	  Another way to map multiple source fields to the same 
 	  destination field is to use the dynamic field syntax. 
--- a/solr/exampleSolr/conf/solrconfig.xml
+++ b/solr/exampleSolr/conf/solrconfig.xml
--- a/solr/exampleSolr/conf/synonyms.txt
+++ b/solr/exampleSolr/conf/synonyms.txt
@ -12,11 +12,9 @@

 #-----------------------------------------------------------------------
 #some test synonym mappings unlikely to appear in real input text
-aaa => aaaa
-bbb => bbbb1 bbbb2
-ccc => cccc1,cccc2
-a\=>a => b\=>b
-a\,a => b\,b
+aaafoo => aaabar
+bbbfoo => bbbfoo bbbbar
+cccfoo => cccbar cccbaz
 fooaaa,baraaa,bazaaa

 # Some synonym groups specific to this example
--- a/solr/exampleSolr/conf/xslt/example.xsl
+++ b/solr/exampleSolr/conf/xslt/example.xsl
@ -24,7 +24,7 @@
    xmlns:xsl='http://www.w3.org/1999/XSL/Transform'
 >

-  <xsl:output media-type="text/html; charset=UTF-8" encoding="UTF-8"/> 
+  <xsl:output media-type="text/html" encoding="UTF-8"/> 
  
  <xsl:variable name="title" select="concat('Solr search results (',response/result/@numFound,' documents)')"/>
  
--- a/solr/exampleSolr/conf/xslt/example_atom.xsl
+++ b/solr/exampleSolr/conf/xslt/example_atom.xsl
@ -27,7 +27,7 @@
  <xsl:output
       method="xml"
       encoding="utf-8"
-       media-type="text/xml; charset=UTF-8"
+       media-type="application/xml"
  />

  <xsl:template match='/'>
--- a/solr/exampleSolr/conf/xslt/example_rss.xsl
+++ b/solr/exampleSolr/conf/xslt/example_rss.xsl
@ -27,7 +27,7 @@
  <xsl:output
       method="xml"
       encoding="utf-8"
-       media-type="text/xml; charset=UTF-8"
+       media-type="application/xml"
  />
  <xsl:template match='/'>
    <rss version="2.0">
--- a/solr/exampleSolr/conf/xslt/luke.xsl
+++ b/solr/exampleSolr/conf/xslt/luke.xsl
@ -28,7 +28,7 @@
    <xsl:output
        method="html"
        encoding="UTF-8"
-        media-type="text/html; charset=UTF-8"
+        media-type="text/html"
        doctype-public="-//W3C//DTD XHTML 1.0 Strict//EN"
        doctype-system="http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
    />
--- a/solr/exampleSolr/solr.xml
+++ b/solr/exampleSolr/solr.xml
@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!--
+ All (relative) paths are relative to the installation path
+  
+  persistent: Save changes made via the API to this file
+  sharedLib: path to a lib directory that will be shared across all cores
+-->
+<solr persistent="false">
+
+  <!--
+  adminPath: RequestHandler path to manage cores.  
+    If 'null' (or absent), cores will not be manageable via request handler
+  -->
+  <cores adminPath="/admin/cores" defaultCoreName="collection1">
+    <core name="collection1" instanceDir="." />
+  </cores>
+</solr>
--- a/webapp/lib/apache-solr-analysis-extras-3.1.0.jar
+++ b/webapp/lib/apache-solr-analysis-extras-3.1.0.jar
--- a/webapp/lib/apache-solr-core-1.4.1.jar
+++ b/webapp/lib/apache-solr-core-1.4.1.jar
--- a/webapp/lib/apache-solr-core-3.1.0.jar
+++ b/webapp/lib/apache-solr-core-3.1.0.jar
--- a/webapp/lib/apache-solr-solrj-1.4.1.jar
+++ b/webapp/lib/apache-solr-solrj-1.4.1.jar
--- a/webapp/lib/apache-solr-solrj-3.1.0.jar
+++ b/webapp/lib/apache-solr-solrj-3.1.0.jar
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/beans/VitroHighlighter.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/beans/VitroHighlighter.java
@ -14,7 +14,6 @@ import net.sf.jga.fn.UnaryFunctor;

 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
-import org.apache.solr.analysis.HTMLStripReader;

 import edu.cornell.mannlib.vitro.webapp.beans.DataPropertyStatement;
 import edu.cornell.mannlib.vitro.webapp.beans.Individual;
@ -141,23 +140,27 @@ public abstract class VitroHighlighter extends UnaryFunctor<String,String> {

    private final String stripHtml(String in){
        /* make a string with html stripped out */
-        Reader stripIn =new HTMLStripReader( new StringReader( in ) );
-        StringWriter stripOut = new StringWriter(in.length());
-
-        char bytes[] = new char[5000];
-        int bytesRead = 0;
-        try {
-            //this is a mess, there must be a better way to do this.
-            while ( true  ){
-                bytesRead = stripIn.read( bytes );
-                if( bytesRead == -1 ) break;
-                stripOut.write(bytes, 0, bytesRead  );
-            }
-        } catch (IOException e1) {
-            log.error("LuceneHighlighter.getHighlightFragments()" +
-                 " - unable to strip html" + e1);
-        }
-        return stripOut.toString();
+        // ryounes 5/16/2011 Broken with upgrade to Solr 3.1: HTMLStripReader has been removed.
+        // According to change list, should use HTMLStripCharFilter, but it's not immediately clear how
+        // to migrate this code. Will enter Jira issue.
+//        Reader stripIn = new HTMLStripReader( new StringReader( in ) );
+//        StringWriter stripOut = new StringWriter(in.length());
+//
+//        char bytes[] = new char[5000];
+//        int bytesRead = 0;
+//        try {
+//            //this is a mess, there must be a better way to do this.
+//            while ( true  ){
+//                bytesRead = stripIn.read( bytes );
+//                if( bytesRead == -1 ) break;
+//                stripOut.write(bytes, 0, bytesRead  );
+//            }
+//        } catch (IOException e1) {
+//            log.error("LuceneHighlighter.getHighlightFragments()" +
+//                 " - unable to strip html" + e1);
+//        }
+//        return stripOut.toString();
+        return in;
    }
 }

--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/PagedSearchController.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/PagedSearchController.java
@ -141,7 +141,7 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
        //There may be other non-html formats in the future
        Format format = getFormat(vreq);            
        boolean wasXmlRequested = Format.XML == format;
-        log.debug("xml was the requested format");                         
+        log.debug("Requested format was " + (wasXmlRequested ? "xml" : "html"));                      
        boolean wasHtmlRequested = ! wasXmlRequested; 
        
        try {
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/SolrPagedSearchController.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/SolrPagedSearchController.java
@ -128,7 +128,7 @@ public class SolrPagedSearchController extends FreemarkerHttpServlet {
        //There may be other non-html formats in the future
        Format format = getFormat(vreq);            
        boolean wasXmlRequested = Format.XML == format;
-        log.debug("xml was the requested format");                         
+        log.debug("Requested format was " + (wasXmlRequested ? "xml" : "html"));
        boolean wasHtmlRequested = ! wasXmlRequested; 
        
        try {