From 997b3ef2cda97eb8cddc321654568a742f33ebb2 Mon Sep 17 00:00:00 2001
From: ryounes <ryounes@440791e1-c2bf-4c1e-ad7c-caf3f8b0ebe8>
Date: Tue, 28 Jun 2011 19:18:03 +0000
Subject: [PATCH] NIHVIVO-2459 Tokenized, stemmed autocomplete search

---
 solr/exampleSolr/conf/schema.xml              | 31 ++++++++-
 .../webapp/search/VitroSearchTermNames.java   |  2 +-
 .../SolrAutocompleteController.java           | 68 +++++++++----------
 .../search/solr/IndividualToSolrDocument.java |  2 +
 4 files changed, 64 insertions(+), 39 deletions(-)
diff --git a/solr/exampleSolr/conf/schema.xml b/solr/exampleSolr/conf/schema.xml
index 65cc4019d..2e9c1544c 100644
--- a/solr/exampleSolr/conf/schema.xml
+++ b/solr/exampleSolr/conf/schema.xml
@@ -479,6 +479,35 @@
         <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>  
       </analyzer>      
     </fieldtype>
+ 
+    <!-- Commenting this fieldtype out for now because we have no use case for 
+      a tokenized, unstemmed autocomplete field. Identical to edgengram_stemmed but without
+      the stemming.
+    <fieldtype name="edgengram_unstemmed" class="solr.TextField">
+      <analyzer type="index">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>   
+        <filter class="solr.StopFilterFactory" ignoreCase="true"
+          words="stopwords.txt"  enablePositionIncrements="true" /> 
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"                 
+          generateNumberParts="1" catenateWords="0"                 
+          catenateNumbers="0" catenateAll="0"                
+          splitOnCaseChange="1" />  
+        <filter class="solr.LowerCaseFilterFactory" />
+        <filter class="solr.EdgeNGramFilterFactory" minGramSize="1" maxGramSize="25" side="front"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>   
+        <filter class="solr.StopFilterFactory" ignoreCase="true"
+          words="stopwords.txt"  enablePositionIncrements="true" />  
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"                 
+          generateNumberParts="1" catenateWords="0"                 
+          catenateNumbers="0" catenateAll="0"                
+          splitOnCaseChange="1" />   
+        <filter class="solr.LowerCaseFilterFactory" /> 
+      </analyzer>      
+    </fieldtype>
+    -->
+    
  </types>
 
 
@@ -529,7 +558,7 @@
    
 <!-- Autocomplete search fields -->
 <field name="acNameUntokenized" type="edgengram_untokenized" indexed="true" stored="false" multiValued="true" />
-<!-- <field name="acNameTokenized" type="edgengram_tokenized" indexed="true" stored="false" multiValued="true" /> -->
+<!-- <field name="acNameUnstemmed" type="edgengram_unstemmed" indexed="true" stored="false" multiValued="true" /> -->
 <field name="acNameStemmed" type="edgengram_stemmed" indexed="true" stored="false" multiValued="true" />
 
 <field name="indexedTime" type="long" indexed="true" stored="true"/>
diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/VitroSearchTermNames.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/VitroSearchTermNames.java
index 98bf7c2e5..033159885 100644
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/VitroSearchTermNames.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/VitroSearchTermNames.java
@@ -69,7 +69,7 @@ public class VitroSearchTermNames {
     public static String AC_NAME_STEMMED = "acNameStemmed";
     
     /* There is currently no use case for an autocomplete search field that is tokenized but not stemmed. 
-    public static String AC_NAME_TOKENIZED = "acNameTokenized";  */
+    public static String AC_NAME_UNSTEMMED = "acNameUnstemmed";  */
     
     /** field for beta values of all documents **/
     public static final String BETA = "BETA";
diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/SolrAutocompleteController.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/SolrAutocompleteController.java
index 16d785129..621da6dea 100644
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/SolrAutocompleteController.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/SolrAutocompleteController.java
@@ -178,50 +178,44 @@ public class SolrAutocompleteController extends VitroAjaxController {
     }
     
     private void setTokenizedNameQuery(SolrQuery query, String queryStr, HttpServletRequest request) {
-        
-        // RY 5/18/2011 For now, just doing untokenized query, due to the interactions of wildcard
-        // query and stemming described below. Need to find a way to do this in Solr.
-        // Should take the same approach if we can figure out how to do a disjunction.
-        // Probably just add an explicit "OR" between the terms.
  
-//        String stemParam = (String) request.getParameter("stem"); 
-//        boolean stem = "true".equals(stemParam);
-//        String termName = stem ? VitroSearchTermNames.AC_NAME_STEMMED : VitroSearchTermNames.AC_NAME_UNSTEMMED  ;
+        /* We currently have no use case for a tokenized, unstemmed autocomplete search field, so the option
+         * has been disabled. If needed in the future, will need to add a new field and field type which
+         * is like AC_NAME_STEMMED but doesn't include the stemmer.
+        String stemParam = (String) request.getParameter("stem"); 
+        boolean stem = "true".equals(stemParam);
+        if (stem) {
+            String acTermName = VitroSearchTermNames.AC_NAME_STEMMED;
+            String nonAcTermName = VitroSearchTermNames.NAME_STEMMED;
+        } else {
+            String acTermName = VitroSearchTermNames.AC_NAME_UNSTEMMED;
+            String nonAcTermName = VitroSearchTermNames.NAME_UNSTEMMED;        
+        }
+        */
         
-//        // Use the query parser to analyze the search term the same way the indexed text was analyzed.
-//        // For example, text is lowercased, and function words are stripped out.
-//        QueryParser parser = getQueryParser(termName);
-//        
-//        // The wildcard query doesn't play well with stemming. Query term name:tales* doesn't match
-//        // "tales", which is indexed as "tale", while query term name:tales does. Obviously we need 
-//        // the wildcard for name:tal*, so the only way to get them all to match is use a disjunction 
-//        // of wildcard and non-wildcard queries. The query will have only an implicit disjunction
-//        // operator: e.g., +(name:tales name:tales*)
-//        try {
-//            log.debug("Adding non-wildcard query for " + querystr);
-//            Query query = parser.parse(querystr);
-//            boolQuery.add(query, BooleanClause.Occur.SHOULD);
-//
-//            // Prevent ParseException here when adding * after a space.
-//            // If there's a space at the end, we don't need the wildcard query.
-//            if (! querystr.endsWith(" ")) {
-//                log.debug("Adding wildcard query for " + querystr);
-//                Query wildcardQuery = parser.parse(querystr + "*");            
-//                boolQuery.add(wildcardQuery, BooleanClause.Occur.SHOULD);
-//            }
-//            
-//            log.debug("Name query is: " + boolQuery.toString());
-//        } catch (ParseException e) {
-//            log.warn(e, e);
-//        }
-       
-        setUntokenizedNameQuery(query, queryStr);
+        String acTermName = VitroSearchTermNames.AC_NAME_STEMMED;
+        String nonAcTermName = VitroSearchTermNames.NAME_STEMMED;
+        
+        if (queryStr.endsWith(" ")) {
+            // Solr wants whitespace to be escaped with a backslash
+            queryStr = queryStr.replaceAll("\\s+", "\\\\ ");
+            queryStr = nonAcTermName + ":" + queryStr;            
+        } else {
+            int indexOfLastWord = queryStr.lastIndexOf(" ") + 1;
+            String queryStr1 = queryStr.substring(0, indexOfLastWord);
+            String queryStr2 = queryStr.substring(indexOfLastWord);
+            queryStr = nonAcTermName + ":\"" + queryStr1 + "\"+" + acTermName + ":" + queryStr2;
+        }
+        
+        log.debug("Tokenized name query string = " + queryStr);
+        query.setQuery(queryStr);
+
     }
 
     private void setUntokenizedNameQuery(SolrQuery query, String queryStr) {
         
+        queryStr = queryStr.trim();
         // Solr wants whitespace to be escaped with a backslash
-        // Better: replace \s+
         queryStr = queryStr.replaceAll("\\s+", "\\\\ ");
         queryStr = VitroSearchTermNames.AC_NAME_UNTOKENIZED + ":" + queryStr;
         query.setQuery(queryStr);
diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/IndividualToSolrDocument.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/IndividualToSolrDocument.java
index a8d913c41..54e432ca3 100644
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/IndividualToSolrDocument.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/IndividualToSolrDocument.java
@@ -201,6 +201,7 @@ public class IndividualToSolrDocument {
         	 doc.addField(term.NAME_PHONETIC, value, PHONETIC_BOOST);
         	 doc.addField(term.AC_NAME_UNTOKENIZED, value);
              doc.addField(term.AC_NAME_STEMMED, value);
+             // doc.addField(term.AC_NAME_TOKENIZED, value);
         }else{
         	 doc.addField(term.NAME_RAW, value);
         	 doc.addField(term.NAME_LOWERCASE, value);
@@ -209,6 +210,7 @@ public class IndividualToSolrDocument {
         	 doc.addField(term.NAME_PHONETIC, value, PHONETIC_BOOST);
              doc.addField(term.AC_NAME_UNTOKENIZED, value);             
              doc.addField(term.AC_NAME_STEMMED, value);
+             // doc.addField(term.AC_NAME_TOKENIZED, value);
         }