diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/PagedSearchController.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/PagedSearchController.java index 9212e409f..d3852cdb2 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/PagedSearchController.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/PagedSearchController.java @@ -38,6 +38,7 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.util.Version; import edu.cornell.mannlib.vitro.webapp.beans.DataProperty; import edu.cornell.mannlib.vitro.webapp.beans.DataPropertyStatement; @@ -690,7 +691,7 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear // map.put(Entity2LuceneDoc.term.ALLTEXT,Entity2LuceneDoc.term.ALLTEXTUNSTEMMED); // qp.setStemmedToUnstemmed(map); - MultiFieldQueryParser qp = new MultiFieldQueryParser(new String[]{ "name", "nameunstemmed", "type", "moniker", "ALLTEXT", "ALLTEXTUNSTEMMED", "nameraw" }, analyzer); + MultiFieldQueryParser qp = new MultiFieldQueryParser(Version.LUCENE_29, new String[]{ "name", "nameunstemmed", "type", "moniker", "ALLTEXT", "ALLTEXTUNSTEMMED", "nameraw" }, analyzer); qp.setDefaultOperator(QueryParser.AND_OPERATOR); diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/HtmlLowerStopAnalyzer.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/HtmlLowerStopAnalyzer.java index 366026ec4..02b09a6b2 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/HtmlLowerStopAnalyzer.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/HtmlLowerStopAnalyzer.java @@ -3,14 +3,19 @@ package edu.cornell.mannlib.vitro.webapp.search.lucene; import java.io.Reader; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import org.apache.lucene.analysis.ASCIIFoldingFilter; import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.ISOLatin1AccentFilter; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.util.Version; public class HtmlLowerStopAnalyzer extends Analyzer { /* much of this code is from @@ -18,7 +23,7 @@ public class HtmlLowerStopAnalyzer extends Analyzer { * bdc34 */ - private static String[] _stopWords; + private static Set _stopWords; /** * An array containing some common English words @@ -52,13 +57,16 @@ public class HtmlLowerStopAnalyzer extends Analyzer { "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z" }; - + + public static final List stopWordsList = Arrays.asList(STOP_WORDS); + public static final Set STOP_WORDS_SET = new HashSet(stopWordsList); + /** * Builds an analyzer. */ public HtmlLowerStopAnalyzer() { - this(STOP_WORDS); + this(STOP_WORDS_SET); } /** @@ -66,9 +74,9 @@ public class HtmlLowerStopAnalyzer extends Analyzer { * * @param stopWords a String array of stop words */ - public HtmlLowerStopAnalyzer(String[] stopWords) + public HtmlLowerStopAnalyzer(Set stopWords) { - _stopWords = stopWords; + _stopWords = STOP_WORDS_SET; } /** @@ -99,15 +107,16 @@ public class HtmlLowerStopAnalyzer extends Analyzer { // return stopFilter; // - TokenStream result = new StandardTokenizer(arg0); + TokenStream result = new StandardTokenizer(Version.LUCENE_29, arg0); result = new StandardFilter(result); //break into tokens result = new LowerCaseFilter(result); //lower case - result = new StopFilter(result, _stopWords, IGNORE_CASE); //remove stop words - result = new ISOLatin1AccentFilter(result); //ISO-8859-1 accented chars are replace by unaccented + result = new StopFilter(ENABLE_POSITION_INCREMENTS, result, STOP_WORDS_SET, IGNORE_CASE); //remove stop words + result = new ASCIIFoldingFilter(result); //this class converts alphabetic, symbolic and numerical characters into their ASCII equivalents. return result; } private static final boolean IGNORE_CASE = true; + private static final boolean ENABLE_POSITION_INCREMENTS = false; } diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/HtmlLowerStopStemAnalyzer.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/HtmlLowerStopStemAnalyzer.java index c4a61ab2c..3abdfbecd 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/HtmlLowerStopStemAnalyzer.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/HtmlLowerStopStemAnalyzer.java @@ -3,6 +3,7 @@ package edu.cornell.mannlib.vitro.webapp.search.lucene; import java.io.Reader; +import java.util.Set; import org.apache.lucene.analysis.PorterStemFilter; import org.apache.lucene.analysis.TokenStream; @@ -19,7 +20,7 @@ public class HtmlLowerStopStemAnalyzer extends HtmlLowerStopAnalyzer { super(); } - public HtmlLowerStopStemAnalyzer(String[] stopWords){ + public HtmlLowerStopStemAnalyzer(Set stopWords){ super(stopWords); } diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneSetup.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneSetup.java index 25f03ff55..b9db431a8 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneSetup.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneSetup.java @@ -24,6 +24,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.util.Version; import com.hp.hpl.jena.ontology.OntModel; @@ -230,14 +231,14 @@ public class LuceneSetup implements javax.servlet.ServletContextListener { * @return */ private Analyzer getAnalyzer() { - PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper( new StandardAnalyzer()); + PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper( new StandardAnalyzer(Version.LUCENE_29)); analyzer.addAnalyzer(ALLTEXT, new HtmlLowerStopStemAnalyzer()); analyzer.addAnalyzer(NAME, new HtmlLowerStopStemAnalyzer()); analyzer.addAnalyzer(ALLTEXTUNSTEMMED, new HtmlLowerStopAnalyzer()); analyzer.addAnalyzer(NAMEUNSTEMMED, new HtmlLowerStopAnalyzer()); - analyzer.addAnalyzer(NAME, new StandardAnalyzer()); - analyzer.addAnalyzer(MONIKER, new StandardAnalyzer()); - analyzer.addAnalyzer(RDFTYPE, new StandardAnalyzer()); + analyzer.addAnalyzer(NAME, new StandardAnalyzer(Version.LUCENE_29)); + analyzer.addAnalyzer(MONIKER, new StandardAnalyzer(Version.LUCENE_29)); + analyzer.addAnalyzer(RDFTYPE, new StandardAnalyzer(Version.LUCENE_29)); return analyzer; }