From 702f904e756085589893aadc69c54f03e13b5c2d Mon Sep 17 00:00:00 2001 From: ryounes Date: Fri, 13 May 2011 22:43:47 +0000 Subject: [PATCH] NIHVIVO-2459 Work on SolrAutocompleteController (not working yet). Refactoring VitroLuceneTermNames rdfs:label field names to clarify what the fields are. Restored 1.2 Lucene analysis to name fields. --- solr/exampleSolr/conf/schema.xml | 10 +- .../controller/EntityURLController.java | 7 +- .../controller/JSONReconcileServlet.java | 4 +- .../freemarker/IndividualListController.java | 4 +- .../controller/AutocompleteController.java | 9 +- .../controller/PagedSearchController.java | 17 +- .../SolrAutocompleteController.java | 255 +++++++----------- .../controller/SolrPagedSearchController.java | 54 ++-- .../search/lucene/Entity2LuceneDoc.java | 52 ++-- .../search/lucene/HtmlLowerStopAnalyzer.java | 3 +- .../webapp/search/lucene/LuceneSetup.java | 9 +- 11 files changed, 184 insertions(+), 240 deletions(-) diff --git a/solr/exampleSolr/conf/schema.xml b/solr/exampleSolr/conf/schema.xml index dadb1c40c..f21c5769a 100644 --- a/solr/exampleSolr/conf/schema.xml +++ b/solr/exampleSolr/conf/schema.xml @@ -427,16 +427,16 @@ - + - - - - + + + + diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/controller/EntityURLController.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/controller/EntityURLController.java index 8b67cb98b..1f874b601 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/controller/EntityURLController.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/controller/EntityURLController.java @@ -29,6 +29,7 @@ import com.hp.hpl.jena.rdf.model.ResourceFactory; import com.hp.hpl.jena.vocabulary.RDF; import edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc; +import edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc.VitroLuceneTermNames; import edu.cornell.mannlib.vitro.webapp.search.lucene.LuceneIndexFactory; import edu.cornell.mannlib.vitro.webapp.web.ContentType; @@ -74,13 +75,13 @@ public void doGet (HttpServletRequest req, HttpServletResponse res) throws IOExc String classUri = (String) getServletContext().getAttribute("classuri"); BooleanQuery query = new BooleanQuery(); query.add( - new TermQuery( new Term(Entity2LuceneDoc.term.RDFTYPE, classUri)), + new TermQuery( new Term(VitroLuceneTermNames.RDFTYPE, classUri)), BooleanClause.Occur.MUST ); IndexSearcher index = LuceneIndexFactory.getIndexSearcher(getServletContext()); TopDocs docs = index.search(query, null, ENTITY_LIST_CONTROLLER_MAX_RESULTS, - new Sort(Entity2LuceneDoc.term.NAMELOWERCASE)); + new Sort(VitroLuceneTermNames.NAME_LOWERCASE)); if( docs == null ){ log.error("Search of lucene index returned null"); @@ -97,7 +98,7 @@ public void doGet (HttpServletRequest req, HttpServletResponse res) throws IOExc if (hit != null) { Document doc = index.doc(hit.doc); if (doc != null) { - String uri = doc.getField(Entity2LuceneDoc.term.URI).stringValue(); + String uri = doc.getField(VitroLuceneTermNames.URI).stringValue(); resource = ResourceFactory.createResource(uri); node = (RDFNode) ResourceFactory.createResource(classUri); model.add(resource, RDF.type, node); diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/controller/JSONReconcileServlet.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/controller/JSONReconcileServlet.java index 276e0af4e..fe4b9f63b 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/controller/JSONReconcileServlet.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/controller/JSONReconcileServlet.java @@ -377,7 +377,7 @@ public class JSONReconcileServlet extends VitroHttpServlet { String stemParam = (String) request.getParameter("stem"); boolean stem = "true".equals(stemParam); - String termName = stem ? VitroLuceneTermNames.NAME : VitroLuceneTermNames.NAMEUNSTEMMED; + String termName = stem ? VitroLuceneTermNames.NAME_STEMMED : VitroLuceneTermNames.NAME_UNSTEMMED; BooleanQuery boolQuery = new BooleanQuery(); @@ -415,7 +415,7 @@ public class JSONReconcileServlet extends VitroHttpServlet { private Query makeUntokenizedNameQuery(String querystr) { querystr = querystr.toLowerCase(); - String termName = VitroLuceneTermNames.NAMELOWERCASE; + String termName = VitroLuceneTermNames.NAME_LOWERCASE; BooleanQuery query = new BooleanQuery(); log.debug("Adding wildcard query on unanalyzed name"); query.add( diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/controller/freemarker/IndividualListController.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/controller/freemarker/IndividualListController.java index c3c4b3874..531bb833e 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/controller/freemarker/IndividualListController.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/controller/freemarker/IndividualListController.java @@ -188,7 +188,7 @@ public class IndividualListController extends FreemarkerHttpServlet { try{ docs = index.search(query, null, ENTITY_LIST_CONTROLLER_MAX_RESULTS, - new Sort(Entity2LuceneDoc.term.NAMELOWERCASE)); + new Sort(Entity2LuceneDoc.term.NAME_LOWERCASE)); }catch(Throwable th){ log.error("Could not run search. " + th.getMessage()); docs = null; @@ -258,7 +258,7 @@ public class IndividualListController extends FreemarkerHttpServlet { Query alphaQuery = null; if( alpha != null && !"".equals(alpha) && alpha.length() == 1){ alphaQuery = - new PrefixQuery(new Term(Entity2LuceneDoc.term.NAMELOWERCASE, alpha.toLowerCase())); + new PrefixQuery(new Term(Entity2LuceneDoc.term.NAME_LOWERCASE, alpha.toLowerCase())); query.add(alphaQuery,BooleanClause.Occur.MUST); } diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/AutocompleteController.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/AutocompleteController.java index f950ed5bc..aa37476ab 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/AutocompleteController.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/AutocompleteController.java @@ -30,8 +30,7 @@ import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.util.Version; import org.json.JSONArray; - -import com.hp.hpl.jena.sparql.lib.org.json.JSONObject; +import org.json.JSONObject; import edu.cornell.mannlib.vitro.webapp.auth.requestedAction.Actions; import edu.cornell.mannlib.vitro.webapp.auth.requestedAction.usepages.UseBasicAjaxControllers; @@ -118,7 +117,7 @@ public class AutocompleteController extends VitroAjaxController { try{ Document doc = searcherForRequest.doc(topDocs.scoreDocs[i].doc); String uri = doc.get(VitroLuceneTermNames.URI); - String name = doc.get(VitroLuceneTermNames.NAMERAW); + String name = doc.get(VitroLuceneTermNames.NAME_RAW); SearchResult result = new SearchResult(name, uri); results.add(result); } catch(Exception e){ @@ -208,7 +207,7 @@ public class AutocompleteController extends VitroAjaxController { String stemParam = (String) request.getParameter("stem"); boolean stem = "true".equals(stemParam); - String termName = stem ? VitroLuceneTermNames.NAME : VitroLuceneTermNames.NAMEUNSTEMMED; + String termName = stem ? VitroLuceneTermNames.NAME_STEMMED : VitroLuceneTermNames.NAME_UNSTEMMED; BooleanQuery boolQuery = new BooleanQuery(); @@ -245,7 +244,7 @@ public class AutocompleteController extends VitroAjaxController { private Query makeUntokenizedNameQuery(String querystr) { querystr = querystr.toLowerCase(); - String termName = VitroLuceneTermNames.NAMELOWERCASE; + String termName = VitroLuceneTermNames.NAME_LOWERCASE; BooleanQuery query = new BooleanQuery(); log.debug("Adding wildcard query on unanalyzed name"); query.add( diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/PagedSearchController.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/PagedSearchController.java index 576c8176e..4aa6a5756 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/PagedSearchController.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/PagedSearchController.java @@ -65,6 +65,7 @@ import edu.cornell.mannlib.vitro.webapp.search.beans.VitroQuery; import edu.cornell.mannlib.vitro.webapp.search.beans.VitroQueryFactory; import edu.cornell.mannlib.vitro.webapp.search.lucene.CustomSimilarity; import edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc; +import edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc.VitroLuceneTermNames; import edu.cornell.mannlib.vitro.webapp.search.lucene.LuceneIndexFactory; import edu.cornell.mannlib.vitro.webapp.search.lucene.LuceneSetup; import edu.cornell.mannlib.vitro.webapp.web.templatemodels.LinkTemplateModel; @@ -228,7 +229,7 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear Document document = searcherForRequest.doc(scoreDoc.doc); Explanation explanation = searcherForRequest.explain(query, scoreDoc.doc); - log.debug("Document title: "+ document.get(Entity2LuceneDoc.VitroLuceneTermNames.NAME) + " score: " +scoreDoc.score); + log.debug("Document title: "+ document.get(Entity2LuceneDoc.VitroLuceneTermNames.NAME_STEMMED) + " score: " +scoreDoc.score); log.debug("Scoring of the doc explained " + explanation.toString()); log.debug("Explanation's description "+ explanation.getDescription()); log.debug("ALLTEXT: " + document.get(Entity2LuceneDoc.VitroLuceneTermNames.ALLTEXT)); @@ -404,7 +405,7 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear Document doc; try { doc = searcher.doc(topDocs.scoreDocs[i].doc); - String name =doc.get(Entity2LuceneDoc.term.NAME); + String name =doc.get(Entity2LuceneDoc.term.NAME_STEMMED); if( name != null && name.length() > 0) alphas.add( name.substring(0, 1)); } catch (CorruptIndexException e) { @@ -621,7 +622,7 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear BooleanQuery boolQuery = new BooleanQuery(); boolQuery.add( query, BooleanClause.Occur.MUST ); boolQuery.add( - new WildcardQuery(new Term(Entity2LuceneDoc.term.NAME, alpha+'*')), + new WildcardQuery(new Term(Entity2LuceneDoc.term.NAME_STEMMED, alpha+'*')), BooleanClause.Occur.MUST); query = boolQuery; } @@ -682,7 +683,15 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear // qp.setStemmedToUnstemmed(map); MultiFieldQueryParser qp = new MultiFieldQueryParser(Version.LUCENE_29, new String[]{ - "name", "nameunstemmed", "type", "moniker", "ALLTEXT", "ALLTEXTUNSTEMMED", "nameraw" , "classLocalName", "classLocalNameLowerCase" }, analyzer); + VitroLuceneTermNames.NAME_STEMMED, + VitroLuceneTermNames.NAME_UNSTEMMED, + VitroLuceneTermNames.RDFTYPE, + VitroLuceneTermNames.MONIKER, + VitroLuceneTermNames.ALLTEXT, + VitroLuceneTermNames.ALLTEXTUNSTEMMED, + VitroLuceneTermNames.NAME_RAW, + VitroLuceneTermNames.CLASSLOCALNAME, + VitroLuceneTermNames.CLASSLOCALNAMELOWERCASE }, analyzer); // QueryParser qp = new QueryParser(Version.LUCENE_29, "name", analyzer); diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/SolrAutocompleteController.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/SolrAutocompleteController.java index 146f2ab29..677244105 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/SolrAutocompleteController.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/SolrAutocompleteController.java @@ -9,43 +9,34 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import javax.servlet.ServletContext; import javax.servlet.ServletException; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.index.Term; -import org.apache.lucene.queryParser.ParseException; -import org.apache.lucene.queryParser.QueryParser; -import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.TopDocs; -import org.apache.lucene.search.WildcardQuery; -import org.apache.lucene.util.Version; +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.SolrServer; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; import org.json.JSONArray; - -import com.hp.hpl.jena.sparql.lib.org.json.JSONObject; +import org.json.JSONObject; import edu.cornell.mannlib.vitro.webapp.auth.requestedAction.Actions; import edu.cornell.mannlib.vitro.webapp.auth.requestedAction.usepages.UseBasicAjaxControllers; import edu.cornell.mannlib.vitro.webapp.controller.VitroRequest; import edu.cornell.mannlib.vitro.webapp.controller.ajax.VitroAjaxController; -import edu.cornell.mannlib.vitro.webapp.search.SearchException; import edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc.VitroLuceneTermNames; -import edu.cornell.mannlib.vitro.webapp.search.lucene.LuceneIndexFactory; -import edu.cornell.mannlib.vitro.webapp.search.lucene.LuceneSetup; +import edu.cornell.mannlib.vitro.webapp.search.solr.SolrSetup; /** * AutocompleteController generates autocomplete content - * through a Lucene search. + * through a Solr search. */ + +// RY Rename to AutocompleteController once the transition to Solr is complete. public class SolrAutocompleteController extends VitroAjaxController { private static final long serialVersionUID = 1L; @@ -53,11 +44,14 @@ public class SolrAutocompleteController extends VitroAjaxController { //private static final String TEMPLATE_DEFAULT = "autocompleteResults.ftl"; - private static String QUERY_PARAMETER_NAME = "term"; + private static final String PARAM_QUERY = "term"; + private static final String PARAM_RDFTYPE = "type"; String NORESULT_MSG = ""; - private int defaultMaxSearchSize= 1000; + private static final int DEFAULT_MAX_HIT_COUNT = 1000; + public static final int MAX_QUERY_LENGTH = 500; + @Override protected Actions requiredActions(VitroRequest vreq) { return new Actions(new UseBasicAjaxControllers()); @@ -68,13 +62,10 @@ public class SolrAutocompleteController extends VitroAjaxController { throws IOException, ServletException { try { - - int maxHitSize = defaultMaxSearchSize; - String qtxt = vreq.getParameter(QUERY_PARAMETER_NAME); - Analyzer analyzer = getAnalyzer(getServletContext()); + String qtxt = vreq.getParameter(PARAM_QUERY); - Query query = getQuery(vreq, analyzer, qtxt); + SolrQuery query = getQuery(qtxt, vreq); if (query == null ) { log.debug("query for '" + qtxt +"' is null."); doNoQuery(response); @@ -82,43 +73,35 @@ public class SolrAutocompleteController extends VitroAjaxController { } log.debug("query for '" + qtxt +"' is " + query.toString()); - IndexSearcher searcherForRequest = LuceneIndexFactory.getIndexSearcher(getServletContext()); - - TopDocs topDocs = null; - try{ - topDocs = searcherForRequest.search(query,null,maxHitSize); - }catch(Throwable t){ - log.error("in first pass at search: " + t); - // this is a hack to deal with odd cases where search and index threads interact - try{ - wait(150); - topDocs = searcherForRequest.search(query,null,maxHitSize); - }catch (Exception e){ - log.error(e, e); - doNoSearchResults(response); - return; - } - } + SolrServer solr = SolrSetup.getSolrServer(getServletContext()); + QueryResponse queryResponse = solr.query(query); - if( topDocs == null || topDocs.scoreDocs == null){ - log.error("topDocs for a search was null"); + if ( queryResponse == null) { + log.error("Query response for a search was null"); doNoSearchResults(response); return; } - int hitsLength = topDocs.scoreDocs.length; - if ( hitsLength < 1 ){ + SolrDocumentList docs = queryResponse.getResults(); + + if ( docs == null) { + log.error("Docs for a search was null"); + doNoSearchResults(response); + return; + } + + long hitCount = docs.getNumFound(); + log.debug("Number of hits = " + hitCount); + if ( hitCount < 1 ) { doNoSearchResults(response); return; } - log.debug("found "+hitsLength+" hits"); List results = new ArrayList(); - for(int i=0; i MAX_QUERY_LENGTH ){ - log.debug("The search was too long. The maximum " + - "query length is " + MAX_QUERY_LENGTH ); - return null; - } - - query = makeNameQuery(querystr, analyzer, vreq); - - // Filter by type - { - BooleanQuery boolQuery = new BooleanQuery(); - String typeParam = (String) vreq.getParameter("type"); - boolQuery.add( new TermQuery( - new Term(VitroLuceneTermNames.RDFTYPE, - typeParam)), - BooleanClause.Occur.MUST); - boolQuery.add(query, BooleanClause.Occur.MUST); - query = boolQuery; - } - - } catch (Exception ex){ - throw new SearchException(ex.getMessage()); + private SolrQuery getQuery(String querystr, VitroRequest vreq) { + + if ( querystr == null) { + log.error("There was no parameter '"+ PARAM_QUERY + +"' in the request."); + return null; + } else if( querystr.length() > MAX_QUERY_LENGTH ) { + log.debug("The search was too long. The maximum " + + "query length is " + MAX_QUERY_LENGTH ); + return null; } + + SolrQuery query = new SolrQuery(); + query = query.setStart(0); + query = query.setRows(DEFAULT_MAX_HIT_COUNT); + + query = setNameQuery(query, querystr, vreq); + + // Filter by type + String typeParam = (String) vreq.getParameter(PARAM_RDFTYPE); + if (typeParam != null) { + query = query.addFilterQuery(VitroLuceneTermNames.RDFTYPE + ":\"" + typeParam + "\""); + } + + // Set the fields to retrieve **** RY + // query = query.setFields( ... ); return query; } - private Query makeNameQuery(String querystr, Analyzer analyzer, HttpServletRequest request) { + private SolrQuery setNameQuery(SolrQuery query, String querystr, HttpServletRequest request) { String tokenizeParam = (String) request.getParameter("tokenize"); boolean tokenize = "true".equals(tokenizeParam); // Note: Stemming is only relevant if we are tokenizing: an untokenized name // query will not be stemmed. So we don't look at the stem parameter until we get to - // makeTokenizedNameQuery(). + // setTokenizedNameQuery(). if (tokenize) { - return makeTokenizedNameQuery(querystr, analyzer, request); + return setTokenizedNameQuery(query, querystr, request); } else { - return makeUntokenizedNameQuery(querystr); + return setUntokenizedNameQuery(query, querystr); } } - private Query makeTokenizedNameQuery(String querystr, Analyzer analyzer, HttpServletRequest request) { + private SolrQuery setTokenizedNameQuery(SolrQuery query, String querystr, HttpServletRequest request) { String stemParam = (String) request.getParameter("stem"); boolean stem = "true".equals(stemParam); - String termName = stem ? VitroLuceneTermNames.NAME : VitroLuceneTermNames.NAMEUNSTEMMED; + String termName = stem ? VitroLuceneTermNames.NAME_STEMMED : VitroLuceneTermNames.NAME_UNSTEMMED; BooleanQuery boolQuery = new BooleanQuery(); - // Use the query parser to analyze the search term the same way the indexed text was analyzed. - // For example, text is lowercased, and function words are stripped out. - QueryParser parser = getQueryParser(termName, analyzer); - - // The wildcard query doesn't play well with stemming. Query term name:tales* doesn't match - // "tales", which is indexed as "tale", while query term name:tales does. Obviously we need - // the wildcard for name:tal*, so the only way to get them all to match is use a disjunction - // of wildcard and non-wildcard queries. The query will look have only an implicit disjunction - // operator: e.g., +(name:tales name:tales*) - try { - log.debug("Adding non-wildcard query for " + querystr); - Query query = parser.parse(querystr); - boolQuery.add(query, BooleanClause.Occur.SHOULD); - - // Prevent ParseException here when adding * after a space. - // If there's a space at the end, we don't need the wildcard query. - if (! querystr.endsWith(" ")) { - log.debug("Adding wildcard query for " + querystr); - Query wildcardQuery = parser.parse(querystr + "*"); - boolQuery.add(wildcardQuery, BooleanClause.Occur.SHOULD); - } - - log.debug("Name query is: " + boolQuery.toString()); - } catch (ParseException e) { - log.warn(e, e); - } +// // Use the query parser to analyze the search term the same way the indexed text was analyzed. +// // For example, text is lowercased, and function words are stripped out. +// QueryParser parser = getQueryParser(termName); +// +// // The wildcard query doesn't play well with stemming. Query term name:tales* doesn't match +// // "tales", which is indexed as "tale", while query term name:tales does. Obviously we need +// // the wildcard for name:tal*, so the only way to get them all to match is use a disjunction +// // of wildcard and non-wildcard queries. The query will look have only an implicit disjunction +// // operator: e.g., +(name:tales name:tales*) +// try { +// log.debug("Adding non-wildcard query for " + querystr); +// Query query = parser.parse(querystr); +// boolQuery.add(query, BooleanClause.Occur.SHOULD); +// +// // Prevent ParseException here when adding * after a space. +// // If there's a space at the end, we don't need the wildcard query. +// if (! querystr.endsWith(" ")) { +// log.debug("Adding wildcard query for " + querystr); +// Query wildcardQuery = parser.parse(querystr + "*"); +// boolQuery.add(wildcardQuery, BooleanClause.Occur.SHOULD); +// } +// +// log.debug("Name query is: " + boolQuery.toString()); +// } catch (ParseException e) { +// log.warn(e, e); +// } - return boolQuery; + return query; } - private Query makeUntokenizedNameQuery(String querystr) { + private SolrQuery setUntokenizedNameQuery(SolrQuery query, String querystr) { - querystr = querystr.toLowerCase(); - String termName = VitroLuceneTermNames.NAMELOWERCASE; - BooleanQuery query = new BooleanQuery(); - log.debug("Adding wildcard query on unanalyzed name"); - query.add( - new WildcardQuery(new Term(termName, querystr + "*")), - BooleanClause.Occur.MUST); + //querystr = querystr.toLowerCase(); + querystr += "*"; + query = query.setQuery(querystr); + // *** It's the df parameter that sets the field to search + //String field = VitroLuceneTermNames.LABEL_LOWERCASE; return query; } - private QueryParser getQueryParser(String searchField, Analyzer analyzer){ - // searchField indicates which field to search against when there is no term - // indicated in the query string. - // The analyzer is needed so that we use the same analyzer on the search queries as - // was used on the text that was indexed. - QueryParser qp = new QueryParser(Version.LUCENE_29, searchField,analyzer); - //this sets the query parser to AND all of the query terms it finds. - qp.setDefaultOperator(QueryParser.AND_OPERATOR); - return qp; - } - private void doNoQuery(HttpServletResponse response) throws IOException { - // For now, we are not sending an error message back to the client because with the default autocomplete configuration it - // chokes. + // For now, we are not sending an error message back to the client because + // with the default autocomplete configuration it chokes. doNoSearchResults(response); } private void doSearchError(HttpServletResponse response) throws IOException { - // For now, we are not sending an error message back to the client because with the default autocomplete configuration it - // chokes. + // For now, we are not sending an error message back to the client because + // with the default autocomplete configuration it chokes. doNoSearchResults(response); } @@ -282,8 +237,6 @@ public class SolrAutocompleteController extends VitroAjaxController { response.getWriter().write("[]"); } - public static final int MAX_QUERY_LENGTH = 500; - public class SearchResult implements Comparable { private String label; private String uri; diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/SolrPagedSearchController.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/SolrPagedSearchController.java index d71b5e73e..476645077 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/SolrPagedSearchController.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/SolrPagedSearchController.java @@ -22,10 +22,6 @@ import javax.servlet.http.HttpServletResponse; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.queryParser.MultiFieldQueryParser; -import org.apache.lucene.queryParser.QueryParser; -import org.apache.lucene.util.Version; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrServer; import org.apache.solr.client.solrj.response.QueryResponse; @@ -73,9 +69,8 @@ public class SolrPagedSearchController extends FreemarkerHttpServlet { private static final long serialVersionUID = 1L; private static final Log log = LogFactory.getLog(SolrPagedSearchController.class); - private static final int DEFAULT_HITS_PER_PAGE = 25; - private static final int DEFAULT_MAX_SEARCH_SIZE = 1000; + private static final int DEFAULT_MAX_HIT_COUNT = 1000; private static final String PARAM_XML_REQUEST = "xml"; private static final String PARAM_START_INDEX = "startIndex"; @@ -169,15 +164,15 @@ public class SolrPagedSearchController extends FreemarkerHttpServlet { } log.debug("hitsPerPage is " + hitsPerPage); - int maxHitCount = DEFAULT_MAX_SEARCH_SIZE ; - if( startIndex >= DEFAULT_MAX_SEARCH_SIZE - hitsPerPage ) - maxHitCount = startIndex + DEFAULT_MAX_SEARCH_SIZE ; + int maxHitCount = DEFAULT_MAX_HIT_COUNT ; + if( startIndex >= DEFAULT_MAX_HIT_COUNT - hitsPerPage ) + maxHitCount = startIndex + DEFAULT_MAX_HIT_COUNT ; log.debug("maxHitSize is " + maxHitCount); String qtxt = vreq.getParameter(VitroQuery.QUERY_PARAMETER_NAME); - log.debug("Query text is "+ qtxt); // + " Analyzer is "+ analyzer.toString()); + log.debug("Query text is \""+ qtxt + "\""); SolrQuery query = getQuery(qtxt, maxHitCount, vreq); @@ -440,52 +435,33 @@ public class SolrPagedSearchController extends FreemarkerHttpServlet { private SolrQuery getQuery(String queryText, int maxHitCount, VitroRequest vreq) { SolrQuery query = new SolrQuery(queryText); - //SolrQuery query = new SolrQuery(); - //query.setQuery(queryText); // Solr requires these values, but we don't want them to be the real values for this page // of results, else the refinement links won't work correctly: each page of results needs to // show refinement links generated for all results, not just for the results on the current page. - query.setStart(0); - query.setRows(maxHitCount); + query.setStart(0) + .setRows(maxHitCount); // Classgroup filtering - Object param = vreq.getParameter(PARAM_CLASSGROUP); - if( param != null && !"".equals(param)){ + String classgroupParam = (String) vreq.getParameter(PARAM_CLASSGROUP); + if ( ! StringUtils.isBlank(classgroupParam) ) { log.debug("Firing classgroup query "); - log.debug("request.getParameter(classgroup) is "+ param.toString()); - query = query.addFilterQuery(VitroLuceneTermNames.CLASSGROUP_URI + ":\"" + param + "\""); + log.debug("request.getParameter(classgroup) is "+ classgroupParam); + query.addFilterQuery(VitroLuceneTermNames.CLASSGROUP_URI + ":\"" + classgroupParam + "\""); } // rdf:type filtering - param = vreq.getParameter(PARAM_RDFTYPE); - if( param != null && !"".equals(param)){ + String typeParam = (String) vreq.getParameter(PARAM_RDFTYPE); + if ( ! StringUtils.isBlank(typeParam) ) { log.debug("Firing type query "); - log.debug("request.getParameter(type) is "+ param.toString()); - query = query.addFilterQuery(VitroLuceneTermNames.RDFTYPE + ":\"" + param + "\""); + log.debug("request.getParameter(type) is "+ typeParam); + query.addFilterQuery(VitroLuceneTermNames.RDFTYPE + ":\"" + typeParam + "\""); } //query.setQuery(queryText); log.debug("Query = " + query.toString()); return query; } - - @SuppressWarnings("unused") - private QueryParser getQueryParser(Analyzer analyzer){ - - MultiFieldQueryParser qp = new MultiFieldQueryParser(Version.LUCENE_29, new String[] { - VitroLuceneTermNames.NAME, - VitroLuceneTermNames.NAMEUNSTEMMED, - VitroLuceneTermNames.RDFTYPE, - VitroLuceneTermNames.ALLTEXT, - VitroLuceneTermNames.ALLTEXTUNSTEMMED, - VitroLuceneTermNames.NAMERAW, - VitroLuceneTermNames.CLASSLOCALNAME, - VitroLuceneTermNames.CLASSLOCALNAMELOWERCASE }, analyzer); - //"name", "nameunstemmed", "type", "ALLTEXT", "ALLTEXTUNSTEMMED", "nameraw" , "classLocalName", "classLocalNameLowerCase" }, analyzer); - - return qp; - } private class VClassGroupSearchLink extends LinkTemplateModel { diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/Entity2LuceneDoc.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/Entity2LuceneDoc.java index abc7cb07b..23cbcaa56 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/Entity2LuceneDoc.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/Entity2LuceneDoc.java @@ -45,14 +45,7 @@ public class Entity2LuceneDoc implements Obj2DocIface{ public static String CLASSGROUP_URI = "classgroup"; /** Modtime from db */ public static String MODTIME = "modTime"; - /** Name of entity, tab or vclass */ - public static String NAME = "name"; - /** rdfs:label unanalyzed */ - public static String NAMELOWERCASE = "nameunanalyzed" ; - /** Name of entity, unstemmed */ - public static String NAMEUNSTEMMED = "nameunstemmed"; - /** Unaltered name of individual, un-lowercased, un-stemmed, un-tokenized" */ - public static String NAMERAW = "nameraw"; + /** time of index in msec since epoc */ public static String INDEXEDTIME= "indexedTime"; /** timekey of entity in yyyymmddhhmm */ @@ -77,7 +70,21 @@ public class Entity2LuceneDoc implements Obj2DocIface{ /** class names in human readable form of an individual*/ public static final String CLASSLOCALNAMELOWERCASE = "classLocalNameLowerCase"; /** class names in human readable form of an individual*/ - public static final String CLASSLOCALNAME = "classLocalName"; + public static final String CLASSLOCALNAME = "classLocalName"; + + // Fields derived from rdfs:label + /** Raw rdfs:label: no lowercasing, no tokenizing, no stop words, no stemming **/ + public static String NAME_RAW = "nameRaw"; // was NAMERAW + + /** rdfs:label lowercased, no tokenizing, no stop words, no stemming **/ + public static String NAME_LOWERCASE = "nameLowercase"; // was NAMELOWERCASE + + /** rdfs:label lowercased, tokenized, stop words, no stemming **/ + public static String NAME_UNSTEMMED = "nameUnstemmed"; // was NAMEUNSTEMMED + + /** rdfs:label lowercased, tokenized, stop words, stemmed **/ + public static String NAME_STEMMED = "nameStemmed"; // was NAME + } private static final Log log = LogFactory.getLog(Entity2LuceneDoc.class.getName()); @@ -189,7 +196,7 @@ public class Entity2LuceneDoc implements Obj2DocIface{ //java class doc.add( new Field(term.JCLASS, entClassName, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS)); - //Entity Name + // Individual label if( ent.getRdfsLabel() != null ) value=ent.getRdfsLabel(); else{ @@ -198,21 +205,22 @@ public class Entity2LuceneDoc implements Obj2DocIface{ log.debug("Using local name for individual with rdfs:label " + ent.getURI()); value = ent.getLocalName(); } - Field name = new Field(term.NAME, value, Field.Store.YES, Field.Index.ANALYZED); - doc.add( name ); + + Field labelRaw = new Field(term.NAME_RAW, value, Field.Store.YES, Field.Index.NOT_ANALYZED); + labelRaw.setBoost(NAME_BOOST); + doc.add(labelRaw); - Field nameUn = new Field(term.NAMEUNSTEMMED, value, Field.Store.NO, Field.Index.ANALYZED); - nameUn.setBoost(NAME_BOOST); - doc.add( nameUn ); + Field labelLowerCase = new Field(term.NAME_LOWERCASE, value, Field.Store.YES, Field.Index.NOT_ANALYZED); + labelLowerCase.setBoost(NAME_BOOST); + doc.add(labelLowerCase); - // BK nameunanalyzed is used by IndividualListController - Field nameUnanalyzed = new Field(term.NAMELOWERCASE, value.toLowerCase(), Field.Store.YES, Field.Index.NOT_ANALYZED); - nameUnanalyzed.setBoost(NAME_BOOST); - doc.add( nameUnanalyzed ); + Field labelUnstemmed = new Field(term.NAME_UNSTEMMED, value, Field.Store.NO, Field.Index.ANALYZED); + labelUnstemmed.setBoost(NAME_BOOST); + doc.add(labelUnstemmed); - Field nameRaw = new Field(term.NAMERAW, value, Field.Store.YES, Field.Index.NOT_ANALYZED); - nameRaw.setBoost(NAME_BOOST); - doc.add(nameRaw); + Field labelStemmed = new Field(term.NAME_STEMMED, value, Field.Store.NO, Field.Index.ANALYZED); + labelStemmed.setBoost(NAME_BOOST); + doc.add(labelStemmed); //Moniker diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/HtmlLowerStopAnalyzer.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/HtmlLowerStopAnalyzer.java index 02b09a6b2..e6a32bed7 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/HtmlLowerStopAnalyzer.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/HtmlLowerStopAnalyzer.java @@ -81,8 +81,7 @@ public class HtmlLowerStopAnalyzer extends Analyzer { /** * Processes the input by first converting it to - * lower case, then by eliminating stop words, and - * finally by performing Porter stemming on it. + * lower case, then by eliminating stop words. * * @param reader the Reader that * provides access to the input text diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneSetup.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneSetup.java index 9a5376c93..94411f132 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneSetup.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneSetup.java @@ -7,8 +7,8 @@ import static edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc.Vi import static edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc.VitroLuceneTermNames.CLASSLOCALNAME; import static edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc.VitroLuceneTermNames.CLASSLOCALNAMELOWERCASE; import static edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc.VitroLuceneTermNames.MONIKER; -import static edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc.VitroLuceneTermNames.NAME; -import static edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc.VitroLuceneTermNames.NAMEUNSTEMMED; +import static edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc.VitroLuceneTermNames.NAME_STEMMED; +import static edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc.VitroLuceneTermNames.NAME_UNSTEMMED; import static edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc.VitroLuceneTermNames.RDFTYPE; import java.io.File; @@ -245,10 +245,9 @@ public class LuceneSetup implements javax.servlet.ServletContextListener { PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper( new StandardAnalyzer(Version.LUCENE_29)); analyzer.addAnalyzer(ALLTEXT, new HtmlLowerStopStemAnalyzer()); -// analyzer.addAnalyzer(NAME, new HtmlLowerStopStemAnalyzer()); analyzer.addAnalyzer(ALLTEXTUNSTEMMED, new HtmlLowerStopAnalyzer()); - analyzer.addAnalyzer(NAMEUNSTEMMED, new HtmlLowerStopAnalyzer()); - analyzer.addAnalyzer(NAME, new StandardAnalyzer(Version.LUCENE_29)); + analyzer.addAnalyzer(NAME_UNSTEMMED, new HtmlLowerStopAnalyzer()); + analyzer.addAnalyzer(NAME_STEMMED, new HtmlLowerStopStemAnalyzer()); analyzer.addAnalyzer(MONIKER, new StandardAnalyzer(Version.LUCENE_29)); analyzer.addAnalyzer(RDFTYPE, new StandardAnalyzer(Version.LUCENE_29)); analyzer.addAnalyzer(CLASSLOCALNAME, new HtmlLowerStopAnalyzer());