From a7c271a9bd7d21c054688af4520e89d49ce00d71 Mon Sep 17 00:00:00 2001 From: ryounes Date: Tue, 28 Jun 2011 16:57:47 +0000 Subject: [PATCH] NIHVIVO-2459 Two edgeNGram field definitions for autocomplete. Untokenized autocomplete search. --- solr/exampleSolr/conf/schema.xml | 116 +++++++++--------- .../webapp/search/VitroSearchTermNames.java | 7 ++ .../SolrAutocompleteController.java | 33 ++--- .../search/solr/IndividualToSolrDocument.java | 51 ++++---- 4 files changed, 102 insertions(+), 105 deletions(-) diff --git a/solr/exampleSolr/conf/schema.xml b/solr/exampleSolr/conf/schema.xml index 36d0e14cb..65cc4019d 100644 --- a/solr/exampleSolr/conf/schema.xml +++ b/solr/exampleSolr/conf/schema.xml @@ -220,7 +220,7 @@ NOTE: autoGeneratePhraseQueries="true" tends to not work well for non whitespace delimited languages. --> - + @@ -229,65 +229,43 @@ add enablePositionIncrements=true in both the index and query analyzers to leave a 'gap' for more accurate phrase queries. --> - - + + - - - - - - - - - - - - + + + + + + + + + + + - - - - - - - - + + + @@ -475,7 +453,32 @@ - + + + + + + + + + + + + + + + + + + @@ -522,13 +525,12 @@ - - + + + - + + diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/VitroSearchTermNames.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/VitroSearchTermNames.java index ba0fa123e..98bf7c2e5 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/VitroSearchTermNames.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/VitroSearchTermNames.java @@ -64,6 +64,13 @@ public class VitroSearchTermNames { /** rdfs:label lowercased, untokenized, edge-n-gram-filtered for autocomplete on people names **/ public static String AC_NAME_UNTOKENIZED = "acNameUntokenized"; + /** rdfs:label lowercased, tokenized, stop words, stemmed, edge-n-gram-filtered for autocomplete + * on non-person labels such as book titles and grant names **/ + public static String AC_NAME_STEMMED = "acNameStemmed"; + + /* There is currently no use case for an autocomplete search field that is tokenized but not stemmed. + public static String AC_NAME_TOKENIZED = "acNameTokenized"; */ + /** field for beta values of all documents **/ public static final String BETA = "BETA"; public static final String PHI = "PHI"; diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/SolrAutocompleteController.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/SolrAutocompleteController.java index 3b290b157..16d785129 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/SolrAutocompleteController.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/SolrAutocompleteController.java @@ -142,7 +142,7 @@ public class SolrAutocompleteController extends VitroAjaxController { query.setStart(0) .setRows(DEFAULT_MAX_HIT_COUNT); - setQuery(query, queryStr, vreq); + setNameQuery(query, queryStr, vreq); // Filter by type String typeParam = (String) vreq.getParameter(PARAM_RDFTYPE); @@ -158,7 +158,7 @@ public class SolrAutocompleteController extends VitroAjaxController { return query; } - private void setQuery(SolrQuery query, String queryStr, HttpServletRequest request) { + private void setNameQuery(SolrQuery query, String queryStr, HttpServletRequest request) { if (StringUtils.isBlank(queryStr)) { log.error("No query string"); @@ -171,13 +171,13 @@ public class SolrAutocompleteController extends VitroAjaxController { // query will not be stemmed. So we don't look at the stem parameter until we get to // setTokenizedNameQuery(). if (tokenize) { - setTokenizedQuery(query, queryStr, request); + setTokenizedNameQuery(query, queryStr, request); } else { - setUntokenizedQuery(query, queryStr); + setUntokenizedNameQuery(query, queryStr); } } - private void setTokenizedQuery(SolrQuery query, String queryStr, HttpServletRequest request) { + private void setTokenizedNameQuery(SolrQuery query, String queryStr, HttpServletRequest request) { // RY 5/18/2011 For now, just doing untokenized query, due to the interactions of wildcard // query and stemming described below. Need to find a way to do this in Solr. @@ -215,30 +215,15 @@ public class SolrAutocompleteController extends VitroAjaxController { // log.warn(e, e); // } - //setUntokenizedQuery(query, queryStr); - - String stemParam = (String) request.getParameter("stem"); - boolean stem = "true".equals(stemParam); - String termName = stem ? VitroSearchTermNames.NAME_STEMMED : VitroSearchTermNames.NAME_UNSTEMMED; - - // We have to lowercase manually, because Solr doesn't do text analysis on wildcard queries - queryStr = queryStr.toLowerCase(); - // Solr wants whitespace to be escaped with a backslash - // Better: replace \s+ - queryStr = queryStr.replaceAll(" ", "\\\\ "); - queryStr = termName + ":" + queryStr + "*"; - query.setQuery(queryStr); - + setUntokenizedNameQuery(query, queryStr); } - private void setUntokenizedQuery(SolrQuery query, String queryStr) { + private void setUntokenizedNameQuery(SolrQuery query, String queryStr) { - // We have to lowercase manually, because Solr doesn't do text analysis on wildcard queries - queryStr = queryStr.toLowerCase(); // Solr wants whitespace to be escaped with a backslash // Better: replace \s+ - queryStr = queryStr.replaceAll(" ", "\\\\ "); - queryStr = VitroSearchTermNames.NAME_LOWERCASE + ":" + queryStr + "*"; + queryStr = queryStr.replaceAll("\\s+", "\\\\ "); + queryStr = VitroSearchTermNames.AC_NAME_UNTOKENIZED + ":" + queryStr; query.setQuery(queryStr); } diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/IndividualToSolrDocument.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/IndividualToSolrDocument.java index c6d0846a1..a8d913c41 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/IndividualToSolrDocument.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/IndividualToSolrDocument.java @@ -175,47 +175,50 @@ public class IndividualToSolrDocument { String t=null; addUri = new StringBuffer(); addUri.append(""); - List objectPropertyStatements = ind.getObjectPropertyStatements(); - if (objectPropertyStatements != null) { - Iterator objectPropertyStmtIter = objectPropertyStatements.iterator(); - while (objectPropertyStmtIter.hasNext()) { - ObjectPropertyStatement objectPropertyStmt = objectPropertyStmtIter.next(); - if( "http://www.w3.org/2002/07/owl#differentFrom".equals(objectPropertyStmt.getPropertyURI()) ) - continue; - try { - objectNames.append(" "); - objectNames.append(((t=objectPropertyStmt.getObject().getName()) == null)?"":t); - addUri.append(" "); - addUri.append(((t=objectPropertyStmt.getObject().getURI()) == null)?"":t); - } catch (Exception e) { + List objectPropertyStatements = ind.getObjectPropertyStatements(); + if (objectPropertyStatements != null) { + Iterator objectPropertyStmtIter = objectPropertyStatements.iterator(); + while (objectPropertyStmtIter.hasNext()) { + ObjectPropertyStatement objectPropertyStmt = objectPropertyStmtIter.next(); + if( "http://www.w3.org/2002/07/owl#differentFrom".equals(objectPropertyStmt.getPropertyURI()) ) + continue; + try { + objectNames.append(" "); + objectNames.append(((t=objectPropertyStmt.getObject().getName()) == null)?"":t); + addUri.append(" "); + addUri.append(((t=objectPropertyStmt.getObject().getURI()) == null)?"":t); + } catch (Exception e) { log.debug("could not index name of related object: " + e.getMessage()); - } - } - } + } + } + } - if(documentModifiers == null || documentModifiers.isEmpty()){ + if(documentModifiers == null || documentModifiers.isEmpty()){ doc.addField(term.NAME_RAW, value, NAME_BOOST); doc.addField(term.NAME_LOWERCASE, value, NAME_BOOST); - doc.addField(term.NAME_UNSTEMMED, value,NAME_BOOST); + doc.addField(term.NAME_UNSTEMMED, value, NAME_BOOST); doc.addField(term.NAME_STEMMED, value, NAME_BOOST); doc.addField(term.NAME_PHONETIC, value, PHONETIC_BOOST); doc.addField(term.AC_NAME_UNTOKENIZED, value); - }else{ + doc.addField(term.AC_NAME_STEMMED, value); + }else{ doc.addField(term.NAME_RAW, value); doc.addField(term.NAME_LOWERCASE, value); doc.addField(term.NAME_UNSTEMMED, value); doc.addField(term.NAME_STEMMED, value); doc.addField(term.NAME_PHONETIC, value, PHONETIC_BOOST); - doc.addField(term.AC_NAME_UNTOKENIZED, value); - } + doc.addField(term.AC_NAME_UNTOKENIZED, value); + doc.addField(term.AC_NAME_STEMMED, value); + } long tMoniker = System.currentTimeMillis(); if(documentModifiers == null || documentModifiers.isEmpty()){ - //boost for entity - if(ind.getSearchBoost() != null && ind.getSearchBoost() != 0) - doc.setDocumentBoost(ind.getSearchBoost()); + //boost for entity + if(ind.getSearchBoost() != null && ind.getSearchBoost() != 0) { + doc.setDocumentBoost(ind.getSearchBoost()); + } } //thumbnail