NIHVIVO-2459 Solr field definitions for autocomplete

This commit is contained in:
ryounes 2011-05-16 19:16:47 +00:00
parent eac9e5dca1
commit ad98e7723c
7 changed files with 56 additions and 29 deletions

View file

@ -224,7 +224,7 @@
words="stopwords.txt"
enablePositionIncrements="true"
/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/>
</analyzer>
@ -242,6 +242,20 @@
</analyzer>
</fieldType>
<!-- Like text, but without synonyms and stemming. Good for autocomplete matching of proper names, where we want to remove
stop words but not stem. -->
<fieldType name="textUnstemmed" class="solr.TextField" positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory"
ignoreCase="true"
words="stopwords.txt"
enablePositionIncrements="true"
/>
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldType>
<!-- Less flexible matching, but less false matches. Probably not ideal for product names,
but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
@ -423,8 +437,6 @@
<!-- **************************** Vitro Fields *************************** -->
<field name="DocId" type="string" indexed="true" stored="true" required="true" />
<field name="type" type="string" indexed="true" stored="true" omitNorms="true" multiValued="true"/>
@ -434,10 +446,10 @@
<field name="PROHIBITED_FROM_TEXT_RESULTS" type="string" indexed="true" stored="false" multiValued="false"/>
<field name="URI" type="string" indexed="true" stored="true" multiValued="false" omitNorms="true"/>
<field name="nameRaw" type="string" indexed="true" stored="true" multiValued="true"/>
<!-- RY Not sure if we need to store nameLowercase -->
<field name="nameLowercase" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="nameUnstemmed" type="text" indexed="true" stored="false" multiValued="true"/>
<field name="nameStemmed" type="text" indexed="true" stored="false" multiValued="true"/>
<!-- RY Not sure if we need to store nameLowercase. Is it ever displayed? -->
<field name="nameLowercase" type="lowercase" indexed="true" stored="true" multiValued="true"/>
<field name="acNameUnstemmed" type="textUnstemmed" indexed="true" stored="false" multiValued="true"/>
<field name="acNameStemmed" type="text" indexed="true" stored="false" multiValued="true"/>
<field name="indexedTime" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="ALLTEXT" type="text" indexed="true" stored="false" multiValued="true"/>
<field name="ALLTEXTUNSTEMMED" type="textgen" indexed="true" stored="false" multiValued="true"/>

View file

@ -377,7 +377,7 @@ public class JSONReconcileServlet extends VitroHttpServlet {
String stemParam = (String) request.getParameter("stem");
boolean stem = "true".equals(stemParam);
String termName = stem ? VitroLuceneTermNames.NAME_STEMMED : VitroLuceneTermNames.NAME_UNSTEMMED;
String termName = stem ? VitroLuceneTermNames.AC_NAME_STEMMED : VitroLuceneTermNames.AC_NAME_UNSTEMMED;
BooleanQuery boolQuery = new BooleanQuery();

View file

@ -207,7 +207,7 @@ public class AutocompleteController extends VitroAjaxController {
String stemParam = (String) request.getParameter("stem");
boolean stem = "true".equals(stemParam);
String termName = stem ? VitroLuceneTermNames.NAME_STEMMED : VitroLuceneTermNames.NAME_UNSTEMMED;
String termName = stem ? VitroLuceneTermNames.AC_NAME_STEMMED : VitroLuceneTermNames.AC_NAME_UNSTEMMED;
BooleanQuery boolQuery = new BooleanQuery();

View file

@ -229,7 +229,7 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
Document document = searcherForRequest.doc(scoreDoc.doc);
Explanation explanation = searcherForRequest.explain(query, scoreDoc.doc);
log.debug("Document title: "+ document.get(Entity2LuceneDoc.VitroLuceneTermNames.NAME_STEMMED) + " score: " +scoreDoc.score);
log.debug("Document title: "+ document.get(Entity2LuceneDoc.VitroLuceneTermNames.AC_NAME_STEMMED) + " score: " +scoreDoc.score);
log.debug("Scoring of the doc explained " + explanation.toString());
log.debug("Explanation's description "+ explanation.getDescription());
log.debug("ALLTEXT: " + document.get(Entity2LuceneDoc.VitroLuceneTermNames.ALLTEXT));
@ -405,7 +405,7 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
Document doc;
try {
doc = searcher.doc(topDocs.scoreDocs[i].doc);
String name =doc.get(Entity2LuceneDoc.term.NAME_STEMMED);
String name =doc.get(Entity2LuceneDoc.term.AC_NAME_STEMMED);
if( name != null && name.length() > 0)
alphas.add( name.substring(0, 1));
} catch (CorruptIndexException e) {
@ -622,7 +622,7 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
BooleanQuery boolQuery = new BooleanQuery();
boolQuery.add( query, BooleanClause.Occur.MUST );
boolQuery.add(
new WildcardQuery(new Term(Entity2LuceneDoc.term.NAME_STEMMED, alpha+'*')),
new WildcardQuery(new Term(Entity2LuceneDoc.term.AC_NAME_STEMMED, alpha+'*')),
BooleanClause.Occur.MUST);
query = boolQuery;
}
@ -683,8 +683,8 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
// qp.setStemmedToUnstemmed(map);
MultiFieldQueryParser qp = new MultiFieldQueryParser(Version.LUCENE_29, new String[]{
VitroLuceneTermNames.NAME_STEMMED,
VitroLuceneTermNames.NAME_UNSTEMMED,
VitroLuceneTermNames.AC_NAME_STEMMED,
VitroLuceneTermNames.AC_NAME_UNSTEMMED,
VitroLuceneTermNames.RDFTYPE,
VitroLuceneTermNames.MONIKER,
VitroLuceneTermNames.ALLTEXT,

View file

@ -12,6 +12,7 @@ import java.util.Map;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@ -21,8 +22,10 @@ import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.core.SolrConfig;
import org.json.JSONArray;
import org.json.JSONObject;
import org.xml.sax.SAXException;
import edu.cornell.mannlib.vitro.webapp.auth.requestedAction.Actions;
import edu.cornell.mannlib.vitro.webapp.auth.requestedAction.usepages.UseBasicAjaxControllers;
@ -176,7 +179,7 @@ public class SolrAutocompleteController extends VitroAjaxController {
String stemParam = (String) request.getParameter("stem");
boolean stem = "true".equals(stemParam);
String termName = stem ? VitroLuceneTermNames.NAME_STEMMED : VitroLuceneTermNames.NAME_UNSTEMMED;
String termName = stem ? VitroLuceneTermNames.AC_NAME_STEMMED : VitroLuceneTermNames.AC_NAME_UNSTEMMED ;
BooleanQuery boolQuery = new BooleanQuery();
@ -214,9 +217,18 @@ public class SolrAutocompleteController extends VitroAjaxController {
//querystr = querystr.toLowerCase();
querystr += "*";
query = query.setQuery(querystr);
// *** It's the df parameter that sets the field to search
//String field = VitroLuceneTermNames.LABEL_LOWERCASE;
//query = query.setQuery(VitroLuceneTermNames.NAME_LOWERCASE + ":" + querystr);
//query.addFilterQuery(VitroLuceneTermNames.NAME_LOWERCASE);
//query.setQuery(querystr);
try {
SolrConfig config = new SolrConfig();
} catch (Exception e) {
// TODO Auto-generated catch block
log.error(e, e);
return null;
}
return query;
}

View file

@ -73,17 +73,20 @@ public class Entity2LuceneDoc implements Obj2DocIface{
public static final String CLASSLOCALNAME = "classLocalName";
// Fields derived from rdfs:label
/** Raw rdfs:label: no lowercasing, no tokenizing, no stop words, no stemming **/
/** Raw rdfs:label: no lowercasing, no tokenizing, no stop words, no stemming.
* Used only in retrieval rather than search. **/
public static String NAME_RAW = "nameRaw"; // was NAMERAW
/** rdfs:label lowercased, no tokenizing, no stop words, no stemming **/
public static String NAME_LOWERCASE = "nameLowercase"; // was NAMELOWERCASE
/** rdfs:label lowercased, tokenized, stop words, no stemming **/
public static String NAME_UNSTEMMED = "nameUnstemmed"; // was NAMEUNSTEMMED
/** rdfs:label lowercased, tokenized, stop words, no stemming.
* Used for autocomplete matching on proper names. **/
public static String AC_NAME_UNSTEMMED = "acNameUnstemmed"; // was NAMEUNSTEMMED
/** rdfs:label lowercased, tokenized, stop words, stemmed **/
public static String NAME_STEMMED = "nameStemmed"; // was NAME
/** rdfs:label lowercased, tokenized, stop words, stemmed.
* Used for autocomplete matching where stemming is desired (e.g., book titles) **/
public static String AC_NAME_STEMMED = "acNameStemmed"; // was NAME
}
@ -215,11 +218,11 @@ public class Entity2LuceneDoc implements Obj2DocIface{
nameLowerCase.setBoost(NAME_BOOST);
doc.add(nameLowerCase);
Field nameUnstemmed = new Field(term.NAME_UNSTEMMED, value, Field.Store.NO, Field.Index.ANALYZED);
Field nameUnstemmed = new Field(term.AC_NAME_UNSTEMMED, value, Field.Store.NO, Field.Index.ANALYZED);
nameUnstemmed.setBoost(NAME_BOOST);
doc.add(nameUnstemmed);
Field nameStemmed = new Field(term.NAME_STEMMED, value, Field.Store.NO, Field.Index.ANALYZED);
Field nameStemmed = new Field(term.AC_NAME_STEMMED, value, Field.Store.NO, Field.Index.ANALYZED);
nameStemmed.setBoost(NAME_BOOST);
doc.add(nameStemmed);

View file

@ -7,8 +7,8 @@ import static edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc.Vi
import static edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc.VitroLuceneTermNames.CLASSLOCALNAME;
import static edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc.VitroLuceneTermNames.CLASSLOCALNAMELOWERCASE;
import static edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc.VitroLuceneTermNames.MONIKER;
import static edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc.VitroLuceneTermNames.NAME_STEMMED;
import static edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc.VitroLuceneTermNames.NAME_UNSTEMMED;
import static edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc.VitroLuceneTermNames.AC_NAME_STEMMED;
import static edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc.VitroLuceneTermNames.AC_NAME_UNSTEMMED;
import static edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc.VitroLuceneTermNames.RDFTYPE;
import java.io.File;
@ -246,8 +246,8 @@ public class LuceneSetup implements javax.servlet.ServletContextListener {
analyzer.addAnalyzer(ALLTEXT, new HtmlLowerStopStemAnalyzer());
analyzer.addAnalyzer(ALLTEXTUNSTEMMED, new HtmlLowerStopAnalyzer());
analyzer.addAnalyzer(NAME_UNSTEMMED, new HtmlLowerStopAnalyzer());
analyzer.addAnalyzer(NAME_STEMMED, new HtmlLowerStopStemAnalyzer());
analyzer.addAnalyzer(AC_NAME_UNSTEMMED, new HtmlLowerStopAnalyzer());
analyzer.addAnalyzer(AC_NAME_STEMMED, new HtmlLowerStopStemAnalyzer());
analyzer.addAnalyzer(MONIKER, new StandardAnalyzer(Version.LUCENE_29));
analyzer.addAnalyzer(RDFTYPE, new StandardAnalyzer(Version.LUCENE_29));
analyzer.addAnalyzer(CLASSLOCALNAME, new HtmlLowerStopAnalyzer());