NIHVIVO-2459 Improvements to stemmed autocomplete matching. NIHVIVO-2801 Fix error in SolrAutocompleteController.

This commit is contained in:
ryounes 2011-07-01 16:19:31 +00:00
parent cebc368738
commit 50b159710b
7 changed files with 97 additions and 36 deletions

View file

@ -259,7 +259,7 @@
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt" enablePositionIncrements="true" />
words="stopwords-name.txt" enablePositionIncrements="true" />
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0"
@ -458,7 +458,7 @@
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt" enablePositionIncrements="true" />
words="stopwords-name.txt" enablePositionIncrements="true" />
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0"
@ -470,7 +470,7 @@
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt" enablePositionIncrements="true" />
words="stopwords-name.txt" enablePositionIncrements="true" />
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0"

View file

@ -0,0 +1,38 @@
# Standard english stop words taken from Lucene's StopAnalyzer
# Stopwords used in autocomplete (label-matching) fields, since full list in stopwords.txt may be too inclusive. Since label includes things like
# book titles, we want a smaller set of stopwords.
a
an
and
are
as
at
be
but
by
for
if
in
into
is
it
no
not
of
on
or
s
t
that
the
their
then
there
these
they
this
to
was
will
with

View file

@ -130,9 +130,6 @@ our
out
over
re
said
same
see
should
since
so
@ -156,13 +153,10 @@ to
too
under
up
use
very
want
was
way
we
well
were
what
when

View file

@ -254,7 +254,7 @@ public class SolrIndividualListController extends FreemarkerHttpServlet {
if (individual != null) {
individualsAdded++;
individuals.add(individual);
log.debug("Adding individual " + uri + " to individuals for display");
log.debug("Adding individual " + uri + " to individual list display");
} else {
log.debug("No existing individual for search document with uri = " + uri);
}

View file

@ -53,7 +53,7 @@ public class IndividualJena extends IndividualImpl implements Individual {
private OntResource ind = null;
private WebappDaoFactoryJena webappDaoFactory = null;
private Float _searchBoostJena = null;
private boolean retreivedNullRdfsLabel = false;
private boolean retrievedNullRdfsLabel = false;
public IndividualJena(OntResource ind, WebappDaoFactoryJena wadf) {
this.ind = ind;
@ -88,13 +88,13 @@ public class IndividualJena extends IndividualImpl implements Individual {
public String getRdfsLabel() {
if (this.rdfsLabel != null) {
return rdfsLabel;
} else if( this.rdfsLabel == null && retreivedNullRdfsLabel ){
} else if( this.rdfsLabel == null && retrievedNullRdfsLabel ){
return null;
} else {
ind.getOntModel().enterCriticalSection(Lock.READ);
try {
this.rdfsLabel = webappDaoFactory.getJenaBaseDao().getLabel(ind);
retreivedNullRdfsLabel = this.rdfsLabel == null;
retrievedNullRdfsLabel = this.rdfsLabel == null;
return this.rdfsLabel;
} finally {
ind.getOntModel().leaveCriticalSection();

View file

@ -101,13 +101,24 @@ public class SolrAutocompleteController extends VitroAjaxController {
for (SolrDocument doc : docs) {
try {
String uri = doc.get(VitroSearchTermNames.URI).toString();
// VitroSearchTermNames.NAME_RAW is a multivalued field, so doc.get() returns a list
// RY 7/1/2011
// Comment was: VitroSearchTermNames.NAME_RAW is a multivalued field, so doc.get() returns a list.
// Changed to: VitroSearchTermNames.NAME_RAW is a multivalued field, so doc.get() could return a list
// But in fact: I'm no longer seeing any lists returned for individuals with multiple labels. Not sure
// if this is new behavior or what. ???
Object nameRaw = doc.get(VitroSearchTermNames.NAME_RAW);
String name = null;
if (nameRaw instanceof List<?>) {
@SuppressWarnings("unchecked")
String name = ((List<String>) doc.get(VitroSearchTermNames.NAME_RAW)).get(0);
List<String> nameRawList = (List<String>) nameRaw;
name = nameRawList.get(0);
} else {
name = (String) nameRaw;
}
SearchResult result = new SearchResult(name, uri);
results.add(result);
} catch(Exception e){
log.error("problem getting usable Individuals from search " +
log.error("problem getting usable individuals from search " +
"hits" + e.getMessage());
}
}
@ -195,31 +206,48 @@ public class SolrAutocompleteController extends VitroAjaxController {
String acTermName = VitroSearchTermNames.AC_NAME_STEMMED;
String nonAcTermName = VitroSearchTermNames.NAME_STEMMED;
String acQueryStr;
if (queryStr.endsWith(" ")) {
// Solr wants whitespace to be escaped with a backslash
queryStr = queryStr.replaceAll("\\s+", "\\\\ ");
queryStr = nonAcTermName + ":" + queryStr;
acQueryStr = makeTermQuery(nonAcTermName, queryStr, true);
} else {
int indexOfLastWord = queryStr.lastIndexOf(" ") + 1;
String queryStr1 = queryStr.substring(0, indexOfLastWord);
String queryStr2 = queryStr.substring(indexOfLastWord);
queryStr = nonAcTermName + ":\"" + queryStr1 + "\"+" + acTermName + ":" + queryStr2;
List<String> terms = new ArrayList<String>(2);
String allButLastWord = queryStr.substring(0, indexOfLastWord);
if (StringUtils.isNotBlank(allButLastWord)) {
terms.add(makeTermQuery(nonAcTermName, allButLastWord, true));
}
log.debug("Tokenized name query string = " + queryStr);
query.setQuery(queryStr);
String lastWord = queryStr.substring(indexOfLastWord);
if (StringUtils.isNotBlank(lastWord)) {
terms.add(makeTermQuery(acTermName, lastWord, false));
}
acQueryStr = StringUtils.join(terms, " AND ");
}
log.debug("Tokenized name query string = " + acQueryStr);
query.setQuery(acQueryStr);
}
private void setUntokenizedNameQuery(SolrQuery query, String queryStr) {
queryStr = queryStr.trim();
// Solr wants whitespace to be escaped with a backslash
queryStr = queryStr.replaceAll("\\s+", "\\\\ ");
queryStr = VitroSearchTermNames.AC_NAME_UNTOKENIZED + ":" + queryStr;
queryStr = makeTermQuery(VitroSearchTermNames.AC_NAME_UNTOKENIZED, queryStr, true);
query.setQuery(queryStr);
}
private String makeTermQuery(String term, String queryStr, boolean mayContainWhitespace) {
if (mayContainWhitespace) {
queryStr = "\"" + escapeWhitespaceInQueryString(queryStr) + "\"";
}
return term + ":" + queryStr;
}
private String escapeWhitespaceInQueryString(String queryStr) {
// Solr wants whitespace to be escaped with a backslash
return queryStr.replaceAll("\\s+", "\\\\ ");
}
private void doNoQuery(HttpServletResponse response) throws IOException {

View file

@ -178,9 +178,10 @@ public class IndividualToSolrDocument {
private void addLabel(Individual ind, SolrInputDocument doc) {
String value = "";
if(ind.getRdfsLabel() != null)
value = ind.getRdfsLabel();
else{
String label = ind.getRdfsLabel();
if (label != null) {
value = label;
} else {
value = ind.getLocalName();
}
doc.addField(term.NAME_RAW, value);