NIHVIVO-2459 Improvements to stemmed autocomplete matching. NIHVIVO-2801 Fix error in SolrAutocompleteController.
This commit is contained in:
parent
cebc368738
commit
50b159710b
7 changed files with 97 additions and 36 deletions
|
@ -259,7 +259,7 @@
|
||||||
<analyzer type="index">
|
<analyzer type="index">
|
||||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true"
|
<filter class="solr.StopFilterFactory" ignoreCase="true"
|
||||||
words="stopwords.txt" enablePositionIncrements="true" />
|
words="stopwords-name.txt" enablePositionIncrements="true" />
|
||||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
|
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
|
||||||
generateNumberParts="1" catenateWords="0"
|
generateNumberParts="1" catenateWords="0"
|
||||||
catenateNumbers="0" catenateAll="0"
|
catenateNumbers="0" catenateAll="0"
|
||||||
|
@ -458,7 +458,7 @@
|
||||||
<analyzer type="index">
|
<analyzer type="index">
|
||||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true"
|
<filter class="solr.StopFilterFactory" ignoreCase="true"
|
||||||
words="stopwords.txt" enablePositionIncrements="true" />
|
words="stopwords-name.txt" enablePositionIncrements="true" />
|
||||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
|
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
|
||||||
generateNumberParts="1" catenateWords="0"
|
generateNumberParts="1" catenateWords="0"
|
||||||
catenateNumbers="0" catenateAll="0"
|
catenateNumbers="0" catenateAll="0"
|
||||||
|
@ -470,7 +470,7 @@
|
||||||
<analyzer type="query">
|
<analyzer type="query">
|
||||||
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||||
<filter class="solr.StopFilterFactory" ignoreCase="true"
|
<filter class="solr.StopFilterFactory" ignoreCase="true"
|
||||||
words="stopwords.txt" enablePositionIncrements="true" />
|
words="stopwords-name.txt" enablePositionIncrements="true" />
|
||||||
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
|
<filter class="solr.WordDelimiterFilterFactory" generateWordParts="1"
|
||||||
generateNumberParts="1" catenateWords="0"
|
generateNumberParts="1" catenateWords="0"
|
||||||
catenateNumbers="0" catenateAll="0"
|
catenateNumbers="0" catenateAll="0"
|
||||||
|
|
38
solr/exampleSolr/conf/stopwords-name.txt
Normal file
38
solr/exampleSolr/conf/stopwords-name.txt
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
# Standard english stop words taken from Lucene's StopAnalyzer
|
||||||
|
# Stopwords used in autocomplete (label-matching) fields, since full list in stopwords.txt may be too inclusive. Since label includes things like
|
||||||
|
# book titles, we want a smaller set of stopwords.
|
||||||
|
|
||||||
|
a
|
||||||
|
an
|
||||||
|
and
|
||||||
|
are
|
||||||
|
as
|
||||||
|
at
|
||||||
|
be
|
||||||
|
but
|
||||||
|
by
|
||||||
|
for
|
||||||
|
if
|
||||||
|
in
|
||||||
|
into
|
||||||
|
is
|
||||||
|
it
|
||||||
|
no
|
||||||
|
not
|
||||||
|
of
|
||||||
|
on
|
||||||
|
or
|
||||||
|
s
|
||||||
|
t
|
||||||
|
that
|
||||||
|
the
|
||||||
|
their
|
||||||
|
then
|
||||||
|
there
|
||||||
|
these
|
||||||
|
they
|
||||||
|
this
|
||||||
|
to
|
||||||
|
was
|
||||||
|
will
|
||||||
|
with
|
|
@ -130,9 +130,6 @@ our
|
||||||
out
|
out
|
||||||
over
|
over
|
||||||
re
|
re
|
||||||
said
|
|
||||||
same
|
|
||||||
see
|
|
||||||
should
|
should
|
||||||
since
|
since
|
||||||
so
|
so
|
||||||
|
@ -156,13 +153,10 @@ to
|
||||||
too
|
too
|
||||||
under
|
under
|
||||||
up
|
up
|
||||||
use
|
|
||||||
very
|
very
|
||||||
want
|
want
|
||||||
was
|
was
|
||||||
way
|
|
||||||
we
|
we
|
||||||
well
|
|
||||||
were
|
were
|
||||||
what
|
what
|
||||||
when
|
when
|
||||||
|
|
|
@ -254,7 +254,7 @@ public class SolrIndividualListController extends FreemarkerHttpServlet {
|
||||||
if (individual != null) {
|
if (individual != null) {
|
||||||
individualsAdded++;
|
individualsAdded++;
|
||||||
individuals.add(individual);
|
individuals.add(individual);
|
||||||
log.debug("Adding individual " + uri + " to individuals for display");
|
log.debug("Adding individual " + uri + " to individual list display");
|
||||||
} else {
|
} else {
|
||||||
log.debug("No existing individual for search document with uri = " + uri);
|
log.debug("No existing individual for search document with uri = " + uri);
|
||||||
}
|
}
|
||||||
|
|
|
@ -53,7 +53,7 @@ public class IndividualJena extends IndividualImpl implements Individual {
|
||||||
private OntResource ind = null;
|
private OntResource ind = null;
|
||||||
private WebappDaoFactoryJena webappDaoFactory = null;
|
private WebappDaoFactoryJena webappDaoFactory = null;
|
||||||
private Float _searchBoostJena = null;
|
private Float _searchBoostJena = null;
|
||||||
private boolean retreivedNullRdfsLabel = false;
|
private boolean retrievedNullRdfsLabel = false;
|
||||||
|
|
||||||
public IndividualJena(OntResource ind, WebappDaoFactoryJena wadf) {
|
public IndividualJena(OntResource ind, WebappDaoFactoryJena wadf) {
|
||||||
this.ind = ind;
|
this.ind = ind;
|
||||||
|
@ -88,13 +88,13 @@ public class IndividualJena extends IndividualImpl implements Individual {
|
||||||
public String getRdfsLabel() {
|
public String getRdfsLabel() {
|
||||||
if (this.rdfsLabel != null) {
|
if (this.rdfsLabel != null) {
|
||||||
return rdfsLabel;
|
return rdfsLabel;
|
||||||
} else if( this.rdfsLabel == null && retreivedNullRdfsLabel ){
|
} else if( this.rdfsLabel == null && retrievedNullRdfsLabel ){
|
||||||
return null;
|
return null;
|
||||||
} else {
|
} else {
|
||||||
ind.getOntModel().enterCriticalSection(Lock.READ);
|
ind.getOntModel().enterCriticalSection(Lock.READ);
|
||||||
try {
|
try {
|
||||||
this.rdfsLabel = webappDaoFactory.getJenaBaseDao().getLabel(ind);
|
this.rdfsLabel = webappDaoFactory.getJenaBaseDao().getLabel(ind);
|
||||||
retreivedNullRdfsLabel = this.rdfsLabel == null;
|
retrievedNullRdfsLabel = this.rdfsLabel == null;
|
||||||
return this.rdfsLabel;
|
return this.rdfsLabel;
|
||||||
} finally {
|
} finally {
|
||||||
ind.getOntModel().leaveCriticalSection();
|
ind.getOntModel().leaveCriticalSection();
|
||||||
|
|
|
@ -99,15 +99,26 @@ public class SolrAutocompleteController extends VitroAjaxController {
|
||||||
|
|
||||||
List<SearchResult> results = new ArrayList<SearchResult>();
|
List<SearchResult> results = new ArrayList<SearchResult>();
|
||||||
for (SolrDocument doc : docs) {
|
for (SolrDocument doc : docs) {
|
||||||
try{
|
try {
|
||||||
String uri = doc.get(VitroSearchTermNames.URI).toString();
|
String uri = doc.get(VitroSearchTermNames.URI).toString();
|
||||||
// VitroSearchTermNames.NAME_RAW is a multivalued field, so doc.get() returns a list
|
// RY 7/1/2011
|
||||||
@SuppressWarnings("unchecked")
|
// Comment was: VitroSearchTermNames.NAME_RAW is a multivalued field, so doc.get() returns a list.
|
||||||
String name = ((List<String>) doc.get(VitroSearchTermNames.NAME_RAW)).get(0);
|
// Changed to: VitroSearchTermNames.NAME_RAW is a multivalued field, so doc.get() could return a list
|
||||||
|
// But in fact: I'm no longer seeing any lists returned for individuals with multiple labels. Not sure
|
||||||
|
// if this is new behavior or what. ???
|
||||||
|
Object nameRaw = doc.get(VitroSearchTermNames.NAME_RAW);
|
||||||
|
String name = null;
|
||||||
|
if (nameRaw instanceof List<?>) {
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
List<String> nameRawList = (List<String>) nameRaw;
|
||||||
|
name = nameRawList.get(0);
|
||||||
|
} else {
|
||||||
|
name = (String) nameRaw;
|
||||||
|
}
|
||||||
SearchResult result = new SearchResult(name, uri);
|
SearchResult result = new SearchResult(name, uri);
|
||||||
results.add(result);
|
results.add(result);
|
||||||
} catch(Exception e){
|
} catch(Exception e){
|
||||||
log.error("problem getting usable Individuals from search " +
|
log.error("problem getting usable individuals from search " +
|
||||||
"hits" + e.getMessage());
|
"hits" + e.getMessage());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -195,31 +206,48 @@ public class SolrAutocompleteController extends VitroAjaxController {
|
||||||
|
|
||||||
String acTermName = VitroSearchTermNames.AC_NAME_STEMMED;
|
String acTermName = VitroSearchTermNames.AC_NAME_STEMMED;
|
||||||
String nonAcTermName = VitroSearchTermNames.NAME_STEMMED;
|
String nonAcTermName = VitroSearchTermNames.NAME_STEMMED;
|
||||||
|
String acQueryStr;
|
||||||
|
|
||||||
if (queryStr.endsWith(" ")) {
|
if (queryStr.endsWith(" ")) {
|
||||||
// Solr wants whitespace to be escaped with a backslash
|
acQueryStr = makeTermQuery(nonAcTermName, queryStr, true);
|
||||||
queryStr = queryStr.replaceAll("\\s+", "\\\\ ");
|
|
||||||
queryStr = nonAcTermName + ":" + queryStr;
|
|
||||||
} else {
|
} else {
|
||||||
int indexOfLastWord = queryStr.lastIndexOf(" ") + 1;
|
int indexOfLastWord = queryStr.lastIndexOf(" ") + 1;
|
||||||
String queryStr1 = queryStr.substring(0, indexOfLastWord);
|
List<String> terms = new ArrayList<String>(2);
|
||||||
String queryStr2 = queryStr.substring(indexOfLastWord);
|
|
||||||
queryStr = nonAcTermName + ":\"" + queryStr1 + "\"+" + acTermName + ":" + queryStr2;
|
String allButLastWord = queryStr.substring(0, indexOfLastWord);
|
||||||
|
if (StringUtils.isNotBlank(allButLastWord)) {
|
||||||
|
terms.add(makeTermQuery(nonAcTermName, allButLastWord, true));
|
||||||
|
}
|
||||||
|
|
||||||
|
String lastWord = queryStr.substring(indexOfLastWord);
|
||||||
|
if (StringUtils.isNotBlank(lastWord)) {
|
||||||
|
terms.add(makeTermQuery(acTermName, lastWord, false));
|
||||||
|
}
|
||||||
|
|
||||||
|
acQueryStr = StringUtils.join(terms, " AND ");
|
||||||
}
|
}
|
||||||
|
|
||||||
log.debug("Tokenized name query string = " + queryStr);
|
log.debug("Tokenized name query string = " + acQueryStr);
|
||||||
query.setQuery(queryStr);
|
query.setQuery(acQueryStr);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void setUntokenizedNameQuery(SolrQuery query, String queryStr) {
|
private void setUntokenizedNameQuery(SolrQuery query, String queryStr) {
|
||||||
|
|
||||||
queryStr = queryStr.trim();
|
queryStr = queryStr.trim();
|
||||||
// Solr wants whitespace to be escaped with a backslash
|
queryStr = makeTermQuery(VitroSearchTermNames.AC_NAME_UNTOKENIZED, queryStr, true);
|
||||||
queryStr = queryStr.replaceAll("\\s+", "\\\\ ");
|
|
||||||
queryStr = VitroSearchTermNames.AC_NAME_UNTOKENIZED + ":" + queryStr;
|
|
||||||
query.setQuery(queryStr);
|
query.setQuery(queryStr);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String makeTermQuery(String term, String queryStr, boolean mayContainWhitespace) {
|
||||||
|
if (mayContainWhitespace) {
|
||||||
|
queryStr = "\"" + escapeWhitespaceInQueryString(queryStr) + "\"";
|
||||||
|
}
|
||||||
|
return term + ":" + queryStr;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String escapeWhitespaceInQueryString(String queryStr) {
|
||||||
|
// Solr wants whitespace to be escaped with a backslash
|
||||||
|
return queryStr.replaceAll("\\s+", "\\\\ ");
|
||||||
}
|
}
|
||||||
|
|
||||||
private void doNoQuery(HttpServletResponse response) throws IOException {
|
private void doNoQuery(HttpServletResponse response) throws IOException {
|
||||||
|
|
|
@ -178,9 +178,10 @@ public class IndividualToSolrDocument {
|
||||||
|
|
||||||
private void addLabel(Individual ind, SolrInputDocument doc) {
|
private void addLabel(Individual ind, SolrInputDocument doc) {
|
||||||
String value = "";
|
String value = "";
|
||||||
if(ind.getRdfsLabel() != null)
|
String label = ind.getRdfsLabel();
|
||||||
value = ind.getRdfsLabel();
|
if (label != null) {
|
||||||
else{
|
value = label;
|
||||||
|
} else {
|
||||||
value = ind.getLocalName();
|
value = ind.getLocalName();
|
||||||
}
|
}
|
||||||
doc.addField(term.NAME_RAW, value);
|
doc.addField(term.NAME_RAW, value);
|
||||||
|
|
Loading…
Add table
Reference in a new issue