Overriding the default Similarity score provided by lucene by adding a CustomSimilarity class.
This commit is contained in:
parent
7f679af874
commit
bcddbbad2e
5 changed files with 98 additions and 16 deletions
|
@ -227,7 +227,7 @@ public class AutocompleteController extends VitroAjaxController {
|
|||
// operator: e.g., +(name:tales name:tales*)
|
||||
try {
|
||||
log.debug("Adding non-wildcard query for " + querystr);
|
||||
Query query = parser.parse(querystr);
|
||||
Query query = parser.parse(querystr);
|
||||
boolQuery.add(query, BooleanClause.Occur.SHOULD);
|
||||
|
||||
// Prevent ParseException here when adding * after a space.
|
||||
|
|
|
@ -70,6 +70,7 @@ import edu.cornell.mannlib.vitro.webapp.search.beans.Searcher;
|
|||
import edu.cornell.mannlib.vitro.webapp.search.beans.VitroHighlighter;
|
||||
import edu.cornell.mannlib.vitro.webapp.search.beans.VitroQuery;
|
||||
import edu.cornell.mannlib.vitro.webapp.search.beans.VitroQueryFactory;
|
||||
import edu.cornell.mannlib.vitro.webapp.search.lucene.CustomSimilarity;
|
||||
import edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc;
|
||||
import edu.cornell.mannlib.vitro.webapp.search.lucene.LuceneIndexFactory;
|
||||
import edu.cornell.mannlib.vitro.webapp.search.lucene.LuceneSetup;
|
||||
|
@ -211,6 +212,16 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
|
|||
|
||||
IndexSearcher searcherForRequest = LuceneIndexFactory.getIndexSearcher(getServletContext());
|
||||
|
||||
/* using the CustomSimilarity to override effects such as
|
||||
* 1) rarity of a term doesn't affect the document score.
|
||||
* 2) number of instances of a query term in the matched document doesn't affect the document score
|
||||
* 3) field length doesn't affect the document score
|
||||
*
|
||||
* 3/29/2011 bk392
|
||||
*/
|
||||
CustomSimilarity customSimilarity = new CustomSimilarity();
|
||||
searcherForRequest.setSimilarity(customSimilarity);
|
||||
|
||||
TopDocs topDocs = null;
|
||||
try{
|
||||
log.debug("Searching for query term in the Index with maxHitSize "+ maxHitSize);
|
||||
|
@ -382,8 +393,8 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
|
|||
return doSearchError(e,format);
|
||||
}
|
||||
}
|
||||
|
||||
private void alphaSortIndividuals(List<Individual> beans) {
|
||||
|
||||
private void alphaSortIndividuals(List<Individual> beans) {
|
||||
Collections.sort(beans, new Comparator< Individual >(){
|
||||
public int compare(Individual o1, Individual o2) {
|
||||
if( o1 == null || o1.getName() == null )
|
||||
|
@ -605,10 +616,9 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
|
|||
|
||||
QueryParser parser = getQueryParser(analyzer);
|
||||
query = parser.parse(querystr);
|
||||
|
||||
|
||||
String alpha = request.getParameter("alpha");
|
||||
|
||||
|
||||
if( alpha != null && !"".equals(alpha) && alpha.length() == 1){
|
||||
|
||||
log.debug("Firing alpha query ");
|
||||
|
@ -688,9 +698,14 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
|
|||
// map.put(Entity2LuceneDoc.term.ALLTEXT,Entity2LuceneDoc.term.ALLTEXTUNSTEMMED);
|
||||
// qp.setStemmedToUnstemmed(map);
|
||||
|
||||
MultiFieldQueryParser qp = new MultiFieldQueryParser(Version.LUCENE_29, new String[]{
|
||||
"name", "nameunstemmed", "type", "moniker", "ALLTEXT", "ALLTEXTUNSTEMMED", "nameraw" , "classLocalName", "classLocalNameLowerCase" }, analyzer);
|
||||
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
|
||||
// MultiFieldQueryParser qp = new MultiFieldQueryParser(Version.LUCENE_29, new String[]{
|
||||
// "name", "nameunstemmed", "type", "moniker", "ALLTEXT", "ALLTEXTUNSTEMMED", "nameraw" , "classLocalName", "classLocalNameLowerCase" }, analyzer);
|
||||
|
||||
QueryParser qp = new QueryParser(Version.LUCENE_29, "name", analyzer);
|
||||
|
||||
//AND_OPERATOR returns documents even if the terms in the query lie in different fields.
|
||||
//The only requirement is that they exist in a single document.
|
||||
//qp.setDefaultOperator(QueryParser.AND_OPERATOR);
|
||||
|
||||
|
||||
return qp;
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
/* $This file is distributed under the terms of the license in /doc/license.txt$ */
|
||||
|
||||
package edu.cornell.mannlib.vitro.webapp.search.lucene;
|
||||
|
||||
import org.apache.lucene.search.DefaultSimilarity;
|
||||
|
||||
public class CustomSimilarity extends DefaultSimilarity {
|
||||
|
||||
public CustomSimilarity(){}
|
||||
|
||||
/**
|
||||
* According to Lucene Javadoc, idf computes a score factor
|
||||
* based on a term's document frequency (the number of documents
|
||||
* that contain the term).
|
||||
*
|
||||
* idf = log(numDocs/(docFreq + 1))
|
||||
*
|
||||
* From this formula we see that, the lower the value of docFreq
|
||||
* higher the value of idf. In other words, rare terms have higher
|
||||
* idf scores.
|
||||
*
|
||||
* Returning a value of 1.0f here for idf, since we wan't the
|
||||
* rarity of a term not to effect the score of a document.
|
||||
*
|
||||
* bk392 3/29/2011
|
||||
*/
|
||||
@Override
|
||||
public float idf(int docFreq, int numDocs){
|
||||
return 1.0f;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Coord computes a score factor based on the fraction of all query terms
|
||||
* that a document contains. The default implementation is
|
||||
*
|
||||
* coord = (overlap/ maxOverlap)
|
||||
*
|
||||
* overlap is the number of queryterms matched in the document and maxOverlap
|
||||
* is the total number of terms present in the query. That means, more number of
|
||||
* query terms matched in a document, higher the score. Here, we are returning a
|
||||
* value of 1.0f to override this effect.
|
||||
*/
|
||||
@Override
|
||||
public float coord(int overlap, int maxOverlap){
|
||||
return 1.0f;
|
||||
}
|
||||
|
||||
/**
|
||||
* From Lucene Javadoc, lengthNorm computes the normalization value
|
||||
* for a given field. These values together with the field boosts, are
|
||||
* stored in an index and multiplied into scores for hits on each field by
|
||||
* the search code.
|
||||
*
|
||||
* lengthNorm = 1 / sqrt(numTerms)
|
||||
*
|
||||
* In other words, the document score is inversely proportional to the number of terms
|
||||
* contained in the field of interest. Higher the number, lower the doc score. We don't
|
||||
* want this since most of our fields contain single value. (except ALLTEXT and type)
|
||||
*/
|
||||
@Override
|
||||
public float lengthNorm(String fieldName, int numTerms){
|
||||
return 1.0f;
|
||||
}
|
||||
}
|
|
@ -200,20 +200,21 @@ public class Entity2LuceneDoc implements Obj2DocIface{
|
|||
log.debug("Using local name for individual with rdfs:label " + ent.getURI());
|
||||
value = ent.getLocalName();
|
||||
}
|
||||
Field name =new Field(term.NAME, value,
|
||||
Field.Store.YES, Field.Index.ANALYZED);
|
||||
Field name = new Field(term.NAME, value, Field.Store.YES, Field.Index.ANALYZED);
|
||||
doc.add( name );
|
||||
|
||||
Field nameUn = new Field(term.NAMEUNSTEMMED, value,
|
||||
Field.Store.NO, Field.Index.ANALYZED);
|
||||
Field nameUn = new Field(term.NAMEUNSTEMMED, value, Field.Store.NO, Field.Index.ANALYZED);
|
||||
nameUn.setBoost(NAME_BOOST);
|
||||
doc.add( nameUn );
|
||||
|
||||
|
||||
// BK nameunanalyzed is used by IndividualListController
|
||||
Field nameUnanalyzed = new Field(term.NAMELOWERCASE, value.toLowerCase(),
|
||||
Field.Store.YES, Field.Index.NOT_ANALYZED);
|
||||
Field nameUnanalyzed = new Field(term.NAMELOWERCASE, value.toLowerCase(), Field.Store.YES, Field.Index.NOT_ANALYZED);
|
||||
nameUnanalyzed.setBoost(NAME_BOOST);
|
||||
doc.add( nameUnanalyzed );
|
||||
|
||||
doc.add( new Field(term.NAMERAW, value, Field.Store.YES, Field.Index.NOT_ANALYZED));
|
||||
Field nameRaw = new Field(term.NAMERAW, value, Field.Store.YES, Field.Index.NOT_ANALYZED);
|
||||
nameRaw.setBoost(NAME_BOOST);
|
||||
doc.add(nameRaw);
|
||||
|
||||
|
||||
//Moniker
|
||||
|
|
|
@ -131,6 +131,7 @@ public class LuceneIndexer implements IndexerIface {
|
|||
String offLineDir = getOffLineBuildDir();
|
||||
this.currentOffLineDir = offLineDir;
|
||||
writer = new IndexWriter(offLineDir, analyzer, true, MAX_FIELD_LENGTH);
|
||||
writer.setSimilarity(new CustomSimilarity());
|
||||
}else{
|
||||
writer = getLiveIndexWriter(false);
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue