Overriding the default Similarity score provided by lucene by adding a CustomSimilarity class.

This commit is contained in:
deepakkoni 2011-03-29 15:06:18 +00:00
parent 7f679af874
commit bcddbbad2e
5 changed files with 98 additions and 16 deletions

View file

@ -227,7 +227,7 @@ public class AutocompleteController extends VitroAjaxController {
// operator: e.g., +(name:tales name:tales*)
try {
log.debug("Adding non-wildcard query for " + querystr);
Query query = parser.parse(querystr);
Query query = parser.parse(querystr);
boolQuery.add(query, BooleanClause.Occur.SHOULD);
// Prevent ParseException here when adding * after a space.

View file

@ -70,6 +70,7 @@ import edu.cornell.mannlib.vitro.webapp.search.beans.Searcher;
import edu.cornell.mannlib.vitro.webapp.search.beans.VitroHighlighter;
import edu.cornell.mannlib.vitro.webapp.search.beans.VitroQuery;
import edu.cornell.mannlib.vitro.webapp.search.beans.VitroQueryFactory;
import edu.cornell.mannlib.vitro.webapp.search.lucene.CustomSimilarity;
import edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc;
import edu.cornell.mannlib.vitro.webapp.search.lucene.LuceneIndexFactory;
import edu.cornell.mannlib.vitro.webapp.search.lucene.LuceneSetup;
@ -211,6 +212,16 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
IndexSearcher searcherForRequest = LuceneIndexFactory.getIndexSearcher(getServletContext());
/* using the CustomSimilarity to override effects such as
* 1) rarity of a term doesn't affect the document score.
* 2) number of instances of a query term in the matched document doesn't affect the document score
* 3) field length doesn't affect the document score
*
* 3/29/2011 bk392
*/
CustomSimilarity customSimilarity = new CustomSimilarity();
searcherForRequest.setSimilarity(customSimilarity);
TopDocs topDocs = null;
try{
log.debug("Searching for query term in the Index with maxHitSize "+ maxHitSize);
@ -382,8 +393,8 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
return doSearchError(e,format);
}
}
private void alphaSortIndividuals(List<Individual> beans) {
private void alphaSortIndividuals(List<Individual> beans) {
Collections.sort(beans, new Comparator< Individual >(){
public int compare(Individual o1, Individual o2) {
if( o1 == null || o1.getName() == null )
@ -605,10 +616,9 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
QueryParser parser = getQueryParser(analyzer);
query = parser.parse(querystr);
String alpha = request.getParameter("alpha");
if( alpha != null && !"".equals(alpha) && alpha.length() == 1){
log.debug("Firing alpha query ");
@ -688,9 +698,14 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear
// map.put(Entity2LuceneDoc.term.ALLTEXT,Entity2LuceneDoc.term.ALLTEXTUNSTEMMED);
// qp.setStemmedToUnstemmed(map);
MultiFieldQueryParser qp = new MultiFieldQueryParser(Version.LUCENE_29, new String[]{
"name", "nameunstemmed", "type", "moniker", "ALLTEXT", "ALLTEXTUNSTEMMED", "nameraw" , "classLocalName", "classLocalNameLowerCase" }, analyzer);
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
// MultiFieldQueryParser qp = new MultiFieldQueryParser(Version.LUCENE_29, new String[]{
// "name", "nameunstemmed", "type", "moniker", "ALLTEXT", "ALLTEXTUNSTEMMED", "nameraw" , "classLocalName", "classLocalNameLowerCase" }, analyzer);
QueryParser qp = new QueryParser(Version.LUCENE_29, "name", analyzer);
//AND_OPERATOR returns documents even if the terms in the query lie in different fields.
//The only requirement is that they exist in a single document.
//qp.setDefaultOperator(QueryParser.AND_OPERATOR);
return qp;

View file

@ -0,0 +1,65 @@
/* $This file is distributed under the terms of the license in /doc/license.txt$ */
package edu.cornell.mannlib.vitro.webapp.search.lucene;
import org.apache.lucene.search.DefaultSimilarity;
public class CustomSimilarity extends DefaultSimilarity {
public CustomSimilarity(){}
/**
* According to Lucene Javadoc, idf computes a score factor
* based on a term's document frequency (the number of documents
* that contain the term).
*
* idf = log(numDocs/(docFreq + 1))
*
* From this formula we see that, the lower the value of docFreq
* higher the value of idf. In other words, rare terms have higher
* idf scores.
*
* Returning a value of 1.0f here for idf, since we wan't the
* rarity of a term not to effect the score of a document.
*
* bk392 3/29/2011
*/
@Override
public float idf(int docFreq, int numDocs){
return 1.0f;
}
/**
* Coord computes a score factor based on the fraction of all query terms
* that a document contains. The default implementation is
*
* coord = (overlap/ maxOverlap)
*
* overlap is the number of queryterms matched in the document and maxOverlap
* is the total number of terms present in the query. That means, more number of
* query terms matched in a document, higher the score. Here, we are returning a
* value of 1.0f to override this effect.
*/
@Override
public float coord(int overlap, int maxOverlap){
return 1.0f;
}
/**
* From Lucene Javadoc, lengthNorm computes the normalization value
* for a given field. These values together with the field boosts, are
* stored in an index and multiplied into scores for hits on each field by
* the search code.
*
* lengthNorm = 1 / sqrt(numTerms)
*
* In other words, the document score is inversely proportional to the number of terms
* contained in the field of interest. Higher the number, lower the doc score. We don't
* want this since most of our fields contain single value. (except ALLTEXT and type)
*/
@Override
public float lengthNorm(String fieldName, int numTerms){
return 1.0f;
}
}

View file

@ -200,20 +200,21 @@ public class Entity2LuceneDoc implements Obj2DocIface{
log.debug("Using local name for individual with rdfs:label " + ent.getURI());
value = ent.getLocalName();
}
Field name =new Field(term.NAME, value,
Field.Store.YES, Field.Index.ANALYZED);
Field name = new Field(term.NAME, value, Field.Store.YES, Field.Index.ANALYZED);
doc.add( name );
Field nameUn = new Field(term.NAMEUNSTEMMED, value,
Field.Store.NO, Field.Index.ANALYZED);
Field nameUn = new Field(term.NAMEUNSTEMMED, value, Field.Store.NO, Field.Index.ANALYZED);
nameUn.setBoost(NAME_BOOST);
doc.add( nameUn );
// BK nameunanalyzed is used by IndividualListController
Field nameUnanalyzed = new Field(term.NAMELOWERCASE, value.toLowerCase(),
Field.Store.YES, Field.Index.NOT_ANALYZED);
Field nameUnanalyzed = new Field(term.NAMELOWERCASE, value.toLowerCase(), Field.Store.YES, Field.Index.NOT_ANALYZED);
nameUnanalyzed.setBoost(NAME_BOOST);
doc.add( nameUnanalyzed );
doc.add( new Field(term.NAMERAW, value, Field.Store.YES, Field.Index.NOT_ANALYZED));
Field nameRaw = new Field(term.NAMERAW, value, Field.Store.YES, Field.Index.NOT_ANALYZED);
nameRaw.setBoost(NAME_BOOST);
doc.add(nameRaw);
//Moniker

View file

@ -131,6 +131,7 @@ public class LuceneIndexer implements IndexerIface {
String offLineDir = getOffLineBuildDir();
this.currentOffLineDir = offLineDir;
writer = new IndexWriter(offLineDir, analyzer, true, MAX_FIELD_LENGTH);
writer.setSimilarity(new CustomSimilarity());
}else{
writer = getLiveIndexWriter(false);
}