diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/AutocompleteController.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/AutocompleteController.java index 6e929c4d3..8953f6519 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/AutocompleteController.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/AutocompleteController.java @@ -227,7 +227,7 @@ public class AutocompleteController extends VitroAjaxController { // operator: e.g., +(name:tales name:tales*) try { log.debug("Adding non-wildcard query for " + querystr); - Query query = parser.parse(querystr); + Query query = parser.parse(querystr); boolQuery.add(query, BooleanClause.Occur.SHOULD); // Prevent ParseException here when adding * after a space. diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/PagedSearchController.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/PagedSearchController.java index d6a225cb4..f4472d550 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/PagedSearchController.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/controller/PagedSearchController.java @@ -70,6 +70,7 @@ import edu.cornell.mannlib.vitro.webapp.search.beans.Searcher; import edu.cornell.mannlib.vitro.webapp.search.beans.VitroHighlighter; import edu.cornell.mannlib.vitro.webapp.search.beans.VitroQuery; import edu.cornell.mannlib.vitro.webapp.search.beans.VitroQueryFactory; +import edu.cornell.mannlib.vitro.webapp.search.lucene.CustomSimilarity; import edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc; import edu.cornell.mannlib.vitro.webapp.search.lucene.LuceneIndexFactory; import edu.cornell.mannlib.vitro.webapp.search.lucene.LuceneSetup; @@ -211,6 +212,16 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear IndexSearcher searcherForRequest = LuceneIndexFactory.getIndexSearcher(getServletContext()); + /* using the CustomSimilarity to override effects such as + * 1) rarity of a term doesn't affect the document score. + * 2) number of instances of a query term in the matched document doesn't affect the document score + * 3) field length doesn't affect the document score + * + * 3/29/2011 bk392 + */ + CustomSimilarity customSimilarity = new CustomSimilarity(); + searcherForRequest.setSimilarity(customSimilarity); + TopDocs topDocs = null; try{ log.debug("Searching for query term in the Index with maxHitSize "+ maxHitSize); @@ -382,8 +393,8 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear return doSearchError(e,format); } } - - private void alphaSortIndividuals(List beans) { + + private void alphaSortIndividuals(List beans) { Collections.sort(beans, new Comparator< Individual >(){ public int compare(Individual o1, Individual o2) { if( o1 == null || o1.getName() == null ) @@ -605,10 +616,9 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear QueryParser parser = getQueryParser(analyzer); query = parser.parse(querystr); - + String alpha = request.getParameter("alpha"); - if( alpha != null && !"".equals(alpha) && alpha.length() == 1){ log.debug("Firing alpha query "); @@ -688,9 +698,14 @@ public class PagedSearchController extends FreemarkerHttpServlet implements Sear // map.put(Entity2LuceneDoc.term.ALLTEXT,Entity2LuceneDoc.term.ALLTEXTUNSTEMMED); // qp.setStemmedToUnstemmed(map); - MultiFieldQueryParser qp = new MultiFieldQueryParser(Version.LUCENE_29, new String[]{ - "name", "nameunstemmed", "type", "moniker", "ALLTEXT", "ALLTEXTUNSTEMMED", "nameraw" , "classLocalName", "classLocalNameLowerCase" }, analyzer); - qp.setDefaultOperator(QueryParser.AND_OPERATOR); +// MultiFieldQueryParser qp = new MultiFieldQueryParser(Version.LUCENE_29, new String[]{ +// "name", "nameunstemmed", "type", "moniker", "ALLTEXT", "ALLTEXTUNSTEMMED", "nameraw" , "classLocalName", "classLocalNameLowerCase" }, analyzer); + + QueryParser qp = new QueryParser(Version.LUCENE_29, "name", analyzer); + + //AND_OPERATOR returns documents even if the terms in the query lie in different fields. + //The only requirement is that they exist in a single document. + //qp.setDefaultOperator(QueryParser.AND_OPERATOR); return qp; diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/CustomSimilarity.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/CustomSimilarity.java new file mode 100644 index 000000000..880fae449 --- /dev/null +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/CustomSimilarity.java @@ -0,0 +1,65 @@ +/* $This file is distributed under the terms of the license in /doc/license.txt$ */ + +package edu.cornell.mannlib.vitro.webapp.search.lucene; + +import org.apache.lucene.search.DefaultSimilarity; + +public class CustomSimilarity extends DefaultSimilarity { + + public CustomSimilarity(){} + + /** + * According to Lucene Javadoc, idf computes a score factor + * based on a term's document frequency (the number of documents + * that contain the term). + * + * idf = log(numDocs/(docFreq + 1)) + * + * From this formula we see that, the lower the value of docFreq + * higher the value of idf. In other words, rare terms have higher + * idf scores. + * + * Returning a value of 1.0f here for idf, since we wan't the + * rarity of a term not to effect the score of a document. + * + * bk392 3/29/2011 + */ + @Override + public float idf(int docFreq, int numDocs){ + return 1.0f; + } + + + /** + * Coord computes a score factor based on the fraction of all query terms + * that a document contains. The default implementation is + * + * coord = (overlap/ maxOverlap) + * + * overlap is the number of queryterms matched in the document and maxOverlap + * is the total number of terms present in the query. That means, more number of + * query terms matched in a document, higher the score. Here, we are returning a + * value of 1.0f to override this effect. + */ + @Override + public float coord(int overlap, int maxOverlap){ + return 1.0f; + } + + /** + * From Lucene Javadoc, lengthNorm computes the normalization value + * for a given field. These values together with the field boosts, are + * stored in an index and multiplied into scores for hits on each field by + * the search code. + * + * lengthNorm = 1 / sqrt(numTerms) + * + * In other words, the document score is inversely proportional to the number of terms + * contained in the field of interest. Higher the number, lower the doc score. We don't + * want this since most of our fields contain single value. (except ALLTEXT and type) + */ + @Override + public float lengthNorm(String fieldName, int numTerms){ + return 1.0f; + } +} diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/Entity2LuceneDoc.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/Entity2LuceneDoc.java index c45b52f33..e544b7b24 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/Entity2LuceneDoc.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/Entity2LuceneDoc.java @@ -200,20 +200,21 @@ public class Entity2LuceneDoc implements Obj2DocIface{ log.debug("Using local name for individual with rdfs:label " + ent.getURI()); value = ent.getLocalName(); } - Field name =new Field(term.NAME, value, - Field.Store.YES, Field.Index.ANALYZED); + Field name = new Field(term.NAME, value, Field.Store.YES, Field.Index.ANALYZED); doc.add( name ); - Field nameUn = new Field(term.NAMEUNSTEMMED, value, - Field.Store.NO, Field.Index.ANALYZED); + Field nameUn = new Field(term.NAMEUNSTEMMED, value, Field.Store.NO, Field.Index.ANALYZED); + nameUn.setBoost(NAME_BOOST); doc.add( nameUn ); - + // BK nameunanalyzed is used by IndividualListController - Field nameUnanalyzed = new Field(term.NAMELOWERCASE, value.toLowerCase(), - Field.Store.YES, Field.Index.NOT_ANALYZED); + Field nameUnanalyzed = new Field(term.NAMELOWERCASE, value.toLowerCase(), Field.Store.YES, Field.Index.NOT_ANALYZED); + nameUnanalyzed.setBoost(NAME_BOOST); doc.add( nameUnanalyzed ); - doc.add( new Field(term.NAMERAW, value, Field.Store.YES, Field.Index.NOT_ANALYZED)); + Field nameRaw = new Field(term.NAMERAW, value, Field.Store.YES, Field.Index.NOT_ANALYZED); + nameRaw.setBoost(NAME_BOOST); + doc.add(nameRaw); //Moniker diff --git a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneIndexer.java b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneIndexer.java index 270932831..e7bc2b8cc 100644 --- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneIndexer.java +++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/lucene/LuceneIndexer.java @@ -131,6 +131,7 @@ public class LuceneIndexer implements IndexerIface { String offLineDir = getOffLineBuildDir(); this.currentOffLineDir = offLineDir; writer = new IndexWriter(offLineDir, analyzer, true, MAX_FIELD_LENGTH); + writer.setSimilarity(new CustomSimilarity()); }else{ writer = getLiveIndexWriter(false); }