Changes to lucen index creation NIHVIVO-1128

This commit is contained in:
bdc34 2010-10-06 21:51:59 +00:00
parent 5002c3d761
commit 1041759072
5 changed files with 51 additions and 268 deletions

View file

@ -585,7 +585,7 @@ public class IndividualJena extends IndividualImpl implements Individual {
try {
webappDaoFactory.getLinksDao().addLinksToIndividual( this );
} catch (Exception e) {
log.error(this.getClass().getName()+" could not addLinksToIndividual for "+this.getURI());
log.debug(this.getClass().getName()+" could not addLinksToIndividual for "+this.getURI());
}
return this.linksList;
}
@ -598,7 +598,7 @@ public class IndividualJena extends IndividualImpl implements Individual {
try {
webappDaoFactory.getLinksDao().addPrimaryLinkToIndividual( this );
} catch (Exception e) {
log.error(this.getClass().getName()+" could not addPrimaryLinkToIndividual for "+this.getURI());
log.debug(this.getClass().getName()+" could not addPrimaryLinkToIndividual for "+this.getURI());
}
return this.primaryLink;
}
@ -612,7 +612,7 @@ public class IndividualJena extends IndividualImpl implements Individual {
try {
this.setKeywords(webappDaoFactory.getIndividualDao().getKeywordsForIndividual(this.getURI()));
} catch (Exception e) {
log.error(this.getClass().getName()+" could not getKeywords for "+this.getURI());
log.debug(this.getClass().getName()+" could not getKeywords for "+this.getURI());
}
return this.keywords;
}

View file

@ -13,6 +13,9 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.hp.hpl.jena.datatypes.xsd.XSDDatatype;
import com.hp.hpl.jena.ontology.OntModel;
import com.hp.hpl.jena.ontology.ObjectProperty;
@ -33,6 +36,8 @@ import edu.cornell.mannlib.vitro.webapp.dao.WebappDaoFactory;
public class LinksDaoJena extends JenaBaseDao implements LinksDao {
protected static final Log log = LogFactory.getLog(LinksDaoJena.class.getName());
public LinksDaoJena(WebappDaoFactoryJena wadf) {
super(wadf);
}
@ -329,7 +334,7 @@ public class LinksDaoJena extends JenaBaseDao implements LinksDao {
} else if (l.getDatatype()==XSDDatatype.XSDstring) {
link.setDisplayRank(l.getString());
} else {
log.error("unexpected literal datatype; saved as displayRank 10");
log.debug("unexpected literal datatype; saved as displayRank 10");
link.setDisplayRank("10");
}
}

View file

@ -4,16 +4,12 @@ package edu.cornell.mannlib.vitro.webapp.search.controller;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.servlet.ServletConfig;
import javax.servlet.ServletContext;
@ -35,8 +31,6 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import edu.cornell.mannlib.vitro.webapp.beans.Individual;
import edu.cornell.mannlib.vitro.webapp.controller.VitroRequest;
@ -44,12 +38,9 @@ import edu.cornell.mannlib.vitro.webapp.controller.freemarker.FreemarkerHttpServ
import edu.cornell.mannlib.vitro.webapp.dao.IndividualDao;
import edu.cornell.mannlib.vitro.webapp.flags.PortalFlag;
import edu.cornell.mannlib.vitro.webapp.search.SearchException;
import edu.cornell.mannlib.vitro.webapp.search.beans.Searcher;
import edu.cornell.mannlib.vitro.webapp.search.beans.VitroHighlighter;
import edu.cornell.mannlib.vitro.webapp.search.beans.VitroQuery;
import edu.cornell.mannlib.vitro.webapp.search.beans.VitroQueryFactory;
import edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc;
import edu.cornell.mannlib.vitro.webapp.search.lucene.LuceneIndexer;
import edu.cornell.mannlib.vitro.webapp.search.lucene.LuceneIndexFactory;
import edu.cornell.mannlib.vitro.webapp.search.lucene.LuceneSetup;
import edu.cornell.mannlib.vitro.webapp.utils.FlagMathUtils;
import freemarker.template.Configuration;
@ -59,7 +50,7 @@ import freemarker.template.Configuration;
* through a Lucene search.
*/
public class AutocompleteController extends FreemarkerHttpServlet implements Searcher{
public class AutocompleteController extends FreemarkerHttpServlet{
private static final long serialVersionUID = 1L;
private static final Log log = LogFactory.getLog(AutocompleteController.class);
@ -68,24 +59,9 @@ public class AutocompleteController extends FreemarkerHttpServlet implements Sea
private static String QUERY_PARAMETER_NAME = "term";
private IndexSearcher searcher = null;
String NORESULT_MSG = "";
private int defaultMaxSearchSize= 1000;
public void init(ServletConfig config) throws ServletException {
super.init(config);
LuceneIndexer indexer=(LuceneIndexer)getServletContext()
.getAttribute(LuceneIndexer.class.getName());
indexer.addSearcher(this);
try{
String indexDir = getIndexDir(getServletContext());
getIndexSearcher(indexDir);
}catch(Exception ex){
}
}
public void doPost(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
doGet(request, response);
@ -125,9 +101,9 @@ public class AutocompleteController extends FreemarkerHttpServlet implements Sea
doNoQuery(map, config, response);
return;
}
IndexSearcher searcherForRequest = LuceneIndexFactory.getIndexSearcher(getServletContext());
IndexSearcher searcherForRequest = getIndexSearcher(indexDir);
TopDocs topDocs = null;
try{
topDocs = searcherForRequest.search(query,null,maxHitSize);
@ -428,21 +404,6 @@ public class AutocompleteController extends FreemarkerHttpServlet implements Sea
return qp;
}
private synchronized IndexSearcher getIndexSearcher(String indexDir) {
if( searcher == null ){
try {
Directory fsDir = FSDirectory.getDirectory(indexDir);
searcher = new IndexSearcher(fsDir);
} catch (IOException e) {
log.error("LuceneSearcher: could not make indexSearcher "+e);
log.error("It is likely that you have not made a directory for the lucene index. "+
"Create the directory indicated in the error and set permissions/ownership so"+
" that the tomcat server can read/write to it.");
//The index directory is created by LuceneIndexer.makeNewIndex()
}
}
return searcher;
}
private void doNoQuery(Map<String, Object> map, Configuration config, HttpServletResponse response) {
@ -491,24 +452,6 @@ public class AutocompleteController extends FreemarkerHttpServlet implements Sea
}
}
/**
* Need to accept notification from indexer that the index has been changed.
*/
public void close() {
searcher = null;
}
public VitroHighlighter getHighlighter(VitroQuery q) {
throw new Error("AutocompleteController.getHighlighter() is unimplemented");
}
public VitroQueryFactory getQueryFactory() {
throw new Error("AutocompleteController.getQueryFactory() is unimplemented");
}
public List search(VitroQuery query) throws SearchException {
throw new Error("AutocompleteController.search() is unimplemented");
}
}

View file

@ -16,7 +16,6 @@ import java.util.List;
import java.util.Set;
import javax.servlet.RequestDispatcher;
import javax.servlet.ServletConfig;
import javax.servlet.ServletContext;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServletRequest;
@ -38,39 +37,25 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import edu.cornell.mannlib.vitro.webapp.beans.DataProperty;
import edu.cornell.mannlib.vitro.webapp.beans.DataPropertyStatement;
import edu.cornell.mannlib.vitro.webapp.beans.Individual;
import edu.cornell.mannlib.vitro.webapp.beans.IndividualImpl;
import edu.cornell.mannlib.vitro.webapp.beans.ObjectProperty;
import edu.cornell.mannlib.vitro.webapp.beans.ObjectPropertyStatement;
import edu.cornell.mannlib.vitro.webapp.beans.Portal;
import edu.cornell.mannlib.vitro.webapp.beans.VClass;
import edu.cornell.mannlib.vitro.webapp.beans.VClassGroup;
import edu.cornell.mannlib.vitro.webapp.controller.Controllers;
import edu.cornell.mannlib.vitro.webapp.controller.VitroHttpServlet;
import edu.cornell.mannlib.vitro.webapp.controller.VitroRequest;
import edu.cornell.mannlib.vitro.webapp.dao.DataPropertyDao;
import edu.cornell.mannlib.vitro.webapp.dao.IndividualDao;
import edu.cornell.mannlib.vitro.webapp.dao.ObjectPropertyDao;
import edu.cornell.mannlib.vitro.webapp.dao.VClassDao;
import edu.cornell.mannlib.vitro.webapp.dao.VClassGroupDao;
import edu.cornell.mannlib.vitro.webapp.dao.VitroVocabulary;
import edu.cornell.mannlib.vitro.webapp.flags.PortalFlag;
import edu.cornell.mannlib.vitro.webapp.search.SearchException;
import edu.cornell.mannlib.vitro.webapp.search.beans.Searcher;
import edu.cornell.mannlib.vitro.webapp.search.beans.VitroHighlighter;
import edu.cornell.mannlib.vitro.webapp.search.beans.VitroQuery;
import edu.cornell.mannlib.vitro.webapp.search.beans.VitroQueryFactory;
import edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc;
import edu.cornell.mannlib.vitro.webapp.search.lucene.LuceneIndexer;
import edu.cornell.mannlib.vitro.webapp.search.lucene.LuceneIndexFactory;
import edu.cornell.mannlib.vitro.webapp.search.lucene.LuceneSetup;
import edu.cornell.mannlib.vitro.webapp.search.lucene.SimpleLuceneHighlighter;
import edu.cornell.mannlib.vitro.webapp.utils.FlagMathUtils;
import edu.cornell.mannlib.vitro.webapp.utils.Html2Text;
/**
* PagedSearchController is the new search controller that interacts
@ -79,27 +64,12 @@ import edu.cornell.mannlib.vitro.webapp.utils.Html2Text;
* @author bdc34
*
*/
public class PagedSearchController extends VitroHttpServlet implements Searcher{
private IndexSearcher searcher = null;
public class PagedSearchController extends VitroHttpServlet {
private static final Log log = LogFactory.getLog(PagedSearchController.class.getName());
String NORESULT_MSG = "The search returned no results.";
private int defaultHitsPerPage = 25;
private int defaultMaxSearchSize= 1000;
public void init(ServletConfig config) throws ServletException {
super.init(config);
LuceneIndexer indexer=(LuceneIndexer)getServletContext()
.getAttribute(LuceneIndexer.class.getName());
indexer.addSearcher(this);
try{
String indexDir = getIndexDir(getServletContext());
getIndexSearcher(indexDir);
}catch(Exception ex){
}
}
public void doPost(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
doGet(request, response);
@ -130,7 +100,8 @@ public class PagedSearchController extends VitroHttpServlet implements Searcher{
startIndex = Integer.parseInt(request.getParameter("startIndex"));
}catch (Throwable e) {
startIndex = 0;
}
}
log.debug("startIndex is " + startIndex);
int hitsPerPage = defaultHitsPerPage;
try{
@ -138,6 +109,7 @@ public class PagedSearchController extends VitroHttpServlet implements Searcher{
} catch (Throwable e) {
hitsPerPage = defaultHitsPerPage;
}
log.debug("hitsPerPage is " + hitsPerPage);
int maxHitSize = defaultMaxSearchSize;
if( startIndex >= defaultMaxSearchSize - hitsPerPage )
@ -146,15 +118,14 @@ public class PagedSearchController extends VitroHttpServlet implements Searcher{
maxHitSize = maxHitSize * 2;
hitsPerPage = maxHitSize;
}
String indexDir = getIndexDir(getServletContext());
log.debug("maxHitSize is " + maxHitSize);
String qtxt = vreq.getParameter(VitroQuery.QUERY_PARAMETER_NAME);
Analyzer analyzer = getAnalyzer(getServletContext());
Query query = null;
try {
query = getQuery(vreq, portalFlag, analyzer, indexDir, qtxt);
query = getQuery(vreq, portalFlag, analyzer, qtxt);
} catch (ParseException e) {
log.warn("Query parse exception: " + e);
doBadQuery(qtxt, request, response);
@ -163,7 +134,7 @@ public class PagedSearchController extends VitroHttpServlet implements Searcher{
log.debug("query for '" + qtxt +"' is " + query.toString());
IndexSearcher searcherForRequest = getIndexSearcher(indexDir);
IndexSearcher searcherForRequest = LuceneIndexFactory.getIndexSearcher(getServletContext());
TopDocs topDocs = null;
try{
@ -181,7 +152,7 @@ public class PagedSearchController extends VitroHttpServlet implements Searcher{
doFailedSearch(request, response, msg, qtxt);
return;
}
}
}
if( topDocs == null || topDocs.scoreDocs == null){
log.error("topDocs for a search was null");
@ -202,18 +173,25 @@ public class PagedSearchController extends VitroHttpServlet implements Searcher{
lastHitToShow = hitsLength;
else
lastHitToShow = startIndex + hitsPerPage - 1;
List<Individual> beans = new LinkedList<Individual>();
for(int i=startIndex; i<topDocs.scoreDocs.length ;i++){
for(int i=startIndex; i<topDocs.scoreDocs.length ;i++){
try{
if( (i >= startIndex) && (i <= lastHitToShow) ){
Document doc = searcherForRequest.doc(topDocs.scoreDocs[i].doc);
String uri = doc.get(Entity2LuceneDoc.term.URI);
Individual ent = new IndividualImpl();
ent.setURI(uri);
ent = iDao.getIndividualByURI(uri);
if(ent!=null)
beans.add(ent);
Individual ent = iDao.getIndividualByURI(uri);
if(ent != null ){
List<VClass>vcs = ent.getVClasses() ;
if( vcs != null && !vcs.isEmpty() ){
beans.add(ent);
log.debug("found individual for search hit in model" + uri );
}else{
log.debug("filtered out classless individual from search results" + uri);
}
}else{
log.debug("could not find individual for search hit in model " + uri);
}
}
}catch(Exception e){
log.error("problem getting usable Individuals from search " +
@ -225,7 +203,7 @@ public class PagedSearchController extends VitroHttpServlet implements Searcher{
if( request.getParameter("classgroup") == null && request.getParameter("type") == null){
request.setAttribute("classgroups",
getClassGroups(grpDao, topDocs, searcherForRequest));
request.setAttribute("refinement", "");
request.setAttribute("refinment", "");
}else{
//search request for a classgroup so add rdf:type search refinement links
@ -233,23 +211,18 @@ public class PagedSearchController extends VitroHttpServlet implements Searcher{
if( request.getParameter("classgroup") != null && request.getParameter("type") == null ){
request.setAttribute("types",
getVClasses(vclassDao,topDocs,searcherForRequest));
request.setAttribute("refinement", "&classgroup="
request.setAttribute("refinment", "&classgroup="
+ URLEncoder.encode(request.getParameter("classgroup"),"UTF-8"));
}else{
if( alphaFilter != null && !"".equals(alphaFilter)){
request.setAttribute("alphas", getAlphas(topDocs, searcherForRequest));
alphaSortIndividuals(beans);
}else{
request.setAttribute("refinement", "&type="
request.setAttribute("refinment", "&type="
+ URLEncoder.encode(request.getParameter("type"),"UTF-8"));
}
}
}
beans = highlightBeans( beans ,
vreq.getWebappDaoFactory().getDataPropertyDao(),
vreq.getWebappDaoFactory().getObjectPropertyDao(),
new SimpleLuceneHighlighter(query,analyzer) );
}
//stick the results in the requestScope and prepare to forward to JSP
request.setAttribute("beans", beans);
@ -412,15 +385,7 @@ public class PagedSearchController extends VitroHttpServlet implements Searcher{
}
}
return typesInHits;
}
private String getIndexDir(ServletContext servletContext) throws SearchException {
Object obj = servletContext.getAttribute(LuceneSetup.INDEX_DIR);
if( obj == null || !(obj instanceof String) )
throw new SearchException("Could not get IndexDir for luecene index");
else
return (String)obj;
}
}
private Analyzer getAnalyzer(ServletContext servletContext) throws SearchException {
Object obj = servletContext.getAttribute(LuceneSetup.ANALYZER);
@ -431,7 +396,7 @@ public class PagedSearchController extends VitroHttpServlet implements Searcher{
}
private Query getQuery(VitroRequest request, PortalFlag portalState,
Analyzer analyzer, String indexDir, String querystr ) throws SearchException, ParseException {
Analyzer analyzer, String querystr ) throws SearchException, ParseException {
Query query = null;
try{
//String querystr = request.getParameter(VitroQuery.QUERY_PARAMETER_NAME);
@ -445,14 +410,7 @@ public class PagedSearchController extends VitroHttpServlet implements Searcher{
return null;
}
QueryParser parser = getQueryParser(analyzer);
// Prevent org.apache.lucene.queryParser.ParseException:
// Cannot parse 'mary *': '*' or '?' not allowed as first character in WildcardQuery
// The * is redundant in this case anyway, so just remove it.
log.debug("Query string is '" + querystr + "'");
querystr = querystr.replaceAll("([\\s^])[?*]", "$1");
log.debug("Cleaned query string is '" + querystr + "'");
query = parser.parse(querystr);
String alpha = request.getParameter("alpha");
@ -576,113 +534,7 @@ public class PagedSearchController extends VitroHttpServlet implements Searcher{
//we have no flags set, so no flag filtering
return null;
}
}
private synchronized IndexSearcher getIndexSearcher(String indexDir) {
if( searcher == null ){
try {
Directory fsDir = FSDirectory.getDirectory(indexDir);
searcher = new IndexSearcher(fsDir);
} catch (IOException e) {
log.error("LuceneSearcher: could not make indexSearcher "+e);
log.error("It is likely that you have not made a directory for the lucene index. "+
"Create the directory indicated in the error and set permissions/ownership so"+
" that the tomcat server can read/write to it.");
//The index directory is created by LuceneIndexer.makeNewIndex()
}
}
return searcher;
}
private List<Individual> highlightBeans(List<Individual> beans,
DataPropertyDao dpDao, ObjectPropertyDao opDao, VitroHighlighter highlighter) {
if( beans == null ){
log.debug("List of beans passed to highlightBeans() was null");
return Collections.EMPTY_LIST;
}else if( highlighter == null ){
log.debug("Null highlighter passed to highlightBeans()");
return beans;
}
Iterator<Individual> it = beans.iterator();
while(it.hasNext()){
Individual ent = it.next();
try{
dpDao.fillDataPropertiesForIndividual(ent);
opDao.fillObjectPropertiesForIndividual(ent);
fragmentHighlight(ent, highlighter);
}catch( Exception ex ){
log.debug("Error while doing search highlighting" , ex);
}
}
return beans;
}
/**
* Highlights the name and then replaces the description with
* highlighted fragments.
* @param ent
* @param highlighter
*/
public void fragmentHighlight(Individual ent, VitroHighlighter hl){
try{
if( ent == null ) return;
Html2Text h2t = new Html2Text();
StringBuffer sb = new StringBuffer("");
if(ent.getBlurb() != null)
sb.append(ent.getBlurb()).append(' ');
if(ent.getDescription() != null )
sb.append(ent.getDescription()).append(' ');
if(ent.getDataPropertyList() != null) {
Iterator edIt = ent.getDataPropertyList().iterator();
while (edIt.hasNext()) {
try{
DataProperty dp = (DataProperty)edIt.next();
if( getDataPropertyBlacklist().contains(dp.getURI()))
continue;
for(DataPropertyStatement dps : dp.getDataPropertyStatements()){
sb.append(dp.getPublicName()).append(' ')
.append(dps.getData()).append(' ');
}
}catch(Throwable e){
log.debug("Error highlighting data property statment " +
"for individual "+ent.getURI());
}
}
}
if(ent.getObjectPropertyList() != null) {
Iterator edIt = ent.getObjectPropertyList().iterator();
String t = null;
while (edIt.hasNext()) {
try {
ObjectProperty op = (ObjectProperty)edIt.next();
if( getObjectPropertyBlacklist().contains(op.getURI()))
continue;
for( ObjectPropertyStatement stmt : op.getObjectPropertyStatements()){
sb.append( ( (t = op.getDomainPublic()) != null) ? t : "" );
sb.append(' ');
sb.append( ( (t = stmt.getObject().getName()) != null) ? t : "" );
sb.append(' ');
}
} catch (Throwable e) {
log.debug("Error highlighting object property " +
"statement for individual "+ent.getURI());
}
}
}
String keywords = ent.getKeywordString();
if( keywords != null )
sb.append(keywords);
ent.setDescription(hl.getHighlightFragments( h2t.stripHtml( sb.toString() )));
}catch(Throwable th){
log.debug("could not hightlight for entity " + ent.getURI(),th);
}
}
}
private void doBadQuery(String queryStr, HttpServletRequest request,
HttpServletResponse response)
@ -785,24 +637,4 @@ public class PagedSearchController extends VitroHttpServlet implements Searcher{
private final String defaultSearchField = "ALLTEXT";
public static final int MAX_QUERY_LENGTH = 500;
/**
* Need to accept notification from indexer that the index has been changed.
*/
public void close() {
searcher = null;
}
public VitroHighlighter getHighlighter(VitroQuery q) {
throw new Error("PagedSearchController.getHighlighter() is unimplemented");
}
public VitroQueryFactory getQueryFactory() {
throw new Error("PagedSearchController.getQueryFactory() is unimplemented");
}
public List search(VitroQuery query) throws SearchException {
throw new Error("PagedSearchController.search() is unimplemented");
}
}

View file

@ -200,6 +200,7 @@ public class IndexBuilder {
listOfIterators.add( (new IndexBuilder.BuilderObjectSource(updatedInds)).getUpdatedSinceIterator(0) );
doBuild( listOfIterators, deletedInds, false, UPDATE_DOCS );
log.debug("Ending updateIndex()");
}
/**
@ -342,15 +343,17 @@ public class IndexBuilder {
for( VClass vclass : ind.getVClasses() ){
if( classesProhibitedFromSearch.isClassProhibited(vclass.getURI()) ){
prohibitedClass = true;
log.debug("not indexing " + ind.getURI() + " because class " + vclass.getURI() + " is on prohibited list.");
log.debug("removing " + ind.getURI() + " from index because class " + vclass.getURI() + " is on prohibited list.");
break;
}
}
}
if( !prohibitedClass )
if( !prohibitedClass ){
log.debug("Indexing '" + ind.getName() + "' URI: " + ind.getURI());
indexer.index(ind, newDoc);
else
}else{
indexer.removeFromIndex(ind);
}
}catch(Throwable ex){
log.debug("IndexBuilder.indexItem() Error indexing "