Adding solr indexing prototype

This commit is contained in:
briancaruso 2011-04-11 17:21:38 +00:00
parent 9bfa6acbd5
commit 543c1cd945
9 changed files with 442 additions and 58 deletions

View file

@ -46,14 +46,6 @@
--> -->
<schema name="example" version="1.2"> <schema name="example" version="1.2">
<!-- attribute "name" is the name of this schema and is only used for display purposes.
Applications should change this to reflect the nature of the search collection.
version="1.2" is Solr's version number for the schema syntax and semantics. It should
not normally be changed by applications.
1.0: multiValued attribute did not exist, all fields are multiValued by nature
1.1: multiValued attribute introduced, false by default
1.2: omitTermFreqAndPositions attribute introduced, true by default except for text fields.
-->
<types> <types>
<!-- field type definitions. The "name" attribute is <!-- field type definitions. The "name" attribute is
@ -426,36 +418,37 @@
when adding a document. when adding a document.
--> -->
<field name="id" type="string" indexed="true" stored="true" required="true" />
<field name="sku" type="textTight" indexed="true" stored="true" omitNorms="true"/>
<field name="name" type="textgen" indexed="true" stored="true"/>
<field name="alphaNameSort" type="alphaOnlySort" indexed="true" stored="false"/>
<field name="manu" type="textgen" indexed="true" stored="true" omitNorms="true"/>
<field name="cat" type="text_ws" indexed="true" stored="true" multiValued="true" omitNorms="true" />
<field name="features" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="includes" type="text" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" />
<field name="weight" type="float" indexed="true" stored="true"/>
<field name="price" type="float" indexed="true" stored="true"/>
<field name="popularity" type="int" indexed="true" stored="true" />
<field name="inStock" type="boolean" indexed="true" stored="true" />
<!-- Common metadata fields, named specifically to match up with
SolrCell metadata when parsing rich documents such as Word, PDF. <!-- **************************** Vitro Fields *************************** -->
Some fields are multiValued only because Tika currently may return
multiple values for them.
-->
<field name="title" type="text" indexed="true" stored="true" multiValued="true"/> <field name="DocId" type="string" indexed="true" stored="true" required="true" />
<field name="subject" type="text" indexed="true" stored="true"/>
<field name="description" type="text" indexed="true" stored="true"/> <field name="type" type="string" indexed="true" stored="true" omitNorms="ture" multiValued="true"/>
<field name="comments" type="text" indexed="true" stored="true"/> <field name="classLocalName" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="author" type="textgen" indexed="true" stored="true"/> <field name="classLocalNameLowerCase" type="text" indexed="true" stored="false" multiValued="true"/>
<field name="keywords" type="textgen" indexed="true" stored="true"/> <field name="classgroup" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="category" type="textgen" indexed="true" stored="true"/> <field name="PROHIBITED_FROM_TEXT_RESULTS" type="string" indexed="true" stored="false" multiValued="false"/>
<field name="content_type" type="string" indexed="true" stored="true" multiValued="true"/> <field name="URI" type="string" indexed="true" stored="true" multiValued="false" omitNorms="true"/>
<field name="last_modified" type="date" indexed="true" stored="true"/> <field name="name" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="links" type="string" indexed="true" stored="true" multiValued="true"/> <field name="nameunstemmed" type="text" indexed="true" stored="false" multiValued="true"/>
<field name="nameunanalyzed" type="string" indexed="true" stored="false" multiValued="true"/>
<field name="nameraw" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="indexedTime" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="ALLTEXT" type="text" indexed="true" stored="false" multiValued="true"/>
<field name="ALLTEXTUNSTEMMED" type="textgen" indexed="true" stored="false" multiValued="true"/>
<field name="THUMBNAIL" type="string" indexed="true" stored="true"/>
<field name="moniker" type="ignored" />
<field name="modType" type="ignored"/>
<field name="JCLASS" type="ignored"/>
<!-- catchall field, containing all other searchable text fields (implemented <!-- catchall field, containing all other searchable text fields (implemented
@ -466,19 +459,7 @@
leading wildcard queries. --> leading wildcard queries. -->
<field name="text_rev" type="text_rev" indexed="true" stored="false" multiValued="true"/> <field name="text_rev" type="text_rev" indexed="true" stored="false" multiValued="true"/>
<!-- non-tokenized version of manufacturer to make it easier to sort or group
results by manufacturer. copied from "manu" via copyField -->
<field name="manu_exact" type="string" indexed="true" stored="false"/>
<field name="payloads" type="payloads" indexed="true" stored="true"/>
<!-- Uncommenting the following will create a "timestamp" field using
a default value of "NOW" to indicate when each document was indexed.
-->
<!--
<field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/> <field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
-->
<!-- Dynamic field definitions. If a field name is not found, dynamicFields <!-- Dynamic field definitions. If a field name is not found, dynamicFields
will be used if the name matches any of the patterns. will be used if the name matches any of the patterns.
@ -521,10 +502,10 @@
<!-- Field to use to determine and enforce document uniqueness. <!-- Field to use to determine and enforce document uniqueness.
Unless this field is marked with required="false", it will be a required field Unless this field is marked with required="false", it will be a required field
--> -->
<uniqueKey>id</uniqueKey> <uniqueKey>DocId</uniqueKey>
<!-- field for the QueryParser to use when an explicit fieldname is absent --> <!-- field for the QueryParser to use when an explicit fieldname is absent -->
<defaultSearchField>text</defaultSearchField> <defaultSearchField>ALLTEXT</defaultSearchField>
<!-- SolrQueryParser configuration: defaultOperator="AND|OR" --> <!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
<solrQueryParser defaultOperator="OR"/> <solrQueryParser defaultOperator="OR"/>
@ -533,12 +514,7 @@
is added to the index. It's used either to index the same field differently, is added to the index. It's used either to index the same field differently,
or to add multiple fields to the same field for easier/faster searching. --> or to add multiple fields to the same field for easier/faster searching. -->
<copyField source="cat" dest="text"/> <!-- <copyField source="name" dest="text"/> -->
<copyField source="name" dest="text"/>
<copyField source="manu" dest="text"/>
<copyField source="features" dest="text"/>
<copyField source="includes" dest="text"/>
<copyField source="manu" dest="manu_exact"/>
<!-- Above, multiple source fields are copied to the [text] field. <!-- Above, multiple source fields are copied to the [text] field.
Another way to map multiple source fields to the same Another way to map multiple source fields to the same

View file

@ -68,7 +68,7 @@
<!-- Used to specify an alternate directory to hold all index data <!-- Used to specify an alternate directory to hold all index data
other than the default ./data under the Solr home. other than the default ./data under the Solr home.
If replication is in use, this should match the replication configuration. --> If replication is in use, this should match the replication configuration. -->
<dataDir>${solr.data.dir:./solr/data}</dataDir> <!-- <dataDir>${solr.data.dir:./solr/data}</dataDir> -->
<!-- WARNING: this <indexDefaults> section only provides defaults for index writers <!-- WARNING: this <indexDefaults> section only provides defaults for index writers

View file

@ -52,6 +52,14 @@ VitroConnection.DataSource.url = jdbc:mysql://localhost/vitro
VitroConnection.DataSource.username = vitroweb VitroConnection.DataSource.username = vitroweb
VitroConnection.DataSource.password = vitrovitro VitroConnection.DataSource.password = vitrovitro
#
# The URL to connect to for the Solr service that is used by the application.
# The Solr service provides the application with full text search and many
# other features. If you leave this commented out the application will attempt to use
# the solr from the same tomcat server at the context ${webapp.name}solr
#
# vitro.local.solr.url =
# #
# The name of your first admin user for the VIVO application. The password # The name of your first admin user for the VIVO application. The password
# for this user is initially set to "defaultAdmin", but you will be asked to # for this user is initially set to "defaultAdmin", but you will be asked to

Binary file not shown.

Binary file not shown.

View file

@ -0,0 +1,45 @@
package edu.cornell.mannlib.vitro.webapp.search.solr;
import org.apache.solr.common.SolrDocument;
import edu.cornell.mannlib.vitro.webapp.beans.Individual;
import edu.cornell.mannlib.vitro.webapp.search.IndexingException;
import edu.cornell.mannlib.vitro.webapp.search.docbuilder.Obj2DocIface;
import edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc;
public class IndividualToSolrDocument implements Obj2DocIface {
protected LuceneDocToSolrDoc luceneToSolr;
protected Entity2LuceneDoc entityToLucene;
public IndividualToSolrDocument(Entity2LuceneDoc e2d){
entityToLucene = e2d;
luceneToSolr = new LuceneDocToSolrDoc();
}
@Override
public boolean canTranslate(Object obj) {
return obj != null && obj instanceof Individual;
}
@Override
public boolean canUnTranslate(Object result) {
return result != null && result instanceof SolrDocument;
}
@Override
public Object getIndexId(Object obj) {
throw new Error("IndiviudalToSolrDocument.getIndexId() is unimplemented");
}
@Override
public Object translate(Object obj) throws IndexingException {
return luceneToSolr.translate( entityToLucene.translate( obj ) );
}
@Override
public Object unTranslate(Object result) {
return luceneToSolr.unTranslate( result );
}
}

View file

@ -0,0 +1,60 @@
package edu.cornell.mannlib.vitro.webapp.search.solr;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument;
import edu.cornell.mannlib.vitro.webapp.beans.Individual;
import edu.cornell.mannlib.vitro.webapp.beans.IndividualImpl;
import edu.cornell.mannlib.vitro.webapp.search.IndexingException;
import edu.cornell.mannlib.vitro.webapp.search.docbuilder.Obj2DocIface;
import edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc;
/**
* Translate a lucene Document into a SolrDocument.
*/
public class LuceneDocToSolrDoc implements Obj2DocIface {
@Override
public boolean canTranslate(Object obj) {
return obj != null && obj instanceof Document;
}
@Override
public boolean canUnTranslate(Object result) {
return result != null && result instanceof SolrDocument;
}
@Override
public Object getIndexId(Object obj) {
//"this method isn't useful for solr"
return null;
}
@Override
public Object translate(Object obj) throws IndexingException {
Document luceneDoc = (Document)obj;
SolrInputDocument solrDoc = new SolrInputDocument();
for( Object f : luceneDoc.getFields()){
Field field = (Field)f;
solrDoc.addField( new String(field.name()), field.stringValue() );
}
return solrDoc;
}
@Override
public Object unTranslate(Object result) {
Individual ind = null;
if( result != null && result instanceof SolrDocument){
SolrDocument hit = (SolrDocument)result;
String id = (String) hit.getFieldValue(Entity2LuceneDoc.term.URI);
ind = new IndividualImpl();
ind.setURI(id);
}
return ind;
}
}

View file

@ -0,0 +1,163 @@
/* $This file is distributed under the terms of the license in /doc/license.txt$ */
package edu.cornell.mannlib.vitro.webapp.search.solr;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument;
import edu.cornell.mannlib.vitro.webapp.beans.Individual;
import edu.cornell.mannlib.vitro.webapp.search.IndexingException;
import edu.cornell.mannlib.vitro.webapp.search.docbuilder.Obj2DocIface;
import edu.cornell.mannlib.vitro.webapp.search.indexing.IndexerIface;
public class SolrIndexer implements IndexerIface {
private final static Log log = LogFactory.getLog(SolrIndexer.class);
protected SolrServer server;
protected boolean indexing;
protected List<Obj2DocIface> obj2DocList;
protected HashSet<String> urisIndexed;
public SolrIndexer( SolrServer server, List<Obj2DocIface> o2d){
this.server = server;
this.obj2DocList = o2d;
}
@Override
public synchronized void index(Individual ind, boolean newDoc) throws IndexingException {
if( ! indexing )
throw new IndexingException("SolrIndexer: must call " +
"startIndexing() before index().");
if( ind == null )
log.debug("Individual to index was null, ignoring.");
try{
if( urisIndexed.contains(ind.getURI()) ){
log.debug("already indexed " + ind.getURI() );
return;
}else{
urisIndexed.add(ind.getURI());
log.debug("indexing " + ind.getURI());
Iterator<Obj2DocIface> it = getObj2DocList().iterator();
while (it.hasNext()) {
Obj2DocIface obj2doc = (Obj2DocIface) it.next();
if (obj2doc.canTranslate(ind)) {
SolrInputDocument solrDoc = (SolrInputDocument) obj2doc.translate(ind);
if( solrDoc != null){
//sending each doc individually is inefficient
Collection<SolrInputDocument> docs = new ArrayList<SolrInputDocument>();
docs.add( solrDoc );
server.add( docs );
// if( !newDoc ){
// server.add( docs );
// log.debug("updated " + ind.getName() + " " + ind.getURI());
// }else{
// server.add( docs );
// log.debug("added " + ind.getName() + " " + ind.getURI());
// }
}else{
log.debug("removing from index " + ind.getURI());
//writer.deleteDocuments((Term)obj2doc.getIndexId(ind));
}
}
}
}
} catch (IOException ex) {
throw new IndexingException(ex.getMessage());
} catch (SolrServerException ex) {
throw new IndexingException(ex.getMessage());
}
}
@Override
public boolean isIndexing() {
return indexing;
}
@Override
public void prepareForRebuild() throws IndexingException {
// TODO Auto-generated method stub
}
@Override
public void removeFromIndex(Individual ind) throws IndexingException {
// TODO Auto-generated method stub
}
@Override
public synchronized void startIndexing() throws IndexingException {
while( indexing ){ //wait for indexing to end.
log.debug("LuceneIndexer.startIndexing() waiting...");
try{ wait(); } catch(InterruptedException ex){}
}
log.debug("Starting to index");
indexing = true;
urisIndexed = new HashSet<String>();
notifyAll();
}
public synchronized void addObj2Doc(Obj2DocIface o2d) {
if (o2d != null)
obj2DocList.add(o2d);
}
public synchronized List<Obj2DocIface> getObj2DocList() {
return obj2DocList;
}
@Override
public void abortIndexingAndCleanUp() {
endIndexing();
}
@Override
public synchronized void endIndexing() {
try {
server.commit();
} catch (Exception e) {
log.error("Could not commit to solr server", e);
}
try {
server.optimize();
} catch (Exception e) {
log.error("Could not optimize solr server", e);
}
indexing = false;
notifyAll();
}
@Override
public long getModified() {
// TODO Auto-generated method stub
return 0;
}
public boolean isIndexEmpty() {
// TODO Auto-generated method stub
return false;
}
}

View file

@ -0,0 +1,132 @@
package edu.cornell.mannlib.vitro.webapp.search.solr;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import javax.servlet.ServletContext;
import javax.servlet.ServletContextEvent;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
import com.hp.hpl.jena.ontology.OntModel;
import edu.cornell.mannlib.vitro.webapp.beans.BaseResourceBean.RoleLevel;
import edu.cornell.mannlib.vitro.webapp.config.ConfigurationProperties;
import edu.cornell.mannlib.vitro.webapp.dao.DisplayVocabulary;
import edu.cornell.mannlib.vitro.webapp.dao.WebappDaoFactory;
import edu.cornell.mannlib.vitro.webapp.dao.filtering.WebappDaoFactoryFiltering;
import edu.cornell.mannlib.vitro.webapp.dao.filtering.filters.VitroFilterUtils;
import edu.cornell.mannlib.vitro.webapp.dao.filtering.filters.VitroFilters;
import edu.cornell.mannlib.vitro.webapp.dao.jena.ModelContext;
import edu.cornell.mannlib.vitro.webapp.dao.jena.SearchReindexingListener;
import edu.cornell.mannlib.vitro.webapp.search.beans.IndividualProhibitedFromSearch;
import edu.cornell.mannlib.vitro.webapp.search.beans.ObjectSourceIface;
import edu.cornell.mannlib.vitro.webapp.search.beans.ProhibitedFromSearch;
import edu.cornell.mannlib.vitro.webapp.search.docbuilder.Obj2DocIface;
import edu.cornell.mannlib.vitro.webapp.search.indexing.IndexBuilder;
import edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc;
import edu.cornell.mannlib.vitro.webapp.search.lucene.LuceneSetup;
import edu.cornell.mannlib.vitro.webapp.servlet.setup.AbortStartup;
public class SolrSetup implements javax.servlet.ServletContextListener{
private static final Log log = LogFactory.getLog(SolrSetup.class.getName());
protected static final String LOCAL_SOLR_SERVER = "vitro.local.solr.server";
@Override
public void contextInitialized(ServletContextEvent sce) {
if (AbortStartup.isStartupAborted(sce.getServletContext())) {
return;
}
try {
ServletContext context = sce.getServletContext();
/* setup the http connection with the solr server */
String solrServerUrl = ConfigurationProperties.getBean(sce).getProperty("vitro.local.solr.url");
if( solrServerUrl == null ){
log.error("Could not find vitro.local.solr.url in deploy.properties. "+
"Vitro application needs a URL of a solr server that it can use to index its data. " +
"The it should be something like http://localhost:${port}" + context.getContextPath() + "solr"
);
return;
}
CommonsHttpSolrServer server;
server = new CommonsHttpSolrServer( solrServerUrl );
server.setSoTimeout(1000); // socket read timeout
server.setConnectionTimeout(100);
server.setDefaultMaxConnectionsPerHost(100);
server.setMaxTotalConnections(100);
server.setMaxRetries(1);
context.setAttribute(LOCAL_SOLR_SERVER, server);
/* setup the individual to solr doc translation */
//first we need a ent2luceneDoc translator
OntModel displayOntModel = (OntModel) sce.getServletContext().getAttribute("displayOntModel");
Entity2LuceneDoc ent2LuceneDoc = new Entity2LuceneDoc(
new ProhibitedFromSearch(DisplayVocabulary.PRIMARY_LUCENE_INDEX_URI, displayOntModel),
new IndividualProhibitedFromSearch(context) );
IndividualToSolrDocument indToSolrDoc = new IndividualToSolrDocument( ent2LuceneDoc );
List<Obj2DocIface> o2d = new ArrayList<Obj2DocIface>();
o2d.add(indToSolrDoc);
/* setup solr indexer */
SolrIndexer solrIndexer = new SolrIndexer(server, o2d);
if( solrIndexer.isIndexEmpty() ){
log.info("solr index is empty, requesting rebuild");
sce.getServletContext().setAttribute(LuceneSetup.INDEX_REBUILD_REQUESTED_AT_STARTUP, Boolean.TRUE);
}
// This is where the builder gets the list of places to try to
// get objects to index. It is filtered so that non-public text
// does not get into the search index.
WebappDaoFactory wadf = (WebappDaoFactory) context.getAttribute("webappDaoFactory");
VitroFilters vf = VitroFilterUtils.getDisplayFilterByRoleLevel(RoleLevel.PUBLIC, wadf);
wadf = new WebappDaoFactoryFiltering(wadf, vf);
List<ObjectSourceIface> sources = new ArrayList<ObjectSourceIface>();
sources.add(wadf.getIndividualDao());
IndexBuilder builder = new IndexBuilder(context, solrIndexer, sources);
// to the servlet context so we can access it later in the webapp.
context.setAttribute(IndexBuilder.class.getName(), builder);
// set up listeners so search index builder is notified of changes to model
ServletContext ctx = sce.getServletContext();
SearchReindexingListener srl = new SearchReindexingListener(builder);
ModelContext.registerListenerForChanges(ctx, srl);
if( sce.getServletContext().getAttribute(LuceneSetup.INDEX_REBUILD_REQUESTED_AT_STARTUP) instanceof Boolean &&
(Boolean)sce.getServletContext().getAttribute(LuceneSetup.INDEX_REBUILD_REQUESTED_AT_STARTUP) ){
log.info("Rebuild of solr index required before startup.");
builder.doIndexRebuild();
int n = 0;
while( builder.isReindexRequested() || builder.isIndexing() ){
n++;
if( n % 20 == 0 ) //output message every 10 sec.
log.info("Still rebuilding solr index");
Thread.sleep(500);
}
}
log.info("Setup of Solr index completed.");
} catch (Throwable e) {
log.error("could not setup local solr server",e);
}
}
@Override
public void contextDestroyed(ServletContextEvent sce) {
}
public static SolrServer getSolrServer(ServletContext ctx){
return (SolrServer) ctx.getAttribute(LOCAL_SOLR_SERVER);
}
}