Adding solr indexing prototype

2011-04-11 17:21:38 +00:00 · 2011-04-11 17:21:38 +00:00 · 543c1cd945
commit 543c1cd945
parent 9bfa6acbd5
9 changed files with 442 additions and 58 deletions
--- a/solr/exampleSolr/conf/schema.xml
+++ b/solr/exampleSolr/conf/schema.xml
@ -46,14 +46,6 @@
 -->
 <schema name="example" version="1.2">
  <!-- attribute "name" is the name of this schema and is only used for display purposes.
       Applications should change this to reflect the nature of the search collection.
       version="1.2" is Solr's version number for the schema syntax and semantics.  It should
       not normally be changed by applications.
       1.0: multiValued attribute did not exist, all fields are multiValued by nature
       1.1: multiValued attribute introduced, false by default 
       1.2: omitTermFreqAndPositions attribute introduced, true by default except for text fields.
     -->
  <types>
    <!-- field type definitions. The "name" attribute is
@ -426,36 +418,37 @@
       when adding a document.
   -->
   <field name="id" type="string" indexed="true" stored="true" required="true" /> 
   <field name="sku" type="textTight" indexed="true" stored="true" omitNorms="true"/>
   <field name="name" type="textgen" indexed="true" stored="true"/>
   <field name="alphaNameSort" type="alphaOnlySort" indexed="true" stored="false"/>
   <field name="manu" type="textgen" indexed="true" stored="true" omitNorms="true"/>
   <field name="cat" type="text_ws" indexed="true" stored="true" multiValued="true" omitNorms="true" />
   <field name="features" type="text" indexed="true" stored="true" multiValued="true"/>
   <field name="includes" type="text" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" />
   <field name="weight" type="float" indexed="true" stored="true"/>
   <field name="price"  type="float" indexed="true" stored="true"/>
   <field name="popularity" type="int" indexed="true" stored="true" />
   <field name="inStock" type="boolean" indexed="true" stored="true" />
-   <!-- Common metadata fields, named specifically to match up with
+
-     SolrCell metadata when parsing rich documents such as Word, PDF.
+<!-- ****************************  Vitro Fields *************************** -->
-     Some fields are multiValued only because Tika currently may return
+
-     multiple values for them.
+
-   -->
+
-   <field name="title" type="text" indexed="true" stored="true" multiValued="true"/>
+<field name="DocId" type="string" indexed="true" stored="true" required="true" /> 
-   <field name="subject" type="text" indexed="true" stored="true"/>
+
-   <field name="description" type="text" indexed="true" stored="true"/>
+<field name="type" type="string" indexed="true" stored="true" omitNorms="ture" multiValued="true"/>
-   <field name="comments" type="text" indexed="true" stored="true"/>
+<field name="classLocalName" type="text" indexed="true" stored="true" multiValued="true"/>
-   <field name="author" type="textgen" indexed="true" stored="true"/>
+<field name="classLocalNameLowerCase" type="text" indexed="true" stored="false" multiValued="true"/>
-   <field name="keywords" type="textgen" indexed="true" stored="true"/>
+<field name="classgroup" type="string" indexed="true" stored="true" multiValued="true"/>
-   <field name="category" type="textgen" indexed="true" stored="true"/>
+<field name="PROHIBITED_FROM_TEXT_RESULTS" type="string" indexed="true" stored="false" multiValued="false"/>
-   <field name="content_type" type="string" indexed="true" stored="true" multiValued="true"/>
+<field name="URI" type="string" indexed="true" stored="true" multiValued="false" omitNorms="true"/>
-   <field name="last_modified" type="date" indexed="true" stored="true"/>
+<field name="name" type="text" indexed="true" stored="true" multiValued="true"/>
-   <field name="links" type="string" indexed="true" stored="true" multiValued="true"/>
+<field name="nameunstemmed" type="text" indexed="true" stored="false" multiValued="true"/>
 <field name="nameunanalyzed" type="string" indexed="true" stored="false" multiValued="true"/>
 <field name="nameraw" type="string" indexed="true" stored="true" multiValued="true"/>
 <field name="indexedTime" type="string" indexed="true" stored="true" multiValued="true"/>
 <field name="ALLTEXT" type="text" indexed="true" stored="false" multiValued="true"/>
 <field name="ALLTEXTUNSTEMMED" type="textgen" indexed="true" stored="false" multiValued="true"/>
 <field name="THUMBNAIL" type="string" indexed="true" stored="true"/>
 <field name="moniker" type="ignored" />
 <field name="modType" type="ignored"/>
 <field name="JCLASS" type="ignored"/>
   <!-- catchall field, containing all other searchable text fields (implemented
@ -466,19 +459,7 @@
        leading wildcard queries. -->
   <field name="text_rev" type="text_rev" indexed="true" stored="false" multiValued="true"/>
   <!-- non-tokenized version of manufacturer to make it easier to sort or group
        results by manufacturer.  copied from "manu" via copyField -->
   <field name="manu_exact" type="string" indexed="true" stored="false"/>
   <field name="payloads" type="payloads" indexed="true" stored="true"/>
   <!-- Uncommenting the following will create a "timestamp" field using
        a default value of "NOW" to indicate when each document was indexed.
     -->
   <!--
   <field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>   
     -->
   <!-- Dynamic field definitions.  If a field name is not found, dynamicFields
        will be used if the name matches any of the patterns.
@ -521,10 +502,10 @@
 <!-- Field to use to determine and enforce document uniqueness. 
      Unless this field is marked with required="false", it will be a required field
   -->
- <uniqueKey>id</uniqueKey>
+ <uniqueKey>DocId</uniqueKey>
 <!-- field for the QueryParser to use when an explicit fieldname is absent -->
- <defaultSearchField>text</defaultSearchField>
+ <defaultSearchField>ALLTEXT</defaultSearchField>
 <!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
 <solrQueryParser defaultOperator="OR"/>
@ -533,12 +514,7 @@
        is added to the index.  It's used either to index the same field differently,
        or to add multiple fields to the same field for easier/faster searching.  -->
-   <copyField source="cat" dest="text"/>
+<!--   <copyField source="name" dest="text"/> -->
   <copyField source="name" dest="text"/>
   <copyField source="manu" dest="text"/>
   <copyField source="features" dest="text"/>
   <copyField source="includes" dest="text"/>
   <copyField source="manu" dest="manu_exact"/>
   <!-- Above, multiple source fields are copied to the [text] field. 
 	  Another way to map multiple source fields to the same 
--- a/solr/exampleSolr/conf/solrconfig.xml
+++ b/solr/exampleSolr/conf/solrconfig.xml
@ -68,7 +68,7 @@
  <!-- Used to specify an alternate directory to hold all index data
       other than the default ./data under the Solr home.
       If replication is in use, this should match the replication configuration. -->
-  <dataDir>${solr.data.dir:./solr/data}</dataDir>
+<!--  <dataDir>${solr.data.dir:./solr/data}</dataDir> -->
  <!-- WARNING: this <indexDefaults> section only provides defaults for index writers
--- a/webapp/config/example.deploy.properties
+++ b/webapp/config/example.deploy.properties
@ -52,6 +52,14 @@ VitroConnection.DataSource.url = jdbc:mysql://localhost/vitro
 VitroConnection.DataSource.username = vitroweb
 VitroConnection.DataSource.password = vitrovitro
 #
 # The URL to connect to for the Solr service that is used by the application.
 # The Solr service provides the application with full text search and many
 # other features.  If you leave this commented out the application will attempt to use
 # the solr from the same tomcat server at the context ${webapp.name}solr
 #
 # vitro.local.solr.url = 
 #
 # The name of your first admin user for the VIVO application. The password 
 # for this user is initially set to "defaultAdmin", but you will be asked to 
--- a/webapp/lib/apache-solr-solrj-1.4.1.jar
+++ b/webapp/lib/apache-solr-solrj-1.4.1.jar
--- a/webapp/lib/jcl-over-slf4j-1.5.5.jar
+++ b/webapp/lib/jcl-over-slf4j-1.5.5.jar
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/IndividualToSolrDocument.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/IndividualToSolrDocument.java
@ -0,0 +1,45 @@
 package edu.cornell.mannlib.vitro.webapp.search.solr;
 import org.apache.solr.common.SolrDocument;
 import edu.cornell.mannlib.vitro.webapp.beans.Individual;
 import edu.cornell.mannlib.vitro.webapp.search.IndexingException;
 import edu.cornell.mannlib.vitro.webapp.search.docbuilder.Obj2DocIface;
 import edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc;
 public class IndividualToSolrDocument implements Obj2DocIface {
    protected LuceneDocToSolrDoc luceneToSolr;
    protected Entity2LuceneDoc entityToLucene;
    public IndividualToSolrDocument(Entity2LuceneDoc e2d){
        entityToLucene = e2d;  
        luceneToSolr = new LuceneDocToSolrDoc();
    }
    @Override
    public boolean canTranslate(Object obj) {
        return obj != null && obj instanceof Individual;
    }
    @Override
    public boolean canUnTranslate(Object result) {
        return result != null && result instanceof SolrDocument;
    }
    @Override
    public Object getIndexId(Object obj) {
        throw new Error("IndiviudalToSolrDocument.getIndexId() is unimplemented");        
    }
    @Override
    public Object translate(Object obj) throws IndexingException {
        return luceneToSolr.translate( entityToLucene.translate( obj ) );
    }
    @Override
    public Object unTranslate(Object result) {
        return luceneToSolr.unTranslate( result ); 
    }
 }
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/LuceneDocToSolrDoc.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/LuceneDocToSolrDoc.java
@ -0,0 +1,60 @@
 package edu.cornell.mannlib.vitro.webapp.search.solr;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrInputDocument;
 import edu.cornell.mannlib.vitro.webapp.beans.Individual;
 import edu.cornell.mannlib.vitro.webapp.beans.IndividualImpl;
 import edu.cornell.mannlib.vitro.webapp.search.IndexingException;
 import edu.cornell.mannlib.vitro.webapp.search.docbuilder.Obj2DocIface;
 import edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc;
 /**
 * Translate a lucene Document into a SolrDocument.
 */
 public class LuceneDocToSolrDoc implements Obj2DocIface {
    @Override
    public boolean canTranslate(Object obj) {        
        return obj != null && obj instanceof Document;
    }
    @Override
    public boolean canUnTranslate(Object result) {
        return result != null && result instanceof SolrDocument; 
    }
    @Override
    public Object getIndexId(Object obj) {        
        //"this method isn't useful for solr"
        return null;
    }
    @Override
    public Object translate(Object obj) throws IndexingException {
        Document luceneDoc = (Document)obj;
        SolrInputDocument solrDoc = new SolrInputDocument();
        for( Object f : luceneDoc.getFields()){
            Field field = (Field)f;
            solrDoc.addField( new String(field.name()), field.stringValue()  );
        }
        return solrDoc;
    }
    @Override
    public Object unTranslate(Object result) {
        Individual ind = null;
        if( result != null && result instanceof SolrDocument){
            SolrDocument hit = (SolrDocument)result;
            String id = (String) hit.getFieldValue(Entity2LuceneDoc.term.URI);
            ind = new IndividualImpl();
            ind.setURI(id);            
        }
        return ind;
    }
 }
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/SolrIndexer.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/SolrIndexer.java
@ -0,0 +1,163 @@
 /* $This file is distributed under the terms of the license in /doc/license.txt$ */
 package edu.cornell.mannlib.vitro.webapp.search.solr;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.solr.client.solrj.SolrServer;
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrInputDocument;
 import edu.cornell.mannlib.vitro.webapp.beans.Individual;
 import edu.cornell.mannlib.vitro.webapp.search.IndexingException;
 import edu.cornell.mannlib.vitro.webapp.search.docbuilder.Obj2DocIface;
 import edu.cornell.mannlib.vitro.webapp.search.indexing.IndexerIface;
 public class SolrIndexer implements IndexerIface {
    private final static Log log = LogFactory.getLog(SolrIndexer.class);
    protected SolrServer server;
    protected boolean indexing;    
    protected List<Obj2DocIface> obj2DocList;
    protected HashSet<String> urisIndexed;
    public SolrIndexer( SolrServer server, List<Obj2DocIface> o2d){
        this.server = server; 
        this.obj2DocList = o2d;        
    }
    @Override
    public synchronized void index(Individual ind, boolean newDoc) throws IndexingException {
        if( ! indexing )
            throw new IndexingException("SolrIndexer: must call " +
                    "startIndexing() before index().");
        if( ind == null )
            log.debug("Individual to index was null, ignoring.");
        try{
            if( urisIndexed.contains(ind.getURI()) ){
                log.debug("already indexed " + ind.getURI() );
                return;
            }else{
                urisIndexed.add(ind.getURI());
                log.debug("indexing " + ind.getURI());
                Iterator<Obj2DocIface> it = getObj2DocList().iterator();
                while (it.hasNext()) {
                    Obj2DocIface obj2doc = (Obj2DocIface) it.next();
                    if (obj2doc.canTranslate(ind)) {
                        SolrInputDocument solrDoc = (SolrInputDocument) obj2doc.translate(ind);
                        if( solrDoc != null){
                            //sending each doc individually is inefficient
                            Collection<SolrInputDocument> docs = new ArrayList<SolrInputDocument>();
                            docs.add( solrDoc );
                            server.add( docs );
 //                            if( !newDoc ){  
 //                                server.add( docs );
 //                                log.debug("updated " + ind.getName() + " " + ind.getURI());
 //                            }else{                 
 //                                server.add( docs );
 //                                log.debug("added " + ind.getName() + " " + ind.getURI());
 //                            }
                        }else{
                            log.debug("removing from index " + ind.getURI());
                            //writer.deleteDocuments((Term)obj2doc.getIndexId(ind));
                        }
                    }
                }
            }
        } catch (IOException ex) {
            throw new IndexingException(ex.getMessage());
        } catch (SolrServerException ex) {
            throw new IndexingException(ex.getMessage());
        }
    }
    @Override
    public boolean isIndexing() {     
        return indexing;
    }
    @Override
    public void prepareForRebuild() throws IndexingException {
        // TODO Auto-generated method stub
    }
    @Override
    public void removeFromIndex(Individual ind) throws IndexingException {
        // TODO Auto-generated method stub
    }
    @Override
    public synchronized void startIndexing() throws IndexingException {        
        while( indexing ){ //wait for indexing to end.
            log.debug("LuceneIndexer.startIndexing() waiting...");
            try{ wait(); } catch(InterruptedException ex){}
        }
        log.debug("Starting to index");        
        indexing = true;
        urisIndexed = new HashSet<String>();        
        notifyAll();        
    }
    public synchronized void addObj2Doc(Obj2DocIface o2d) {
        if (o2d != null)
            obj2DocList.add(o2d);
    }
    public synchronized List<Obj2DocIface> getObj2DocList() {
        return obj2DocList;
    }
    @Override
    public void abortIndexingAndCleanUp() {
        endIndexing();        
    }
    @Override
    public synchronized void endIndexing() {
        try {
            server.commit();            
        } catch (Exception e) {
            log.error("Could not commit to solr server", e);
        }
        try {
            server.optimize();
        } catch (Exception e) {
            log.error("Could not optimize solr server", e);
        }
        indexing = false;
        notifyAll();
    }
    @Override
    public long getModified() {
        // TODO Auto-generated method stub
        return 0;
    }
    public boolean isIndexEmpty() {
        // TODO Auto-generated method stub
        return false;
    }
 }
--- a/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/SolrSetup.java
+++ b/webapp/src/edu/cornell/mannlib/vitro/webapp/search/solr/SolrSetup.java
@ -0,0 +1,132 @@
 package edu.cornell.mannlib.vitro.webapp.search.solr;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
 import javax.servlet.ServletContext;
 import javax.servlet.ServletContextEvent;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.solr.client.solrj.SolrServer;
 import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
 import com.hp.hpl.jena.ontology.OntModel;
 import edu.cornell.mannlib.vitro.webapp.beans.BaseResourceBean.RoleLevel;
 import edu.cornell.mannlib.vitro.webapp.config.ConfigurationProperties;
 import edu.cornell.mannlib.vitro.webapp.dao.DisplayVocabulary;
 import edu.cornell.mannlib.vitro.webapp.dao.WebappDaoFactory;
 import edu.cornell.mannlib.vitro.webapp.dao.filtering.WebappDaoFactoryFiltering;
 import edu.cornell.mannlib.vitro.webapp.dao.filtering.filters.VitroFilterUtils;
 import edu.cornell.mannlib.vitro.webapp.dao.filtering.filters.VitroFilters;
 import edu.cornell.mannlib.vitro.webapp.dao.jena.ModelContext;
 import edu.cornell.mannlib.vitro.webapp.dao.jena.SearchReindexingListener;
 import edu.cornell.mannlib.vitro.webapp.search.beans.IndividualProhibitedFromSearch;
 import edu.cornell.mannlib.vitro.webapp.search.beans.ObjectSourceIface;
 import edu.cornell.mannlib.vitro.webapp.search.beans.ProhibitedFromSearch;
 import edu.cornell.mannlib.vitro.webapp.search.docbuilder.Obj2DocIface;
 import edu.cornell.mannlib.vitro.webapp.search.indexing.IndexBuilder;
 import edu.cornell.mannlib.vitro.webapp.search.lucene.Entity2LuceneDoc;
 import edu.cornell.mannlib.vitro.webapp.search.lucene.LuceneSetup;
 import edu.cornell.mannlib.vitro.webapp.servlet.setup.AbortStartup;
 public class SolrSetup implements javax.servlet.ServletContextListener{   
    private static final Log log = LogFactory.getLog(SolrSetup.class.getName());
    protected static final String LOCAL_SOLR_SERVER  = "vitro.local.solr.server";
    @Override
    public void contextInitialized(ServletContextEvent sce) {        
        if (AbortStartup.isStartupAborted(sce.getServletContext())) {
            return;
        }
        try {        
            ServletContext context = sce.getServletContext();
            /* setup the http connection with the solr server */
            String solrServerUrl = ConfigurationProperties.getBean(sce).getProperty("vitro.local.solr.url");
            if( solrServerUrl == null ){
                log.error("Could not find vitro.local.solr.url in deploy.properties.  "+
                        "Vitro application needs a URL of a solr server that it can use to index its data. " +
                        "The it should be something like http://localhost:${port}" + context.getContextPath() + "solr" 
                        );
                return;
            }            
            CommonsHttpSolrServer server;
            server = new CommonsHttpSolrServer( solrServerUrl );
            server.setSoTimeout(1000);  // socket read timeout
            server.setConnectionTimeout(100);
            server.setDefaultMaxConnectionsPerHost(100);
            server.setMaxTotalConnections(100);         
            server.setMaxRetries(1);            
            context.setAttribute(LOCAL_SOLR_SERVER, server);
            /* setup the individual to solr doc translation */            
            //first we need a ent2luceneDoc translator
            OntModel displayOntModel = (OntModel) sce.getServletContext().getAttribute("displayOntModel");
            Entity2LuceneDoc ent2LuceneDoc = new Entity2LuceneDoc( 
                    new ProhibitedFromSearch(DisplayVocabulary.PRIMARY_LUCENE_INDEX_URI, displayOntModel),
                    new IndividualProhibitedFromSearch(context) );                                              
            IndividualToSolrDocument indToSolrDoc = new IndividualToSolrDocument( ent2LuceneDoc );
            List<Obj2DocIface> o2d = new ArrayList<Obj2DocIface>();
            o2d.add(indToSolrDoc);
            /* setup solr indexer */
            SolrIndexer solrIndexer = new SolrIndexer(server, o2d);            
            if( solrIndexer.isIndexEmpty() ){
                log.info("solr index is empty, requesting rebuild");
                sce.getServletContext().setAttribute(LuceneSetup.INDEX_REBUILD_REQUESTED_AT_STARTUP, Boolean.TRUE);         
            }            
            // This is where the builder gets the list of places to try to
            // get objects to index. It is filtered so that non-public text
            // does not get into the search index.
            WebappDaoFactory wadf = (WebappDaoFactory) context.getAttribute("webappDaoFactory");
            VitroFilters vf = VitroFilterUtils.getDisplayFilterByRoleLevel(RoleLevel.PUBLIC, wadf);
            wadf = new WebappDaoFactoryFiltering(wadf, vf);
            List<ObjectSourceIface> sources = new ArrayList<ObjectSourceIface>();
            sources.add(wadf.getIndividualDao());
            IndexBuilder builder = new IndexBuilder(context, solrIndexer, sources);
            // to the servlet context so we can access it later in the webapp.
            context.setAttribute(IndexBuilder.class.getName(), builder);
            // set up listeners so search index builder is notified of changes to model
            ServletContext ctx = sce.getServletContext();
            SearchReindexingListener srl = new SearchReindexingListener(builder);
            ModelContext.registerListenerForChanges(ctx, srl);
            if( sce.getServletContext().getAttribute(LuceneSetup.INDEX_REBUILD_REQUESTED_AT_STARTUP) instanceof Boolean &&
                (Boolean)sce.getServletContext().getAttribute(LuceneSetup.INDEX_REBUILD_REQUESTED_AT_STARTUP) ){
                log.info("Rebuild of solr index required before startup.");
                builder.doIndexRebuild();                                               
                int n = 0;
                while( builder.isReindexRequested() || builder.isIndexing() ){
                    n++;
                    if( n % 20 == 0 ) //output message every 10 sec. 
                        log.info("Still rebuilding solr index");
                    Thread.sleep(500);
                }               
            }
            log.info("Setup of Solr index completed.");   
        } catch (Throwable e) {
            log.error("could not setup local solr server",e);
        }
    }
    @Override
    public void contextDestroyed(ServletContextEvent sce) {       
    }
    public static SolrServer getSolrServer(ServletContext ctx){
        return (SolrServer) ctx.getAttribute(LOCAL_SOLR_SERVER);
    }
 }