diff --git a/api/pom.xml b/api/pom.xml index f77e271d2..7ff47245c 100644 --- a/api/pom.xml +++ b/api/pom.xml @@ -58,6 +58,11 @@ argon2-jvm 2.4 + + org.apache.httpcomponents + fluent-hc + 4.5.6 + org.vivoweb vitro-dependencies diff --git a/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/ESAdder.java b/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/ESAdder.java new file mode 100644 index 000000000..860e02b8d --- /dev/null +++ b/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/ESAdder.java @@ -0,0 +1,92 @@ +/* $This file is distributed under the terms of the license in /doc/license.txt$ */ + +package edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch; + +import java.net.URLEncoder; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.http.client.fluent.Request; +import org.apache.http.client.fluent.Response; +import org.apache.http.entity.ContentType; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngineException; +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchInputDocument; +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchInputField; + +/** + * The nuts and bolts of adding a document to the Elasticsearch index + */ +public class ESAdder { + private static final Log log = LogFactory.getLog(ESAdder.class); + + private final String baseUrl; + + public ESAdder(String baseUrl) { + this.baseUrl = baseUrl; + } + + public void add(Collection docs) + throws SearchEngineException { + for (SearchInputDocument doc : docs) { + addDocument(doc); + } + } + + private void addDocument(SearchInputDocument doc) + throws SearchEngineException { + try { + Map> map = convertDocToMap(doc); + String json = new ObjectMapper().writeValueAsString(map); + log.debug("Adding document for '" + doc.getField("DocId") + "': " + + json); + + putToElastic(json, (String) doc.getField("DocId").getFirstValue()); + } catch (Exception e) { + throw new SearchEngineException("Failed to convert to JSON", e); + } + } + + /** + * Some field values are collections. Add the members of the collection + * instead. + */ + private Map> convertDocToMap(SearchInputDocument doc) { + Map> map = new HashMap<>(); + for (SearchInputField field : doc.getFieldMap().values()) { + ArrayList list = new ArrayList<>(); + for (Object value : field.getValues()) { + if (value instanceof Collection) { + Collection cValue = (Collection) value; + list.addAll(cValue); + } else { + list.add(value); + } + } + map.put(field.getName(), list); + } + return map; + } + + private void putToElastic(String json, String docId) + throws SearchEngineException { + try { + String url = baseUrl + "/_doc/" + + URLEncoder.encode(docId, "UTF8"); + Response response = Request.Put(url) + .bodyString(json, ContentType.APPLICATION_JSON).execute(); + log.debug("Response from Elasticsearch: " + + response.returnContent().asString()); + } catch (Exception e) { + throw new SearchEngineException("Failed to put to Elasticsearch", + e); + } + } +} diff --git a/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/ESCounter.java b/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/ESCounter.java new file mode 100644 index 000000000..14246484d --- /dev/null +++ b/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/ESCounter.java @@ -0,0 +1,42 @@ +/* $This file is distributed under the terms of the license in /doc/license.txt$ */ + +package edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch; + +import java.util.HashMap; +import java.util.Map; + +import org.apache.http.client.fluent.Request; +import org.apache.http.client.fluent.Response; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngineException; + +/** + * The nuts and bolts of getting the number of documents in the Elasticsearch + * index. + */ +public class ESCounter { + private final String baseUrl; + + public ESCounter(String baseUrl) { + this.baseUrl = baseUrl; + } + + public int count() throws SearchEngineException { + try { + String url = baseUrl + "/_doc/_count"; + Response response = Request.Get(url).execute(); + String json = response.returnContent().asString(); + + @SuppressWarnings("unchecked") + Map map = new ObjectMapper().readValue(json, + HashMap.class); + return (Integer) map.get("count"); + } catch (Exception e) { + throw new SearchEngineException("Failed to put to Elasticsearch", + e); + } + } + +} diff --git a/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/ESDeleter.java b/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/ESDeleter.java new file mode 100644 index 000000000..8267c6f21 --- /dev/null +++ b/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/ESDeleter.java @@ -0,0 +1,147 @@ +/* $This file is distributed under the terms of the license in /doc/license.txt$ */ + +package edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch; + +import java.io.IOException; +import java.net.URLEncoder; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.http.Header; +import org.apache.http.HttpEntity; +import org.apache.http.StatusLine; +import org.apache.http.client.HttpResponseException; +import org.apache.http.client.ResponseHandler; +import org.apache.http.client.fluent.Request; +import org.apache.http.client.fluent.Response; +import org.apache.http.entity.ContentType; +import org.apache.http.util.EntityUtils; + +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngineException; +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchQuery; +import edu.cornell.mannlib.vitro.webapp.searchengine.base.BaseSearchQuery; + +/** + * The nuts and bolts of deleting documents from the Elasticsearch index. + */ +public class ESDeleter { + private static final Log log = LogFactory.getLog(ESDeleter.class); + + private final String baseUrl; + + /** + * @param baseUrl + */ + public ESDeleter(String baseUrl) { + this.baseUrl = baseUrl; + } + + public void deleteByIds(Collection ids) + throws SearchEngineException { + for (String id : ids) { + deleteById(id); + } + } + + private void deleteById(String id) throws SearchEngineException { + try { + String url = baseUrl + "/_doc/" + + URLEncoder.encode(id, "UTF8"); + Response response = Request.Delete(url).execute(); + String json = response.returnContent().asString(); + } catch (HttpResponseException e) { + if (e.getStatusCode() == 404) { + // Don't care if it has already been deleted. + } else { + throw new SearchEngineException( + "Failed to delete Elasticsearch document " + id, e); + } + } catch (Exception e) { + throw new SearchEngineException( + "Failed to delete Elasticsearch document " + id, e); + } + } + + public void deleteByQuery(String queryString) throws SearchEngineException { + String url = baseUrl + "/_delete_by_query"; + SearchQuery query = new BaseSearchQuery().setQuery(queryString); + String queryJson = new QueryConverter(query).asString(); + + try { + Response response = Request.Post(url) + .bodyString(queryJson, ContentType.APPLICATION_JSON) + .execute(); + + BaseResponseHandler handler = new BaseResponseHandler(); + response.handleResponse(handler); + if (handler.getStatusCode() >= 400) { + log.warn(String.format( + "Failed to delete Elasticsearch documents by query: %s, %d - %s\n%s", + queryString, handler.getStatusCode(), + handler.getReasonPhrase(), handler.getContentString())); + } + } catch (IOException e) { + throw new SearchEngineException("Failed to delete Elasticsearch " + + "documents by query " + queryString, e); + } + } + + // ---------------------------------------------------------------------- + // Helper class for interpreting HttpResponse errors + // ---------------------------------------------------------------------- + + private class BaseResponseHandler implements ResponseHandler { + private int statusCode; + private String reasonPhrase; + private Map> headers; + private String contentString; + + @Override + public Object handleResponse(org.apache.http.HttpResponse innerResponse) + throws IOException { + StatusLine statusLine = innerResponse.getStatusLine(); + statusCode = statusLine.getStatusCode(); + reasonPhrase = statusLine.getReasonPhrase(); + + headers = new HashMap<>(); + for (Header header : innerResponse.getAllHeaders()) { + String name = header.getName(); + if (!headers.containsKey(name)) { + headers.put(name, new ArrayList()); + } + headers.get(name).add(header.getValue()); + } + + HttpEntity entity = innerResponse.getEntity(); + if (entity == null) { + contentString = ""; + } else { + contentString = EntityUtils.toString(entity); + } + return ""; + } + + public int getStatusCode() { + return statusCode; + } + + public String getReasonPhrase() { + return reasonPhrase; + } + + public Map> getHeaders() { + return headers; + } + + public String getContentString() { + return contentString; + } + + } + +} diff --git a/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/ESFlusher.java b/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/ESFlusher.java new file mode 100644 index 000000000..1b32e8e45 --- /dev/null +++ b/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/ESFlusher.java @@ -0,0 +1,41 @@ +/* $This file is distributed under the terms of the license in /doc/license.txt$ */ + +package edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.http.client.fluent.Request; +import org.apache.http.client.fluent.Response; + +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngineException; + +/** + * Just does a "commit" or "flush" to the index. + */ +public class ESFlusher { + private static final Log log = LogFactory.getLog(ESFlusher.class); + + private final String baseUrl; + + public ESFlusher(String baseUrl) { + this.baseUrl = baseUrl; + } + + public void flush() throws SearchEngineException { + flush(false); + } + + public void flush(boolean wait) throws SearchEngineException { + try { + String url = baseUrl + "/_flush" + + (wait ? "?wait_for_ongoing" : ""); + Response response = Request.Get(url).execute(); + String json = response.returnContent().asString(); + log.debug("flush response: " + json); + } catch (Exception e) { + throw new SearchEngineException("Failed to put to Elasticsearch", + e); + } + } + +} diff --git a/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/ESQuery.java b/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/ESQuery.java new file mode 100644 index 000000000..01fcf37f3 --- /dev/null +++ b/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/ESQuery.java @@ -0,0 +1,106 @@ +/* $This file is distributed under the terms of the license in /doc/license.txt$ */ + +package edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch; + +import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.http.HttpResponse; +import org.apache.http.client.methods.HttpEntityEnclosingRequestBase; +import org.apache.http.entity.ContentType; +import org.apache.http.entity.StringEntity; + +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngineException; +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchQuery; +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchResponse; +import edu.cornell.mannlib.vitro.webapp.utils.http.HttpClientFactory; + +/** + * Convert a SearchQuery to JSON, send it to Elasticsearch, and convert the JSON + * response to a SearchResponse. + */ +public class ESQuery { + private static final Log log = LogFactory.getLog(ESQuery.class); + + private final String baseUrl; + + public ESQuery(String baseUrl) { + this.baseUrl = baseUrl; + } + + public SearchResponse query(SearchQuery query) + throws SearchEngineException { + String queryString = new QueryConverter(query).asString(); + String response = doTheQuery(queryString); + return new ResponseParser(response).parse(); + } + + private String doTheQuery(String queryString) { + log.debug("QUERY: " + queryString); + try { + String url = baseUrl + "/_search"; + HttpResponse response = new ESFunkyGetRequest(url) + .bodyString(queryString, ContentType.APPLICATION_JSON) + .execute(); + String responseString = IOUtils + .toString(response.getEntity().getContent()); + log.debug("RESPONSE: " + responseString); + return responseString; + } catch (Exception e) { + log.error("Failed to put to Elasticsearch", e); + return ""; + } + } + + // ---------------------------------------------------------------------- + // Helper class -- a GET request that accepts a body + // ---------------------------------------------------------------------- + + /** + * The HttpClient implementations, both regular and conversational, do not + * allow you to put a body on a GET request. In online discussion, some say + * that the HTTP spec is ambiguous on this point, so each implementation + * makes its own choice. For example, CURL allows it. + * + * More to the point however, is that ElasticSearch requires it. So here's a + * simple class to make that possible. + * + * USE POST INSTEAD!! + */ + private static class ESFunkyGetRequest + extends HttpEntityEnclosingRequestBase { + public ESFunkyGetRequest(String url) throws SearchEngineException { + super(); + try { + setURI(new URI(url)); + } catch (URISyntaxException e) { + throw new SearchEngineException(e); + } + } + + public ESFunkyGetRequest bodyString(String contents, + ContentType contentType) { + setEntity(new StringEntity(contents, contentType)); + return this; + } + + public HttpResponse execute() throws SearchEngineException { + try { + return HttpClientFactory.getHttpClient().execute(this); + } catch (IOException e) { + throw new SearchEngineException(e); + } + } + + @Override + public String getMethod() { + return "GET"; + } + + } + +} diff --git a/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/ElasticSearchEngine.java b/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/ElasticSearchEngine.java new file mode 100644 index 000000000..80b1da984 --- /dev/null +++ b/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/ElasticSearchEngine.java @@ -0,0 +1,142 @@ +/* $This file is distributed under the terms of the license in /doc/license.txt$ */ + +package edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch; + +import java.util.Arrays; +import java.util.Collection; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import edu.cornell.mannlib.vitro.webapp.modules.Application; +import edu.cornell.mannlib.vitro.webapp.modules.ComponentStartupStatus; +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngine; +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngineException; +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchInputDocument; +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchQuery; +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchResponse; +import edu.cornell.mannlib.vitro.webapp.searchengine.base.BaseSearchInputDocument; +import edu.cornell.mannlib.vitro.webapp.searchengine.base.BaseSearchQuery; +import edu.cornell.mannlib.vitro.webapp.utils.configuration.Property; +import edu.cornell.mannlib.vitro.webapp.utils.configuration.Validation; + +/** + * A first draft of an Elasticsearch implementation. + */ +public class ElasticSearchEngine implements SearchEngine { + private static final Log log = LogFactory.getLog(ElasticSearchEngine.class); + + // ---------------------------------------------------------------------- + // Configuration + // ---------------------------------------------------------------------- + + private String baseUrl; + + @Property(uri = "http://vitro.mannlib.cornell.edu/ns/vitro/ApplicationSetup#hasBaseUrl") + public void setBaseUrl(String url) { + if (baseUrl == null) { + if (url.endsWith("/")) { + url = url.substring(0, url.length() - 1); + } + baseUrl = url; + } else { + throw new IllegalStateException( + "Configuration includes multiple base URLs: " + url + + ", and " + baseUrl); + } + } + + @Validation + public void validate() throws Exception { + if (baseUrl == null) { + throw new IllegalStateException( + "Configuration did not include a base URL."); + } + } + + // ---------------------------------------------------------------------- + // The instance + // ---------------------------------------------------------------------- + + @Override + public void startup(Application application, ComponentStartupStatus ss) { + log.warn("ElasticSearchEngine.startup() not implemented."); // TODO + } + + @Override + public void shutdown(Application application) { + // TODO Flush the buffers + log.warn("ElasticSearchEngine.shutdown not implemented."); + } + + @Override + public void ping() throws SearchEngineException { + // TODO What's the simplest we can do? Another smoke test? + log.warn("ElasticSearchEngine.ping() not implemented."); // TODO + } + + @Override + public SearchInputDocument createInputDocument() { + return new BaseSearchInputDocument(); + } + + @Override + public void add(SearchInputDocument... docs) throws SearchEngineException { + add(Arrays.asList(docs)); + } + + @Override + public void add(Collection docs) + throws SearchEngineException { + new ESAdder(baseUrl).add(docs); + } + + @Override + public void commit() throws SearchEngineException { + new ESFlusher(baseUrl).flush(); + } + + @Override + public void commit(boolean wait) throws SearchEngineException { + new ESFlusher(baseUrl).flush(wait); + } + + @Override + public void deleteById(String... ids) throws SearchEngineException { + deleteById(Arrays.asList(ids)); + } + + @Override + public void deleteById(Collection ids) + throws SearchEngineException { + new ESDeleter(baseUrl).deleteByIds(ids); + } + + @Override + public void deleteByQuery(String query) throws SearchEngineException { + new ESDeleter(baseUrl).deleteByQuery(query); + } + + @Override + public SearchQuery createQuery() { + return new BaseSearchQuery(); + } + + @Override + public SearchQuery createQuery(String queryText) { + BaseSearchQuery query = new BaseSearchQuery(); + query.setQuery(queryText); + return query; + } + + @Override + public SearchResponse query(SearchQuery query) + throws SearchEngineException { + return new ESQuery(baseUrl).query(query); + } + + @Override + public int documentCount() throws SearchEngineException { + return new ESCounter(baseUrl).count(); + } +} diff --git a/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/ElasticSearchResultDocumentList.java b/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/ElasticSearchResultDocumentList.java new file mode 100644 index 000000000..a10e4670b --- /dev/null +++ b/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/ElasticSearchResultDocumentList.java @@ -0,0 +1,53 @@ +/* $This file is distributed under the terms of the license in /doc/license.txt$ */ + +package edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchResultDocument; +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchResultDocumentList; + +/** + * A simple implementation. In fact, this is so simple that perhaps it should be + * named BaseSearchResultDocumentList. + */ +class ElasticSearchResultDocumentList implements SearchResultDocumentList { + private final List documents; + private final long numberFound; + + public ElasticSearchResultDocumentList(List documents, + long numberFound) { + this.documents = documents; + this.numberFound = numberFound; + } + + @Override + public Iterator iterator() { + return documents.iterator(); + } + + @Override + public long getNumFound() { + return documents.size(); + } + + @Override + public int size() { + return documents.size(); + } + + @Override + public SearchResultDocument get(int i) { + return documents.get(i); + } + + @Override + public String toString() { + return String.format( + "ElasticSearchResultDocumentList[numberFound=%s, documents=%s]", + numberFound, documents); + } + +} \ No newline at end of file diff --git a/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/Elasticsearch_notes_on_the_first_draft.md b/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/Elasticsearch_notes_on_the_first_draft.md new file mode 100644 index 000000000..2ffd3aade --- /dev/null +++ b/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/Elasticsearch_notes_on_the_first_draft.md @@ -0,0 +1,216 @@ +# What is this package? +* The first draft of a Elasticsearch driver for VIVO + +# What has been done? +* Implement the `SearchEngine` interface + * Classes in `edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch` +* No attempt to add new functions. + +# How to experiment with it? +* Install elasticsearch somewhere. +* Create a search index with the appropriate mapping (see below). +* Check out VIVO and this branch of Vitro (see below), and do the usual installation procedure. +* Modify `{vitro_home}/config/applicationSetup.n3` to use this driver (see below). +* Start elasticsearch +* Start VIVO + +# Not ready for production +* Documentation + * Instructions on how to install and configure the driver. + * Instructions on how to setup elasticsearch? +* Smoke test + * Display a warning if the elasticsearch server is not responding. +* Functional testing + * Are we getting the proper search results? + * Are search results in the order that we would like? +* Improved snippets + * Copy the technique used for Solr +* Code improvement + * Rigorous closing of HTTP connections. + * IOC for HTTP code, to help in unit testing + * Consistent use of exceptions and logging +* Unit tests +* Automatic initialization of the index + * If VIVO detects an empty index, apply the mapping. + +# The next steps: adding functionality + +## Stay within the framework +* Add fields that enhance the contents of the search index documents (see below). +* Add data distributors that run queries and format the output (see below). + +## Go outside the framework +* Add functions to the Elasticsearch driver that the Solr driver will simply ignore. + * Or remove Solr entirely +* Query Elasticsearch directly + * Or write a data distributor that will run the query + +# The details: + +## Check out VIVO and Vitro +* For now, the Elasticsearch driver only lives in my fork of Vitro +* No changes to VIVO are required (yet). + +``` +git clone https://github.com/vivo-project/VIVO.git +git clone -b feature/elasticsearchExperiments https://github.com/j2blake/Vitro.git +``` + +## A mapping for the search index +* If the index uses the default mapping, it will not work correctly. +* Some fields must be declared as `keyword`, some as unstemmed, etc. + +* Example mapping script: + +``` +curl -X PUT "localhost:9200/vivo?pretty" -H 'Content-Type: application/json' -d' +{ + "mappings": { + "_doc": { + "properties": { + "ALLTEXT": { + "type": "text", + "analyzer": "english" + }, + "ALLTEXTUNSTEMMED": { + "type": "text", + "analyzer": "standard" + }, + "DocId": { + "type": "keyword" + }, + "classgroup": { + "type": "keyword" + }, + "type": { + "type": "keyword" + }, + "mostSpecificTypeURIs": { + "type": "keyword" + }, + "indexedTime": { + "type": "long" + }, + "nameRaw": { + "type": "keyword" + }, + "URI": { + "type": "keyword" + }, + "THUMBNAIL": { + "type": "integer" + }, + "THUMBNAIL_URL": { + "type": "keyword" + }, + "nameLowercaseSingleValued": { + "type": "text", + "analyzer": "standard", + "fielddata": "true" + }, + "BETA" : { + "type" : "float" + } + } + } + }, + "query": { + "default_field": "ALLTEXT" + } +} +' +``` +* __*Note:*__ The first line of the script specifies the name of the index as `vivo`. +Any name may be used, but it must match the "base URL" that is specified in `applicationSetup.n3` (see below). +* __*Note:*__ The same first line specifies the location and port number of the elasticsearch server. +Again, any location and port may be used, but they must match the "base URL" in `applicationSetup.n3`. + +## Modify `applicationSetup.n3` +* Change this: + +``` +# ---------------------------- +# +# Search engine module: +# The Solr-based implementation is the only standard option, but it can be +# wrapped in an "instrumented" wrapper, which provides additional logging +# and more rigorous life-cycle checking. +# + +:instrumentedSearchEngineWrapper + a , + ; + :wraps :solrSearchEngine . + +``` + +* To this: + +``` +# ---------------------------- +# +# Search engine module: +# The Solr-based implementation is the only standard option, but it can be +# wrapped in an "instrumented" wrapper, which provides additional logging +# and more rigorous life-cycle checking. +# + +:instrumentedSearchEngineWrapper + a , + ; + :wraps :elasticSearchEngine . + +:elasticSearchEngine + a , + ; + :hasBaseUrl "http://localhost:9200/vivo" . +``` + +## Enhance the contents of the search index +### An example: Publication URIs in the author's search document +* Add a keyword field to the search index + +``` + "publicationURI": { + "type": "keyword" + }, +``` + +* Add a `DocumentModifier` to VIVO. + +``` +:documentModifier_publications + a , + ; + rdfs:label "URIs of publications are added to publicationURI field." ; + :hasTargetField "publicationURI" ; + :hasSelectQuery """ + PREFIX rdfs: + PREFIX vivo: + PREFIX bibo: + SELECT ?publication + WHERE { + ?uri vivo:relatedBy ?authorship . + ?authorship a vivo:Authorship . + ?authorship vivo:relates ?publication . + ?publication a bibo:Document . + } + """ . +``` + +## Use data distributors to query the search index +* Install the Data Distribution API +* Add a distributor: + +``` +:drill_by_URI + a , + ; + :actionName "searchAndDrill" . +``` + +* Run the query: + +``` +http://localhost:8080/vivo/api/dataRequest/searchAndDrill?uri=http://scholars.cornell.edu/individual/mj495 +``` \ No newline at end of file diff --git a/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/JsonTree.java b/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/JsonTree.java new file mode 100644 index 000000000..6c44ce971 --- /dev/null +++ b/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/JsonTree.java @@ -0,0 +1,77 @@ +/* $This file is distributed under the terms of the license in /doc/license.txt$ */ + +package edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch; + +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Build a Map of Objects, suitable for marshalling by Jackson. + * + * Include conditional logic, so null values, empty maps, or empty lists will + * not be added, unless you use the special values. + */ +public class JsonTree { + /** + * Empty maps will not be added, except for this one. + */ + public static final Map EMPTY_JSON_MAP = Collections + .emptyMap(); + + /** + * Empty lists will not be added, except for this one. + */ + public static final List EMPTY_JSON_LIST = Collections.emptyList(); + + /** + * Create the tree + */ + public static JsonTree tree() { + return new JsonTree(); + } + + /** + * This will cause negative integers to be ignored. + */ + public static Integer ifPositive(int i) { + return (i > 0) ? i : null; + } + + private Map map = new HashMap<>(); + + public JsonTree put(String key, Object value) { + if (isSignificant(value)) { + storeIt(key, value); + } + return this; + } + + private boolean isSignificant(Object value) { + if (value == null) { + return false; + } + if (value instanceof Map && ((Map) value).isEmpty() + && value != EMPTY_JSON_MAP) { + return false; + } + if (value instanceof List && ((List) value).isEmpty() + && value != EMPTY_JSON_LIST) { + return false; + } + return true; + } + + private void storeIt(String key, Object value) { + if (value instanceof JsonTree) { + map.put(key, ((JsonTree) value).asMap()); + } else { + map.put(key, value); + } + } + + public Map asMap() { + return new HashMap<>(map); + } +} diff --git a/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/QueryConverter.java b/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/QueryConverter.java new file mode 100644 index 000000000..d756054b9 --- /dev/null +++ b/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/QueryConverter.java @@ -0,0 +1,172 @@ +/* $This file is distributed under the terms of the license in /doc/license.txt$ */ + +package edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch; + +import static edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch.JsonTree.EMPTY_JSON_MAP; +import static edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch.JsonTree.ifPositive; +import static edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch.JsonTree.tree; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; + +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngineException; +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchQuery; +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchQuery.Order; + +/** + * Accept a SearchQuery and make it available as a JSON string, suitable for + * Elasticsearch. + */ +public class QueryConverter { + private static final Log log = LogFactory.getLog(QueryConverter.class); + + private final SearchQuery query; + private final Map queryAndFilters; + private final Map sortFields; + private final Map facets; + private final Map highlighter; + private final List returnFields; + private final Map fullMap; + + public QueryConverter(SearchQuery query) { + this.query = query; + this.queryAndFilters = filteredOrNot(); + this.sortFields = figureSortFields(); + this.facets = figureFacets(); + this.highlighter = figureHighlighter(); + this.returnFields = figureReturnFields(); + + this.fullMap = figureFullMap(); + } + + private Map filteredOrNot() { + if (query.getFilters().isEmpty()) { + return new QueryStringMap(query.getQuery()).map; + } else { + return buildFilterStructure(); + } + } + + private Map buildFilterStructure() { + return tree() // + .put("bool", tree() // + .put("must", new QueryStringMap(query.getQuery()).map) // + .put("filter", buildFiltersList())) // + .asMap(); + } + + private List> buildFiltersList() { + List> list = new ArrayList<>(); + for (String filter : query.getFilters()) { + list.add(new QueryStringMap(filter).map); + } + return list; + } + + private Map figureSortFields() { + Map fields = query.getSortFields(); + Map map = new HashMap<>(); + for (String name : fields.keySet()) { + String sortOrder = fields.get(name).toString().toLowerCase(); + map.put(name, sortOrder); + } + return map; + } + + private Map figureFacets() { + Map map = new HashMap<>(); + for (String field : query.getFacetFields()) { + map.put("facet_" + field, figureFacet(field)); + } + return map; + } + + private Map figureHighlighter() { + return tree() // + .put("fields", tree() // + .put("ALLTEXT", EMPTY_JSON_MAP)) + .asMap(); + } + + private Map figureFacet(String field) { + return tree() // + .put("terms", tree() // + .put("field", field) // + .put("size", ifPositive(query.getFacetLimit())) // + .put("min_doc_count", + ifPositive(query.getFacetMinCount()))) // + .asMap(); + } + + private List figureReturnFields() { + return new ArrayList<>(query.getFieldsToReturn()); + } + + private Map figureFullMap() { + return tree() // + .put("query", queryAndFilters) // + .put("from", ifPositive(query.getStart())) // + .put("highlight", highlighter) + .put("size", ifPositive(query.getRows())) // + .put("sort", sortFields) // + .put("_source", returnFields) // + .put("aggregations", facets) // + .asMap(); + } + + public String asString() throws SearchEngineException { + try { + return new ObjectMapper().writeValueAsString(fullMap); + } catch (JsonProcessingException e) { + throw new SearchEngineException(e); + } + } + + private static class QueryStringMap { + public final Map map; + + public QueryStringMap(String queryString) { + map = new HashMap<>(); + map.put("query_string", makeInnerMap(escape(queryString))); + } + + /** + * This is a kluge, but perhaps it will work for now. + * + * Apparently Solr is willing to put up with query strings that contain + * special characters in odd places, but Elasticsearch is not. + * + * So, a query string of "classgroup:http://this/that" must be escaped + * as "classgroup:http\:\/\/this\/that". Notice that the first colon + * delimits the field name, and so must not be escaped. + * + * But what if no field is specified? Then all colons must be escaped. + * How would we distinguish that? + * + * And what if the query is more complex, and more than one field is + * specified? What if other special characters are included? + * + * This could be a real problem. + */ + private String escape(String queryString) { + return queryString.replace(":", "\\:").replace("/", "\\/") + .replaceFirst("\\\\:", ":"); + } + + private Map makeInnerMap(String queryString) { + Map inner = new HashMap<>(); + inner.put("default_field", "ALLTEXT"); + inner.put("query", queryString); + return inner; + } + } + +} diff --git a/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/ResponseParser.java b/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/ResponseParser.java new file mode 100644 index 000000000..7ac521065 --- /dev/null +++ b/api/src/main/java/edu/cornell/mannlib/vitro/webapp/searchengine/elasticsearch/ResponseParser.java @@ -0,0 +1,182 @@ +/* $This file is distributed under the terms of the license in /doc/license.txt$ */ + +package edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngineException; +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchFacetField; +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchFacetField.Count; +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchResponse; +import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchResultDocument; +import edu.cornell.mannlib.vitro.webapp.searchengine.base.BaseSearchFacetField; +import edu.cornell.mannlib.vitro.webapp.searchengine.base.BaseSearchFacetField.BaseCount; +import edu.cornell.mannlib.vitro.webapp.searchengine.base.BaseSearchResponse; +import edu.cornell.mannlib.vitro.webapp.searchengine.base.BaseSearchResultDocument; + +/** + * Elastic search sends a JSON response to a query. parse it to a + * SearchResponse. + */ +class ResponseParser { + private static final Log log = LogFactory.getLog(ResponseParser.class); + + private final Map responseMap; + + private Map>> highlightingMap; + private Map facetFieldsMap; + private long totalHits; + private List documentList; + + @SuppressWarnings("unchecked") + public ResponseParser(String responseString) throws SearchEngineException { + try { + this.responseMap = new ObjectMapper().readValue(responseString, + HashMap.class); + } catch (IOException e) { + throw new SearchEngineException(e); + } + } + + public SearchResponse parse() { + parseDocumentList(); + parseFacetFields(); + SearchResponse response = new BaseSearchResponse(highlightingMap, + facetFieldsMap, + new ElasticSearchResultDocumentList(documentList, totalHits)); + log.debug("ESQuery.ResponseParser.parse: " + response); + return response; + } + + private void parseFacetFields() { + facetFieldsMap = new HashMap<>(); + + @SuppressWarnings("unchecked") + Map> aggregations = (Map>) responseMap + .get("aggregations"); + if (aggregations == null) { + return; + } + + for (String key : aggregations.keySet()) { + if (key.startsWith("facet_")) { + String name = key.substring(6); + parseFacetField(name, aggregations.get(key)); + } + } + } + + private void parseFacetField(String name, Map facetMap) { + @SuppressWarnings("unchecked") + List> bucketsList = (List>) facetMap + .get("buckets"); + if (bucketsList == null) { + return; + } + + List counts = new ArrayList<>(); + for (Map bucket : bucketsList) { + counts.add(new BaseCount((String) bucket.get("key"), + (Integer) bucket.get("doc_count"))); + } + + facetFieldsMap.put(name, new BaseSearchFacetField(name, counts)); + } + + private void parseDocumentList() { + documentList = new ArrayList<>(); + highlightingMap = new HashMap<>(); + + @SuppressWarnings("unchecked") + Map uberHits = (Map) responseMap + .get("hits"); + if (uberHits == null) { + log.warn("Didn't find a 'hits' field " + "in the query response: " + + responseMap); + return; + } + + Integer total = (Integer) uberHits.get("total"); + if (total == null) { + log.warn("Didn't find a 'hits.total' field " + + "in the query response: " + responseMap); + return; + } + + @SuppressWarnings("unchecked") + List> hits = (List>) uberHits + .get("hits"); + if (hits == null) { + log.warn("Didn't find a 'hits.hits' field " + + "in the query response: " + responseMap); + return; + } + + parseDocuments(hits); + } + + private void parseDocuments(List> hits) { + for (Map hit : hits) { + SearchResultDocument doc = parseDocument(hit); + if (doc != null) { + documentList.add(doc); + + Map> highlight = parseHighlight(hit); + if (highlight != null) { + highlightingMap.put(doc.getUniqueId(), highlight); + } + } + } + } + + private SearchResultDocument parseDocument(Map hitMap) { + @SuppressWarnings("unchecked") + Map> sourceMap = (Map>) hitMap + .get("_source"); + if (sourceMap == null) { + log.warn("Didn't find a '_source' field in the hit: " + hitMap); + return null; + } + + String id = (String) hitMap.get("_id"); + if (id == null) { + log.warn("Didn't find a '_id' field in the hit: " + hitMap); + return null; + } + + return new BaseSearchResultDocument(id, sourceMap); + } + + private Map> parseHighlight( + Map hitMap) { + @SuppressWarnings("unchecked") + Map> highlightMap = (Map>) hitMap + .get("highlight"); + if (highlightMap == null) { + log.debug("Didn't find a 'highlight' field in the hit: " + hitMap); + return null; + } + + @SuppressWarnings("unchecked") + List snippets = highlightMap.get("ALLTEXT"); + if (snippets == null) { + log.warn("Didn't find a 'highlight.ALLTEXT' field in the hit: " + + hitMap); + return null; + } + + Map> snippetMap = new HashMap<>(); + snippetMap.put("ALLTEXT", snippets); + return snippetMap; + } +} \ No newline at end of file