initial files for ElasticSearch: ticket vivo-1587 (#85)

* initial files from ticket vivo-1587
* Add https://mvnrepository.com/artifact/org.apache.httpcomponents/fluent-hc/4.5.6 dependency
* Resolves: https://jira.duraspace.org/browse/VIVO-1587
This commit is contained in:
Don Elsborg 2018-09-24 20:05:42 -06:00 committed by Andrew Woods
parent 734b9ccf68
commit 40f78e58a8
12 changed files with 1275 additions and 0 deletions

View file

@ -58,6 +58,11 @@
<artifactId>argon2-jvm</artifactId> <artifactId>argon2-jvm</artifactId>
<version>2.4</version> <version>2.4</version>
</dependency> </dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>fluent-hc</artifactId>
<version>4.5.6</version>
</dependency>
<dependency> <dependency>
<groupId>org.vivoweb</groupId> <groupId>org.vivoweb</groupId>
<artifactId>vitro-dependencies</artifactId> <artifactId>vitro-dependencies</artifactId>

View file

@ -0,0 +1,92 @@
/* $This file is distributed under the terms of the license in /doc/license.txt$ */
package edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.client.fluent.Request;
import org.apache.http.client.fluent.Response;
import org.apache.http.entity.ContentType;
import com.fasterxml.jackson.databind.ObjectMapper;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngineException;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchInputDocument;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchInputField;
/**
* The nuts and bolts of adding a document to the Elasticsearch index
*/
public class ESAdder {
private static final Log log = LogFactory.getLog(ESAdder.class);
private final String baseUrl;
public ESAdder(String baseUrl) {
this.baseUrl = baseUrl;
}
public void add(Collection<SearchInputDocument> docs)
throws SearchEngineException {
for (SearchInputDocument doc : docs) {
addDocument(doc);
}
}
private void addDocument(SearchInputDocument doc)
throws SearchEngineException {
try {
Map<String, List<Object>> map = convertDocToMap(doc);
String json = new ObjectMapper().writeValueAsString(map);
log.debug("Adding document for '" + doc.getField("DocId") + "': "
+ json);
putToElastic(json, (String) doc.getField("DocId").getFirstValue());
} catch (Exception e) {
throw new SearchEngineException("Failed to convert to JSON", e);
}
}
/**
* Some field values are collections. Add the members of the collection
* instead.
*/
private Map<String, List<Object>> convertDocToMap(SearchInputDocument doc) {
Map<String, List<Object>> map = new HashMap<>();
for (SearchInputField field : doc.getFieldMap().values()) {
ArrayList<Object> list = new ArrayList<>();
for (Object value : field.getValues()) {
if (value instanceof Collection) {
Collection<?> cValue = (Collection<?>) value;
list.addAll(cValue);
} else {
list.add(value);
}
}
map.put(field.getName(), list);
}
return map;
}
private void putToElastic(String json, String docId)
throws SearchEngineException {
try {
String url = baseUrl + "/_doc/"
+ URLEncoder.encode(docId, "UTF8");
Response response = Request.Put(url)
.bodyString(json, ContentType.APPLICATION_JSON).execute();
log.debug("Response from Elasticsearch: "
+ response.returnContent().asString());
} catch (Exception e) {
throw new SearchEngineException("Failed to put to Elasticsearch",
e);
}
}
}

View file

@ -0,0 +1,42 @@
/* $This file is distributed under the terms of the license in /doc/license.txt$ */
package edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch;
import java.util.HashMap;
import java.util.Map;
import org.apache.http.client.fluent.Request;
import org.apache.http.client.fluent.Response;
import com.fasterxml.jackson.databind.ObjectMapper;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngineException;
/**
* The nuts and bolts of getting the number of documents in the Elasticsearch
* index.
*/
public class ESCounter {
private final String baseUrl;
public ESCounter(String baseUrl) {
this.baseUrl = baseUrl;
}
public int count() throws SearchEngineException {
try {
String url = baseUrl + "/_doc/_count";
Response response = Request.Get(url).execute();
String json = response.returnContent().asString();
@SuppressWarnings("unchecked")
Map<String, Object> map = new ObjectMapper().readValue(json,
HashMap.class);
return (Integer) map.get("count");
} catch (Exception e) {
throw new SearchEngineException("Failed to put to Elasticsearch",
e);
}
}
}

View file

@ -0,0 +1,147 @@
/* $This file is distributed under the terms of the license in /doc/license.txt$ */
package edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch;
import java.io.IOException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.StatusLine;
import org.apache.http.client.HttpResponseException;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.fluent.Request;
import org.apache.http.client.fluent.Response;
import org.apache.http.entity.ContentType;
import org.apache.http.util.EntityUtils;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngineException;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchQuery;
import edu.cornell.mannlib.vitro.webapp.searchengine.base.BaseSearchQuery;
/**
* The nuts and bolts of deleting documents from the Elasticsearch index.
*/
public class ESDeleter {
private static final Log log = LogFactory.getLog(ESDeleter.class);
private final String baseUrl;
/**
* @param baseUrl
*/
public ESDeleter(String baseUrl) {
this.baseUrl = baseUrl;
}
public void deleteByIds(Collection<String> ids)
throws SearchEngineException {
for (String id : ids) {
deleteById(id);
}
}
private void deleteById(String id) throws SearchEngineException {
try {
String url = baseUrl + "/_doc/"
+ URLEncoder.encode(id, "UTF8");
Response response = Request.Delete(url).execute();
String json = response.returnContent().asString();
} catch (HttpResponseException e) {
if (e.getStatusCode() == 404) {
// Don't care if it has already been deleted.
} else {
throw new SearchEngineException(
"Failed to delete Elasticsearch document " + id, e);
}
} catch (Exception e) {
throw new SearchEngineException(
"Failed to delete Elasticsearch document " + id, e);
}
}
public void deleteByQuery(String queryString) throws SearchEngineException {
String url = baseUrl + "/_delete_by_query";
SearchQuery query = new BaseSearchQuery().setQuery(queryString);
String queryJson = new QueryConverter(query).asString();
try {
Response response = Request.Post(url)
.bodyString(queryJson, ContentType.APPLICATION_JSON)
.execute();
BaseResponseHandler handler = new BaseResponseHandler();
response.handleResponse(handler);
if (handler.getStatusCode() >= 400) {
log.warn(String.format(
"Failed to delete Elasticsearch documents by query: %s, %d - %s\n%s",
queryString, handler.getStatusCode(),
handler.getReasonPhrase(), handler.getContentString()));
}
} catch (IOException e) {
throw new SearchEngineException("Failed to delete Elasticsearch "
+ "documents by query " + queryString, e);
}
}
// ----------------------------------------------------------------------
// Helper class for interpreting HttpResponse errors
// ----------------------------------------------------------------------
private class BaseResponseHandler implements ResponseHandler<Object> {
private int statusCode;
private String reasonPhrase;
private Map<String, List<String>> headers;
private String contentString;
@Override
public Object handleResponse(org.apache.http.HttpResponse innerResponse)
throws IOException {
StatusLine statusLine = innerResponse.getStatusLine();
statusCode = statusLine.getStatusCode();
reasonPhrase = statusLine.getReasonPhrase();
headers = new HashMap<>();
for (Header header : innerResponse.getAllHeaders()) {
String name = header.getName();
if (!headers.containsKey(name)) {
headers.put(name, new ArrayList<String>());
}
headers.get(name).add(header.getValue());
}
HttpEntity entity = innerResponse.getEntity();
if (entity == null) {
contentString = "";
} else {
contentString = EntityUtils.toString(entity);
}
return "";
}
public int getStatusCode() {
return statusCode;
}
public String getReasonPhrase() {
return reasonPhrase;
}
public Map<String, List<String>> getHeaders() {
return headers;
}
public String getContentString() {
return contentString;
}
}
}

View file

@ -0,0 +1,41 @@
/* $This file is distributed under the terms of the license in /doc/license.txt$ */
package edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.client.fluent.Request;
import org.apache.http.client.fluent.Response;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngineException;
/**
* Just does a "commit" or "flush" to the index.
*/
public class ESFlusher {
private static final Log log = LogFactory.getLog(ESFlusher.class);
private final String baseUrl;
public ESFlusher(String baseUrl) {
this.baseUrl = baseUrl;
}
public void flush() throws SearchEngineException {
flush(false);
}
public void flush(boolean wait) throws SearchEngineException {
try {
String url = baseUrl + "/_flush"
+ (wait ? "?wait_for_ongoing" : "");
Response response = Request.Get(url).execute();
String json = response.returnContent().asString();
log.debug("flush response: " + json);
} catch (Exception e) {
throw new SearchEngineException("Failed to put to Elasticsearch",
e);
}
}
}

View file

@ -0,0 +1,106 @@
/* $This file is distributed under the terms of the license in /doc/license.txt$ */
package edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.HttpResponse;
import org.apache.http.client.methods.HttpEntityEnclosingRequestBase;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.StringEntity;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngineException;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchQuery;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchResponse;
import edu.cornell.mannlib.vitro.webapp.utils.http.HttpClientFactory;
/**
* Convert a SearchQuery to JSON, send it to Elasticsearch, and convert the JSON
* response to a SearchResponse.
*/
public class ESQuery {
private static final Log log = LogFactory.getLog(ESQuery.class);
private final String baseUrl;
public ESQuery(String baseUrl) {
this.baseUrl = baseUrl;
}
public SearchResponse query(SearchQuery query)
throws SearchEngineException {
String queryString = new QueryConverter(query).asString();
String response = doTheQuery(queryString);
return new ResponseParser(response).parse();
}
private String doTheQuery(String queryString) {
log.debug("QUERY: " + queryString);
try {
String url = baseUrl + "/_search";
HttpResponse response = new ESFunkyGetRequest(url)
.bodyString(queryString, ContentType.APPLICATION_JSON)
.execute();
String responseString = IOUtils
.toString(response.getEntity().getContent());
log.debug("RESPONSE: " + responseString);
return responseString;
} catch (Exception e) {
log.error("Failed to put to Elasticsearch", e);
return "";
}
}
// ----------------------------------------------------------------------
// Helper class -- a GET request that accepts a body
// ----------------------------------------------------------------------
/**
* The HttpClient implementations, both regular and conversational, do not
* allow you to put a body on a GET request. In online discussion, some say
* that the HTTP spec is ambiguous on this point, so each implementation
* makes its own choice. For example, CURL allows it.
*
* More to the point however, is that ElasticSearch requires it. So here's a
* simple class to make that possible.
*
* USE POST INSTEAD!!
*/
private static class ESFunkyGetRequest
extends HttpEntityEnclosingRequestBase {
public ESFunkyGetRequest(String url) throws SearchEngineException {
super();
try {
setURI(new URI(url));
} catch (URISyntaxException e) {
throw new SearchEngineException(e);
}
}
public ESFunkyGetRequest bodyString(String contents,
ContentType contentType) {
setEntity(new StringEntity(contents, contentType));
return this;
}
public HttpResponse execute() throws SearchEngineException {
try {
return HttpClientFactory.getHttpClient().execute(this);
} catch (IOException e) {
throw new SearchEngineException(e);
}
}
@Override
public String getMethod() {
return "GET";
}
}
}

View file

@ -0,0 +1,142 @@
/* $This file is distributed under the terms of the license in /doc/license.txt$ */
package edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch;
import java.util.Arrays;
import java.util.Collection;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import edu.cornell.mannlib.vitro.webapp.modules.Application;
import edu.cornell.mannlib.vitro.webapp.modules.ComponentStartupStatus;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngine;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngineException;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchInputDocument;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchQuery;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchResponse;
import edu.cornell.mannlib.vitro.webapp.searchengine.base.BaseSearchInputDocument;
import edu.cornell.mannlib.vitro.webapp.searchengine.base.BaseSearchQuery;
import edu.cornell.mannlib.vitro.webapp.utils.configuration.Property;
import edu.cornell.mannlib.vitro.webapp.utils.configuration.Validation;
/**
* A first draft of an Elasticsearch implementation.
*/
public class ElasticSearchEngine implements SearchEngine {
private static final Log log = LogFactory.getLog(ElasticSearchEngine.class);
// ----------------------------------------------------------------------
// Configuration
// ----------------------------------------------------------------------
private String baseUrl;
@Property(uri = "http://vitro.mannlib.cornell.edu/ns/vitro/ApplicationSetup#hasBaseUrl")
public void setBaseUrl(String url) {
if (baseUrl == null) {
if (url.endsWith("/")) {
url = url.substring(0, url.length() - 1);
}
baseUrl = url;
} else {
throw new IllegalStateException(
"Configuration includes multiple base URLs: " + url
+ ", and " + baseUrl);
}
}
@Validation
public void validate() throws Exception {
if (baseUrl == null) {
throw new IllegalStateException(
"Configuration did not include a base URL.");
}
}
// ----------------------------------------------------------------------
// The instance
// ----------------------------------------------------------------------
@Override
public void startup(Application application, ComponentStartupStatus ss) {
log.warn("ElasticSearchEngine.startup() not implemented."); // TODO
}
@Override
public void shutdown(Application application) {
// TODO Flush the buffers
log.warn("ElasticSearchEngine.shutdown not implemented.");
}
@Override
public void ping() throws SearchEngineException {
// TODO What's the simplest we can do? Another smoke test?
log.warn("ElasticSearchEngine.ping() not implemented."); // TODO
}
@Override
public SearchInputDocument createInputDocument() {
return new BaseSearchInputDocument();
}
@Override
public void add(SearchInputDocument... docs) throws SearchEngineException {
add(Arrays.asList(docs));
}
@Override
public void add(Collection<SearchInputDocument> docs)
throws SearchEngineException {
new ESAdder(baseUrl).add(docs);
}
@Override
public void commit() throws SearchEngineException {
new ESFlusher(baseUrl).flush();
}
@Override
public void commit(boolean wait) throws SearchEngineException {
new ESFlusher(baseUrl).flush(wait);
}
@Override
public void deleteById(String... ids) throws SearchEngineException {
deleteById(Arrays.asList(ids));
}
@Override
public void deleteById(Collection<String> ids)
throws SearchEngineException {
new ESDeleter(baseUrl).deleteByIds(ids);
}
@Override
public void deleteByQuery(String query) throws SearchEngineException {
new ESDeleter(baseUrl).deleteByQuery(query);
}
@Override
public SearchQuery createQuery() {
return new BaseSearchQuery();
}
@Override
public SearchQuery createQuery(String queryText) {
BaseSearchQuery query = new BaseSearchQuery();
query.setQuery(queryText);
return query;
}
@Override
public SearchResponse query(SearchQuery query)
throws SearchEngineException {
return new ESQuery(baseUrl).query(query);
}
@Override
public int documentCount() throws SearchEngineException {
return new ESCounter(baseUrl).count();
}
}

View file

@ -0,0 +1,53 @@
/* $This file is distributed under the terms of the license in /doc/license.txt$ */
package edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchResultDocument;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchResultDocumentList;
/**
* A simple implementation. In fact, this is so simple that perhaps it should be
* named BaseSearchResultDocumentList.
*/
class ElasticSearchResultDocumentList implements SearchResultDocumentList {
private final List<SearchResultDocument> documents;
private final long numberFound;
public ElasticSearchResultDocumentList(List<SearchResultDocument> documents,
long numberFound) {
this.documents = documents;
this.numberFound = numberFound;
}
@Override
public Iterator<SearchResultDocument> iterator() {
return documents.iterator();
}
@Override
public long getNumFound() {
return documents.size();
}
@Override
public int size() {
return documents.size();
}
@Override
public SearchResultDocument get(int i) {
return documents.get(i);
}
@Override
public String toString() {
return String.format(
"ElasticSearchResultDocumentList[numberFound=%s, documents=%s]",
numberFound, documents);
}
}

View file

@ -0,0 +1,216 @@
# What is this package?
* The first draft of a Elasticsearch driver for VIVO
# What has been done?
* Implement the `SearchEngine` interface
* Classes in `edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch`
* No attempt to add new functions.
# How to experiment with it?
* Install elasticsearch somewhere.
* Create a search index with the appropriate mapping (see below).
* Check out VIVO and this branch of Vitro (see below), and do the usual installation procedure.
* Modify `{vitro_home}/config/applicationSetup.n3` to use this driver (see below).
* Start elasticsearch
* Start VIVO
# Not ready for production
* Documentation
* Instructions on how to install and configure the driver.
* Instructions on how to setup elasticsearch?
* Smoke test
* Display a warning if the elasticsearch server is not responding.
* Functional testing
* Are we getting the proper search results?
* Are search results in the order that we would like?
* Improved snippets
* Copy the technique used for Solr
* Code improvement
* Rigorous closing of HTTP connections.
* IOC for HTTP code, to help in unit testing
* Consistent use of exceptions and logging
* Unit tests
* Automatic initialization of the index
* If VIVO detects an empty index, apply the mapping.
# The next steps: adding functionality
## Stay within the framework
* Add fields that enhance the contents of the search index documents (see below).
* Add data distributors that run queries and format the output (see below).
## Go outside the framework
* Add functions to the Elasticsearch driver that the Solr driver will simply ignore.
* Or remove Solr entirely
* Query Elasticsearch directly
* Or write a data distributor that will run the query
# The details:
## Check out VIVO and Vitro
* For now, the Elasticsearch driver only lives in my fork of Vitro
* No changes to VIVO are required (yet).
```
git clone https://github.com/vivo-project/VIVO.git
git clone -b feature/elasticsearchExperiments https://github.com/j2blake/Vitro.git
```
## A mapping for the search index
* If the index uses the default mapping, it will not work correctly.
* Some fields must be declared as `keyword`, some as unstemmed, etc.
* Example mapping script:
```
curl -X PUT "localhost:9200/vivo?pretty" -H 'Content-Type: application/json' -d'
{
"mappings": {
"_doc": {
"properties": {
"ALLTEXT": {
"type": "text",
"analyzer": "english"
},
"ALLTEXTUNSTEMMED": {
"type": "text",
"analyzer": "standard"
},
"DocId": {
"type": "keyword"
},
"classgroup": {
"type": "keyword"
},
"type": {
"type": "keyword"
},
"mostSpecificTypeURIs": {
"type": "keyword"
},
"indexedTime": {
"type": "long"
},
"nameRaw": {
"type": "keyword"
},
"URI": {
"type": "keyword"
},
"THUMBNAIL": {
"type": "integer"
},
"THUMBNAIL_URL": {
"type": "keyword"
},
"nameLowercaseSingleValued": {
"type": "text",
"analyzer": "standard",
"fielddata": "true"
},
"BETA" : {
"type" : "float"
}
}
}
},
"query": {
"default_field": "ALLTEXT"
}
}
'
```
* __*Note:*__ The first line of the script specifies the name of the index as `vivo`.
Any name may be used, but it must match the "base URL" that is specified in `applicationSetup.n3` (see below).
* __*Note:*__ The same first line specifies the location and port number of the elasticsearch server.
Again, any location and port may be used, but they must match the "base URL" in `applicationSetup.n3`.
## Modify `applicationSetup.n3`
* Change this:
```
# ----------------------------
#
# Search engine module:
# The Solr-based implementation is the only standard option, but it can be
# wrapped in an "instrumented" wrapper, which provides additional logging
# and more rigorous life-cycle checking.
#
:instrumentedSearchEngineWrapper
a <java:edu.cornell.mannlib.vitro.webapp.searchengine.InstrumentedSearchEngineWrapper> ,
<java:edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngine> ;
:wraps :solrSearchEngine .
```
* To this:
```
# ----------------------------
#
# Search engine module:
# The Solr-based implementation is the only standard option, but it can be
# wrapped in an "instrumented" wrapper, which provides additional logging
# and more rigorous life-cycle checking.
#
:instrumentedSearchEngineWrapper
a <java:edu.cornell.mannlib.vitro.webapp.searchengine.InstrumentedSearchEngineWrapper> ,
<java:edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngine> ;
:wraps :elasticSearchEngine .
:elasticSearchEngine
a <java:edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch.ElasticSearchEngine> ,
<java:edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngine> ;
:hasBaseUrl "http://localhost:9200/vivo" .
```
## Enhance the contents of the search index
### An example: Publication URIs in the author's search document
* Add a keyword field to the search index
```
"publicationURI": {
"type": "keyword"
},
```
* Add a `DocumentModifier` to VIVO.
```
:documentModifier_publications
a <java:edu.cornell.mannlib.vitro.webapp.searchindex.documentBuilding.SelectQueryDocumentModifier> ,
<java:edu.cornell.mannlib.vitro.webapp.searchindex.documentBuilding.DocumentModifier> ;
rdfs:label "URIs of publications are added to publicationURI field." ;
:hasTargetField "publicationURI" ;
:hasSelectQuery """
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX vivo: <http://vivoweb.org/ontology/core#>
PREFIX bibo: <http://purl.org/ontology/bibo/>
SELECT ?publication
WHERE {
?uri vivo:relatedBy ?authorship .
?authorship a vivo:Authorship .
?authorship vivo:relates ?publication .
?publication a bibo:Document .
}
""" .
```
## Use data distributors to query the search index
* Install the Data Distribution API
* Add a distributor:
```
:drill_by_URI
a <java:edu.cornell.library.scholars.webapp.controller.api.distribute.DataDistributor> ,
<java:edu.cornell.library.scholars.webapp.controller.api.distribute.search.DrillDownSearchByUriDataDistributor> ;
:actionName "searchAndDrill" .
```
* Run the query:
```
http://localhost:8080/vivo/api/dataRequest/searchAndDrill?uri=http://scholars.cornell.edu/individual/mj495
```

View file

@ -0,0 +1,77 @@
/* $This file is distributed under the terms of the license in /doc/license.txt$ */
package edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Build a Map of Objects, suitable for marshalling by Jackson.
*
* Include conditional logic, so null values, empty maps, or empty lists will
* not be added, unless you use the special values.
*/
public class JsonTree {
/**
* Empty maps will not be added, except for this one.
*/
public static final Map<String, Object> EMPTY_JSON_MAP = Collections
.emptyMap();
/**
* Empty lists will not be added, except for this one.
*/
public static final List<Object> EMPTY_JSON_LIST = Collections.emptyList();
/**
* Create the tree
*/
public static JsonTree tree() {
return new JsonTree();
}
/**
* This will cause negative integers to be ignored.
*/
public static Integer ifPositive(int i) {
return (i > 0) ? i : null;
}
private Map<String, Object> map = new HashMap<>();
public JsonTree put(String key, Object value) {
if (isSignificant(value)) {
storeIt(key, value);
}
return this;
}
private boolean isSignificant(Object value) {
if (value == null) {
return false;
}
if (value instanceof Map && ((Map) value).isEmpty()
&& value != EMPTY_JSON_MAP) {
return false;
}
if (value instanceof List && ((List) value).isEmpty()
&& value != EMPTY_JSON_LIST) {
return false;
}
return true;
}
private void storeIt(String key, Object value) {
if (value instanceof JsonTree) {
map.put(key, ((JsonTree) value).asMap());
} else {
map.put(key, value);
}
}
public Map<String, Object> asMap() {
return new HashMap<>(map);
}
}

View file

@ -0,0 +1,172 @@
/* $This file is distributed under the terms of the license in /doc/license.txt$ */
package edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch;
import static edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch.JsonTree.EMPTY_JSON_MAP;
import static edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch.JsonTree.ifPositive;
import static edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch.JsonTree.tree;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngineException;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchQuery;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchQuery.Order;
/**
* Accept a SearchQuery and make it available as a JSON string, suitable for
* Elasticsearch.
*/
public class QueryConverter {
private static final Log log = LogFactory.getLog(QueryConverter.class);
private final SearchQuery query;
private final Map<String, Object> queryAndFilters;
private final Map<String, Object> sortFields;
private final Map<String, Object> facets;
private final Map<String, Object> highlighter;
private final List<String> returnFields;
private final Map<String, Object> fullMap;
public QueryConverter(SearchQuery query) {
this.query = query;
this.queryAndFilters = filteredOrNot();
this.sortFields = figureSortFields();
this.facets = figureFacets();
this.highlighter = figureHighlighter();
this.returnFields = figureReturnFields();
this.fullMap = figureFullMap();
}
private Map<String, Object> filteredOrNot() {
if (query.getFilters().isEmpty()) {
return new QueryStringMap(query.getQuery()).map;
} else {
return buildFilterStructure();
}
}
private Map<String, Object> buildFilterStructure() {
return tree() //
.put("bool", tree() //
.put("must", new QueryStringMap(query.getQuery()).map) //
.put("filter", buildFiltersList())) //
.asMap();
}
private List<Map<String, Object>> buildFiltersList() {
List<Map<String, Object>> list = new ArrayList<>();
for (String filter : query.getFilters()) {
list.add(new QueryStringMap(filter).map);
}
return list;
}
private Map<String, Object> figureSortFields() {
Map<String, Order> fields = query.getSortFields();
Map<String, Object> map = new HashMap<>();
for (String name : fields.keySet()) {
String sortOrder = fields.get(name).toString().toLowerCase();
map.put(name, sortOrder);
}
return map;
}
private Map<String, Object> figureFacets() {
Map<String, Object> map = new HashMap<>();
for (String field : query.getFacetFields()) {
map.put("facet_" + field, figureFacet(field));
}
return map;
}
private Map<String, Object> figureHighlighter() {
return tree() //
.put("fields", tree() //
.put("ALLTEXT", EMPTY_JSON_MAP))
.asMap();
}
private Map<String, Object> figureFacet(String field) {
return tree() //
.put("terms", tree() //
.put("field", field) //
.put("size", ifPositive(query.getFacetLimit())) //
.put("min_doc_count",
ifPositive(query.getFacetMinCount()))) //
.asMap();
}
private List<String> figureReturnFields() {
return new ArrayList<>(query.getFieldsToReturn());
}
private Map<String, Object> figureFullMap() {
return tree() //
.put("query", queryAndFilters) //
.put("from", ifPositive(query.getStart())) //
.put("highlight", highlighter)
.put("size", ifPositive(query.getRows())) //
.put("sort", sortFields) //
.put("_source", returnFields) //
.put("aggregations", facets) //
.asMap();
}
public String asString() throws SearchEngineException {
try {
return new ObjectMapper().writeValueAsString(fullMap);
} catch (JsonProcessingException e) {
throw new SearchEngineException(e);
}
}
private static class QueryStringMap {
public final Map<String, Object> map;
public QueryStringMap(String queryString) {
map = new HashMap<>();
map.put("query_string", makeInnerMap(escape(queryString)));
}
/**
* This is a kluge, but perhaps it will work for now.
*
* Apparently Solr is willing to put up with query strings that contain
* special characters in odd places, but Elasticsearch is not.
*
* So, a query string of "classgroup:http://this/that" must be escaped
* as "classgroup:http\:\/\/this\/that". Notice that the first colon
* delimits the field name, and so must not be escaped.
*
* But what if no field is specified? Then all colons must be escaped.
* How would we distinguish that?
*
* And what if the query is more complex, and more than one field is
* specified? What if other special characters are included?
*
* This could be a real problem.
*/
private String escape(String queryString) {
return queryString.replace(":", "\\:").replace("/", "\\/")
.replaceFirst("\\\\:", ":");
}
private Map<String, String> makeInnerMap(String queryString) {
Map<String, String> inner = new HashMap<>();
inner.put("default_field", "ALLTEXT");
inner.put("query", queryString);
return inner;
}
}
}

View file

@ -0,0 +1,182 @@
/* $This file is distributed under the terms of the license in /doc/license.txt$ */
package edu.cornell.mannlib.vitro.webapp.searchengine.elasticsearch;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchEngineException;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchFacetField;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchFacetField.Count;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchResponse;
import edu.cornell.mannlib.vitro.webapp.modules.searchEngine.SearchResultDocument;
import edu.cornell.mannlib.vitro.webapp.searchengine.base.BaseSearchFacetField;
import edu.cornell.mannlib.vitro.webapp.searchengine.base.BaseSearchFacetField.BaseCount;
import edu.cornell.mannlib.vitro.webapp.searchengine.base.BaseSearchResponse;
import edu.cornell.mannlib.vitro.webapp.searchengine.base.BaseSearchResultDocument;
/**
* Elastic search sends a JSON response to a query. parse it to a
* SearchResponse.
*/
class ResponseParser {
private static final Log log = LogFactory.getLog(ResponseParser.class);
private final Map<String, Object> responseMap;
private Map<String, Map<String, List<String>>> highlightingMap;
private Map<String, SearchFacetField> facetFieldsMap;
private long totalHits;
private List<SearchResultDocument> documentList;
@SuppressWarnings("unchecked")
public ResponseParser(String responseString) throws SearchEngineException {
try {
this.responseMap = new ObjectMapper().readValue(responseString,
HashMap.class);
} catch (IOException e) {
throw new SearchEngineException(e);
}
}
public SearchResponse parse() {
parseDocumentList();
parseFacetFields();
SearchResponse response = new BaseSearchResponse(highlightingMap,
facetFieldsMap,
new ElasticSearchResultDocumentList(documentList, totalHits));
log.debug("ESQuery.ResponseParser.parse: " + response);
return response;
}
private void parseFacetFields() {
facetFieldsMap = new HashMap<>();
@SuppressWarnings("unchecked")
Map<String, Map<String, Object>> aggregations = (Map<String, Map<String, Object>>) responseMap
.get("aggregations");
if (aggregations == null) {
return;
}
for (String key : aggregations.keySet()) {
if (key.startsWith("facet_")) {
String name = key.substring(6);
parseFacetField(name, aggregations.get(key));
}
}
}
private void parseFacetField(String name, Map<String, Object> facetMap) {
@SuppressWarnings("unchecked")
List<Map<String, Object>> bucketsList = (List<Map<String, Object>>) facetMap
.get("buckets");
if (bucketsList == null) {
return;
}
List<Count> counts = new ArrayList<>();
for (Map<String, Object> bucket : bucketsList) {
counts.add(new BaseCount((String) bucket.get("key"),
(Integer) bucket.get("doc_count")));
}
facetFieldsMap.put(name, new BaseSearchFacetField(name, counts));
}
private void parseDocumentList() {
documentList = new ArrayList<>();
highlightingMap = new HashMap<>();
@SuppressWarnings("unchecked")
Map<String, Object> uberHits = (Map<String, Object>) responseMap
.get("hits");
if (uberHits == null) {
log.warn("Didn't find a 'hits' field " + "in the query response: "
+ responseMap);
return;
}
Integer total = (Integer) uberHits.get("total");
if (total == null) {
log.warn("Didn't find a 'hits.total' field "
+ "in the query response: " + responseMap);
return;
}
@SuppressWarnings("unchecked")
List<Map<String, Object>> hits = (List<Map<String, Object>>) uberHits
.get("hits");
if (hits == null) {
log.warn("Didn't find a 'hits.hits' field "
+ "in the query response: " + responseMap);
return;
}
parseDocuments(hits);
}
private void parseDocuments(List<Map<String, Object>> hits) {
for (Map<String, Object> hit : hits) {
SearchResultDocument doc = parseDocument(hit);
if (doc != null) {
documentList.add(doc);
Map<String, List<String>> highlight = parseHighlight(hit);
if (highlight != null) {
highlightingMap.put(doc.getUniqueId(), highlight);
}
}
}
}
private SearchResultDocument parseDocument(Map<String, Object> hitMap) {
@SuppressWarnings("unchecked")
Map<String, Collection<Object>> sourceMap = (Map<String, Collection<Object>>) hitMap
.get("_source");
if (sourceMap == null) {
log.warn("Didn't find a '_source' field in the hit: " + hitMap);
return null;
}
String id = (String) hitMap.get("_id");
if (id == null) {
log.warn("Didn't find a '_id' field in the hit: " + hitMap);
return null;
}
return new BaseSearchResultDocument(id, sourceMap);
}
private Map<String, List<String>> parseHighlight(
Map<String, Object> hitMap) {
@SuppressWarnings("unchecked")
Map<String, List<String>> highlightMap = (Map<String, List<String>>) hitMap
.get("highlight");
if (highlightMap == null) {
log.debug("Didn't find a 'highlight' field in the hit: " + hitMap);
return null;
}
@SuppressWarnings("unchecked")
List<String> snippets = highlightMap.get("ALLTEXT");
if (snippets == null) {
log.warn("Didn't find a 'highlight.ALLTEXT' field in the hit: "
+ hitMap);
return null;
}
Map<String, List<String>> snippetMap = new HashMap<>();
snippetMap.put("ALLTEXT", snippets);
return snippetMap;
}
}