[VIVO-1668] Efficiency improvements in AboxRecomputer (#108)

* Ensure we only use statements for the current individual in Recomputer
* Make AboxRecomputer more efficient when acquiring the list of individuals

Resolves: https://jira.duraspace.org/browse/VIVO-1668
This commit is contained in:
Graham Triggs 2019-02-12 14:40:02 +00:00 committed by Andrew Woods
parent 6e717446b4
commit c3e8cc739b

View file

@ -14,6 +14,7 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Queue; import java.util.Queue;
import java.util.Set; import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
@ -259,8 +260,8 @@ public class ABoxRecomputer {
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
Model types = ModelFactory.createDefaultModel(); Model types = ModelFactory.createDefaultModel();
types.add(assertions.listStatements(null, RDF.type, (RDFNode) null)); types.add(assertions.listStatements(individual, RDF.type, (RDFNode) null));
types.add(rebuildModel.listStatements(null, RDF.type, (RDFNode) null)); types.add(rebuildModel.listStatements(individual, RDF.type, (RDFNode) null));
Model inferredTypes = rewriteInferences(getInferredTypes(individual, types, caches), aliasURI); Model inferredTypes = rewriteInferences(getInferredTypes(individual, types, caches), aliasURI);
rebuildModel.add(inferredTypes); rebuildModel.add(inferredTypes);
log.trace((System.currentTimeMillis() - start) + " to infer " + inferredTypes.size() + " types"); log.trace((System.currentTimeMillis() - start) + " to infer " + inferredTypes.size() + " types");
@ -457,39 +458,46 @@ public class ABoxRecomputer {
return individualURIs; return individualURIs;
} }
protected void getIndividualURIs(String queryString, Queue<String> individuals) { protected void getIndividualURIs(String queryString, final Queue<String> individuals) {
int batchSize = 50000; final int batchSize = 50000;
int offset = 0; int offset = 0;
boolean done = false; final AtomicBoolean done = new AtomicBoolean(false);
while (!done) { while (!done.get()) {
String queryStr = queryString + " LIMIT " + batchSize + " OFFSET " + offset; String queryStr = queryString + " LIMIT " + batchSize + " OFFSET " + offset;
if(log.isDebugEnabled()) { if(log.isDebugEnabled()) {
log.debug(queryStr); log.debug(queryStr);
} }
ResultSet results = null;
try { try {
InputStream in = rdfService.sparqlSelectQuery(queryStr, RDFService.ResultFormat.JSON); rdfService.sparqlSelectQuery(queryStr, new ResultSetConsumer() {
results = ResultSetFactory.fromJSON(in); private int count = 0;
} catch (RDFServiceException e) {
throw new RuntimeException(e); @Override
} protected void processQuerySolution(QuerySolution qs) {
if (!results.hasNext()) { count++;
done = true; Resource resource = qs.getResource("s");
}
while (results.hasNext()) {
QuerySolution solution = results.next();
Resource resource = solution.getResource("s");
if ((resource != null) && !resource.isAnon()) { if ((resource != null) && !resource.isAnon()) {
individuals.add(resource.getURI()); individuals.add(resource.getURI());
} }
} }
@Override
protected void endProcessing() {
super.endProcessing();
if (count < batchSize) {
done.set(true);
}
}
});
} catch (RDFServiceException e) {
throw new RuntimeException(e);
}
if(log.isDebugEnabled()) { if(log.isDebugEnabled()) {
log.debug(individuals.size() + " in set"); log.debug(individuals.size() + " in set");
} }
offset += batchSize; offset += batchSize;
} }
} }
protected void addInferenceStatementsFor(String individualUri, Model addTo) throws RDFServiceException { protected void addInferenceStatementsFor(String individualUri, Model addTo) throws RDFServiceException {