Revise the language filtering from RDF. Create unit tests.

This commit is contained in:
j2blake 2013-02-07 11:58:10 -05:00
parent 866c4435b6
commit 522068edd8
3 changed files with 570 additions and 50 deletions

View file

@ -11,7 +11,6 @@ import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@ -42,26 +41,20 @@ public class LanguageFilteringRDFService implements RDFService {
this.langs = normalizeLangs(langs);
}
private List<String> normalizeLangs(List<String> langs) {
List<String> normalizedLangs = new ArrayList<String>();
String currentBaseLang = null;
for (String lang : langs) {
String normalizedLang = StringUtils.lowerCase(lang);
String baseLang = normalizedLang.split("-")[0];
if (currentBaseLang == null) {
currentBaseLang = baseLang;
} else if (!currentBaseLang.equals(baseLang)) {
if (!normalizedLangs.contains(currentBaseLang)) {
normalizedLangs.add(currentBaseLang);
}
currentBaseLang = baseLang;
}
}
if (currentBaseLang != null && !normalizedLangs.contains(currentBaseLang)) {
normalizedLangs.add(currentBaseLang);
}
return normalizedLangs;
}
private List<String> normalizeLangs(List<String> langs) {
log.debug("Preferred languages:" + langs);
List<String> normalizedLangs = new ArrayList<String>(langs);
for (String lang : langs) {
String baseLang = lang.split("-")[0];
if (!normalizedLangs.contains(baseLang)) {
normalizedLangs.add(baseLang);
}
}
log.debug("Normalized languages:" + normalizedLangs);
return normalizedLangs;
}
@Override
public boolean changeSetUpdate(ChangeSet changeSet)
@ -106,6 +99,7 @@ public class LanguageFilteringRDFService implements RDFService {
}
private Model filterModel(Model m) {
log.debug("filterModel");
List<Statement> retractions = new ArrayList<Statement>();
StmtIterator stmtIt = m.listStatements();
while (stmtIt.hasNext()) {
@ -117,6 +111,7 @@ public class LanguageFilteringRDFService implements RDFService {
continue;
}
Collections.sort(candidatesForRemoval, new StatementSortByLang());
log.debug("sorted statements: " + showSortedStatements(candidatesForRemoval));
Iterator<Statement> candIt = candidatesForRemoval.iterator();
String langRegister = null;
boolean chuckRemaining = false;
@ -142,9 +137,27 @@ public class LanguageFilteringRDFService implements RDFService {
return m;
}
@Override
private String showSortedStatements(List<Statement> candidatesForRemoval) {
List<String> langStrings = new ArrayList<String>();
for (Statement stmt: candidatesForRemoval) {
if (stmt == null) {
langStrings.add("null stmt");
} else {
RDFNode node = stmt.getObject();
if (!node.isLiteral()) {
langStrings.add("not literal");
} else {
langStrings.add(node.asLiteral().getLanguage());
}
}
}
return langStrings.toString();
}
@Override
public InputStream sparqlSelectQuery(String query,
ResultFormat resultFormat) throws RDFServiceException {
log.debug("sparqlSelectQuery: " + query.replaceAll("\\s+", " "));
ResultSet resultSet = ResultSetFactory.fromJSON(
s.sparqlSelectQuery(query, RDFService.ResultFormat.JSON));
List<QuerySolution> solnList = getSolutionList(resultSet);
@ -178,6 +191,7 @@ public class LanguageFilteringRDFService implements RDFService {
continue;
}
Collections.sort(candidatesForRemoval, new RowIndexedLiteralSortByLang());
log.debug("sorted RowIndexedLiterals: " + showSortedRILs(candidatesForRemoval));
Iterator<RowIndexedLiteral> candIt = candidatesForRemoval.iterator();
String langRegister = null;
boolean chuckRemaining = false;
@ -223,7 +237,15 @@ public class LanguageFilteringRDFService implements RDFService {
return new ByteArrayInputStream(outputStream.toByteArray());
}
private class RowIndexedLiteral {
private String showSortedRILs(List<RowIndexedLiteral> candidatesForRemoval) {
List<String> langstrings = new ArrayList<String>();
for (RowIndexedLiteral ril: candidatesForRemoval) {
langstrings.add(ril.getLiteral().getLanguage());
}
return langstrings.toString();
}
private class RowIndexedLiteral {
private Literal literal;
private int index;
@ -324,37 +346,44 @@ public class LanguageFilteringRDFService implements RDFService {
}
private class LangSort {
// any inexact match is worse than any exact match
private int inexactMatchPenalty = langs.size();
// no language is worse than any inexact match (unless it is one of the preferred languages)
private int noLanguage = 2 * inexactMatchPenalty;
// no match is worse than no language.
private int noMatch = noLanguage+1;
protected int compareLangs(String t1lang, String t2lang) {
t1lang = StringUtils.lowerCase(t1lang);
t2lang = StringUtils.lowerCase(t2lang);
if ( t1lang == null && t2lang == null) {
return 0;
} else if (t1lang == null) {
return 1;
} else if (t2lang == null) {
return -1;
} else {
int t1langPref = langs.indexOf(t1lang);
int t2langPref = langs.indexOf(t2lang);
if (t1langPref == -1 && t2langPref == -1) {
if ("".equals(t1lang) && "".equals(t2lang)) {
return 0;
} else if ("".equals(t1lang) && !("".equals(t2lang))) {
return -1;
} else {
return 1;
}
} else if (t1langPref > -1 && t2langPref == -1) {
return -1;
} else if (t1langPref == -1 && t2langPref > -1) {
return 1;
} else {
return t1langPref - t2langPref;
}
}
return languageIndex(t1lang) - languageIndex(t2lang);
}
/**
* Return index of exact match, or index of partial match, a
* language-free, or no match.
*/
private int languageIndex(String lang) {
if (lang == null) {
lang = "";
}
int index = langs.indexOf(lang);
if (index >= 0) {
return index;
}
if (lang.length() > 2) {
index = langs.indexOf(lang.substring(0, 2));
if (index >= 0) {
return index + inexactMatchPenalty;
}
}
if (lang.isEmpty()) {
return noLanguage;
}
return noMatch;
}
}
private class RowIndexedLiteralSortByLang extends LangSort implements Comparator<RowIndexedLiteral> {

View file

@ -0,0 +1,314 @@
/* $This file is distributed under the terms of the license in /doc/license.txt$ */
package edu.cornell.mannlib.vitro.webapp.rdfservice.filter;
import static org.junit.Assert.*;
import static org.junit.Assert.assertEquals;
import java.lang.reflect.Constructor;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.log4j.Level;
import org.junit.Before;
import org.junit.Test;
import stubs.com.hp.hpl.jena.rdf.model.LiteralStub;
import com.hp.hpl.jena.rdf.model.Literal;
import edu.cornell.mannlib.vitro.testing.AbstractTestClass;
/**
* This is the matching order we expect to see:
* <pre>
* exact match to preferred, by order.
* partial match to preferred, by order.
* vanilla or null (no language)
* no match
* </pre>
*/
public class LanguageFilteringRDFServiceTest extends AbstractTestClass {
private static final Log log = LogFactory
.getLog(LanguageFilteringRDFServiceTest.class);
private static final String COLLATOR_CLASSNAME = "edu.cornell.mannlib.vitro.webapp.rdfservice.filter.LanguageFilteringRDFService$RowIndexedLiteralSortByLang";
private static final String RIL_CLASSNAME = "edu.cornell.mannlib.vitro.webapp.rdfservice.filter.LanguageFilteringRDFService$RowIndexedLiteral";
private LanguageFilteringRDFService filteringRDFService;
private List<Object> listOfRowIndexedLiterals;
private int literalIndex;
private List<String> preferredLanguages;
private List<String> availableLanguages;
private List<String> expectedSortOrders;
@Before
public void setup() {
setLoggerLevel(this.getClass(), Level.DEBUG);
setLoggerLevel(LanguageFilteringRDFService.class, Level.DEBUG);
}
// ----------------------------------------------------------------------
// The tests
// ----------------------------------------------------------------------
@Test
public void singleMatch() {
preferredLanguages = list("en-US");
availableLanguages = list("en-US");
expectedSortOrders = list("en-US");
testArbitraryOrder();
}
@Test
public void singleNoMatch() {
preferredLanguages = list("en-US");
availableLanguages = list("es-MX");
expectedSortOrders = list("es-MX");
testArbitraryOrder();
}
@Test
public void doubleMatch() {
preferredLanguages = list("en-US", "es-MX");
availableLanguages = list("en-US", "es-MX");
expectedSortOrders = list("en-US", "es-MX");
testBothWays();
}
@Test
public void noMatches() {
preferredLanguages = list("es-MX");
availableLanguages = list("en-US", "fr-FR");
expectedSortOrders = list("en-US", "fr-FR");
testArbitraryOrder();
}
@Test
public void partialMatches() {
preferredLanguages = list("en", "es");
availableLanguages = list("en-US", "es-MX");
expectedSortOrders = list("en-US", "es-MX");
testBothWays();
}
@Test
public void matchIsBetterThanNoMatch() {
preferredLanguages = list("en-US", "es-MX");
availableLanguages = list("en-US", "fr-FR");
expectedSortOrders = list("en-US", "fr-FR");
testBothWays();
}
@Test
public void matchIsBetterThanPartialMatch() {
preferredLanguages = list("es-ES", "en-US");
availableLanguages = list("en-US", "es-MX");
expectedSortOrders = list("en-US", "es-MX");
testBothWays();
}
@Test
public void exactMatchIsBetterThanPartialMatch() {
preferredLanguages = list("es");
availableLanguages = list("es","es-MX");
expectedSortOrders = list("es", "es-MX");
testBothWays();
}
@Test
public void matchIsBetterThanVanilla() {
preferredLanguages = list("en-US");
availableLanguages = list("en-US", "");
expectedSortOrders = list("en-US", "");
testBothWays();
}
@Test
public void partialMatchIsBetterThanVanilla() {
preferredLanguages = list("es-MX");
availableLanguages = list("es-ES", "");
expectedSortOrders = list("es-ES", "");
testBothWays();
}
@Test
public void vanillaIsBetterThanNoMatch() {
preferredLanguages = list("es-MX");
availableLanguages = list("en-US", "");
expectedSortOrders = list("", "en-US");
testBothWays();
}
@Test
public void omnibus() {
preferredLanguages = list("es-MX", "es", "en-UK", "es-PE", "fr");
availableLanguages = list("es-MX", "es", "fr", "es-ES", "fr-FR", "", "de-DE");
expectedSortOrders = list("es-MX", "es", "fr", "es-ES", "fr-FR", "", "de-DE");
testBothWays();
}
/**
* TODO test plan
*
* <pre>
* single match
* single not match
* double match both ways
* double, one match, both ways
* double, no match
* double, match only languages, both ways
* double, one match full, one match language, both ways
* double, one vanilla, one match, both ways
* double, one vanilla, one no match, both ways
* </pre>
*/
// ----------------------------------------------------------------------
// Helper methods
// ----------------------------------------------------------------------
private void testBothWays() {
createLanguageFilter();
buildListOfLiterals();
sortListOfLiterals();
assertLanguageOrder("sort literals");
buildReversedListOfLiterals();
sortListOfLiterals();
assertLanguageOrder("sort reversed literals");
}
private void testArbitraryOrder() {
createLanguageFilter();
buildListOfLiterals();
sortListOfLiterals();
assertLanguages("sort literals");
buildReversedListOfLiterals();
sortListOfLiterals();
assertLanguages("sort reversed literals");
}
private List<String> list(String... strings) {
return new ArrayList<String>(Arrays.asList(strings));
}
private void createLanguageFilter() {
filteringRDFService = new LanguageFilteringRDFService(null,
preferredLanguages);
}
private void buildListOfLiterals() {
List<Object> list = new ArrayList<Object>();
for (String language : availableLanguages) {
list.add(buildRowIndexedLiteral(language));
}
listOfRowIndexedLiterals = list;
}
private void buildReversedListOfLiterals() {
List<Object> list = new ArrayList<Object>();
for (String language : availableLanguages) {
list.add(0, buildRowIndexedLiteral(language));
}
listOfRowIndexedLiterals = list;
}
private void sortListOfLiterals() {
log.debug("before sorting: "
+ languagesFromLiterals(listOfRowIndexedLiterals));
Comparator<Object> comparator = buildRowIndexedLiteralSortByLang();
Collections.sort(listOfRowIndexedLiterals, comparator);
}
private void assertLanguageOrder(String message) {
List<String> expectedLanguages = expectedSortOrders;
log.debug("expected order: " + expectedLanguages);
List<String> actualLanguages = languagesFromLiterals(listOfRowIndexedLiterals);
log.debug("actual order: " + actualLanguages);
assertEquals(message, expectedLanguages, actualLanguages);
}
private void assertLanguages(String message) {
Set<String> expectedLanguages = new HashSet<String>(expectedSortOrders);
log.debug("expected languages: " + expectedLanguages);
Set<String> actualLanguages = new HashSet<String>(
languagesFromLiterals(listOfRowIndexedLiterals));
log.debug("actual languages: " + actualLanguages);
assertEquals(message, expectedLanguages, actualLanguages);
}
private List<String> languagesFromLiterals(List<Object> literals) {
List<String> actualLanguages = new ArrayList<String>();
for (Object ril : literals) {
actualLanguages.add(getLanguageFromRowIndexedLiteral(ril));
}
return actualLanguages;
}
// ----------------------------------------------------------------------
// Reflection methods to get around "private" declarations.
// ----------------------------------------------------------------------
private Object buildRowIndexedLiteral(String language) {
try {
Class<?> clazz = Class.forName(RIL_CLASSNAME);
Class<?>[] argTypes = { LanguageFilteringRDFService.class,
Literal.class, Integer.TYPE };
Constructor<?> constructor = clazz.getDeclaredConstructor(argTypes);
constructor.setAccessible(true);
Literal l = new LiteralStub(language);
int i = literalIndex++;
return constructor.newInstance(filteringRDFService, l, i);
} catch (Exception e) {
throw new RuntimeException(
"Could not create a row-indexed literal", e);
}
}
@SuppressWarnings("unchecked")
private Comparator<Object> buildRowIndexedLiteralSortByLang() {
try {
Class<?> clazz = Class.forName(COLLATOR_CLASSNAME);
Class<?>[] argTypes = { LanguageFilteringRDFService.class };
Constructor<?> constructor = clazz.getDeclaredConstructor(argTypes);
constructor.setAccessible(true);
return (Comparator<Object>) constructor
.newInstance(filteringRDFService);
} catch (Exception e) {
throw new RuntimeException("Could not create a collator", e);
}
}
private String getLanguageFromRowIndexedLiteral(Object ril) {
try {
Method m = ril.getClass().getDeclaredMethod("getLiteral");
m.setAccessible(true);
Literal l = (Literal) m.invoke(ril);
return l.getLanguage();
} catch (Exception e) {
throw new RuntimeException(
"Could not get the Literal from a RowIndexedLiteral", e);
}
}
}

View file

@ -0,0 +1,177 @@
package stubs.com.hp.hpl.jena.rdf.model;
import com.hp.hpl.jena.datatypes.RDFDatatype;
import com.hp.hpl.jena.graph.Node;
import com.hp.hpl.jena.rdf.model.Literal;
import com.hp.hpl.jena.rdf.model.Model;
import com.hp.hpl.jena.rdf.model.RDFNode;
import com.hp.hpl.jena.rdf.model.RDFVisitor;
import com.hp.hpl.jena.rdf.model.Resource;
/**
* Only implemented what I needed so far. The rest is left as an exercise for
* the student.
*/
public class LiteralStub implements Literal {
// ----------------------------------------------------------------------
// Stub infrastructure
// ----------------------------------------------------------------------
final String language;
public LiteralStub(String language) {
this.language = language;
}
// ----------------------------------------------------------------------
// Stub methods
// ----------------------------------------------------------------------
@Override
public boolean isLiteral() {
return true;
}
@Override
public boolean isAnon() {
return false;
}
@Override
public boolean isResource() {
return false;
}
@Override
public boolean isURIResource() {
return false;
}
@Override
public Literal asLiteral() {
return this;
}
@Override
public Resource asResource() {
throw new ClassCastException();
}
@Override
public String getLanguage() {
return language;
}
// ----------------------------------------------------------------------
// Un-implemented methods
// ----------------------------------------------------------------------
@Override
public <T extends RDFNode> T as(Class<T> view) {
throw new RuntimeException("LiteralStub.as() not implemented.");
}
@Override
public <T extends RDFNode> boolean canAs(Class<T> arg0) {
throw new RuntimeException("LiteralStub.canAs() not implemented.");
}
@Override
public Model getModel() {
throw new RuntimeException("LiteralStub.getModel() not implemented.");
}
@Override
public Object visitWith(RDFVisitor arg0) {
throw new RuntimeException("LiteralStub.visitWith() not implemented.");
}
@Override
public Node asNode() {
throw new RuntimeException("LiteralStub.asNode() not implemented.");
}
@Override
public boolean getBoolean() {
throw new RuntimeException("LiteralStub.getBoolean() not implemented.");
}
@Override
public byte getByte() {
throw new RuntimeException("LiteralStub.getByte() not implemented.");
}
@Override
public char getChar() {
throw new RuntimeException("LiteralStub.getChar() not implemented.");
}
@Override
public RDFDatatype getDatatype() {
throw new RuntimeException("LiteralStub.getDatatype() not implemented.");
}
@Override
public String getDatatypeURI() {
throw new RuntimeException(
"LiteralStub.getDatatypeURI() not implemented.");
}
@Override
public double getDouble() {
throw new RuntimeException("LiteralStub.getDouble() not implemented.");
}
@Override
public float getFloat() {
throw new RuntimeException("LiteralStub.getFloat() not implemented.");
}
@Override
public int getInt() {
throw new RuntimeException("LiteralStub.getInt() not implemented.");
}
@Override
public String getLexicalForm() {
throw new RuntimeException(
"LiteralStub.getLexicalForm() not implemented.");
}
@Override
public long getLong() {
throw new RuntimeException("LiteralStub.getLong() not implemented.");
}
@Override
public short getShort() {
throw new RuntimeException("LiteralStub.getShort() not implemented.");
}
@Override
public String getString() {
throw new RuntimeException("LiteralStub.getString() not implemented.");
}
@Override
public Object getValue() {
throw new RuntimeException("LiteralStub.getValue() not implemented.");
}
@Override
public Literal inModel(Model arg0) {
throw new RuntimeException("LiteralStub.inModel() not implemented.");
}
@Override
public boolean isWellFormedXML() {
throw new RuntimeException(
"LiteralStub.isWellFormedXML() not implemented.");
}
@Override
public boolean sameValueAs(Literal arg0) {
throw new RuntimeException("LiteralStub.sameValueAs() not implemented.");
}
}