VIVO-75 Ruby scripts to help with good-enough translation.

One set will take an ontology file, extract the labels for Google translation, and create RDF for those translated labels. The other set does the same for a properties file.
This commit is contained in:
j2blake 2013-06-11 13:03:03 -04:00
parent 2c49931672
commit c3d7651c82
6 changed files with 575 additions and 0 deletions

View file

@ -0,0 +1,33 @@
#!/usr/bin/ruby
=begin
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
=end
require 'rubygems'
require 'rdf'
require 'rdf/rdfxml'
require 'rdf/ntriples'
include RDF
class LabelCommon
# ------------------------------------------------------------------------------------
private
# ------------------------------------------------------------------------------------
# ------------------------------------------------------------------------------------
public
# ------------------------------------------------------------------------------------
def initialize(rdf_file, &filter)
@filter = filter.nil? ? lambda{true} : filter
@graph = Graph.load(rdf_file)
end
def process(query, &filter)
solutions = query.execute(@graph)
solutions.filter!(&filter)
solutions.order(:prop)
end
end

View file

@ -0,0 +1,113 @@
#!/usr/bin/ruby
=begin
--------------------------------------------------------------------------------
A utility that reads an RDF file, builds a model, sorts all of the URIs that have
labels, and associates those URIs with the labels in a supplied text file, one line
per label.
These labels are assigned the language specified on the command line, and the
resulting RDF statements are sent to standard output as N3.
On the command line provide the path to the RDF file, the path to the labels file,
and the desired language/locale. E.g.:
label_inserter.rb ../../vivo-core-1.5-annotations.rdf labels.file es_ES
--------------------------------------------------------------------------------
=end
$: << File.dirname(File.expand_path(__FILE__))
require 'rubygems'
require 'rdf'
require 'label_common'
include RDF
class LabelStripper
# ------------------------------------------------------------------------------------
private
# ------------------------------------------------------------------------------------
#
# Parse the arguments and complain if they don't make sense.
#
def sanity_check_arguments(args)
raise "usage is: label_inserter.rb <rdf_file> <labels_input_file> <locale> <n3_output_file> [ok]" unless (4..5).include?(args.length)
if args[4].nil?
ok = false
elsif args[4].downcase == 'ok'
ok = true
else
raise "fifth argument, if present, must be 'ok'"
end
rdf_file = args[0]
raise "File '#{rdf_file}' does not exist." unless File.exist?(rdf_file)
labels_input_file = args[1]
raise "File '#{labels_input_file}' does not exist." unless File.exist?(labels_input_file)
locale = args[2]
raise "Locale should be like 'ab' or 'ab-CD'." unless /^[a-z]{2}(-[A-Z]{2})?$/ =~ locale
n3_output_file = args[3]
raise "File '#{n3_output_file}' already exists. specify 'ok' to overwrite it." if File.exist?(n3_output_file) && !ok
return rdf_file, labels_input_file, locale, n3_output_file
end
# ------------------------------------------------------------------------------------
public
# ------------------------------------------------------------------------------------
def initialize(args)
@rdf_file, @labels_input_file, @locale, @n3_output_file = sanity_check_arguments(args)
end
def process(&filter)
filter = filter || lambda{true}
query = Query.new({
:prop => {
RDFS.label => :label,
}
})
solutions = LabelCommon.new(@rdf_file).process(query, &filter)
labels = IO.readlines(@labels_input_file)
raise "Number of labels (#{labels.length}) doesn't match number of URIs (#{solutions.length})" unless labels.length == solutions.length
graph = Graph.new
solutions.zip(labels).each do |data|
s = data[0].prop
p = RDFS.label
o = Literal.new(data[1].chomp, :language => @locale)
graph << Statement.new(s, p, o)
end
File.open(@n3_output_file, 'w') do |f|
f.puts graph.dump(:ntriples)
end
end
end
#
#
# ------------------------------------------------------------------------------------
# Standalone calling.
#
# Do this if this program was called from the command line. That is, if the command
# expands to the path of this file.
# ------------------------------------------------------------------------------------
#
vivo_filter = lambda {|s| s.prop.start_with?("http://vivoweb.org/ontology/core#") && !s.label.to_s.strip.empty?}
if File.expand_path($0) == File.expand_path(__FILE__)
stripper = LabelStripper.new(ARGV)
stripper.process(&vivo_filter)
end

View file

@ -0,0 +1,95 @@
#!/usr/bin/ruby
=begin
--------------------------------------------------------------------------------
A utility that reads an RDF file, builds a model, sorts all of the URIs that have
labels, and produces a file of those labels, one per line. The idea is that this
file could be translated, and the result could be put into RDF by label_inserter.rb
This required the RDF.rb gem: sudo gem install rdf
On the command line provide the path to the RDF file. E.g.:
label_stripper.rb '../../vivo-core-1.5-annotations.rdf'
--------------------------------------------------------------------------------
=end
$: << File.dirname(File.expand_path(__FILE__))
require 'rubygems'
require 'rdf'
require 'label_common'
include RDF
class LabelStripper
# ------------------------------------------------------------------------------------
private
# ------------------------------------------------------------------------------------
#
# Parse the arguments and complain if they don't make sense.
#
def sanity_check_arguments(args)
raise "usage is: label_stripper.rb <rdf_file> <labels_output_file> [ok]" unless (2..3).include?(args.length)
if args[2].nil?
ok = false
elsif args[2].downcase == 'ok'
ok = true
else
raise "third argument, if present, must be 'ok'"
end
rdf_file = args[0]
raise "File '#{rdf_file}' does not exist." unless File.exist?(rdf_file)
labels_output_file = args[1]
raise "File '#{labels_output_file}' already exists. specify 'ok' to overwrite it." if File.exist?(labels_output_file) && !ok
return rdf_file, labels_output_file
end
# ------------------------------------------------------------------------------------
public
# ------------------------------------------------------------------------------------
def initialize(args)
@rdf_file, @labels_output_file = sanity_check_arguments(args)
end
def process(&filter)
filter = filter || lambda{true}
query = Query.new({
:prop => {
RDFS.label => :label,
}
})
solutions = LabelCommon.new(@rdf_file).process(query, &filter)
File.open(@labels_output_file, 'w') do |f|
solutions.each do |s|
f.puts s.label
end
end
end
end
#
#
# ------------------------------------------------------------------------------------
# Standalone calling.
#
# Do this if this program was called from the command line. That is, if the command
# expands to the path of this file.
# ------------------------------------------------------------------------------------
#
vivo_filter = lambda {|s| s.prop.start_with?("http://vivoweb.org/ontology/core#") && !s.label.to_s.strip.empty?}
if File.expand_path($0) == File.expand_path(__FILE__)
stripper = LabelStripper.new(ARGV)
stripper.process(&vivo_filter)
end

View file

@ -0,0 +1,80 @@
#!/usr/bin/ruby
=begin
--------------------------------------------------------------------------------
Some common routines used both by property_stripper and property_inserter
--------------------------------------------------------------------------------
=end
class Warning
attr_reader :line
attr_reader :message
def initialize(line, message)
@line = line
@message = message
end
end
class Property
attr_reader :line
attr_reader :key
attr_accessor :value
def initialize(line, key, value)
@line = line
@key = key
@value = value
end
end
class PropertiesFile
attr_reader :properties
attr_reader :warnings
def join_continuation_lines(lines)
(lines.size()-1).downto(0) do |i|
if /(.*)\\$/.match(lines[i])
lines[i] = $1 + lines[i+1].lstrip()
lines[i+1] = ''
end
end
return lines
end
def read_properties(lines)
ln = 0
lines.each do |line|
ln += 1
line.strip!
# ignore blank lines, and lines starting with '#' or '!'.
next if line.length == 0 || line[0] == ?# || line[0] == ?!
if line =~ /(.*?)\s*[=:]\s*(.*)/
# key and value are separated by '=' or ':' and optional whitespace.
key = $1.strip
value = $2
else
# No '=' or ':' means that the value is empty.
key = line;
value = ''
end
if dupe = @properties[key]
@warnings << Warning.new(ln, "Key '#{key}' is duplicated on line #{dupe.line}")
else
@properties[key] = Property.new(ln, key, value)
end
end
end
def initialize(path)
@properties = {}
@warnings = []
lines = IO.readlines(path)
lines = join_continuation_lines(lines)
read_properties(lines)
end
end

View file

@ -0,0 +1,165 @@
#!/usr/bin/ruby
=begin
--------------------------------------------------------------------------------
Builds a property file, using an existing property file as a template, but
getting the property values from a text file of translated text
(see property_stripper.rb) and optionally a partial file of translated properties.
So, if you have a template file of English-language properties (e.g. all.properties),
this will read the file into a properties structure. The text file of translated
values is presumed to have one value per line, associated with the alphabetized
list of keys from the template file. The translated values will replace the orignal
values, with the exception that any value that starts with @@file will not be
replaced.
If a partially translated file is provided, it will be read and used to replace
any translated values from the text file, which are assumed to be weaker. Note
that this is true of @@file values as well, which are presumed to be corrected
for the language.
Any @@file values that are not overridden by the partial translation will result in
a warning to stderr.
Finally, the template file is processed again, replacing the existing values with
the translated values, but keeping the same comment and spacing structure.
On the command line provide the path to the tempate file, the text file, and
optionally the partial translation. E.g.:
property_inserter.rb '../../all.properties' translated.txt '../../all_es.properties' 'all_es.properties'
--------------------------------------------------------------------------------
=end
$: << File.dirname(File.expand_path(__FILE__))
require "property_common"
class PropertyInserter
# ------------------------------------------------------------------------------------
private
# ------------------------------------------------------------------------------------
#
# Parse the arguments and complain if they don't make sense.
#
def sanity_check_arguments(args)
raise "usage is: property_inserter.rb <template_file> <translated_values_file> [partial_translation] <output_file> [ok]" unless (3..5).include?(args.length)
if args[-1].downcase == 'ok'
ok = true
args.pop
else
ok = false
end
output_file = args.pop
raise "File '#{output_file}' already exists. specify 'ok' to overwrite it." if File.exist?(output_file) && !ok
template_file = args[0]
raise "File '#{template_file}' does not exist." unless File.exist?(template_file)
translated_values_file = args[1]
raise "File '#{translated_values_file}' does not exist." unless File.exist?(translated_values_file)
partial_translation = args[2]
raise "File '#{partial_translation}' does not exist." if partial_translation && !File.exist?(partial_translation)
return template_file, translated_values_file, partial_translation, output_file
end
def read_template_file()
PropertiesFile.new(@template_file).properties
end
def read_and_merge_translated_values()
lines = IO.readlines(@translated_values_file)
raise "Number of lines in the translated values file (#{lines.size}) does not match the number of properties in the template file (#{@properties_map.size})." unless lines.size == @properties_map.size
count = 0
@properties_map.keys.sort.zip(lines) do |a|
key, value = a
unless @properties_map[key].value.start_with?("@@file")
@properties_map[key].value = value
count += 1
end
end
puts "Merged #{count} translated values."
end
def read_and_merge_partial_translation()
count = 0
if @partial_translation
@partial_map = PropertiesFile.new(@partial_translation).properties
@partial_map.keys.each do |key|
@properties_map[key].value = @partial_map[key].value
count += 1
end
end
puts "Overrode #{count} from partial translation."
end
def write_result()
template_lines = merge_continuation_lines(IO.readlines(@template_file))
File.open(@output_file, 'w') do |f|
template_lines.each do |line|
if line.length == 0 || line[0] == ?# || line[0] == ?!
# copy blank lines, and lines starting with '#' or '!'.
f.puts line
elsif line =~ /(.*?)(\s*[=:]\s*)(.*)/
# key and value are separated by '=' or ':' and optional whitespace.
key = $1.strip
f.puts "#{$1}#{$2}#{@properties_map[key].value}"
else
# No '=' or ':' means that the value was empty.
key = line.strip;
if @properties_map[key]
f.puts "#{key} = #{@properties_map[key].value}"
else
f.puts line
end
end
end
end
end
def merge_continuation_lines(lines)
(lines.size()-1).downto(0) do |i|
if /(.*)\\$/.match(lines[i])
lines[i] = $1 + lines[i+1].lstrip()
lines.delete_at(i+1)
end
end
return lines
end
# ------------------------------------------------------------------------------------
public
# ------------------------------------------------------------------------------------
def initialize(args)
@template_file, @translated_values_file, @partial_translation, @output_file = sanity_check_arguments(args)
end
def process()
@properties_map = read_template_file()
read_and_merge_translated_values()
read_and_merge_partial_translation()
write_result()
puts "Wrote #{@properties_map.length} values to '#{@output_file}'"
end
end
#
#
# ------------------------------------------------------------------------------------
# Standalone calling.
#
# Do this if this program was called from the command line. That is, if the command
# expands to the path of this file.
# ------------------------------------------------------------------------------------
#
if File.expand_path($0) == File.expand_path(__FILE__)
inserter = PropertyInserter.new(ARGV)
inserter.process()
end

View file

@ -0,0 +1,89 @@
#!/usr/bin/ruby
=begin
--------------------------------------------------------------------------------
Read a property file, sort the remainder alphabetically and write their values to
a text file, one value per line.
The idea is that this file could be translated and the result could be used to
create a new property file with property_inserter.rb
On the command line provide the path to the properties file. E.g.:
property_stripper.rb '../../all.properties' output_file
--------------------------------------------------------------------------------
=end
$: << File.dirname(File.expand_path(__FILE__))
require "property_common"
class PropertyStripper
# ------------------------------------------------------------------------------------
private
# ------------------------------------------------------------------------------------
#
# Parse the arguments and complain if they don't make sense.
#
def sanity_check_arguments(args)
raise "usage is: property_stripper.rb <properties_file> <values_output_file> [ok]" unless (2..3).include?(args.length)
if args[2].nil?
ok = false
elsif args[2].downcase == 'ok'
ok = true
else
raise "third argument, if present, must be 'ok'"
end
properties_file = args[0]
raise "File '#{properties_file}' does not exist." unless File.exist?(properties_file)
values_output_file = args[1]
raise "File '#{values_output_file}' already exists. specify 'ok' to overwrite it." if File.exist?(values_output_file) && !ok
return properties_file, values_output_file
end
def read_properties_file(properties_file)
PropertiesFile.new(properties_file).properties
end
def write_values(values_output_file, properties)
File.open(values_output_file, 'w') do |f|
properties.keys.sort.each do |key|
f.puts properties[key].value
end
end
end
# ------------------------------------------------------------------------------------
public
# ------------------------------------------------------------------------------------
def initialize(args)
@properties_file, @values_output_file = sanity_check_arguments(args)
end
def process()
@properties = read_properties_file(@properties_file)
write_values(@values_output_file, @properties)
puts "Wrote #{@properties.length} values to '#{@values_output_file}'"
end
end
#
#
# ------------------------------------------------------------------------------------
# Standalone calling.
#
# Do this if this program was called from the command line. That is, if the command
# expands to the path of this file.
# ------------------------------------------------------------------------------------
#
if File.expand_path($0) == File.expand_path(__FILE__)
stripper = PropertyStripper.new(ARGV)
stripper.process()
end