|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__version__ = '2.0' |
|
|
|
import sys |
|
import os |
|
import codecs |
|
import ConfigParser |
|
from optparse import OptionParser |
|
from copy import deepcopy |
|
|
|
|
|
class makemteval: |
|
|
|
def __init__(self, config=None): |
|
|
|
if isinstance(config,dict): |
|
self.config = deepcopy(config) |
|
else: |
|
self.config = { |
|
'filein': None, |
|
'fileout': None, |
|
'settype': None, |
|
'srclang': None, |
|
'tstlang': None, |
|
'setid': 'SetID', |
|
'refid': 'RefID', |
|
'sysid': 'SysID', |
|
'docid': 'DocID', |
|
'genre': 'Genre', |
|
} |
|
|
|
|
|
def parseini(self, config=None, inifile=None, section='set'): |
|
|
|
if inifile is None: |
|
inifile = os.path.abspath(os.path.dirname(sys.argv[0])) + os.sep + os.path.splitext(os.path.basename(sys.argv[0]))[0] + '.ini' |
|
|
|
if config is None: |
|
config = self.config |
|
|
|
cfgparser = ConfigParser.RawConfigParser() |
|
|
|
if not cfgparser.has_section(section): |
|
cfgparser.add_section(section) |
|
|
|
for option in config: |
|
cfgparser.set(section, option, config[option]) |
|
|
|
cfgparser.read(inifile) |
|
|
|
for option in cfgparser.options(section): |
|
config[option] = cfgparser.get(section, option) |
|
|
|
return deepcopy(config) |
|
|
|
|
|
def writesgm( self, config ): |
|
|
|
try: |
|
filein = codecs.open(os.path.abspath(os.path.expanduser(config['filein'])), "r", 'utf-8-sig') |
|
except IOError, ErrorMessage: |
|
sys.stderr.write("\n: %s\n"%(ErrorMessage)) |
|
sys.stderr.write(": End Program\n") |
|
return True |
|
|
|
if __name__ == "__main__": |
|
sys.stderr.write( ": opened \"%s\" for reading\n"%(os.path.basename( config['filein'] ))) |
|
|
|
lines = [l.replace('"','\"').replace(''','\'').replace('>','>').replace('<','<').replace('&','&') for l in filein.read().splitlines()] |
|
filein.close() |
|
lines = [l.replace('&','&').replace('<','<').replace('>','>').replace('\'',''').replace('\"','"') for l in lines] |
|
|
|
if __name__ == "__main__": |
|
sys.stderr.write(": closed \"%s\"\n"%(os.path.basename( config['filein'] ))) |
|
|
|
try: |
|
fileout = codecs.open(os.path.abspath(os.path.expanduser(config['fileout'])), "w", 'utf8') |
|
except IOError, ErrorMessage: |
|
sys.stderr.write("\n: %s\n"%(ErrorMessage)) |
|
sys.stderr.write(": End Program\n") |
|
return True |
|
|
|
if __name__ == "__main__": |
|
sys.stderr.write(": opened \"%s\" for writing\n"%(os.path.basename( config['fileout'] ))) |
|
|
|
contents = [] |
|
contents.append('<?xml version=\"1.0\" encoding=\"UTF-8\"?>') |
|
contents.append('<!DOCTYPE mteval SYSTEM \"ftp://jaguar.ncsl.nist.gov/mt/resources/mteval-xml-v1.3.dtd\">') |
|
contents.append('<mteval>') |
|
|
|
if config['settype'] == "srcset": |
|
contents.append("<%(settype)s setid=\"%(setid)s\" srclang=\"%(srclang)s\">"%(config)) |
|
|
|
elif config['settype'] == "refset": |
|
contents.append('<%(settype)s setid=\"%(setid)s\" srclang=\"%(srclang)s\" trglang=\"%(tstlang)s\" refid=\"%(refid)s\">'%(config)) |
|
|
|
elif config['settype'] == "tstset": |
|
contents.append('<%(settype)s setid=\"%(setid)s\" srclang=\"%(srclang)s\" trglang=\"%(tstlang)s\" sysid=\"%(sysid)s\" sysbleu=\"%(sysbleu)s\" language=\"%(language)s\">'%(config)) |
|
|
|
else: |
|
fileout.close() |
|
os.unlink(os.path.abspath(os.path.expanduser(config['fileout']))) |
|
sys.stderr.write("\n: Invalid \"settype\" value %s\n"%(config['settype'])) |
|
sys.stderr.write(": End Program\n") |
|
return True |
|
|
|
contents.append('<DOC %sdocid=\"%s\" genre=\"%s\">'%('' if config['settype'] == "srcset" else 'sysid=\"%s\" '%(config['sysid']),config['docid'],config['genre'])) |
|
|
|
if __name__ == "__main__": |
|
sys.stderr.write(": added header\n") |
|
|
|
for i in range(len(lines)): |
|
contents.append('<seg id=\"%d\"> %s </seg>'%(i+1,lines[i])) |
|
|
|
if __name__ == "__main__": |
|
sys.stderr.write(": added %d lines\n"%(i+1)) |
|
|
|
contents.append('</DOC>') |
|
contents.append('</%s>'%(config['settype'])) |
|
contents.append('</mteval>') |
|
|
|
if __name__ == "__main__": |
|
sys.stderr.write(": added footer\n") |
|
|
|
fileout.write('%s\n'%('\n'.join(contents))) |
|
ferror = fileout.close() |
|
|
|
if __name__ == "__main__": |
|
sys.stderr.write(": closed \"" + os.path.basename( config['fileout'] ) + "\"\n") |
|
|
|
return ferror |
|
|
|
|
|
def parsecmd( config = {} ): |
|
|
|
optparser = OptionParser() |
|
|
|
optparser.add_option( |
|
"-i", "--filein", dest = "filein", default = config["filein"], |
|
help = "UNC path to tokenized input file (required)") |
|
|
|
optparser.add_option( |
|
"-o", "--fileout", dest = "fileout", default = config["fileout"], |
|
help = "UNC path of fileout file (required)") |
|
|
|
optparser.add_option( |
|
"-s", "--srclang", dest = "srclang", default = config["srclang"], |
|
help = "2-letter code for source language (required)") |
|
|
|
optparser.add_option( |
|
"-t", "--tstlang", dest = "tstlang", default = config["tstlang"], |
|
help = "2-letter code for test language (required)") |
|
|
|
optparser.add_option( |
|
"-T", "--settype", dest = "settype", default = config["settype"], |
|
help = "Use XML tag: srcset, tstset or refset (required)") |
|
|
|
optparser.add_option( |
|
"-e", "--setid", dest = "setid", default = config["setid"], |
|
help = "Test set ID (default \""+config["setid"]+"\")") |
|
|
|
optparser.add_option( |
|
"-d", "--docid", dest = "docid", default = config["docid"], |
|
help = "Document ID (default \""+config["docid"]+"\")") |
|
|
|
optparser.add_option( |
|
"-r", "--refid", dest = "refid", default = config["refid"], |
|
help = "Reference ID (default \""+config["refid"]+"\")") |
|
|
|
optparser.add_option( |
|
"-S", "--sysid", dest = "sysid", default = config["sysid"], |
|
help = "System ID used to make the test set (default \""+config["sysid"]+"\")") |
|
|
|
optparser.add_option( |
|
"-g", "--genre", dest = "genre", default = config["genre"], |
|
help = "Genre of the test set and system ID (default \""+config["genre"]+"\")") |
|
|
|
options, commands = optparser.parse_args() |
|
|
|
missing = [] |
|
for k,v in { "Error: missing --filein" : options.filein, |
|
"Error: missing --fileout": options.fileout, |
|
"Error: missing --settype": options.settype, |
|
"Error: missing --srclang": options.srclang, |
|
"Error: missing --tstlang": options.tstlang |
|
}.items(): |
|
if not v: |
|
missing.append(k) |
|
|
|
if missing: |
|
for msg in missing: |
|
sys.stderr.write('%s\n'%(msg)) |
|
optparser.print_help() |
|
exit(1) |
|
|
|
config['filein'] = options.filein |
|
config['fileout'] = options.fileout |
|
config['settype'] = options.settype |
|
config['setid'] = options.setid |
|
config['srclang'] = options.srclang |
|
config['tstlang'] = options.tstlang |
|
config['refid'] = options.refid |
|
config['sysid'] = options.sysid |
|
config['docid'] = options.docid |
|
config['genre'] = options.genre |
|
|
|
sys.stderr.write(": Configuration complete\n") |
|
|
|
return |
|
|
|
|
|
licensetxt=u'''CorpusFiltergraph™ |
|
Copyright © 2010-2014 Precision Translation Tools Co., Ltd. |
|
|
|
This module is free software: you can redistribute it and/or modify |
|
it under the terms of the GNU Lesser General Public License as published by |
|
the Free Software Foundation, either version 2.1 of the License, or |
|
(at your option) any later version. |
|
|
|
This program is distributed in the hope that it will be useful, |
|
but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
GNU Lesser General Public License for more details. |
|
|
|
You should have received a copy of the GNU Lesser General Public License |
|
along with this program. If not, see http://www.gnu.org/licenses/. |
|
|
|
For more information, please contact Precision Translation Tools Pte |
|
at: http://www.precisiontranslationtools.com''' |
|
|
|
|
|
def main(): |
|
|
|
mksgm = makemteval() |
|
|
|
mksgm.parseini(mksgm.config) |
|
|
|
parsecmd(mksgm.config) |
|
|
|
mksgm.writesgm(mksgm.config) |
|
|
|
return 0 |
|
|
|
|
|
if __name__ == "__main__": |
|
sys.exit(main()) |
|
|