#! /usr/bin/env python # -*- coding: utf8 -*- #=============================================================================== # Author: Walapa Muangjeen #=============================================================================== __version__ = '2.0' import sys import os import codecs import ConfigParser from optparse import OptionParser from copy import deepcopy class makemteval: def __init__(self, config=None): if isinstance(config,dict): self.config = deepcopy(config) else: self.config = { 'filein': None, 'fileout': None, 'settype': None, 'srclang': None, 'tstlang': None, 'setid': 'SetID', 'refid': 'RefID', 'sysid': 'SysID', 'docid': 'DocID', 'genre': 'Genre', } def parseini(self, config=None, inifile=None, section='set'): if inifile is None: inifile = os.path.abspath(os.path.dirname(sys.argv[0])) + os.sep + os.path.splitext(os.path.basename(sys.argv[0]))[0] + '.ini' if config is None: config = self.config cfgparser = ConfigParser.RawConfigParser() if not cfgparser.has_section(section): cfgparser.add_section(section) for option in config: cfgparser.set(section, option, config[option]) cfgparser.read(inifile) for option in cfgparser.options(section): config[option] = cfgparser.get(section, option) return deepcopy(config) def writesgm( self, config ): try: filein = codecs.open(os.path.abspath(os.path.expanduser(config['filein'])), "r", 'utf-8-sig') except IOError, ErrorMessage: sys.stderr.write("\n: %s\n"%(ErrorMessage)) sys.stderr.write(": End Program\n") return True if __name__ == "__main__": sys.stderr.write( ": opened \"%s\" for reading\n"%(os.path.basename( config['filein'] ))) lines = [l.replace('"','\"').replace(''','\'').replace('>','>').replace('<','<').replace('&','&') for l in filein.read().splitlines()] filein.close() lines = [l.replace('&','&').replace('<','<').replace('>','>').replace('\'',''').replace('\"','"') for l in lines] if __name__ == "__main__": sys.stderr.write(": closed \"%s\"\n"%(os.path.basename( config['filein'] ))) try: fileout = codecs.open(os.path.abspath(os.path.expanduser(config['fileout'])), "w", 'utf8') except IOError, ErrorMessage: sys.stderr.write("\n: %s\n"%(ErrorMessage)) sys.stderr.write(": End Program\n") return True if __name__ == "__main__": sys.stderr.write(": opened \"%s\" for writing\n"%(os.path.basename( config['fileout'] ))) contents = [] contents.append('') contents.append('') contents.append('') if config['settype'] == "srcset": contents.append("<%(settype)s setid=\"%(setid)s\" srclang=\"%(srclang)s\">"%(config)) elif config['settype'] == "refset": contents.append('<%(settype)s setid=\"%(setid)s\" srclang=\"%(srclang)s\" trglang=\"%(tstlang)s\" refid=\"%(refid)s\">'%(config)) elif config['settype'] == "tstset": contents.append('<%(settype)s setid=\"%(setid)s\" srclang=\"%(srclang)s\" trglang=\"%(tstlang)s\" sysid=\"%(sysid)s\" sysbleu=\"%(sysbleu)s\" language=\"%(language)s\">'%(config)) else: fileout.close() os.unlink(os.path.abspath(os.path.expanduser(config['fileout']))) sys.stderr.write("\n: Invalid \"settype\" value %s\n"%(config['settype'])) sys.stderr.write(": End Program\n") return True contents.append(''%('' if config['settype'] == "srcset" else 'sysid=\"%s\" '%(config['sysid']),config['docid'],config['genre'])) if __name__ == "__main__": sys.stderr.write(": added header\n") for i in range(len(lines)): contents.append(' %s '%(i+1,lines[i])) if __name__ == "__main__": sys.stderr.write(": added %d lines\n"%(i+1)) contents.append('') contents.append(''%(config['settype'])) contents.append('') if __name__ == "__main__": sys.stderr.write(": added footer\n") fileout.write('%s\n'%('\n'.join(contents))) ferror = fileout.close() if __name__ == "__main__": sys.stderr.write(": closed \"" + os.path.basename( config['fileout'] ) + "\"\n") return ferror def parsecmd( config = {} ): optparser = OptionParser() optparser.add_option( "-i", "--filein", dest = "filein", default = config["filein"], help = "UNC path to tokenized input file (required)") optparser.add_option( "-o", "--fileout", dest = "fileout", default = config["fileout"], help = "UNC path of fileout file (required)") optparser.add_option( "-s", "--srclang", dest = "srclang", default = config["srclang"], help = "2-letter code for source language (required)") optparser.add_option( "-t", "--tstlang", dest = "tstlang", default = config["tstlang"], help = "2-letter code for test language (required)") optparser.add_option( "-T", "--settype", dest = "settype", default = config["settype"], help = "Use XML tag: srcset, tstset or refset (required)") optparser.add_option( "-e", "--setid", dest = "setid", default = config["setid"], help = "Test set ID (default \""+config["setid"]+"\")") optparser.add_option( "-d", "--docid", dest = "docid", default = config["docid"], help = "Document ID (default \""+config["docid"]+"\")") optparser.add_option( "-r", "--refid", dest = "refid", default = config["refid"], help = "Reference ID (default \""+config["refid"]+"\")") optparser.add_option( "-S", "--sysid", dest = "sysid", default = config["sysid"], help = "System ID used to make the test set (default \""+config["sysid"]+"\")") optparser.add_option( "-g", "--genre", dest = "genre", default = config["genre"], help = "Genre of the test set and system ID (default \""+config["genre"]+"\")") options, commands = optparser.parse_args() missing = [] for k,v in { "Error: missing --filein" : options.filein, "Error: missing --fileout": options.fileout, "Error: missing --settype": options.settype, "Error: missing --srclang": options.srclang, "Error: missing --tstlang": options.tstlang }.items(): if not v: missing.append(k) if missing: for msg in missing: sys.stderr.write('%s\n'%(msg)) optparser.print_help() exit(1) config['filein'] = options.filein config['fileout'] = options.fileout config['settype'] = options.settype config['setid'] = options.setid config['srclang'] = options.srclang config['tstlang'] = options.tstlang config['refid'] = options.refid config['sysid'] = options.sysid config['docid'] = options.docid config['genre'] = options.genre sys.stderr.write(": Configuration complete\n") return licensetxt=u'''CorpusFiltergraph™ Copyright © 2010-2014 Precision Translation Tools Co., Ltd. This module is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 2.1 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this program. If not, see http://www.gnu.org/licenses/. For more information, please contact Precision Translation Tools Pte at: http://www.precisiontranslationtools.com''' def main(): mksgm = makemteval() mksgm.parseini(mksgm.config) parsecmd(mksgm.config) mksgm.writesgm(mksgm.config) return 0 if __name__ == "__main__": sys.exit(main())