sakharamg's picture
Uploading all files
158b61b
#!/usr/bin/env python
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
"""Create a test corpus, using a previously pruned vocabulary."""
import logging
import optparse
import os
import os.path
import sys
import extract
def read_vocab(filename, offset=0):
vocab = {}
for i, line in enumerate(open(filename)):
vocab[line.strip()] = i + offset
return vocab, i + offset
def main():
logging.basicConfig(
format='%(asctime)s %(levelname)s: %(message)s',
datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG)
parser = optparse.OptionParser("%prog [options]")
parser.add_option(
"-e", "--target-language", type="string", dest="target_language")
parser.add_option(
"-f", "--source-language", type="string", dest="source_language")
parser.add_option(
"-c", "--corpus", type="string", dest="corpus_stem")
parser.add_option(
"-t", "--tagged-corpus", type="string", dest="tagged_stem")
parser.add_option(
"-a", "--align", type="string", dest="align_file")
parser.add_option(
"-w", "--working-dir", type="string", dest="working_dir")
parser.set_defaults(
target_language="en",
source_language="de",
corpus_stem="test",
align_file="test.align",
working_dir="working")
options, args = parser.parse_args(sys.argv)
if not os.path.exists(options.working_dir):
raise Exception(
"Working directory '%s' not found" % options.working_dir)
m, n = None, None
for line in open(options.working_dir + "/info"):
name, value = line[:-1].split()
if name == "m":
m = int(value)
if name == "n":
n = int(value)
if m is None or n is None:
raise Exception("Info file is incomplete.")
tvocab, offset = read_vocab(options.working_dir + "/vocab.target")
svocab, offset = read_vocab(
options.working_dir + "/vocab.source", offset + 1)
file_stem = os.path.basename(options.corpus_stem)
ofh = open(options.working_dir + "/" + file_stem + ".ngrams", "w")
extract.get_ngrams(
options.corpus_stem,
options.align_file,
options.tagged_stem,
svocab,
tvocab,
options.source_language,
options.target_language,
m,
n,
ofh)
numberized_file = options.working_dir + "/" + file_stem + ".numberized"
ngrams_file_handle = open(
os.path.join(options.working_dir, file_stem + ".ngrams"), 'r')
numberized_file_handle = open(numberized_file, 'w')
# Numberize the file.
for line in ngrams_file_handle:
numberized_file_handle.write(extract.numberize(
line, m, n, svocab, tvocab))
numberized_file_handle.close()
ngrams_file_handle.close()
if __name__ == "__main__":
main()