sakharamg's picture
Uploading all files
158b61b
#!/usr/bin/env python
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
from collections import Counter
import logging
import sys
LOG = logging.getLogger(__name__)
BOS = "<s>"
EOS = "</s>"
UNK = "<unk>"
def replace_tags(tokens, tags, vocab):
for i, t in enumerate(tokens):
if t not in vocab:
if i < len(tags):
tokens[i] = tags[i]
else:
print "Error: missing tags for index i:", i
print ' '.join(tokens)
print ' '.join(tags)
tokens[i] = UNK
def replace_unks(tokens, vocab):
for i, t in enumerate(tokens):
if t not in vocab:
tokens[i] = UNK
def numberize(line, m, n, svocab, tvocab):
line = line.split()
source_words = line[:2 * m + 1]
target_words = line[-n:]
line = ' '.join([str(svocab[item]) for item in source_words]) + ' '
line += ' '.join([str(tvocab[item]) for item in target_words]) + '\n'
return line
def get_ngrams(corpus_stem, align_file, tagged_stem, svocab, tvocab, slang,
tlang, m, n, ofh):
"""
m - source context
n - target context
returns set of tags used
"""
tags = Counter()
sfh = open(corpus_stem + "." + slang)
tfh = open(corpus_stem + "." + tlang)
afh = open(align_file)
fhs = [sfh, tfh, afh]
if tagged_stem:
fhs.append(open(tagged_stem + "." + slang))
fhs.append(open(tagged_stem + "." + tlang))
count = 0
ngrams = 0
LOG.info("Extracting ngrams")
for lines in zip(*fhs):
stokens = lines[0][:-1].split()
ttokens = lines[1][:-1].split()
stokens.append(EOS)
ttokens.append(EOS)
if tagged_stem:
stags = lines[3][:-1].split()
ttags = lines[4][:-1].split()
stags.append(EOS)
ttags.append(EOS)
tags.update(stags)
tags.update(ttags)
replace_tags(stokens, stags, svocab)
replace_tags(ttokens, ttags, tvocab)
else:
replace_unks(stokens, svocab)
replace_unks(ttokens, tvocab)
# List aligns for each target.
# Note: align specifies source -> target
target_aligns = [[] for t in range(len(ttokens))]
for atoken in lines[2][:-1].split():
spos, tpos = atoken.split("-")
spos, tpos = int(spos), int(tpos)
target_aligns[tpos].append(spos)
# EOS alignment.
target_aligns[-1] = [len(stokens) - 1]
for tpos, spos_list in enumerate(target_aligns):
# Affiliation heuristics - see Devlin t al. p1371
if not spos_list:
# tpos has no alignment, look right, then left, then
# right-right, then left-left etc.
rpos = tpos + 1
lpos = tpos - 1
while rpos < len(ttokens) or lpos >= 0:
if rpos < len(ttokens) and target_aligns[rpos]:
spos_list = target_aligns[rpos]
break
if lpos >= 0 and target_aligns[lpos]:
spos_list = target_aligns[lpos]
break
rpos += 1
lpos -= 1
if not spos_list:
raise Exception(
"No alignments in sentence \nSRC: " +
lines[0][:-1] + "\nTGT: " + lines[1][:-1])
midpos = (len(spos_list) - 1) / 2
spos = sorted(spos_list)[midpos]
# source-context, target-context, predicted word
for i in range(max(0, m - spos)):
print>>ofh, BOS,
# print [spos-m/2,spos+m/2+1], stokens[spos-m/2:spos+m/2+1]
print>>ofh, " ".join(
[s for s in stokens[max(0, spos - m):spos + m + 1]]),
for i in range(max(0, spos + m + 1 - len(stokens))):
print>>ofh, EOS,
for i in range(max(0, n - (tpos + 1))):
print>>ofh, BOS,
print>>ofh, " ".join(
[t for t in ttokens[max(0, tpos + 1 - n):tpos + 1]]),
print>>ofh
ngrams += 1
count += 1
if count % 1000 == 0:
sys.stderr.write(".")
if count % 50000 == 0:
sys.stderr.write(" [%d]\n" % count)
ofh.close()
sys.stderr.write("\n")
LOG.info("Extracted %d ngrams" % ngrams)
return tags