|
|
|
|
|
|
|
|
|
|
|
""" |
|
The Gacha filter cleans out sentence pairs that have global character mean |
|
lower than a certain threshold. |
|
|
|
Use this cleaner to produce low quantity of high quality sentence pairs. |
|
|
|
It is an aggressive cleaner that cleaned out ~64% of the HindEnCorp during |
|
WMT14 when threshold is set at 20% (Tan and Pal, 2014); achieving lowest TER. |
|
(see http://www.aclweb.org/anthology/W/W14/W14-3323.pdf) |
|
|
|
This is inspired by the global character mean that is used in the Gale-Church |
|
algorithm (Gale aand Church, 1993), the c variable in: |
|
|
|
delta = (l2-l1*c)/math.sqrt(l1*s2) |
|
|
|
where: |
|
- l1 = len(source_sentence) |
|
- l2 = len(target_sentence) |
|
- c = global mean, i.e. #char in source corpus / #char in target corpus |
|
- s2 = global variance, i.e. d ((l1 - l2)^2) / d (l1) |
|
|
|
(For details on Gale-Church, see http://www.aclweb.org/anthology/J93-1004.pdf) |
|
""" |
|
|
|
import io |
|
import subprocess |
|
|
|
|
|
red = '\033[01;31m' |
|
native = '\033[m' |
|
|
|
|
|
def err_msg(txt): |
|
return red + txt + native |
|
|
|
|
|
def num_char(filename): |
|
process = subprocess.Popen( |
|
["wc", "-m", filename], stdout=subprocess.PIPE) |
|
|
|
return float(process.stdout.read().split()[0]) |
|
|
|
|
|
def gacha_mean(sourcefile, targetfile): |
|
""" |
|
Counts the global character mean between source and target language as |
|
in Gale-Church (1993) |
|
""" |
|
sys.stderr.write(err_msg('Calculating Gacha mean, please wait ...\n')) |
|
c = num_char(sourcefile) / num_char(targetfile) |
|
sys.stderr.write(err_msg('Gacha mean = ' + str(c) + '\n')) |
|
sys.stderr.write(err_msg('Filtering starts ...\n')) |
|
return c |
|
|
|
|
|
def io_open(path): |
|
"""Open file `path` for reading, as a UTF-8 text file.""" |
|
return io.open(path, 'r', encoding='utf8') |
|
|
|
|
|
def main(sourcefile, targetfile, threshold=0.2): |
|
|
|
c = gacha_mean(sourcefile, targetfile) |
|
|
|
threshold = float(threshold) |
|
lowerbound = (1 - threshold) * c |
|
upperbound = (1 + threshold) * c |
|
|
|
|
|
with io_open(sourcefile) as srcfin, io_open(targetfile) as trgfin: |
|
for s, t in zip(srcfin, trgfin): |
|
if lowerbound < len(s) / float(len(t)) < upperbound: |
|
print(u"{}\t{}\n".format(s.strip(), t.strip())) |
|
|
|
|
|
if __name__ == '__main__': |
|
import sys |
|
if len(sys.argv) not in range(3, 5): |
|
usage_msg = err_msg( |
|
"Usage: python %s srcfile trgfile (threshold)\n" |
|
% sys.argv[0]) |
|
|
|
example_msg = err_msg( |
|
"Example: " |
|
"gacha_cleaning.py ~/Europarl.de-en.de ~/Europarl.de-en.en 0.4\n" |
|
% sys.argv[0]) |
|
sys.stderr.write(usage_msg) |
|
sys.stderr.write(example_msg) |
|
sys.exit(1) |
|
|
|
main(*sys.argv[1:]) |
|
|