|
import gzip |
|
import argparse |
|
from string import punctuation |
|
|
|
def len_no_punc(s, punc): |
|
return len([ch for ch in s if ch in punc]) |
|
|
|
def filter_overpunc(len_npunc, len_sen): |
|
return len_npunc < 0.5*len_sen |
|
|
|
def main(args): |
|
punc = punctuation + "β|β" |
|
print('Processing file {}'.format(args.input)) |
|
with gzip.open(args.input, 'rt', encoding=args.encoding) as tsv: |
|
with open(args.bitext + '.' + args.src_lang, 'wt', encoding=args.encoding) as fsrc: |
|
with open(args.bitext + '.' + args.tgt_lang, 'wt', encoding=args.encoding) as ftgt: |
|
line = tsv.readline() |
|
fields = line.split('\t') |
|
|
|
src, tgt = fields[1], fields[2] |
|
|
|
nchar_npunc_src = len_no_punc(src, punc) |
|
nchar_npunc_tgt = len_no_punc(tgt, punc) |
|
|
|
if filter_overpunc(nchar_npunc_src, len(src)) and filter_overpunc(nchar_npunc_tgt, len(tgt)): |
|
fsrc.write(src.strip() + '\n') |
|
ftgt.write(tgt.strip() + '\n') |
|
|
|
if __name__ == '__main__': |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--input", required=True, type=str) |
|
parser.add_argument('--encoding', default='utf-8', help='character encoding for input/output') |
|
parser.add_argument('--bitext', type=str, required=True, help='language direction') |
|
parser.add_argument('--src-lang', type=str, required=True, help='Source language') |
|
parser.add_argument('--tgt-lang', type=str, required=True, help='Target language') |
|
main(parser.parse_args()) |
|
|