|
import argparse |
|
|
|
parser = argparse.ArgumentParser() |
|
parser.add_argument('--src', type=str, help='Source language') |
|
parser.add_argument('--tgt', type=str, help='Target language') |
|
parser.add_argument('--src-file', type=str, help='Input source file') |
|
parser.add_argument('--tgt-file', type=str, help='Input target file') |
|
parser.add_argument('--src-output-file', type=str, help='Output source file') |
|
parser.add_argument('--tgt-output-file', type=str, help='Output target file') |
|
parser.add_argument('--threshold', type=float, default=0.5, help='Threshold') |
|
parser.add_argument('--threshold-character', type=str, default=']', help='Threshold character') |
|
parser.add_argument('--histograms', type=str, help='Path to histograms') |
|
|
|
args = parser.parse_args() |
|
|
|
|
|
def read_hist(f): |
|
ch = [] |
|
for line in f: |
|
c = line[0] |
|
if c == args.threshold_character: |
|
break |
|
ch.append(c) |
|
return ch |
|
|
|
|
|
with(open("{}/{}".format(args.histograms, args.src), 'r', encoding='utf8')) as f: |
|
ch1 = read_hist(f) |
|
|
|
with(open("{}/{}".format(args.histograms, args.tgt), 'r', encoding='utf8')) as f: |
|
ch2 = read_hist(f) |
|
|
|
print("Accepted characters for {}: {}".format(args.src, ch1)) |
|
print("Accepted characters for {}: {}".format(args.tgt, ch2)) |
|
|
|
with open(args.src_file, 'r', encoding='utf8') as fs1, open(args.tgt_file, 'r', encoding='utf8') as fs2, open(args.src_output_file, 'w', encoding='utf8') as fos1, open(args.tgt_output_file, 'w', encoding='utf8') as fos2: |
|
ls1 = fs1.readline() |
|
ls2 = fs2.readline() |
|
|
|
while ls1 or ls2: |
|
cnt1 = len([c for c in ls1.strip() if c in ch1]) |
|
cnt2 = len([c for c in ls2.strip() if c in ch2]) |
|
|
|
if cnt1 / len(ls1) > args.threshold and cnt2 / len(ls2) > args.threshold: |
|
fos1.write(ls1) |
|
fos2.write(ls2) |
|
else: |
|
print("{} {} {} \n{} {} {}".format(args.src, cnt1 / len(ls1), ls1.strip(), args.tgt, cnt2 / len(ls2), ls2.strip())) |
|
|
|
ls1 = fs1.readline() |
|
ls2 = fs2.readline() |
|
|