Spaces:
Running
Running
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--src', type=str, help='Source language') | |
parser.add_argument('--tgt', type=str, help='Target language') | |
parser.add_argument('--src-file', type=str, help='Input source file') | |
parser.add_argument('--tgt-file', type=str, help='Input target file') | |
parser.add_argument('--src-output-file', type=str, help='Output source file') | |
parser.add_argument('--tgt-output-file', type=str, help='Output target file') | |
parser.add_argument('--threshold', type=float, default=0.5, help='Threshold') | |
parser.add_argument('--threshold-character', type=str, default=']', help='Threshold character') | |
parser.add_argument('--histograms', type=str, help='Path to histograms') | |
args = parser.parse_args() | |
def read_hist(f): | |
ch = [] | |
for line in f: | |
c = line[0] | |
if c == args.threshold_character: | |
break | |
ch.append(c) | |
return ch | |
with(open("{}/{}".format(args.histograms, args.src), 'r', encoding='utf8')) as f: | |
ch1 = read_hist(f) | |
with(open("{}/{}".format(args.histograms, args.tgt), 'r', encoding='utf8')) as f: | |
ch2 = read_hist(f) | |
print("Accepted characters for {}: {}".format(args.src, ch1)) | |
print("Accepted characters for {}: {}".format(args.tgt, ch2)) | |
with open(args.src_file, 'r', encoding='utf8') as fs1, open(args.tgt_file, 'r', encoding='utf8') as fs2, open(args.src_output_file, 'w', encoding='utf8') as fos1, open(args.tgt_output_file, 'w', encoding='utf8') as fos2: | |
ls1 = fs1.readline() | |
ls2 = fs2.readline() | |
while ls1 or ls2: | |
cnt1 = len([c for c in ls1.strip() if c in ch1]) | |
cnt2 = len([c for c in ls2.strip() if c in ch2]) | |
if cnt1 / len(ls1) > args.threshold and cnt2 / len(ls2) > args.threshold: | |
fos1.write(ls1) | |
fos2.write(ls2) | |
else: | |
print("{} {} {} \n{} {} {}".format(args.src, cnt1 / len(ls1), ls1.strip(), args.tgt, cnt2 / len(ls2), ls2.strip())) | |
ls1 = fs1.readline() | |
ls2 = fs2.readline() | |