jiang
init commit
650c5f6
raw
history blame
2.01 kB
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--src', type=str, help='Source language')
parser.add_argument('--tgt', type=str, help='Target language')
parser.add_argument('--src-file', type=str, help='Input source file')
parser.add_argument('--tgt-file', type=str, help='Input target file')
parser.add_argument('--src-output-file', type=str, help='Output source file')
parser.add_argument('--tgt-output-file', type=str, help='Output target file')
parser.add_argument('--threshold', type=float, default=0.5, help='Threshold')
parser.add_argument('--threshold-character', type=str, default=']', help='Threshold character')
parser.add_argument('--histograms', type=str, help='Path to histograms')
args = parser.parse_args()
def read_hist(f):
ch = []
for line in f:
c = line[0]
if c == args.threshold_character:
break
ch.append(c)
return ch
with(open("{}/{}".format(args.histograms, args.src), 'r', encoding='utf8')) as f:
ch1 = read_hist(f)
with(open("{}/{}".format(args.histograms, args.tgt), 'r', encoding='utf8')) as f:
ch2 = read_hist(f)
print("Accepted characters for {}: {}".format(args.src, ch1))
print("Accepted characters for {}: {}".format(args.tgt, ch2))
with open(args.src_file, 'r', encoding='utf8') as fs1, open(args.tgt_file, 'r', encoding='utf8') as fs2, open(args.src_output_file, 'w', encoding='utf8') as fos1, open(args.tgt_output_file, 'w', encoding='utf8') as fos2:
ls1 = fs1.readline()
ls2 = fs2.readline()
while ls1 or ls2:
cnt1 = len([c for c in ls1.strip() if c in ch1])
cnt2 = len([c for c in ls2.strip() if c in ch2])
if cnt1 / len(ls1) > args.threshold and cnt2 / len(ls2) > args.threshold:
fos1.write(ls1)
fos2.write(ls2)
else:
print("{} {} {} \n{} {} {}".format(args.src, cnt1 / len(ls1), ls1.strip(), args.tgt, cnt2 / len(ls2), ls2.strip()))
ls1 = fs1.readline()
ls2 = fs2.readline()