|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
import glob |
|
import argparse |
|
from utils.dedup import deup |
|
import sys |
|
|
|
WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None) |
|
|
|
if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip(): |
|
print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."') |
|
sys.exit(-1) |
|
|
|
def get_directions(folder): |
|
raw_files = glob.glob(f'{folder}/train*') |
|
directions = [os.path.split(file_path)[-1].split('.')[1] for file_path in raw_files] |
|
return directions |
|
|
|
def diff_list(lhs, rhs): |
|
return set(lhs).difference(set(rhs)) |
|
|
|
def check_diff( |
|
from_src_file, from_tgt_file, |
|
to_src_file, to_tgt_file, |
|
): |
|
seen_in_from = set() |
|
seen_src_in_from = set() |
|
seen_tgt_in_from = set() |
|
from_count = 0 |
|
with open(from_src_file, encoding='utf-8') as fsrc, \ |
|
open(from_tgt_file, encoding='utf-8') as ftgt: |
|
for s, t in zip(fsrc, ftgt): |
|
seen_in_from.add((s, t)) |
|
seen_src_in_from.add(s) |
|
seen_tgt_in_from.add(t) |
|
from_count += 1 |
|
common = 0 |
|
common_src = 0 |
|
common_tgt = 0 |
|
to_count = 0 |
|
seen = set() |
|
|
|
with open(to_src_file, encoding='utf-8') as fsrc, \ |
|
open(to_tgt_file, encoding='utf-8') as ftgt: |
|
for s, t in zip(fsrc, ftgt): |
|
to_count += 1 |
|
if (s, t) not in seen: |
|
if (s, t) in seen_in_from: |
|
common += 1 |
|
if s in seen_src_in_from: |
|
common_src += 1 |
|
seen_src_in_from.remove(s) |
|
if t in seen_tgt_in_from: |
|
common_tgt += 1 |
|
seen_tgt_in_from.remove(t) |
|
seen.add((s, t)) |
|
return common, common_src, common_tgt, from_count, to_count |
|
|
|
def main(): |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--folder", type=str, required=True, |
|
help="the data folder ") |
|
parser.add_argument("--split", type=str, default='test', |
|
help="split (valid, test) to check against training data") |
|
parser.add_argument('--directions', type=str, default=None, required=False) |
|
|
|
args = parser.parse_args() |
|
|
|
if args.directions is None: |
|
directions = set(get_directions(args.folder)) |
|
directions = sorted(directions) |
|
else: |
|
directions = args.directions.split(',') |
|
directions = sorted(set(directions)) |
|
|
|
results = [] |
|
print(f'checking where {args.split} split data are in training') |
|
print(f'direction\tcommon_count\tsrc common\ttgt common\tfrom_size\tto_size') |
|
|
|
for direction in directions: |
|
src, tgt = direction.split('-') |
|
from_src_file = f'{args.folder}/{args.split}.{src}-{tgt}.{src}' |
|
from_tgt_file = f'{args.folder}/{args.split}.{src}-{tgt}.{tgt}' |
|
if not os.path.exists(from_src_file): |
|
|
|
from_src_file = f'{args.folder}/{args.split}.{tgt}-{src}.{src}' |
|
from_tgt_file = f'{args.folder}/{args.split}.{tgt}-{src}.{tgt}' |
|
to_src_file = f'{args.folder}/train.{src}-{tgt}.{src}' |
|
to_tgt_file = f'{args.folder}/train.{src}-{tgt}.{tgt}' |
|
if not os.path.exists(to_src_file) or not os.path.exists(from_src_file): |
|
continue |
|
r = check_diff(from_src_file, from_tgt_file, to_src_file, to_tgt_file) |
|
results.append(r) |
|
print(f'{direction}\t', '\t'.join(map(str, r))) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|