|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
import glob |
|
import argparse |
|
from utils.dedup import deup |
|
|
|
import sys |
|
WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None) |
|
|
|
if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip(): |
|
print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."') |
|
sys.exit(-1) |
|
|
|
|
|
def main(): |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--from-folder", type=str, required=True, |
|
help="the data folder to be dedup") |
|
parser.add_argument("--to-folder", type=str, required=True, |
|
help="the data folder to save deduped data") |
|
parser.add_argument('--directions', type=str, default=None, required=False) |
|
|
|
args = parser.parse_args() |
|
|
|
if args.directions is None: |
|
raw_files = glob.glob(f'{args.from_folder}/train*') |
|
|
|
directions = [os.path.split(file_path)[-1].split('.')[1] for file_path in raw_files] |
|
else: |
|
directions = args.directions.split(',') |
|
directions = sorted(set(directions)) |
|
|
|
for direction in directions: |
|
src, tgt = direction.split('-') |
|
src_file = f'{args.from_folder}/train.{src}-{tgt}.{src}' |
|
tgt_file = f'{args.from_folder}/train.{src}-{tgt}.{tgt}' |
|
src_file_out = f'{args.to_folder}/train.{src}-{tgt}.{src}' |
|
tgt_file_out = f'{args.to_folder}/train.{src}-{tgt}.{tgt}' |
|
assert src_file != src_file_out |
|
assert tgt_file != tgt_file_out |
|
print(f'deduping {src_file}, {tgt_file}') |
|
deup(src_file, tgt_file, src_file_out, tgt_file_out) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|