File size: 3,742 Bytes
d44849f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
import sys
from tqdm import tqdm
from typing import List


def concat_data(
    data_dir: str,
    out_dir: str,
    lang_pair_list: List[List[str]],
    out_src_lang: str = "SRC",
    out_tgt_lang: str = "TGT",
    split: str = "train",
):
    """
    Concatenate data files from different language pairs and writes the output to a specified directory.

    Args:
        data_dir (str): path of the directory containing the data files for language pairs.
        out_dir (str): path of the directory where the output files will be saved.
        lang_pair_list (List[List[str]]): a list of language pairs, where each pair is a list of two strings.
        out_src_lang (str, optional): suffix to use for the source language (default: "SRC").
        out_tgt_lang (str, optional): suffix to use for the source language (default: "TGT").
        split (str, optional): name of the split (e.g. "train", "dev", "test") to concatenate (default: "train").
    """
    os.makedirs(out_dir, exist_ok=True)

    out_src_fname = os.path.join(out_dir, f"{split}.{out_src_lang}")
    out_tgt_fname = os.path.join(out_dir, f"{split}.{out_tgt_lang}")

    print()
    print(out_src_fname)
    print(out_tgt_fname)

    # concatenate data for different language pairs
    if os.path.isfile(out_src_fname):
        os.unlink(out_src_fname)
    if os.path.isfile(out_tgt_fname):
        os.unlink(out_tgt_fname)

    for src_lang, tgt_lang in tqdm(lang_pair_list):
        print("src: {}, tgt:{}".format(src_lang, tgt_lang))

        in_src_fname = os.path.join(data_dir, f"{src_lang}-{tgt_lang}", f"{split}.{src_lang}")
        in_tgt_fname = os.path.join(data_dir, f"{src_lang}-{tgt_lang}", f"{split}.{tgt_lang}")

        if not os.path.exists(in_src_fname) or not os.path.exists(in_tgt_fname):
            continue

        print(in_src_fname)
        os.system("cat {} >> {}".format(in_src_fname, out_src_fname))

        print(in_tgt_fname)
        os.system("cat {} >> {}".format(in_tgt_fname, out_tgt_fname))

    corpus_stats(data_dir, out_dir, lang_pair_list, split)


def corpus_stats(data_dir: str, out_dir: str, lang_pair_list: List[List[str]], split: str):
    """
    Computes statistics for the given language pairs in a corpus and
    writes the results to a file in the output directory.

    Args:
        data_dir (str): path of the directory containing the corpus data.
        out_dir (str): path of the directory where the output file should be written.
        lang_pair_list (List[List[str]]): a list of language pairs as lists of strings in the form "`[src_lang, tgt_lang]`".
        split (str): a string indicating the split (e.g. 'train', 'dev', 'test') of the corpus to consider.
    """
    meta_fname = os.path.join(out_dir, f"{split}_lang_pairs.txt")
    with open(meta_fname, "w", encoding="utf-8") as lp_file:

        for src_lang, tgt_lang in tqdm(lang_pair_list):
            print("src: {}, tgt:{}".format(src_lang, tgt_lang))

            in_src_fname = os.path.join(data_dir, f"{src_lang}-{tgt_lang}", f"{split}.{src_lang}")
            if not os.path.exists(in_src_fname):
                continue

            print(in_src_fname)

            corpus_size = 0
            with open(in_src_fname, "r", encoding="utf-8") as infile:
                corpus_size = sum(map(lambda x: 1, infile))

            lp_file.write(f"{src_lang}\t{tgt_lang}\t{corpus_size}\n")


if __name__ == "__main__":

    in_dir = sys.argv[1]
    out_dir = sys.argv[2]
    split = sys.argv[3]
    lang_pair_list = []

    pairs = os.listdir(in_dir)
    for pair in pairs:
        src_lang, tgt_lang = pair.split("-")
        lang_pair_list.append([src_lang, tgt_lang])

    concat_data(in_dir, out_dir, lang_pair_list, split=split)