File size: 3,167 Bytes
d6585f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
"""tokenize mrtydi files and save in original format"""

from tqdm import tqdm 
import os
import json

from datasets import load_dataset
from tools import get_mbert_tokenize_fn


LANGS = "arabic  bengali  english finnish  indonesian  japanese  korean  russian  swahili  telugu  thai".split()

n_proc = 15

token_type = "mbert"
assert token_type in {"mbert", "whitespace"}
print(f"Preparing tokenized mrtydi with {token_type} tokenizer.")


def gen_mrtydi(lang, set_name):
    dataset = load_dataset("castorini/mr-tydi", lang, set_name)
    for entry in tqdm(dataset[set_name], desc=f"{lang}-topics-{set_name}"):
        yield entry


def gen_mrtydi_corpus(lang):
    dataset = load_dataset("castorini/mr-tydi-corpus", lang)
    for entry in tqdm(dataset["train"], desc=f"{lang}-documents"):
        yield entry


def tokenize_single_lang(lang, outp_dir):
    mbert_tokenize = get_mbert_tokenize_fn()
    def _tokenize_psgs(psgs):
        return [{
            "docid": psg["docid"],
            "title": mbert_tokenize(psg["title"]),
            "text": mbert_tokenize(psg["text"]),
        } for psg in psgs]

    mrtydi_dir = os.path.join(outp_dir, "mr-tydi", f"mr-tydi-v1.1-mbert-tokenize-{lang}")
    os.makedirs(mrtydi_dir, exist_ok=True)

    # tokenize "mr-tydi"
    for set_name in ["train", "dev", "test"]:
        outp_fn = os.path.join(mrtydi_dir, f"{set_name}.jsonl")
        if os.path.exists(outp_fn):
            print(f"Found existing file: {outp_fn}.")
            continue

        with open(outp_fn, "w") as fout:
            for entry in gen_mrtydi(lang=lang, set_name=set_name):
                query = entry["query"]
                pos_psgs = entry["positive_passages"]
                neg_psgs = entry["negative_passages"]

                if set_name == "train":
                    pos_psgs = _tokenize_psgs(pos_psgs)
                    neg_psgs = _tokenize_psgs(neg_psgs)

                mbert_entry = {
                    "query_id": entry["query_id"],
                    "query": mbert_tokenize(query),
                    "positive_passages": pos_psgs,
                    "negative_passages": neg_psgs,
                } 
                line = json.dumps(mbert_entry, ensure_ascii=False)
                fout.write(line + "\n")

    # tokenize "mr-tydi-corpus"
    mrtydi_corpus_dir = os.path.join(outp_dir, "mr-tydi-corpus", f"mr-tydi-v1.1-mbert-tokenize-{lang}")
    os.makedirs(mrtydi_corpus_dir, exist_ok=True)
    outp_fn = os.path.join(mrtydi_corpus_dir, f"corpus.jsonl")
    if os.path.exists(outp_fn):
        print(f"Found existing file: {outp_fn}.")
        return 

    with open(outp_fn, "w") as fout:
        for entry in gen_mrtydi_corpus(lang):
            mbert_entry = {
                "docid": entry["docid"],
                "title": mbert_tokenize(entry["title"]),
                "text": mbert_tokenize(entry["text"]),
            }
            line = json.dumps(mbert_entry, ensure_ascii=False)
            fout.write(line + "\n")



def main():
    outp_dir = f"mbert-mrtydi/"
    for i, lang in enumerate(LANGS):
        tokenize_single_lang(lang, outp_dir + lang)


if __name__ == "__main__":
    main()