ierhon commited on
Commit
29e9c37
·
1 Parent(s): 12b0423

Upload todset.py

Browse files
Files changed (1) hide show
  1. todset.py +25 -0
todset.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from tqdm import tqdm
3
+
4
+ lang = ""
5
+
6
+ with open(f"dset{lang}.txt", "r") as f:
7
+ lines = [x.rstrip("\n").lower().split("→") for x in tqdm(f.readlines())]
8
+ lines = [(x[0].replace("\\n", "\n"), x[1].replace("\\n", "\n")) for x in lines]
9
+
10
+ responses = []
11
+ for i in tqdm(lines):
12
+ if i[1] not in responses:
13
+ responses.append(i[1])
14
+
15
+ dset = {}
16
+ for sample in tqdm(lines):
17
+ dset[sample[0]] = responses.index(sample[1])
18
+
19
+ with open(f"dataset{lang}.json", "w") as f:
20
+ json.dump(dset, f, ensure_ascii=False)
21
+
22
+ with open(f"responses{lang}.txt", "w") as f:
23
+ for i in tqdm(responses):
24
+ f.write(i+"\n")
25
+