File size: 976 Bytes
35c1cfd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38

import sys
import re
import string

in_f = sys.argv[1]
out_f = sys.argv[2]


with open(in_f, "r", encoding="utf-8") as f:
  lines = f.readlines()

with open(out_f, "w", encoding="utf-8") as f:
  for line in lines:
    outs = line.strip().split("\t", 1)
    if len(outs) == 2:
      idx, text = outs
      text = re.sub("<|", "", text)
      text = re.sub("|>", "", text)
      text = re.sub("—", "", text)
      # text = re.sub("<s>", "", text)
      # text = re.sub("@@", "", text)
      # text = re.sub("@", "", text)
      # text = re.sub("<unk>", "", text)
      # text = re.sub(" ", "", text)
      # text = text.lower()
      translator = str.maketrans('', '', string.punctuation.replace("'", ""))
      result = text.translate(translator)
      text = result.upper()
    else:
      idx = outs[0]
      text = " "

    # text = [x for x in text]
    # text = " ".join(text)
    out = "{} {}\n".format(idx, text)
    f.write(out)