File size: 1,830 Bytes
9346eed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
pip --cert /etc/pki/ca-trust/source/anchors/tri-ace-CA-2015.cer install --trusted-host pypi.org --trusted-host files.pythonhosted.org beautifulsoup4
from bs4 import BeautifulSoup
f = open("./amitaro.htm", "r")
txt = f.read()
soup = BeautifulSoup(txt)
print(soup.prettify())
import json
f = open('amitaro.json')
file_list = json.load(f)
td = soup.find_all('td')
for i, val in enumerate(td):
if len(val.contents) == 0:
continue
key = val.contents[0]
if key in file_list:
#print(td[i-1].contents[0])
if len(td[i-1].contents) > 0:
#print(td[i-1].contents[0])
temp = BeautifulSoup(str(td[i-1].contents[0]))
a = temp.find_all('a')
print(a[0].contents[0])
file_list[key]["kana"] = str(a[0].contents[0])
with open("./amitaro_with_kana.json", "w") as outfile:
outfile.write(json.dumps(file_list, indent=4,ensure_ascii=False))
for key, val in file_list.items():
val["path"] = "./data_amitaro22k/" + val["path"]
with open("./amitaro_with_kana.json", "w") as outfile:
outfile.write(json.dumps(file_list, indent=4,ensure_ascii=False))
file = []
for key, val in file_list.items():
if len(val['kana']) == 0:
continue
if val['kana'].find("(") != -1:
continue
file.append(f"{val['path']}|10|{val['kana']}")
amitaro_train = []
amitaro_val = []
for val in file:
amitaro_train.append(val)
import random
rands = []
while len(rands) < len(file)/10:
rand_num = random.randint(0, len(file)-1)
if rand_num in rands:
continue
amitaro_val.append(file[rand_num])
rands.append(rand_num)
f = open("amitaro_train.txt", "w")
for val in amitaro_train:
f.write(f"{val}\n")
f.close()
f = open("amitaro_val.txt", "w")
for val in amitaro_val:
f.write(f"{val}\n")
f.close() |