Spaces:
Build error
Build error
Sam Passaglia
commited on
Commit
ยท
8061dc1
1
Parent(s):
37b2b22
minor
Browse files
app.py
CHANGED
@@ -11,143 +11,142 @@ from yomikata import utils
|
|
11 |
from yomikata.dictionary import Dictionary
|
12 |
from yomikata.utils import parse_furigana
|
13 |
|
14 |
-
st.title("Yomikata: Disambiguate Japanese Heteronyms with a BERT model")
|
15 |
-
|
16 |
-
# @st.cache
|
17 |
-
# def add_border(html: str):
|
18 |
-
# WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.5rem; padding: 1rem; margin-bottom: 1.0rem; display: inline-block">{}</div>"""
|
19 |
-
# html = html.replace("\n", " ")
|
20 |
-
# return WRAPPER.format(html)
|
21 |
-
|
22 |
-
|
23 |
-
# def get_random_sentence():
|
24 |
-
# from config.config import TEST_DATA_DIR
|
25 |
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
28 |
|
29 |
|
30 |
-
|
31 |
-
|
32 |
-
# from yomikata.dbert import dBert
|
33 |
|
34 |
-
|
35 |
-
|
36 |
|
37 |
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
# from yomikata.utils import load_dict
|
42 |
|
43 |
-
|
|
|
44 |
|
45 |
-
# global_accuracy = stats["test"]["accuracy"]
|
46 |
|
47 |
-
|
48 |
-
|
|
|
|
|
49 |
|
50 |
-
|
51 |
|
52 |
-
|
53 |
-
# "ใ".join(
|
54 |
-
# [
|
55 |
-
# "{reading} ({correct}/{n})".format(
|
56 |
-
# reading=reading,
|
57 |
-
# correct=stats[heteronym]["readings"][reading]["found"][reading],
|
58 |
-
# n=stats[heteronym]["readings"][reading]["n"],
|
59 |
-
# )
|
60 |
-
# for reading in stats[heteronym]["readings"].keys()
|
61 |
-
# if (
|
62 |
-
# stats[heteronym]["readings"][reading]["found"][reading] != 0
|
63 |
-
# or reading != "<OTHER>"
|
64 |
-
# )
|
65 |
-
# ]
|
66 |
-
# )
|
67 |
-
# for heteronym in heteronyms
|
68 |
-
# ]
|
69 |
|
70 |
-
|
|
|
71 |
|
72 |
-
|
73 |
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
75 |
|
76 |
-
#
|
77 |
|
78 |
-
|
79 |
|
80 |
-
|
81 |
|
82 |
-
|
83 |
|
84 |
-
|
85 |
|
|
|
86 |
|
87 |
-
|
88 |
-
# def furigana_to_spacy(text_with_furigana):
|
89 |
-
# tokens = parse_furigana(text_with_furigana)
|
90 |
-
# ents = []
|
91 |
-
# output_text = ""
|
92 |
-
# heteronym_count = 0
|
93 |
-
# for token in tokens.groups:
|
94 |
-
# if isinstance(token, ttlig.RubyFrag):
|
95 |
-
# if heteronym_count != 0:
|
96 |
-
# output_text += ", "
|
97 |
|
98 |
-
|
99 |
-
# {
|
100 |
-
# "start": len(output_text),
|
101 |
-
# "end": len(output_text) + len(token.text),
|
102 |
-
# "label": token.furi,
|
103 |
-
# }
|
104 |
-
# )
|
105 |
|
106 |
-
# output_text += token.text
|
107 |
-
# heteronym_count += 1
|
108 |
-
# else:
|
109 |
-
# pass
|
110 |
-
# return {
|
111 |
-
# "text": output_text,
|
112 |
-
# "ents": ents,
|
113 |
-
# "title": None,
|
114 |
-
# }
|
115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
# label_visibility="collapsed",
|
129 |
-
# )
|
130 |
|
131 |
-
# # Yomikata prediction
|
132 |
-
# dbert_prediction, heteronyms = get_dbert_prediction_and_heteronym_list(input_text)
|
133 |
|
134 |
-
|
135 |
-
# colors = ["#85DCDF", "#DF85DC", "#DCDF85", "#85ABDF"]
|
136 |
-
# spacy_dict = furigana_to_spacy(dbert_prediction)
|
137 |
-
# label_colors = {
|
138 |
-
# reading: colors[i % len(colors)]
|
139 |
-
# for i, reading in enumerate(set([item["label"] for item in spacy_dict["ents"]]))
|
140 |
-
# }
|
141 |
-
# html = spacy.displacy.render(spacy_dict, style="ent", manual=True, options={"colors": label_colors})
|
142 |
|
143 |
-
#
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
151 |
|
152 |
# # Dictionary + Yomikata prediction
|
153 |
# st.markdown("**Yomikata** can be coupled with a dictionary to get full furigana:")
|
|
|
11 |
from yomikata.dictionary import Dictionary
|
12 |
from yomikata.utils import parse_furigana
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
+
@st.cache
|
16 |
+
def add_border(html: str):
|
17 |
+
WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.5rem; padding: 1rem; margin-bottom: 1.0rem; display: inline-block">{}</div>"""
|
18 |
+
html = html.replace("\n", " ")
|
19 |
+
return WRAPPER.format(html)
|
20 |
|
21 |
|
22 |
+
def get_random_sentence():
|
23 |
+
from config.config import TEST_DATA_DIR
|
|
|
24 |
|
25 |
+
df = pd.read_csv(Path(TEST_DATA_DIR, "test_optimized_strict_heteronyms.csv"))
|
26 |
+
return df.sample(1).iloc[0].sentence
|
27 |
|
28 |
|
29 |
+
@st.cache
|
30 |
+
def get_dbert_prediction_and_heteronym_list(text):
|
31 |
+
from yomikata.dbert import dBert
|
|
|
32 |
|
33 |
+
reader = dBert()
|
34 |
+
return reader.furigana(text), reader.heteronyms
|
35 |
|
|
|
36 |
|
37 |
+
@st.cache
|
38 |
+
def get_stats():
|
39 |
+
from config import config
|
40 |
+
from yomikata.utils import load_dict
|
41 |
|
42 |
+
stats = load_dict(Path(config.STORES_DIR, "dbert/training_performance.json"))
|
43 |
|
44 |
+
global_accuracy = stats["test"]["accuracy"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
+
stats = stats["test"]["heteronym_performance"]
|
47 |
+
heteronyms = stats.keys()
|
48 |
|
49 |
+
accuracy = [stats[heteronym]["accuracy"] for heteronym in heteronyms]
|
50 |
|
51 |
+
readings = [
|
52 |
+
"ใ".join(
|
53 |
+
[
|
54 |
+
"{reading} ({correct}/{n})".format(
|
55 |
+
reading=reading,
|
56 |
+
correct=stats[heteronym]["readings"][reading]["found"][reading],
|
57 |
+
n=stats[heteronym]["readings"][reading]["n"],
|
58 |
+
)
|
59 |
+
for reading in stats[heteronym]["readings"].keys()
|
60 |
+
if (
|
61 |
+
stats[heteronym]["readings"][reading]["found"][reading] != 0
|
62 |
+
or reading != "<OTHER>"
|
63 |
+
)
|
64 |
+
]
|
65 |
+
)
|
66 |
+
for heteronym in heteronyms
|
67 |
+
]
|
68 |
|
69 |
+
# if reading != '<OTHER>'
|
70 |
|
71 |
+
df = pd.DataFrame({"heteronym": heteronyms, "accuracy": accuracy, "readings": readings})
|
72 |
|
73 |
+
df = df[df["readings"].str.contains("ใ")]
|
74 |
|
75 |
+
df["readings"] = df["readings"].str.replace("<OTHER>", "Other")
|
76 |
|
77 |
+
df = df.rename(columns={"readings": "readings (test corr./total)"})
|
78 |
|
79 |
+
df = df.sort_values("accuracy", ascending=False, ignore_index=True)
|
80 |
|
81 |
+
df.index += 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
|
83 |
+
return global_accuracy, df
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
|
86 |
+
@st.cache
|
87 |
+
def furigana_to_spacy(text_with_furigana):
|
88 |
+
tokens = parse_furigana(text_with_furigana)
|
89 |
+
ents = []
|
90 |
+
output_text = ""
|
91 |
+
heteronym_count = 0
|
92 |
+
for token in tokens.groups:
|
93 |
+
if isinstance(token, ttlig.RubyFrag):
|
94 |
+
if heteronym_count != 0:
|
95 |
+
output_text += ", "
|
96 |
|
97 |
+
ents.append(
|
98 |
+
{
|
99 |
+
"start": len(output_text),
|
100 |
+
"end": len(output_text) + len(token.text),
|
101 |
+
"label": token.furi,
|
102 |
+
}
|
103 |
+
)
|
104 |
|
105 |
+
output_text += token.text
|
106 |
+
heteronym_count += 1
|
107 |
+
else:
|
108 |
+
pass
|
109 |
+
return {
|
110 |
+
"text": output_text,
|
111 |
+
"ents": ents,
|
112 |
+
"title": None,
|
113 |
+
}
|
|
|
|
|
114 |
|
|
|
|
|
115 |
|
116 |
+
st.title("Yomikata: Disambiguate Japanese Heteronyms with a BERT model")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
+
# Input text box
|
119 |
+
st.markdown("Input a Japanese sentence:")
|
120 |
+
|
121 |
+
if "default_sentence" not in st.session_state:
|
122 |
+
st.session_state.default_sentence = "ใใ{ไบบ้/ใซใใใ}ใจใใใใฎใใ? {ไบบ้/ใซใใใ}ใจใใใใฎใฏ{่ง/ใคใฎ}ใฎ{็/ใฏ}ใใชใใ{็็ฝ/ใชใพใใ}ใ{้ก/ใใ}ใ{ๆ่ถณ/ใฆใใ}ใใใใ{ไฝ/ใชใ}ใจใใใใใ{ๆฐๅณ/ใใฟ}ใฎ{ๆช/ใใ}ใใใฎใ ใใ"
|
123 |
+
|
124 |
+
input_text = st.text_area(
|
125 |
+
"Input a Japanese sentence:",
|
126 |
+
utils.remove_furigana(st.session_state.default_sentence),
|
127 |
+
label_visibility="collapsed",
|
128 |
+
)
|
129 |
+
|
130 |
+
# Yomikata prediction
|
131 |
+
dbert_prediction, heteronyms = get_dbert_prediction_and_heteronym_list(input_text)
|
132 |
+
|
133 |
+
# spacy-style output for the predictions
|
134 |
+
colors = ["#85DCDF", "#DF85DC", "#DCDF85", "#85ABDF"]
|
135 |
+
spacy_dict = furigana_to_spacy(dbert_prediction)
|
136 |
+
label_colors = {
|
137 |
+
reading: colors[i % len(colors)]
|
138 |
+
for i, reading in enumerate(set([item["label"] for item in spacy_dict["ents"]]))
|
139 |
+
}
|
140 |
+
html = spacy.displacy.render(spacy_dict, style="ent", manual=True, options={"colors": label_colors})
|
141 |
+
|
142 |
+
if len(spacy_dict["ents"]) > 0:
|
143 |
+
st.markdown("**Yomikata** found and disambiguated the following heteronyms:")
|
144 |
+
st.write(
|
145 |
+
f"{add_border(html)}",
|
146 |
+
unsafe_allow_html=True,
|
147 |
+
)
|
148 |
+
else:
|
149 |
+
st.markdown("**Yomikata** found no heteronyms in the input text.")
|
150 |
|
151 |
# # Dictionary + Yomikata prediction
|
152 |
# st.markdown("**Yomikata** can be coupled with a dictionary to get full furigana:")
|