Sam Passaglia commited on
Commit
8061dc1
ยท
1 Parent(s): 37b2b22
Files changed (1) hide show
  1. app.py +107 -108
app.py CHANGED
@@ -11,143 +11,142 @@ from yomikata import utils
11
  from yomikata.dictionary import Dictionary
12
  from yomikata.utils import parse_furigana
13
 
14
- st.title("Yomikata: Disambiguate Japanese Heteronyms with a BERT model")
15
-
16
- # @st.cache
17
- # def add_border(html: str):
18
- # WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.5rem; padding: 1rem; margin-bottom: 1.0rem; display: inline-block">{}</div>"""
19
- # html = html.replace("\n", " ")
20
- # return WRAPPER.format(html)
21
-
22
-
23
- # def get_random_sentence():
24
- # from config.config import TEST_DATA_DIR
25
 
26
- # df = pd.read_csv(Path(TEST_DATA_DIR, "test_optimized_strict_heteronyms.csv"))
27
- # return df.sample(1).iloc[0].sentence
 
 
 
28
 
29
 
30
- # @st.cache
31
- # def get_dbert_prediction_and_heteronym_list(text):
32
- # from yomikata.dbert import dBert
33
 
34
- # reader = dBert()
35
- # return reader.furigana(text), reader.heteronyms
36
 
37
 
38
- # @st.cache
39
- # def get_stats():
40
- # from config import config
41
- # from yomikata.utils import load_dict
42
 
43
- # stats = load_dict(Path(config.STORES_DIR, "dbert/training_performance.json"))
 
44
 
45
- # global_accuracy = stats["test"]["accuracy"]
46
 
47
- # stats = stats["test"]["heteronym_performance"]
48
- # heteronyms = stats.keys()
 
 
49
 
50
- # accuracy = [stats[heteronym]["accuracy"] for heteronym in heteronyms]
51
 
52
- # readings = [
53
- # "ใ€".join(
54
- # [
55
- # "{reading} ({correct}/{n})".format(
56
- # reading=reading,
57
- # correct=stats[heteronym]["readings"][reading]["found"][reading],
58
- # n=stats[heteronym]["readings"][reading]["n"],
59
- # )
60
- # for reading in stats[heteronym]["readings"].keys()
61
- # if (
62
- # stats[heteronym]["readings"][reading]["found"][reading] != 0
63
- # or reading != "<OTHER>"
64
- # )
65
- # ]
66
- # )
67
- # for heteronym in heteronyms
68
- # ]
69
 
70
- # # if reading != '<OTHER>'
 
71
 
72
- # df = pd.DataFrame({"heteronym": heteronyms, "accuracy": accuracy, "readings": readings})
73
 
74
- # df = df[df["readings"].str.contains("ใ€")]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
- # df["readings"] = df["readings"].str.replace("<OTHER>", "Other")
77
 
78
- # df = df.rename(columns={"readings": "readings (test corr./total)"})
79
 
80
- # df = df.sort_values("accuracy", ascending=False, ignore_index=True)
81
 
82
- # df.index += 1
83
 
84
- # return global_accuracy, df
85
 
 
86
 
87
- # @st.cache
88
- # def furigana_to_spacy(text_with_furigana):
89
- # tokens = parse_furigana(text_with_furigana)
90
- # ents = []
91
- # output_text = ""
92
- # heteronym_count = 0
93
- # for token in tokens.groups:
94
- # if isinstance(token, ttlig.RubyFrag):
95
- # if heteronym_count != 0:
96
- # output_text += ", "
97
 
98
- # ents.append(
99
- # {
100
- # "start": len(output_text),
101
- # "end": len(output_text) + len(token.text),
102
- # "label": token.furi,
103
- # }
104
- # )
105
 
106
- # output_text += token.text
107
- # heteronym_count += 1
108
- # else:
109
- # pass
110
- # return {
111
- # "text": output_text,
112
- # "ents": ents,
113
- # "title": None,
114
- # }
115
 
 
 
 
 
 
 
 
 
 
 
116
 
117
- # st.title("Yomikata: Disambiguate Japanese Heteronyms with a BERT model")
 
 
 
 
 
 
118
 
119
- # # Input text box
120
- # st.markdown("Input a Japanese sentence:")
121
-
122
- # if "default_sentence" not in st.session_state:
123
- # st.session_state.default_sentence = "ใˆใ€{ไบบ้–“/ใซใ‚“ใ’ใ‚“}ใจใ„ใ†ใ‚‚ใฎใ‹ใ„? {ไบบ้–“/ใซใ‚“ใ’ใ‚“}ใจใ„ใ†ใ‚‚ใฎใฏ{่ง’/ใคใฎ}ใฎ{็”Ÿ/ใฏ}ใˆใชใ„ใ€{็”Ÿ็™ฝ/ใชใพใ˜ใ‚}ใ„{้ก”/ใ‹ใŠ}ใ‚„{ๆ‰‹่ถณ/ใฆใ‚ใ—}ใ‚’ใ—ใŸใ€{ไฝ•/ใชใ‚“}ใจใ‚‚ใ„ใ‚ใ‚Œใš{ๆฐ—ๅ‘ณ/ใใฟ}ใฎ{ๆ‚ช/ใ‚ใ‚‹}ใ„ใ‚‚ใฎใ ใ‚ˆใ€‚"
124
-
125
- # input_text = st.text_area(
126
- # "Input a Japanese sentence:",
127
- # utils.remove_furigana(st.session_state.default_sentence),
128
- # label_visibility="collapsed",
129
- # )
130
 
131
- # # Yomikata prediction
132
- # dbert_prediction, heteronyms = get_dbert_prediction_and_heteronym_list(input_text)
133
 
134
- # # spacy-style output for the predictions
135
- # colors = ["#85DCDF", "#DF85DC", "#DCDF85", "#85ABDF"]
136
- # spacy_dict = furigana_to_spacy(dbert_prediction)
137
- # label_colors = {
138
- # reading: colors[i % len(colors)]
139
- # for i, reading in enumerate(set([item["label"] for item in spacy_dict["ents"]]))
140
- # }
141
- # html = spacy.displacy.render(spacy_dict, style="ent", manual=True, options={"colors": label_colors})
142
 
143
- # if len(spacy_dict["ents"]) > 0:
144
- # st.markdown("**Yomikata** found and disambiguated the following heteronyms:")
145
- # st.write(
146
- # f"{add_border(html)}",
147
- # unsafe_allow_html=True,
148
- # )
149
- # else:
150
- # st.markdown("**Yomikata** found no heteronyms in the input text.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
  # # Dictionary + Yomikata prediction
153
  # st.markdown("**Yomikata** can be coupled with a dictionary to get full furigana:")
 
11
  from yomikata.dictionary import Dictionary
12
  from yomikata.utils import parse_furigana
13
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ @st.cache
16
+ def add_border(html: str):
17
+ WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.5rem; padding: 1rem; margin-bottom: 1.0rem; display: inline-block">{}</div>"""
18
+ html = html.replace("\n", " ")
19
+ return WRAPPER.format(html)
20
 
21
 
22
+ def get_random_sentence():
23
+ from config.config import TEST_DATA_DIR
 
24
 
25
+ df = pd.read_csv(Path(TEST_DATA_DIR, "test_optimized_strict_heteronyms.csv"))
26
+ return df.sample(1).iloc[0].sentence
27
 
28
 
29
+ @st.cache
30
+ def get_dbert_prediction_and_heteronym_list(text):
31
+ from yomikata.dbert import dBert
 
32
 
33
+ reader = dBert()
34
+ return reader.furigana(text), reader.heteronyms
35
 
 
36
 
37
+ @st.cache
38
+ def get_stats():
39
+ from config import config
40
+ from yomikata.utils import load_dict
41
 
42
+ stats = load_dict(Path(config.STORES_DIR, "dbert/training_performance.json"))
43
 
44
+ global_accuracy = stats["test"]["accuracy"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ stats = stats["test"]["heteronym_performance"]
47
+ heteronyms = stats.keys()
48
 
49
+ accuracy = [stats[heteronym]["accuracy"] for heteronym in heteronyms]
50
 
51
+ readings = [
52
+ "ใ€".join(
53
+ [
54
+ "{reading} ({correct}/{n})".format(
55
+ reading=reading,
56
+ correct=stats[heteronym]["readings"][reading]["found"][reading],
57
+ n=stats[heteronym]["readings"][reading]["n"],
58
+ )
59
+ for reading in stats[heteronym]["readings"].keys()
60
+ if (
61
+ stats[heteronym]["readings"][reading]["found"][reading] != 0
62
+ or reading != "<OTHER>"
63
+ )
64
+ ]
65
+ )
66
+ for heteronym in heteronyms
67
+ ]
68
 
69
+ # if reading != '<OTHER>'
70
 
71
+ df = pd.DataFrame({"heteronym": heteronyms, "accuracy": accuracy, "readings": readings})
72
 
73
+ df = df[df["readings"].str.contains("ใ€")]
74
 
75
+ df["readings"] = df["readings"].str.replace("<OTHER>", "Other")
76
 
77
+ df = df.rename(columns={"readings": "readings (test corr./total)"})
78
 
79
+ df = df.sort_values("accuracy", ascending=False, ignore_index=True)
80
 
81
+ df.index += 1
 
 
 
 
 
 
 
 
 
82
 
83
+ return global_accuracy, df
 
 
 
 
 
 
84
 
 
 
 
 
 
 
 
 
 
85
 
86
+ @st.cache
87
+ def furigana_to_spacy(text_with_furigana):
88
+ tokens = parse_furigana(text_with_furigana)
89
+ ents = []
90
+ output_text = ""
91
+ heteronym_count = 0
92
+ for token in tokens.groups:
93
+ if isinstance(token, ttlig.RubyFrag):
94
+ if heteronym_count != 0:
95
+ output_text += ", "
96
 
97
+ ents.append(
98
+ {
99
+ "start": len(output_text),
100
+ "end": len(output_text) + len(token.text),
101
+ "label": token.furi,
102
+ }
103
+ )
104
 
105
+ output_text += token.text
106
+ heteronym_count += 1
107
+ else:
108
+ pass
109
+ return {
110
+ "text": output_text,
111
+ "ents": ents,
112
+ "title": None,
113
+ }
 
 
114
 
 
 
115
 
116
+ st.title("Yomikata: Disambiguate Japanese Heteronyms with a BERT model")
 
 
 
 
 
 
 
117
 
118
+ # Input text box
119
+ st.markdown("Input a Japanese sentence:")
120
+
121
+ if "default_sentence" not in st.session_state:
122
+ st.session_state.default_sentence = "ใˆใ€{ไบบ้–“/ใซใ‚“ใ’ใ‚“}ใจใ„ใ†ใ‚‚ใฎใ‹ใ„? {ไบบ้–“/ใซใ‚“ใ’ใ‚“}ใจใ„ใ†ใ‚‚ใฎใฏ{่ง’/ใคใฎ}ใฎ{็”Ÿ/ใฏ}ใˆใชใ„ใ€{็”Ÿ็™ฝ/ใชใพใ˜ใ‚}ใ„{้ก”/ใ‹ใŠ}ใ‚„{ๆ‰‹่ถณ/ใฆใ‚ใ—}ใ‚’ใ—ใŸใ€{ไฝ•/ใชใ‚“}ใจใ‚‚ใ„ใ‚ใ‚Œใš{ๆฐ—ๅ‘ณ/ใใฟ}ใฎ{ๆ‚ช/ใ‚ใ‚‹}ใ„ใ‚‚ใฎใ ใ‚ˆใ€‚"
123
+
124
+ input_text = st.text_area(
125
+ "Input a Japanese sentence:",
126
+ utils.remove_furigana(st.session_state.default_sentence),
127
+ label_visibility="collapsed",
128
+ )
129
+
130
+ # Yomikata prediction
131
+ dbert_prediction, heteronyms = get_dbert_prediction_and_heteronym_list(input_text)
132
+
133
+ # spacy-style output for the predictions
134
+ colors = ["#85DCDF", "#DF85DC", "#DCDF85", "#85ABDF"]
135
+ spacy_dict = furigana_to_spacy(dbert_prediction)
136
+ label_colors = {
137
+ reading: colors[i % len(colors)]
138
+ for i, reading in enumerate(set([item["label"] for item in spacy_dict["ents"]]))
139
+ }
140
+ html = spacy.displacy.render(spacy_dict, style="ent", manual=True, options={"colors": label_colors})
141
+
142
+ if len(spacy_dict["ents"]) > 0:
143
+ st.markdown("**Yomikata** found and disambiguated the following heteronyms:")
144
+ st.write(
145
+ f"{add_border(html)}",
146
+ unsafe_allow_html=True,
147
+ )
148
+ else:
149
+ st.markdown("**Yomikata** found no heteronyms in the input text.")
150
 
151
  # # Dictionary + Yomikata prediction
152
  # st.markdown("**Yomikata** can be coupled with a dictionary to get full furigana:")