Spaces:

passaglia
/

yomikata-demo

Build error

App Files Files Community

Sam Passaglia commited on Feb 20, 2023

Commit

8061dc1

1 Parent(s): 37b2b22

minor

Browse files

Files changed (1) hide show

app.py +107 -108

app.py CHANGED Viewed

@@ -11,143 +11,142 @@ from yomikata import utils
 from yomikata.dictionary import Dictionary
 from yomikata.utils import parse_furigana
-st.title("Yomikata: Disambiguate Japanese Heteronyms with a BERT model")
-# @st.cache
-# def add_border(html: str):
-#     WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.5rem; padding: 1rem; margin-bottom: 1.0rem; display: inline-block">{}</div>"""
-#     html = html.replace("\n", " ")
-#     return WRAPPER.format(html)
-# def get_random_sentence():
-#     from config.config import TEST_DATA_DIR
-#     df = pd.read_csv(Path(TEST_DATA_DIR, "test_optimized_strict_heteronyms.csv"))
-#     return df.sample(1).iloc[0].sentence
-# @st.cache
-# def get_dbert_prediction_and_heteronym_list(text):
-#     from yomikata.dbert import dBert
-#     reader = dBert()
-#     return reader.furigana(text), reader.heteronyms
-# @st.cache
-# def get_stats():
-#     from config import config
-#     from yomikata.utils import load_dict
-#     stats = load_dict(Path(config.STORES_DIR, "dbert/training_performance.json"))
-#     global_accuracy = stats["test"]["accuracy"]
-#     stats = stats["test"]["heteronym_performance"]
-#     heteronyms = stats.keys()
-#     accuracy = [stats[heteronym]["accuracy"] for heteronym in heteronyms]
-#     readings = [
-#         "、".join(
-#             [
-#                 "{reading} ({correct}/{n})".format(
-#                     reading=reading,
-#                     correct=stats[heteronym]["readings"][reading]["found"][reading],
-#                     n=stats[heteronym]["readings"][reading]["n"],
-#                 )
-#                 for reading in stats[heteronym]["readings"].keys()
-#                 if (
-#                     stats[heteronym]["readings"][reading]["found"][reading] != 0
-#                     or reading != "<OTHER>"
-#                 )
-#             ]
-#         )
-#         for heteronym in heteronyms
-#     ]
-#     # if reading != '<OTHER>'
-#     df = pd.DataFrame({"heteronym": heteronyms, "accuracy": accuracy, "readings": readings})
-#     df = df[df["readings"].str.contains("、")]
-#     df["readings"] = df["readings"].str.replace("<OTHER>", "Other")
-#     df = df.rename(columns={"readings": "readings (test corr./total)"})
-#     df = df.sort_values("accuracy", ascending=False, ignore_index=True)
-#     df.index += 1
-#     return global_accuracy, df
-# @st.cache
-# def furigana_to_spacy(text_with_furigana):
-#     tokens = parse_furigana(text_with_furigana)
-#     ents = []
-#     output_text = ""
-#     heteronym_count = 0
-#     for token in tokens.groups:
-#         if isinstance(token, ttlig.RubyFrag):
-#             if heteronym_count != 0:
-#                 output_text += ", "
-#             ents.append(
-#                 {
-#                     "start": len(output_text),
-#                     "end": len(output_text) + len(token.text),
-#                     "label": token.furi,
-#                 }
-#             )
-#             output_text += token.text
-#             heteronym_count += 1
-#         else:
-#             pass
-#     return {
-#         "text": output_text,
-#         "ents": ents,
-#         "title": None,
-#     }
-# st.title("Yomikata: Disambiguate Japanese Heteronyms with a BERT model")
-# # Input text box
-# st.markdown("Input a Japanese sentence:")
-# if "default_sentence" not in st.session_state:
-#     st.session_state.default_sentence = "え、{人間/にんげん}というものかい? {人間/にんげん}というものは{角/つの}の{生/は}えない、{生白/なまじろ}い{顔/かお}や{手足/てあし}をした、{何/なん}ともいわれず{気味/きみ}の{悪/わる}いものだよ。"
-# input_text = st.text_area(
-#     "Input a Japanese sentence:",
-#     utils.remove_furigana(st.session_state.default_sentence),
-#     label_visibility="collapsed",
-# )
-# # Yomikata prediction
-# dbert_prediction, heteronyms = get_dbert_prediction_and_heteronym_list(input_text)
-# # spacy-style output for the predictions
-# colors = ["#85DCDF", "#DF85DC", "#DCDF85", "#85ABDF"]
-# spacy_dict = furigana_to_spacy(dbert_prediction)
-# label_colors = {
-#     reading: colors[i % len(colors)]
-#     for i, reading in enumerate(set([item["label"] for item in spacy_dict["ents"]]))
-# }
-# html = spacy.displacy.render(spacy_dict, style="ent", manual=True, options={"colors": label_colors})
-# if len(spacy_dict["ents"]) > 0:
-#     st.markdown("**Yomikata** found and disambiguated the following heteronyms:")
-#     st.write(
-#         f"{add_border(html)}",
-#         unsafe_allow_html=True,
-#     )
-# else:
-#     st.markdown("**Yomikata** found no heteronyms in the input text.")
 # # Dictionary + Yomikata prediction
 # st.markdown("**Yomikata** can be coupled with a dictionary to get full furigana:")

 from yomikata.dictionary import Dictionary
 from yomikata.utils import parse_furigana
+@st.cache
+def add_border(html: str):
+    WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.5rem; padding: 1rem; margin-bottom: 1.0rem; display: inline-block">{}</div>"""
+    html = html.replace("\n", " ")
+    return WRAPPER.format(html)
+def get_random_sentence():
+    from config.config import TEST_DATA_DIR
+    df = pd.read_csv(Path(TEST_DATA_DIR, "test_optimized_strict_heteronyms.csv"))
+    return df.sample(1).iloc[0].sentence
+@st.cache
+def get_dbert_prediction_and_heteronym_list(text):
+    from yomikata.dbert import dBert
+    reader = dBert()
+    return reader.furigana(text), reader.heteronyms
+@st.cache
+def get_stats():
+    from config import config
+    from yomikata.utils import load_dict
+    stats = load_dict(Path(config.STORES_DIR, "dbert/training_performance.json"))
+    global_accuracy = stats["test"]["accuracy"]
+    stats = stats["test"]["heteronym_performance"]
+    heteronyms = stats.keys()
+    accuracy = [stats[heteronym]["accuracy"] for heteronym in heteronyms]
+    readings = [
+        "、".join(
+            [
+                "{reading} ({correct}/{n})".format(
+                    reading=reading,
+                    correct=stats[heteronym]["readings"][reading]["found"][reading],
+                    n=stats[heteronym]["readings"][reading]["n"],
+                )
+                for reading in stats[heteronym]["readings"].keys()
+                if (
+                    stats[heteronym]["readings"][reading]["found"][reading] != 0
+                    or reading != "<OTHER>"
+                )
+            ]
+        )
+        for heteronym in heteronyms
+    ]
+    # if reading != '<OTHER>'
+    df = pd.DataFrame({"heteronym": heteronyms, "accuracy": accuracy, "readings": readings})
+    df = df[df["readings"].str.contains("、")]
+    df["readings"] = df["readings"].str.replace("<OTHER>", "Other")
+    df = df.rename(columns={"readings": "readings (test corr./total)"})
+    df = df.sort_values("accuracy", ascending=False, ignore_index=True)
+    df.index += 1
+    return global_accuracy, df
+@st.cache
+def furigana_to_spacy(text_with_furigana):
+    tokens = parse_furigana(text_with_furigana)
+    ents = []
+    output_text = ""
+    heteronym_count = 0
+    for token in tokens.groups:
+        if isinstance(token, ttlig.RubyFrag):
+            if heteronym_count != 0:
+                output_text += ", "
+            ents.append(
+                {
+                    "start": len(output_text),
+                    "end": len(output_text) + len(token.text),
+                    "label": token.furi,
+                }
+            )
+            output_text += token.text
+            heteronym_count += 1
+        else:
+            pass
+    return {
+        "text": output_text,
+        "ents": ents,
+        "title": None,
+    }
+st.title("Yomikata: Disambiguate Japanese Heteronyms with a BERT model")
+# Input text box
+st.markdown("Input a Japanese sentence:")
+if "default_sentence" not in st.session_state:
+    st.session_state.default_sentence = "え、{人間/にんげん}というものかい? {人間/にんげん}というものは{角/つの}の{生/は}えない、{生白/なまじろ}い{顔/かお}や{手足/てあし}をした、{何/なん}ともいわれず{気味/きみ}の{悪/わる}いものだよ。"
+input_text = st.text_area(
+    "Input a Japanese sentence:",
+    utils.remove_furigana(st.session_state.default_sentence),
+    label_visibility="collapsed",
+)
+# Yomikata prediction
+dbert_prediction, heteronyms = get_dbert_prediction_and_heteronym_list(input_text)
+# spacy-style output for the predictions
+colors = ["#85DCDF", "#DF85DC", "#DCDF85", "#85ABDF"]
+spacy_dict = furigana_to_spacy(dbert_prediction)
+label_colors = {
+    reading: colors[i % len(colors)]
+    for i, reading in enumerate(set([item["label"] for item in spacy_dict["ents"]]))
+}
+html = spacy.displacy.render(spacy_dict, style="ent", manual=True, options={"colors": label_colors})
+if len(spacy_dict["ents"]) > 0:
+    st.markdown("**Yomikata** found and disambiguated the following heteronyms:")
+    st.write(
+        f"{add_border(html)}",
+        unsafe_allow_html=True,
+    )
+else:
+    st.markdown("**Yomikata** found no heteronyms in the input text.")
 # # Dictionary + Yomikata prediction
 # st.markdown("**Yomikata** can be coupled with a dictionary to get full furigana:")