Spaces:
Runtime error
Runtime error
File size: 7,362 Bytes
d825710 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 |
from jisho_api.word import Word
from jisho_api.sentence import Sentence
import pandas as pd
import re
import requests
import spacy
from spacy_streamlit import visualize_ner, visualize_tokens
#from spacy.language import Language
from spacy.tokens import Doc
import spacy_ke
import streamlit as st
# Global variables
DEFAULT_TEXT = """それまで、ぼくはずっとひとりぼっちだった。だれともうちとけられないまま、6年まえ、ちょっとおかしくなって、サハラさばくに下りた。ぼくのエンジンのなかで、なにかがこわれていた。ぼくには、みてくれるひとも、おきゃくさんもいなかったから、なおすのはむずかしいけど、ぜんぶひとりでなんとかやってみることにした。それでぼくのいのちがきまってしまう。のみ水は、たった7日ぶんしかなかった。
1日めの夜、ぼくはすなの上でねむった。ひとのすむところは、はるかかなただった。海のどまんなか、いかだでさまよっているひとよりも、もっとひとりぼっち。だから、ぼくがびっくりしたのも、みんなわかってくれるとおもう。じつは、あさ日がのぼるころ、ぼくは、ふしぎなかわいいこえでおこされたんだ。
「ごめんください……ヒツジの絵をかいて!」
「えっ?」
「ぼくにヒツジの絵をかいて……」
『星の王子さま』"""
DESCRIPTION = "AI模型輔助語言學習:日語"
TOK_SEP = " | "
MODEL_NAME = "ja_ginza"
# External API callers
def parse_jisho_senses(word):
res = Word.request(word)
response = res.dict()
if response["meta"]["status"] == 200:
data = response["data"]
commons = [d for d in data if d["is_common"]]
if commons:
common = commons[0] # Only get the first entry that is common
senses = common["senses"]
if len(senses) > 3:
senses = senses[:3]
with st.container():
for idx, sense in enumerate(senses):
eng_def = "; ".join(sense["english_definitions"])
pos = "/".join(sense["parts_of_speech"])
st.write(f"Sense {idx+1}: {eng_def} ({pos})")
else:
st.info("Found no common words on Jisho!")
else:
st.error("Can't get response from Jisho!")
def parse_jisho_sentences(word):
res = Sentence.request(word)
try:
response = res.dict()
data = response["data"]
if len(data) > 3:
sents = data[:3]
else:
sents = data
with st.container():
for idx, sent in enumerate(sents):
eng = sent["en_translation"]
jap = sent["japanese"]
st.write(f"Sentence {idx+1}: {jap}")
st.write(f"({eng})")
except:
st.info("Found no results on Jisho!")
# Utility functions
def create_jap_df(tokens):
seen_texts = []
filtered_tokens = []
for tok in tokens:
if tok.text not in seen_texts:
filtered_tokens.append(tok)
df = pd.DataFrame(
{
"單詞": [tok.text for tok in filtered_tokens],
"發音": ["/".join(tok.morph.get("Reading")) for tok in filtered_tokens],
"詞形變化": ["/".join(tok.morph.get("Inflection")) for tok in filtered_tokens],
"原形": [tok.lemma_ for tok in filtered_tokens],
#"正規形": [tok.norm_ for tok in verbs],
}
)
st.dataframe(df)
csv = df.to_csv().encode('utf-8')
st.download_button(
label="下載表格",
data=csv,
file_name='jap_forms.csv',
)
def filter_tokens(doc):
clean_tokens = [tok for tok in doc if tok.pos_ not in ["PUNCT", "SYM"]]
clean_tokens = [tok for tok in clean_tokens if not tok.like_email]
clean_tokens = [tok for tok in clean_tokens if not tok.like_url]
clean_tokens = [tok for tok in clean_tokens if not tok.like_num]
clean_tokens = [tok for tok in clean_tokens if not tok.is_punct]
clean_tokens = [tok for tok in clean_tokens if not tok.is_space]
return clean_tokens
def create_kw_section(doc):
st.markdown("## 關鍵詞分析")
kw_num = st.slider("請選擇關鍵詞數量", 1, 10, 3)
kws2scores = {keyword: score for keyword, score in doc._.extract_keywords(n=kw_num)}
kws2scores = sorted(kws2scores.items(), key=lambda x: x[1], reverse=True)
count = 1
for keyword, score in kws2scores:
rounded_score = round(score, 3)
st.write(f"{count} >>> {keyword} ({rounded_score})")
count += 1
# Page setting
st.set_page_config(
page_icon="🤠",
layout="wide",
initial_sidebar_state="auto",
)
st.markdown(f"# {DESCRIPTION}")
# Load the model
nlp = spacy.load(MODEL_NAME)
# Add pipelines to spaCy
nlp.add_pipe("yake") # keyword extraction
# nlp.add_pipe("merge_entities") # Merge entity spans to tokens
# Page starts from here
st.markdown("## 待分析文本")
st.info("請在下面的文字框輸入文本並按下Ctrl + Enter以更新分析結果")
text = st.text_area("", DEFAULT_TEXT, height=200)
doc = nlp(text)
st.markdown("---")
st.info("請勾選以下至少一項功能")
keywords_extraction = st.checkbox("關鍵詞分析", False)
analyzed_text = st.checkbox("增強文本", True)
defs_examples = st.checkbox("單詞解析", True)
morphology = st.checkbox("詞形變化", False)
ner_viz = st.checkbox("命名實體", True)
tok_table = st.checkbox("斷詞特徵", False)
if keywords_extraction:
create_kw_section(doc)
if analyzed_text:
st.markdown("## 分析後文本")
for idx, sent in enumerate(doc.sents):
clean_tokens = [tok for tok in sent if tok.pos_ not in ["PUNCT", "SYM"]]
tokens_text = [tok.text for tok in clean_tokens]
readings = ["/".join(tok.morph.get("Reading")) for tok in clean_tokens]
display = [f"{text} [{reading}]" for text, reading in zip(tokens_text, readings)]
if display:
display_text = TOK_SEP.join(display)
st.write(f"{idx+1} >>> {display_text}")
else:
st.write(f"{idx+1} >>> EMPTY LINE")
if defs_examples:
st.markdown("## 單詞解釋與例句")
clean_tokens = filter_tokens(doc)
alphanum_pattern = re.compile(r"[a-zA-Z0-9]")
clean_lemmas = [tok.lemma_ for tok in clean_tokens if not alphanum_pattern.search(tok.lemma_)]
vocab = list(set(clean_lemmas))
if vocab:
selected_words = st.multiselect("請選擇要查詢的單詞: ", vocab, vocab[0:3])
for w in selected_words:
st.write(f"### {w}")
with st.expander("點擊 + 檢視結果"):
parse_jisho_senses(w)
parse_jisho_sentences(w)
if morphology:
st.markdown("## 詞形變化")
# Collect inflected forms
inflected_forms = [tok for tok in doc if tok.tag_.startswith("動詞") or tok.tag_.startswith("形")]
if inflected_forms:
create_jap_df(inflected_forms)
if ner_viz:
ner_labels = nlp.get_pipe("ner").labels
visualize_ner(doc, labels=ner_labels, show_table=False, title="命名實體")
if tok_table:
visualize_tokens(doc, attrs=["text", "pos_", "tag_", "dep_", "head"], title="斷詞特徵")
|