Spaces:
Runtime error
Runtime error
from collections import Counter | |
from dragonmapper import hanzi, transcriptions | |
import jieba | |
import pandas as pd | |
import plotly.express as px | |
import re | |
import requests | |
import spacy | |
from spacy_streamlit import visualize_ner, visualize_tokens | |
#from spacy.language import Language | |
from spacy.tokens import Doc | |
import streamlit as st | |
# Global variables | |
DEFAULT_TEXT = "我如此的過著孤單的生活,我沒有一個可以真正跟他談話的人,一直到六年前,我在撒哈拉沙漠飛機故障的時候。我的發動機裡有些東西壞了。而由於我身邊沒有機械師,也沒有乘客,我準備獨自去嘗試一次困難的修理。這對我是生死問題。我連足夠喝八天的水都沒有。頭一天晚上我在離開有人居住的地方一千英里的沙地上睡覺。我比一位漂流在汪洋大海裡的木筏上面的遇難者更孤單。當天剛破曉的時候,我被一種奇異的小聲音叫醒,你可以想像到,這時我是多麼的驚訝。那聲音說:「請你﹒﹒﹒給我畫一隻綿羊!」「哪!」「給我畫一隻綿羊!」《小王子》" | |
DESCRIPTION = "AI模型輔助語言學習:華語" | |
TOK_SEP = " | " | |
PUNCT_SYM = ["PUNCT", "SYM"] | |
MODEL_NAME = "zh_core_web_sm" | |
# External API callers | |
def moedict_caller(word): | |
st.write(f"### {word}") | |
req = requests.get(f"https://www.moedict.tw/uni/{word}.json") | |
try: | |
definitions = req.json().get('heteronyms')[0].get('definitions') | |
df = pd.DataFrame(definitions) | |
df.fillna("---", inplace=True) | |
if 'example' not in df.columns: | |
df['example'] = '---' | |
if 'synonyms' not in df.columns: | |
df['synonyms'] = '---' | |
if 'antonyms' not in df.columns: | |
df['antonyms'] = '---' | |
cols = ['def', 'example', 'synonyms', 'antonyms'] | |
df = df[cols] | |
df.rename(columns={ | |
'def': '解釋', | |
'example': '例句', | |
'synonyms': '同義詞', | |
'antonyms': '反義詞', | |
}, inplace=True) | |
with st.expander("點擊 + 查看結果"): | |
st.table(df) | |
except: | |
st.write("查無結果") | |
# Custom tokenizer class | |
class JiebaTokenizer: | |
def __init__(self, vocab): | |
self.vocab = vocab | |
def __call__(self, text): | |
words = jieba.cut(text) # returns a generator | |
tokens = list(words) # convert the genetator to a list | |
spaces = [False] * len(tokens) | |
doc = Doc(self.vocab, words=tokens, spaces=spaces) | |
return doc | |
# Utility functions | |
def filter_tokens(doc): | |
clean_tokens = [tok for tok in doc if tok.pos_ not in PUNCT_SYM] | |
clean_tokens = ( | |
[tok for tok in clean_tokens if | |
not tok.like_email and | |
not tok.like_num and | |
not tok.like_url and | |
not tok.is_space] | |
) | |
return clean_tokens | |
def get_vocab(doc): | |
clean_tokens = filter_tokens(doc) | |
alphanum_pattern = re.compile(r"[a-zA-Z0-9]") | |
clean_tokens_text = [tok.text for tok in clean_tokens if not alphanum_pattern.search(tok.text)] | |
vocab = list(set(clean_tokens_text)) | |
return vocab | |
def get_counter(doc): | |
clean_tokens = filter_tokens(doc) | |
tokens = [token.text for token in clean_tokens] | |
counter = Counter(tokens) | |
return counter | |
def get_freq_fig(doc): | |
counter = get_counter(doc) | |
counter_df = ( | |
pd.DataFrame.from_dict(counter, orient='index'). | |
reset_index(). | |
rename(columns={ | |
0: 'count', | |
'index': 'word' | |
}). | |
sort_values(by='count', ascending=False) | |
) | |
fig = px.bar(counter_df, x='word', y='count') | |
return fig | |
def get_level_pie(tocfl_result): | |
level = tocfl_result['詞條分級'].value_counts() | |
fig = px.pie(tocfl_result, | |
values=level.values, | |
names=level.index, | |
title='詞彙分級圓餅圖') | |
return fig | |
def load_tocfl_table(filename="./tocfl_wordlist.csv"): | |
table = pd.read_csv(filename) | |
cols = "詞彙 漢語拼音 注音 任務領域 詞條分級".split() | |
table = table[cols] | |
return table | |
# Page setting | |
st.set_page_config( | |
page_icon="🤠", | |
layout="wide", | |
initial_sidebar_state="auto", | |
) | |
st.markdown(f"# {DESCRIPTION}") | |
# Load the model | |
nlp = spacy.load(MODEL_NAME) | |
# Add pipelines to spaCy | |
# nlp.add_pipe("yake") # keyword extraction | |
# nlp.add_pipe("merge_entities") # Merge entity spans to tokens | |
# Select a tokenizer if the Chinese model is chosen | |
selected_tokenizer = st.radio("請選擇斷詞模型", ["jieba-TW", "spaCy"]) | |
if selected_tokenizer == "jieba-TW": | |
nlp.tokenizer = JiebaTokenizer(nlp.vocab) | |
# Page starts from here | |
st.markdown("## 待分析文本") | |
st.info("請在下面的文字框輸入文本並按下Ctrl + Enter以更新分析結果") | |
text = st.text_area("", DEFAULT_TEXT, height=200) | |
doc = nlp(text) | |
st.markdown("---") | |
st.info("請勾選以下至少一項功能") | |
# keywords_extraction = st.sidebar.checkbox("關鍵詞分析", False) # YAKE doesn't work for Chinese texts | |
analyzed_text = st.checkbox("增強文本", True) | |
defs_examples = st.checkbox("單詞解析", True) | |
# morphology = st.sidebar.checkbox("詞形變化", True) | |
freq_count = st.checkbox("詞頻統計", True) | |
ner_viz = st.checkbox("命名實體", True) | |
tok_table = st.checkbox("斷詞特徵", False) | |
if analyzed_text: | |
st.markdown("## 增強文本") | |
pronunciation = st.radio("請選擇輔助發音類型", ["漢語拼音", "注音符號", "國際音標"]) | |
for idx, sent in enumerate(doc.sents): | |
tokens_text = [tok.text for tok in sent if tok.pos_ not in PUNCT_SYM] | |
pinyins = [hanzi.to_pinyin(word) for word in tokens_text] | |
sounds = pinyins | |
if pronunciation == "注音符號": | |
zhuyins = [transcriptions.pinyin_to_zhuyin(word) for word in pinyins] | |
sounds = zhuyins | |
elif pronunciation == "國際音標": | |
ipas = [transcriptions.pinyin_to_ipa(word) for word in pinyins] | |
sounds = ipas | |
display = [] | |
for text, sound in zip(tokens_text, sounds): | |
res = f"{text} [{sound}]" | |
display.append(res) | |
if display: | |
display_text = TOK_SEP.join(display) | |
st.write(f"{idx+1} >>> {display_text}") | |
else: | |
st.write(f"{idx+1} >>> EMPTY LINE") | |
if defs_examples: | |
st.markdown("## 單詞解析") | |
vocab = get_vocab(doc) | |
if vocab: | |
tocfl_table = load_tocfl_table() | |
filt = tocfl_table['詞彙'].isin(vocab) | |
tocfl_res = tocfl_table[filt] | |
st.markdown("### 華語詞彙分級") | |
fig = get_level_pie(tocfl_res) | |
st.plotly_chart(fig, use_container_width=True) | |
with st.expander("點擊 + 查看結果"): | |
st.table(tocfl_res) | |
st.markdown("---") | |
st.markdown("### 單詞解釋與例句") | |
selected_words = st.multiselect("請選擇要查詢的單詞: ", vocab, vocab[-1]) | |
for w in selected_words: | |
moedict_caller(w) | |
if freq_count: | |
st.markdown("## 詞頻統計") | |
counter = get_counter(doc) | |
topK = st.slider('請選擇前K個高頻詞', 1, len(counter), 5) | |
most_common = counter.most_common(topK) | |
st.write(most_common) | |
st.markdown("---") | |
fig = get_freq_fig(doc) | |
st.plotly_chart(fig, use_container_width=True) | |
if ner_viz: | |
ner_labels = nlp.get_pipe("ner").labels | |
visualize_ner(doc, labels=ner_labels, show_table=False, title="命名實體") | |
if tok_table: | |
visualize_tokens(doc, attrs=["text", "pos_", "tag_", "dep_", "head"], title="斷詞特徵") | |