from collections import Counter
from dragonmapper import hanzi, transcriptions
import jieba
import pandas as pd
import as px
import re
import requests
import spacy
from spacy_streamlit import visualize_ner, visualize_tokens
from spacy.tokens import Doc
import streamlit as st
# Global variables
DEFAULT_TEXT = "我如此的過著孤單的生活,我沒有一個可以真正跟他談話的人,一直到六年前,我在撒哈拉沙漠飛機故障的時候。我的發動機裡有些東西壞了。而由於我身邊沒有機械師,也沒有乘客,我準備獨自去嘗試一次困難的修理。這對我是生死問題。我連足夠喝八天的水都沒有。頭一天晚上我在離開有人居住的地方一千英里的沙地上睡覺。我比一位漂流在汪洋大海裡的木筏上面的遇難者更孤單。當天剛破曉的時候,我被一種奇異的小聲音叫醒,你可以想像到,這時我是多麼的驚訝。那聲音說:「請你﹒﹒﹒給我畫一隻綿羊!」「哪!」「給我畫一隻綿羊!」《小王子》"
DESCRIPTION = "AI模型輔助語言學習:華語"
TOK_SEP = " | "
MODEL_NAME = "zh_core_web_sm"
# External API callers
def moedict_caller(word):
st.write(f"### {word}")
req = requests.get(f"{word}.json")
definitions = req.json().get('heteronyms')[0].get('definitions')
df = pd.DataFrame(definitions)
df.fillna("---", inplace=True)
if 'example' not in df.columns:
df['example'] = '---'
if 'synonyms' not in df.columns:
df['synonyms'] = '---'
if 'antonyms' not in df.columns:
df['antonyms'] = '---'
cols = ['def', 'example', 'synonyms', 'antonyms']
df = df[cols]
'def': '解釋',
'example': '例句',
'synonyms': '同義詞',
'antonyms': '反義詞',
}, inplace=True)
with st.expander("點擊 + 查看結果"):
# Custom tokenizer class
class JiebaTokenizer:
def __init__(self, vocab):
self.vocab = vocab
def __call__(self, text):
words = jieba.cut(text) # returns a generator
tokens = list(words) # convert the genetator to a list
spaces = [False] * len(tokens)
doc = Doc(self.vocab, words=tokens, spaces=spaces)
return doc
# Utility functions
def filter_tokens(doc):
clean_tokens = [tok for tok in doc if tok.pos_ not in PUNCT_SYM]
clean_tokens = (
[tok for tok in clean_tokens if
not tok.like_email and
not tok.like_num and
not tok.like_url and
not tok.is_space]
return clean_tokens
def get_vocab(doc):
clean_tokens = filter_tokens(doc)
alphanum_pattern = re.compile(r"[a-zA-Z0-9]")
clean_tokens_text = [tok.text for tok in clean_tokens if not]
vocab = list(set(clean_tokens_text))
return vocab
def get_counter(doc):
clean_tokens = filter_tokens(doc)
tokens = [token.text for token in clean_tokens]
counter = Counter(tokens)
return counter
def get_freq_fig(doc):
counter = get_counter(doc)
counter_df = (
pd.DataFrame.from_dict(counter, orient='index').
0: 'count',
'index': 'word'
sort_values(by='count', ascending=False)
fig =, x='word', y='count')
return fig
def get_level_pie(tocfl_result):
level = tocfl_result['詞條分級'].value_counts()
fig = px.pie(tocfl_result,
return fig
def load_tocfl_table(filename="./tocfl_wordlist.csv"):
table = pd.read_csv(filename)
cols = "詞彙 漢語拼音 注音 任務領域 詞條分級".split()
table = table[cols]
return table
# Page setting
st.markdown(f"# {DESCRIPTION}")
# Load the model
nlp = spacy.load(MODEL_NAME)
# Add pipelines to spaCy
# nlp.add_pipe("merge_entities") # Merge entity spans to tokens
# Select a tokenizer if the Chinese model is chosen
selected_tokenizer ="請選擇斷詞模型", ["jieba-TW", "spaCy"])
if selected_tokenizer == "jieba-TW":
nlp.tokenizer = JiebaTokenizer(nlp.vocab)
# Page starts from here
st.markdown("## 待分析文本")"請在下面的文字框輸入文本並按下Ctrl + Enter以更新分析結果")
text = st.text_area("", DEFAULT_TEXT, height=200)
doc = nlp(text)
analyzed_text = st.checkbox("增強文本", True)
defs_examples = st.checkbox("單詞解析", True)
# morphology = st.sidebar.checkbox("詞形變化", True)
freq_count = st.checkbox("詞頻統計", True)
ner_viz = st.checkbox("命名實體", True)
tok_table = st.checkbox("斷詞特徵", False)
if analyzed_text:
st.markdown("## 增強文本")
pronunciation ="請選擇輔助發音類型", ["漢語拼音", "注音符號", "國際音標"])
for idx, sent in enumerate(doc.sents):
tokens_text = [tok.text for tok in sent if tok.pos_ not in PUNCT_SYM]
pinyins = [hanzi.to_pinyin(word) for word in tokens_text]
sounds = pinyins
if pronunciation == "注音符號":
zhuyins = [transcriptions.pinyin_to_zhuyin(word) for word in pinyins]
sounds = zhuyins
elif pronunciation == "國際音標":
ipas = [transcriptions.pinyin_to_ipa(word) for word in pinyins]
sounds = ipas
display = []
for text, sound in zip(tokens_text, sounds):
res = f"{text} [{sound}]"
if display:
display_text = TOK_SEP.join(display)
st.write(f"{idx+1} >>> {display_text}")
st.write(f"{idx+1} >>> EMPTY LINE")
if defs_examples:
st.markdown("## 單詞解析")
vocab = get_vocab(doc)
if vocab:
tocfl_table = load_tocfl_table()
filt = tocfl_table['詞彙'].isin(vocab)
tocfl_res = tocfl_table[filt]
st.markdown("### 華語詞彙分級")
fig = get_level_pie(tocfl_res)
st.plotly_chart(fig, use_container_width=True)
with st.expander("點擊 + 查看結果"):
st.markdown("### 單詞解釋與例句")
selected_words = st.multiselect("請選擇要查詢的單詞: ", vocab, vocab[-1])
for w in selected_words:
if freq_count:
st.markdown("## 詞頻統計")
counter = get_counter(doc)
topK = st.slider('請選擇前K個高頻詞', 1, len(counter), 5)
most_common = counter.most_common(topK)
fig = get_freq_fig(doc)
st.plotly_chart(fig, use_container_width=True)
if ner_viz:
ner_labels = nlp.get_pipe("ner").labels
visualize_ner(doc, labels=ner_labels, show_table=False, title="命名實體")
if tok_table:
visualize_tokens(doc, attrs=["text", "pos_", "tag_", "dep_", "head"], title="斷詞特徵")