spacy-streamlit-haowenchiang / pages /01_🍊Mandarin.py
jeffeux's picture
Migrate to HF Space
d825710
from collections import Counter
from dragonmapper import hanzi, transcriptions
import jieba
import pandas as pd
import plotly.express as px
import re
import requests
import spacy
from spacy_streamlit import visualize_ner, visualize_tokens
#from spacy.language import Language
from spacy.tokens import Doc
import streamlit as st
# Global variables
DEFAULT_TEXT = "我如此的過著孤單的生活,我沒有一個可以真正跟他談話的人,一直到六年前,我在撒哈拉沙漠飛機故障的時候。我的發動機裡有些東西壞了。而由於我身邊沒有機械師,也沒有乘客,我準備獨自去嘗試一次困難的修理。這對我是生死問題。我連足夠喝八天的水都沒有。頭一天晚上我在離開有人居住的地方一千英里的沙地上睡覺。我比一位漂流在汪洋大海裡的木筏上面的遇難者更孤單。當天剛破曉的時候,我被一種奇異的小聲音叫醒,你可以想像到,這時我是多麼的驚訝。那聲音說:「請你﹒﹒﹒給我畫一隻綿羊!」「哪!」「給我畫一隻綿羊!」《小王子》"
DESCRIPTION = "AI模型輔助語言學習:華語"
TOK_SEP = " | "
PUNCT_SYM = ["PUNCT", "SYM"]
MODEL_NAME = "zh_core_web_sm"
# External API callers
def moedict_caller(word):
st.write(f"### {word}")
req = requests.get(f"https://www.moedict.tw/uni/{word}.json")
try:
definitions = req.json().get('heteronyms')[0].get('definitions')
df = pd.DataFrame(definitions)
df.fillna("---", inplace=True)
if 'example' not in df.columns:
df['example'] = '---'
if 'synonyms' not in df.columns:
df['synonyms'] = '---'
if 'antonyms' not in df.columns:
df['antonyms'] = '---'
cols = ['def', 'example', 'synonyms', 'antonyms']
df = df[cols]
df.rename(columns={
'def': '解釋',
'example': '例句',
'synonyms': '同義詞',
'antonyms': '反義詞',
}, inplace=True)
with st.expander("點擊 + 查看結果"):
st.table(df)
except:
st.write("查無結果")
# Custom tokenizer class
class JiebaTokenizer:
def __init__(self, vocab):
self.vocab = vocab
def __call__(self, text):
words = jieba.cut(text) # returns a generator
tokens = list(words) # convert the genetator to a list
spaces = [False] * len(tokens)
doc = Doc(self.vocab, words=tokens, spaces=spaces)
return doc
# Utility functions
def filter_tokens(doc):
clean_tokens = [tok for tok in doc if tok.pos_ not in PUNCT_SYM]
clean_tokens = (
[tok for tok in clean_tokens if
not tok.like_email and
not tok.like_num and
not tok.like_url and
not tok.is_space]
)
return clean_tokens
def get_vocab(doc):
clean_tokens = filter_tokens(doc)
alphanum_pattern = re.compile(r"[a-zA-Z0-9]")
clean_tokens_text = [tok.text for tok in clean_tokens if not alphanum_pattern.search(tok.text)]
vocab = list(set(clean_tokens_text))
return vocab
def get_counter(doc):
clean_tokens = filter_tokens(doc)
tokens = [token.text for token in clean_tokens]
counter = Counter(tokens)
return counter
def get_freq_fig(doc):
counter = get_counter(doc)
counter_df = (
pd.DataFrame.from_dict(counter, orient='index').
reset_index().
rename(columns={
0: 'count',
'index': 'word'
}).
sort_values(by='count', ascending=False)
)
fig = px.bar(counter_df, x='word', y='count')
return fig
def get_level_pie(tocfl_result):
level = tocfl_result['詞條分級'].value_counts()
fig = px.pie(tocfl_result,
values=level.values,
names=level.index,
title='詞彙分級圓餅圖')
return fig
@st.cache
def load_tocfl_table(filename="./tocfl_wordlist.csv"):
table = pd.read_csv(filename)
cols = "詞彙 漢語拼音 注音 任務領域 詞條分級".split()
table = table[cols]
return table
# Page setting
st.set_page_config(
page_icon="🤠",
layout="wide",
initial_sidebar_state="auto",
)
st.markdown(f"# {DESCRIPTION}")
# Load the model
nlp = spacy.load(MODEL_NAME)
# Add pipelines to spaCy
# nlp.add_pipe("yake") # keyword extraction
# nlp.add_pipe("merge_entities") # Merge entity spans to tokens
# Select a tokenizer if the Chinese model is chosen
selected_tokenizer = st.radio("請選擇斷詞模型", ["jieba-TW", "spaCy"])
if selected_tokenizer == "jieba-TW":
nlp.tokenizer = JiebaTokenizer(nlp.vocab)
# Page starts from here
st.markdown("## 待分析文本")
st.info("請在下面的文字框輸入文本並按下Ctrl + Enter以更新分析結果")
text = st.text_area("", DEFAULT_TEXT, height=200)
doc = nlp(text)
st.markdown("---")
st.info("請勾選以下至少一項功能")
# keywords_extraction = st.sidebar.checkbox("關鍵詞分析", False) # YAKE doesn't work for Chinese texts
analyzed_text = st.checkbox("增強文本", True)
defs_examples = st.checkbox("單詞解析", True)
# morphology = st.sidebar.checkbox("詞形變化", True)
freq_count = st.checkbox("詞頻統計", True)
ner_viz = st.checkbox("命名實體", True)
tok_table = st.checkbox("斷詞特徵", False)
if analyzed_text:
st.markdown("## 增強文本")
pronunciation = st.radio("請選擇輔助發音類型", ["漢語拼音", "注音符號", "國際音標"])
for idx, sent in enumerate(doc.sents):
tokens_text = [tok.text for tok in sent if tok.pos_ not in PUNCT_SYM]
pinyins = [hanzi.to_pinyin(word) for word in tokens_text]
sounds = pinyins
if pronunciation == "注音符號":
zhuyins = [transcriptions.pinyin_to_zhuyin(word) for word in pinyins]
sounds = zhuyins
elif pronunciation == "國際音標":
ipas = [transcriptions.pinyin_to_ipa(word) for word in pinyins]
sounds = ipas
display = []
for text, sound in zip(tokens_text, sounds):
res = f"{text} [{sound}]"
display.append(res)
if display:
display_text = TOK_SEP.join(display)
st.write(f"{idx+1} >>> {display_text}")
else:
st.write(f"{idx+1} >>> EMPTY LINE")
if defs_examples:
st.markdown("## 單詞解析")
vocab = get_vocab(doc)
if vocab:
tocfl_table = load_tocfl_table()
filt = tocfl_table['詞彙'].isin(vocab)
tocfl_res = tocfl_table[filt]
st.markdown("### 華語詞彙分級")
fig = get_level_pie(tocfl_res)
st.plotly_chart(fig, use_container_width=True)
with st.expander("點擊 + 查看結果"):
st.table(tocfl_res)
st.markdown("---")
st.markdown("### 單詞解釋與例句")
selected_words = st.multiselect("請選擇要查詢的單詞: ", vocab, vocab[-1])
for w in selected_words:
moedict_caller(w)
if freq_count:
st.markdown("## 詞頻統計")
counter = get_counter(doc)
topK = st.slider('請選擇前K個高頻詞', 1, len(counter), 5)
most_common = counter.most_common(topK)
st.write(most_common)
st.markdown("---")
fig = get_freq_fig(doc)
st.plotly_chart(fig, use_container_width=True)
if ner_viz:
ner_labels = nlp.get_pipe("ner").labels
visualize_ner(doc, labels=ner_labels, show_table=False, title="命名實體")
if tok_table:
visualize_tokens(doc, attrs=["text", "pos_", "tag_", "dep_", "head"], title="斷詞特徵")