Spaces:

jeffeux
/

spacy-streamlit-haowenchiang

Runtime error

File size: 7,691 Bytes

d825710

from collections import Counter
from dragonmapper import hanzi, transcriptions
import jieba
import pandas as pd
import plotly.express as px
import re
import requests 
import spacy
from spacy_streamlit import visualize_ner, visualize_tokens
#from spacy.language import Language
from spacy.tokens import Doc
import streamlit as st

# Global variables
DEFAULT_TEXT = "我如此的過著孤單的生活，我沒有一個可以真正跟他談話的人，一直到六年前，我在撒哈拉沙漠飛機故障的時候。我的發動機裡有些東西壞了。而由於我身邊沒有機械師，也沒有乘客，我準備獨自去嘗試一次困難的修理。這對我是生死問題。我連足夠喝八天的水都沒有。頭一天晚上我在離開有人居住的地方一千英里的沙地上睡覺。我比一位漂流在汪洋大海裡的木筏上面的遇難者更孤單。當天剛破曉的時候，我被一種奇異的小聲音叫醒，你可以想像到，這時我是多麼的驚訝。那聲音說：「請你﹒﹒﹒給我畫一隻綿羊！」「哪！」「給我畫一隻綿羊！」《小王子》"
DESCRIPTION = "AI模型輔助語言學習：華語"
TOK_SEP = " | "
PUNCT_SYM = ["PUNCT", "SYM"]
MODEL_NAME = "zh_core_web_sm"

# External API callers
def moedict_caller(word):
    st.write(f"### {word}")
    req = requests.get(f"https://www.moedict.tw/uni/{word}.json")
    try:
        definitions = req.json().get('heteronyms')[0].get('definitions')
        df = pd.DataFrame(definitions)
        df.fillna("---", inplace=True)
        if 'example' not in df.columns:
            df['example'] = '---'
        if 'synonyms' not in df.columns:
            df['synonyms'] = '---' 
        if 'antonyms' not in df.columns:
            df['antonyms'] = '---' 
        cols = ['def', 'example', 'synonyms', 'antonyms']
        df = df[cols]
        df.rename(columns={
            'def': '解釋',
            'example': '例句',
            'synonyms': '同義詞',
            'antonyms': '反義詞',
        }, inplace=True)
        with st.expander("點擊 + 查看結果"):
            st.table(df)
    except:
        st.write("查無結果")
            
# Custom tokenizer class
class JiebaTokenizer:
    def __init__(self, vocab):
        self.vocab = vocab

    def __call__(self, text):
        words = jieba.cut(text) # returns a generator
        tokens = list(words) # convert the genetator to a list
        spaces = [False] * len(tokens)
        doc = Doc(self.vocab, words=tokens, spaces=spaces)
        return doc
    
# Utility functions
def filter_tokens(doc):
    clean_tokens = [tok for tok in doc if tok.pos_ not in PUNCT_SYM]
    clean_tokens = (
        [tok for tok in clean_tokens if 
         not tok.like_email and 
         not tok.like_num and 
         not tok.like_url and 
         not tok.is_space]
    )
    return clean_tokens

def get_vocab(doc):
    clean_tokens = filter_tokens(doc)
    alphanum_pattern = re.compile(r"[a-zA-Z0-9]")
    clean_tokens_text = [tok.text for tok in clean_tokens if not alphanum_pattern.search(tok.text)]
    vocab = list(set(clean_tokens_text))
    return vocab

def get_counter(doc):
    clean_tokens = filter_tokens(doc)
    tokens = [token.text for token in clean_tokens]
    counter = Counter(tokens)
    return counter

def get_freq_fig(doc):
    counter = get_counter(doc)
    counter_df = (
        pd.DataFrame.from_dict(counter, orient='index').
        reset_index().
        rename(columns={
            0: 'count', 
            'index': 'word'
            }).
        sort_values(by='count', ascending=False)
        )
    fig = px.bar(counter_df, x='word', y='count')
    return fig

def get_level_pie(tocfl_result):
    level = tocfl_result['詞條分級'].value_counts()
    fig = px.pie(tocfl_result, 
                values=level.values, 
                names=level.index, 
                title='詞彙分級圓餅圖')
    return fig

@st.cache
def load_tocfl_table(filename="./tocfl_wordlist.csv"):
    table = pd.read_csv(filename)
    cols = "詞彙 漢語拼音 注音 任務領域 詞條分級".split()
    table = table[cols]
    return table
       
# Page setting
st.set_page_config(
    page_icon="🤠",
    layout="wide",
    initial_sidebar_state="auto",
)
st.markdown(f"# {DESCRIPTION}") 

# Load the model
nlp = spacy.load(MODEL_NAME)
          
# Add pipelines to spaCy
# nlp.add_pipe("yake") # keyword extraction
# nlp.add_pipe("merge_entities") # Merge entity spans to tokens

# Select a tokenizer if the Chinese model is chosen
selected_tokenizer = st.radio("請選擇斷詞模型", ["jieba-TW", "spaCy"])
if selected_tokenizer == "jieba-TW":
    nlp.tokenizer = JiebaTokenizer(nlp.vocab)

# Page starts from here
st.markdown("## 待分析文本")     
st.info("請在下面的文字框輸入文本並按下Ctrl + Enter以更新分析結果")
text = st.text_area("",  DEFAULT_TEXT, height=200)
doc = nlp(text)
st.markdown("---")

st.info("請勾選以下至少一項功能")
# keywords_extraction = st.sidebar.checkbox("關鍵詞分析", False) # YAKE doesn't work for Chinese texts
analyzed_text = st.checkbox("增強文本", True)
defs_examples = st.checkbox("單詞解析", True)
# morphology = st.sidebar.checkbox("詞形變化", True)
freq_count = st.checkbox("詞頻統計", True)
ner_viz = st.checkbox("命名實體", True)
tok_table = st.checkbox("斷詞特徵", False)

if analyzed_text:
    st.markdown("## 增強文本") 
    pronunciation = st.radio("請選擇輔助發音類型", ["漢語拼音", "注音符號", "國際音標"])
    for idx, sent in enumerate(doc.sents):
        tokens_text = [tok.text for tok in sent if tok.pos_ not in PUNCT_SYM]
        pinyins = [hanzi.to_pinyin(word) for word in tokens_text]
        sounds = pinyins
        if pronunciation == "注音符號":
            zhuyins = [transcriptions.pinyin_to_zhuyin(word) for word in pinyins]
            sounds = zhuyins
        elif pronunciation == "國際音標":
            ipas = [transcriptions.pinyin_to_ipa(word) for word in pinyins]
            sounds = ipas

        display = []
        for text, sound in zip(tokens_text, sounds):
            res = f"{text} [{sound}]"
            display.append(res)
        if display:
            display_text = TOK_SEP.join(display)
            st.write(f"{idx+1} >>> {display_text}")
        else:
            st.write(f"{idx+1} >>> EMPTY LINE")

if defs_examples:
    st.markdown("## 單詞解析")
    vocab = get_vocab(doc)
    if vocab:
        tocfl_table = load_tocfl_table()
        filt = tocfl_table['詞彙'].isin(vocab)
        tocfl_res = tocfl_table[filt]
        st.markdown("### 華語詞彙分級")
        fig = get_level_pie(tocfl_res)
        st.plotly_chart(fig, use_container_width=True)

        with st.expander("點擊 + 查看結果"):
            st.table(tocfl_res)
        st.markdown("---")
        st.markdown("### 單詞解釋與例句")
        selected_words = st.multiselect("請選擇要查詢的單詞: ", vocab, vocab[-1])
        for w in selected_words:
            moedict_caller(w)                        

if freq_count:  
    st.markdown("## 詞頻統計")  
    counter = get_counter(doc)
    topK = st.slider('請選擇前K個高頻詞', 1, len(counter), 5)
    most_common = counter.most_common(topK)
    st.write(most_common)
    st.markdown("---")

    fig = get_freq_fig(doc)
    st.plotly_chart(fig, use_container_width=True)

if ner_viz:
    ner_labels = nlp.get_pipe("ner").labels
    visualize_ner(doc, labels=ner_labels, show_table=False, title="命名實體")
    
if tok_table:
    visualize_tokens(doc, attrs=["text", "pos_", "tag_", "dep_", "head"], title="斷詞特徵")