File size: 7,691 Bytes
d825710
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
from collections import Counter
from dragonmapper import hanzi, transcriptions
import jieba
import pandas as pd
import plotly.express as px
import re
import requests 
import spacy
from spacy_streamlit import visualize_ner, visualize_tokens
#from spacy.language import Language
from spacy.tokens import Doc
import streamlit as st

# Global variables
DEFAULT_TEXT = "我如此的過著孤單的生活,我沒有一個可以真正跟他談話的人,一直到六年前,我在撒哈拉沙漠飛機故障的時候。我的發動機裡有些東西壞了。而由於我身邊沒有機械師,也沒有乘客,我準備獨自去嘗試一次困難的修理。這對我是生死問題。我連足夠喝八天的水都沒有。頭一天晚上我在離開有人居住的地方一千英里的沙地上睡覺。我比一位漂流在汪洋大海裡的木筏上面的遇難者更孤單。當天剛破曉的時候,我被一種奇異的小聲音叫醒,你可以想像到,這時我是多麼的驚訝。那聲音說:「請你﹒﹒﹒給我畫一隻綿羊!」「哪!」「給我畫一隻綿羊!」《小王子》"
DESCRIPTION = "AI模型輔助語言學習:華語"
TOK_SEP = " | "
PUNCT_SYM = ["PUNCT", "SYM"]
MODEL_NAME = "zh_core_web_sm"

# External API callers
def moedict_caller(word):
    st.write(f"### {word}")
    req = requests.get(f"https://www.moedict.tw/uni/{word}.json")
    try:
        definitions = req.json().get('heteronyms')[0].get('definitions')
        df = pd.DataFrame(definitions)
        df.fillna("---", inplace=True)
        if 'example' not in df.columns:
            df['example'] = '---'
        if 'synonyms' not in df.columns:
            df['synonyms'] = '---' 
        if 'antonyms' not in df.columns:
            df['antonyms'] = '---' 
        cols = ['def', 'example', 'synonyms', 'antonyms']
        df = df[cols]
        df.rename(columns={
            'def': '解釋',
            'example': '例句',
            'synonyms': '同義詞',
            'antonyms': '反義詞',
        }, inplace=True)
        with st.expander("點擊 + 查看結果"):
            st.table(df)
    except:
        st.write("查無結果")
            
# Custom tokenizer class
class JiebaTokenizer:
    def __init__(self, vocab):
        self.vocab = vocab

    def __call__(self, text):
        words = jieba.cut(text) # returns a generator
        tokens = list(words) # convert the genetator to a list
        spaces = [False] * len(tokens)
        doc = Doc(self.vocab, words=tokens, spaces=spaces)
        return doc
    
# Utility functions
def filter_tokens(doc):
    clean_tokens = [tok for tok in doc if tok.pos_ not in PUNCT_SYM]
    clean_tokens = (
        [tok for tok in clean_tokens if 
         not tok.like_email and 
         not tok.like_num and 
         not tok.like_url and 
         not tok.is_space]
    )
    return clean_tokens

def get_vocab(doc):
    clean_tokens = filter_tokens(doc)
    alphanum_pattern = re.compile(r"[a-zA-Z0-9]")
    clean_tokens_text = [tok.text for tok in clean_tokens if not alphanum_pattern.search(tok.text)]
    vocab = list(set(clean_tokens_text))
    return vocab

def get_counter(doc):
    clean_tokens = filter_tokens(doc)
    tokens = [token.text for token in clean_tokens]
    counter = Counter(tokens)
    return counter

def get_freq_fig(doc):
    counter = get_counter(doc)
    counter_df = (
        pd.DataFrame.from_dict(counter, orient='index').
        reset_index().
        rename(columns={
            0: 'count', 
            'index': 'word'
            }).
        sort_values(by='count', ascending=False)
        )
    fig = px.bar(counter_df, x='word', y='count')
    return fig

def get_level_pie(tocfl_result):
    level = tocfl_result['詞條分級'].value_counts()
    fig = px.pie(tocfl_result, 
                values=level.values, 
                names=level.index, 
                title='詞彙分級圓餅圖')
    return fig

@st.cache
def load_tocfl_table(filename="./tocfl_wordlist.csv"):
    table = pd.read_csv(filename)
    cols = "詞彙 漢語拼音 注音 任務領域 詞條分級".split()
    table = table[cols]
    return table
       
# Page setting
st.set_page_config(
    page_icon="🤠",
    layout="wide",
    initial_sidebar_state="auto",
)
st.markdown(f"# {DESCRIPTION}") 

# Load the model
nlp = spacy.load(MODEL_NAME)
          
# Add pipelines to spaCy
# nlp.add_pipe("yake") # keyword extraction
# nlp.add_pipe("merge_entities") # Merge entity spans to tokens

# Select a tokenizer if the Chinese model is chosen
selected_tokenizer = st.radio("請選擇斷詞模型", ["jieba-TW", "spaCy"])
if selected_tokenizer == "jieba-TW":
    nlp.tokenizer = JiebaTokenizer(nlp.vocab)

# Page starts from here
st.markdown("## 待分析文本")     
st.info("請在下面的文字框輸入文本並按下Ctrl + Enter以更新分析結果")
text = st.text_area("",  DEFAULT_TEXT, height=200)
doc = nlp(text)
st.markdown("---")

st.info("請勾選以下至少一項功能")
# keywords_extraction = st.sidebar.checkbox("關鍵詞分析", False) # YAKE doesn't work for Chinese texts
analyzed_text = st.checkbox("增強文本", True)
defs_examples = st.checkbox("單詞解析", True)
# morphology = st.sidebar.checkbox("詞形變化", True)
freq_count = st.checkbox("詞頻統計", True)
ner_viz = st.checkbox("命名實體", True)
tok_table = st.checkbox("斷詞特徵", False)

if analyzed_text:
    st.markdown("## 增強文本") 
    pronunciation = st.radio("請選擇輔助發音類型", ["漢語拼音", "注音符號", "國際音標"])
    for idx, sent in enumerate(doc.sents):
        tokens_text = [tok.text for tok in sent if tok.pos_ not in PUNCT_SYM]
        pinyins = [hanzi.to_pinyin(word) for word in tokens_text]
        sounds = pinyins
        if pronunciation == "注音符號":
            zhuyins = [transcriptions.pinyin_to_zhuyin(word) for word in pinyins]
            sounds = zhuyins
        elif pronunciation == "國際音標":
            ipas = [transcriptions.pinyin_to_ipa(word) for word in pinyins]
            sounds = ipas

        display = []
        for text, sound in zip(tokens_text, sounds):
            res = f"{text} [{sound}]"
            display.append(res)
        if display:
            display_text = TOK_SEP.join(display)
            st.write(f"{idx+1} >>> {display_text}")
        else:
            st.write(f"{idx+1} >>> EMPTY LINE")

if defs_examples:
    st.markdown("## 單詞解析")
    vocab = get_vocab(doc)
    if vocab:
        tocfl_table = load_tocfl_table()
        filt = tocfl_table['詞彙'].isin(vocab)
        tocfl_res = tocfl_table[filt]
        st.markdown("### 華語詞彙分級")
        fig = get_level_pie(tocfl_res)
        st.plotly_chart(fig, use_container_width=True)

        with st.expander("點擊 + 查看結果"):
            st.table(tocfl_res)
        st.markdown("---")
        st.markdown("### 單詞解釋與例句")
        selected_words = st.multiselect("請選擇要查詢的單詞: ", vocab, vocab[-1])
        for w in selected_words:
            moedict_caller(w)                        

if freq_count:  
    st.markdown("## 詞頻統計")  
    counter = get_counter(doc)
    topK = st.slider('請選擇前K個高頻詞', 1, len(counter), 5)
    most_common = counter.most_common(topK)
    st.write(most_common)
    st.markdown("---")

    fig = get_freq_fig(doc)
    st.plotly_chart(fig, use_container_width=True)

if ner_viz:
    ner_labels = nlp.get_pipe("ner").labels
    visualize_ner(doc, labels=ner_labels, show_table=False, title="命名實體")
    
if tok_table:
    visualize_tokens(doc, attrs=["text", "pos_", "tag_", "dep_", "head"], title="斷詞特徵")