Spaces:
Runtime error
Runtime error
import pandas as pd | |
import re | |
import requests | |
import spacy | |
from spacy_streamlit import visualize_ner, visualize_tokens | |
#from spacy.language import Language | |
from spacy.tokens import Doc | |
import spacy_ke | |
import streamlit as st | |
# Global variables | |
DEFAULT_TEXT = """So I lived my life alone, without anyone that I could really talk to, until I had an accident with my plane in the Desert of Sahara, six years ago. Something was broken in my engine. And as I had with me neither a mechanic nor any passengers, I set myself to attempt the difficult repairs all alone. It was a question of life or death for me: I had scarcely enough drinking water to last a week. The first night, then, I went to sleep on the sand, a thousand miles from any human habitation. I was more isolated than a shipwrecked sailor on a raft in the middle of the ocean. Thus you can imagine my amazement, at sunrise, when I was awakened by an odd little voice. It said: | |
"If you please−− draw me a sheep!" | |
"What!" | |
"Draw me a sheep!" | |
The Little Prince | |
""" | |
DESCRIPTION = "AI模型輔助語言學習:英語" | |
TOK_SEP = " | " | |
MODEL_NAME = "en_core_web_sm" | |
API_LOOKUP = {} | |
MAX_SYM_NUM = 5 | |
# External API caller | |
def free_dict_caller(word): | |
req = requests.get(f"https://api.dictionaryapi.dev/api/v2/entries/en/{word}") | |
try: | |
result = req.json()[0] | |
if word not in API_LOOKUP: | |
API_LOOKUP[word] = result | |
except: | |
pass | |
def show_definitions_and_examples(word, pos): | |
if word not in API_LOOKUP: | |
free_dict_caller(word) | |
result = API_LOOKUP.get(word) | |
if result: | |
meanings = result.get('meanings') | |
if meanings: | |
definitions = [] | |
for meaning in meanings: | |
if meaning['partOfSpeech'] == pos.lower(): | |
definitions = meaning.get('definitions') | |
if len(definitions) > 3: | |
definitions = definitions[:3] | |
for definition in definitions: | |
df = definition.get("definition") | |
ex = definition.get("example") | |
st.markdown(f" - {df}") | |
st.markdown(f" Example: *{ex}*") | |
st.markdown("---") | |
else: | |
st.info("Found no matching result on Free Dictionary!") | |
def get_synonyms(word, pos): | |
if word not in API_LOOKUP: | |
free_dict_caller(word) | |
result = API_LOOKUP.get(word) | |
if result: | |
meanings = result.get('meanings') | |
if meanings: | |
synonyms = [] | |
for meaning in meanings: | |
if meaning['partOfSpeech'] == pos.lower(): | |
synonyms = meaning.get('synonyms') | |
return synonyms | |
# Utility functions | |
def create_eng_df(tokens): | |
seen_texts = [] | |
filtered_tokens = [] | |
for tok in tokens: | |
if tok.lemma_ not in seen_texts: | |
filtered_tokens.append(tok) | |
df = pd.DataFrame( | |
{ | |
"單詞": [tok.text.lower() for tok in filtered_tokens], | |
"詞類": [tok.pos_ for tok in filtered_tokens], | |
"原形": [tok.lemma_ for tok in filtered_tokens], | |
} | |
) | |
st.dataframe(df) | |
csv = df.to_csv().encode('utf-8') | |
st.download_button( | |
label="下載表格", | |
data=csv, | |
file_name='eng_forms.csv', | |
) | |
def filter_tokens(doc): | |
clean_tokens = [tok for tok in doc if tok.pos_ not in ["PUNCT", "SYM"]] | |
clean_tokens = [tok for tok in clean_tokens if not tok.like_email] | |
clean_tokens = [tok for tok in clean_tokens if not tok.like_url] | |
clean_tokens = [tok for tok in clean_tokens if not tok.like_num] | |
clean_tokens = [tok for tok in clean_tokens if not tok.is_punct] | |
clean_tokens = [tok for tok in clean_tokens if not tok.is_space] | |
return clean_tokens | |
def create_kw_section(doc): | |
st.markdown("## 關鍵詞分析") | |
kw_num = st.slider("請選擇關鍵詞數量", 1, 10, 3) | |
kws2scores = {keyword: score for keyword, score in doc._.extract_keywords(n=kw_num)} | |
kws2scores = sorted(kws2scores.items(), key=lambda x: x[1], reverse=True) | |
count = 1 | |
for keyword, score in kws2scores: | |
rounded_score = round(score, 3) | |
st.write(f"{count} >>> {keyword} ({rounded_score})") | |
count += 1 | |
# Page setting | |
st.set_page_config( | |
page_icon="🤠", | |
layout="wide", | |
initial_sidebar_state="auto", | |
) | |
st.markdown(f"# {DESCRIPTION}") | |
# Load the language model | |
nlp = spacy.load(MODEL_NAME) | |
# Add pipelines to spaCy | |
nlp.add_pipe("yake") # keyword extraction | |
# nlp.add_pipe("merge_entities") # Merge entity spans to tokens | |
# Page starts from here | |
st.markdown("## 待分析文本") | |
st.info("請在下面的文字框輸入文本並按下Ctrl + Enter以更新分析結果") | |
text = st.text_area("", DEFAULT_TEXT, height=200) | |
doc = nlp(text) | |
st.markdown("---") | |
st.info("請勾選以下至少一項功能") | |
keywords_extraction = st.checkbox("關鍵詞分析", False) | |
analyzed_text = st.checkbox("增強文本", True) | |
defs_examples = st.checkbox("單詞解析", True) | |
morphology = st.checkbox("詞形變化", False) | |
ner_viz = st.checkbox("命名實體", True) | |
tok_table = st.checkbox("斷詞特徵", False) | |
if keywords_extraction: | |
create_kw_section(doc) | |
if analyzed_text: | |
st.markdown("## 分析後文本") | |
for idx, sent in enumerate(doc.sents): | |
enriched_sentence = [] | |
for tok in sent: | |
if tok.pos_ != "VERB": | |
enriched_sentence.append(tok.text) | |
else: | |
synonyms = get_synonyms(tok.text, tok.pos_) | |
if synonyms: | |
if len(synonyms) > MAX_SYM_NUM: | |
synonyms = synonyms[:MAX_SYM_NUM] | |
added_verbs = " | ".join(synonyms) | |
enriched_tok = f"{tok.text} (cf. {added_verbs})" | |
enriched_sentence.append(enriched_tok) | |
else: | |
enriched_sentence.append(tok.text) | |
display_text = " ".join(enriched_sentence) | |
st.write(f"{idx+1} >>> {display_text}") | |
if defs_examples: | |
st.markdown("## 單詞解釋與例句") | |
clean_tokens = filter_tokens(doc) | |
num_pattern = re.compile(r"[0-9]") | |
clean_tokens = [tok for tok in clean_tokens if not num_pattern.search(tok.lemma_)] | |
selected_pos = ["VERB", "NOUN", "ADJ", "ADV"] | |
clean_tokens = [tok for tok in clean_tokens if tok.pos_ in selected_pos] | |
tokens_lemma_pos = [tok.lemma_ + " | " + tok.pos_ for tok in clean_tokens] | |
vocab = list(set(tokens_lemma_pos)) | |
if vocab: | |
selected_words = st.multiselect("請選擇要查詢的單詞: ", vocab, vocab[0:3]) | |
for w in selected_words: | |
word_pos = w.split("|") | |
word = word_pos[0].strip() | |
pos = word_pos[1].strip() | |
st.write(f"### {w}") | |
with st.expander("點擊 + 檢視結果"): | |
show_definitions_and_examples(word, pos) | |
if morphology: | |
st.markdown("## 詞形變化") | |
# Collect inflected forms | |
inflected_forms = [tok for tok in doc if tok.text.lower() != tok.lemma_.lower()] | |
if inflected_forms: | |
create_eng_df(inflected_forms) | |
if ner_viz: | |
ner_labels = nlp.get_pipe("ner").labels | |
visualize_ner(doc, labels=ner_labels, show_table=False, title="命名實體") | |
if tok_table: | |
visualize_tokens(doc, attrs=["text", "pos_", "tag_", "dep_", "head"], title="斷詞特徵") | |