jeffeux's picture
GOGOGO
4be708d
raw
history blame
2.74 kB
#!/usr/bin/env python3 # 學號:R09942097、姓名:陳建成、streamlit cloud link: https://jeffeuxmartin-assignment-1-jeffeuxmartin-twnlp-appsrcapp-8mil4y.streamlitapp.com/
from pathlib import Path
import streamlit as st, pandas as pd, re, time
from views.components.spinner import dowload_ckip_package, download_cwn_drivers
def load_corpus(path):
print(path)
full_df = pd.read_json(path)
full_df.sort_values('index', ascending=False)
df = full_df[['title', 'web_url']]
return df, full_df
def make_clickable(url, text):
# Ref.: https://discuss.streamlit.io/t/display-urls-in-dataframe-column-as-a-clickable-hyperlink/743/7
return f'<a target="_blank" href="{url}">{text}</a>'
def run_app(path, ckip_nlp_models, cwn_upgrade) -> None:
# need to download first because CWN packages will first check whether
# there is .cwn_graph folder in the root directory.
download_cwn_drivers(cwn_upgrade)
dowload_ckip_package(ckip_nlp_models)
from views.components.sidebar import visualize_side_bar
from views.containers import display_cwn, display_ckip, display_data_form
st.title("PTT 語料庫搜尋分析工具 (ver. 0.1)")
input_data = display_data_form()
max_articles = st.slider('最多標題數:', min_value=0, max_value=30, step=1, value=3)
model, pipeline, active_visualizers = visualize_side_bar(ckip_nlp_models)
display_factories = {
"CKIP": display_ckip,
"CWN": display_cwn,
}
df, full_df = load_corpus(path)
if "input_data" in st.session_state:
queries = st.session_state["input_data"]
for query in queries:
df = df[df["title"].str.contains(query)]
df = df.iloc[:max_articles]
if len(df) > 0:
st.markdown("#### 搜尋文章標題結果 ####")
st.markdown('\n'.join(
f"1. [{it.title}]({it.web_url})"
for it in df.itertuples())
)
_cleaned_titles = [
re.sub('^\[[^]]*\] *', '',
re.sub('^R\: *', '', title))
for title in df['title']]
cleaned_titles = []
for t in _cleaned_titles:
if t not in cleaned_titles:
cleaned_titles.append(t)
display_factories[pipeline](
model, active_visualizers,
cleaned_titles,
)
else:
st.markdown("## No results match! Q_Q... ##")
if __name__ == "__main__":
ckip_nlp_models = ["bert-base", "albert-tiny", "bert-tiny", "albert-base"]
run_app(
str(Path(__file__).parent.resolve() / 'data/corpus.json'), ckip_nlp_models, cwn_upgrade=False)