File size: 2,744 Bytes
21e639d
 
4be708d
21e639d
 
 
 
4be708d
21e639d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4be708d
 
21e639d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/env python3  # 學號:R09942097、姓名:陳建成、streamlit cloud link: https://jeffeuxmartin-assignment-1-jeffeuxmartin-twnlp-appsrcapp-8mil4y.streamlitapp.com/

from pathlib import Path
import streamlit as st, pandas as pd, re, time
from views.components.spinner import dowload_ckip_package, download_cwn_drivers

def load_corpus(path):
    print(path)
    full_df = pd.read_json(path)
    full_df.sort_values('index', ascending=False)
    df = full_df[['title', 'web_url']]
    return df, full_df

def make_clickable(url, text):
    # Ref.: https://discuss.streamlit.io/t/display-urls-in-dataframe-column-as-a-clickable-hyperlink/743/7
    return f'<a target="_blank" href="{url}">{text}</a>'

def run_app(path, ckip_nlp_models, cwn_upgrade) -> None:
    # need to download first because CWN packages will first check whether
    # there is .cwn_graph folder in the root directory.
    download_cwn_drivers(cwn_upgrade)
    dowload_ckip_package(ckip_nlp_models)

    from views.components.sidebar import visualize_side_bar
    from views.containers import display_cwn, display_ckip, display_data_form

    st.title("PTT 語料庫搜尋分析工具 (ver. 0.1)")
    input_data = display_data_form()
    max_articles = st.slider('最多標題數:', min_value=0, max_value=30, step=1, value=3)
    model, pipeline, active_visualizers = visualize_side_bar(ckip_nlp_models)
    display_factories = {
        "CKIP": display_ckip, 
        "CWN": display_cwn,
    }
    df, full_df = load_corpus(path)
    if "input_data" in st.session_state:
        queries = st.session_state["input_data"]
        for query in queries:
            df = df[df["title"].str.contains(query)]
            df = df.iloc[:max_articles]
            
        if len(df) > 0:
            st.markdown("#### 搜尋文章標題結果 ####")
            st.markdown('\n'.join(
                f"1. [{it.title}]({it.web_url})"
                for it in df.itertuples())
            )
            
            _cleaned_titles = [
                re.sub('^\[[^]]*\] *', '', 
                    re.sub('^R\: *', '', title))
                for title in df['title']]
            cleaned_titles = []
            for t in _cleaned_titles:
                if t not in cleaned_titles:
                    cleaned_titles.append(t)

            display_factories[pipeline](
                model, active_visualizers, 
                cleaned_titles,
            )
        else:
            st.markdown("## No results match! Q_Q... ##")


if __name__ == "__main__":
    ckip_nlp_models = ["bert-base", "albert-tiny", "bert-tiny", "albert-base"]
    run_app(
        str(Path(__file__).parent.resolve() / 'data/corpus.json'), ckip_nlp_models, cwn_upgrade=False)