aicodingfun commited on
Commit
b257d13
·
verified ·
1 Parent(s): 2d54fce

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +333 -0
app.py ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import plotly.graph_objects as go
4
+ from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
5
+ import numpy as np
6
+ from tqdm.auto import tqdm
7
+ import warnings
8
+ import os
9
+ from datetime import datetime, timedelta
10
+ from scipy.stats import pearsonr
11
+ import ast
12
+ warnings.simplefilter(action='ignore', category=FutureWarning)
13
+
14
+ DEVELOPER_NAME = "汪于捷、李哲弘、黃千宥、陳奕瑄、洪寓澤"
15
+
16
+ NEWS_CSV_PATH = 'cryptonews.csv'
17
+ BTC_CSV_PATH = 'BTC.csv'
18
+ PROCESSED_DATA_PATH = 'processed_btc_sentiment_data.csv'
19
+
20
+ SENTIMENT_PIPELINE = None
21
+
22
+ def initialize_pipeline():
23
+ """載入情緒分析模型,只在需要時執行一次。"""
24
+ global SENTIMENT_PIPELINE
25
+ if SENTIMENT_PIPELINE is None:
26
+ try:
27
+ print("⏳ 正在載入情緒分析模型 (Hugging Face)...")
28
+ MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment"
29
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
30
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
31
+ SENTIMENT_PIPELINE = pipeline(
32
+ "sentiment-analysis", model=model, tokenizer=tokenizer, device=0
33
+ )
34
+ print("✅ 模型載入成功!")
35
+ except Exception as e:
36
+ print(f"❌ 載入模型時發生錯誤: {e}")
37
+ SENTIMENT_PIPELINE = None
38
+
39
+ def safe_literal_eval(val):
40
+ """安全地解析字串,如果失敗則回傳空字典。"""
41
+ try:
42
+ return ast.literal_eval(val)
43
+ except (ValueError, SyntaxError):
44
+ return {}
45
+
46
+ def preprocess_and_cache_data():
47
+ """
48
+ 執行一次性的資料預處理,分析來源為新聞標題(title)與內文(text)的組合。
49
+ """
50
+ if not os.path.exists(NEWS_CSV_PATH) or not os.path.exists(BTC_CSV_PATH):
51
+ raise FileNotFoundError(f"請確認 '{NEWS_CSV_PATH}' 和 '{BTC_CSV_PATH}' 檔案存在。")
52
+
53
+ initialize_pipeline()
54
+ if SENTIMENT_PIPELINE is None:
55
+ raise RuntimeError("情緒分析模型未能成功初始化。")
56
+
57
+ print(f"⏳ 正在讀取原始資料: '{NEWS_CSV_PATH}'...")
58
+ news_df = pd.read_csv(NEWS_CSV_PATH)
59
+ news_df.dropna(subset=['title', 'text', 'sentiment'], inplace=True)
60
+ news_df['date'] = pd.to_datetime(news_df['date'], errors='coerce').dt.date
61
+ news_df.dropna(subset=['date'], inplace=True)
62
+
63
+ print("⏳ 正在合併新聞標題與內文...")
64
+ news_df['full_text'] = news_df['title'] + ". " + news_df['text']
65
+
66
+ print("⏳ 正在對新聞完整內容 (標題+內文) 進行模型情緒分析...")
67
+ texts_to_analyze = news_df['full_text'].tolist()
68
+ sentiments_model = SENTIMENT_PIPELINE(
69
+ texts_to_analyze,
70
+ batch_size=256,
71
+ truncation=True,
72
+ max_length=512
73
+ )
74
+ score_map_model = {'LABEL_2': 1, 'LABEL_1': 0, 'LABEL_0': -1}
75
+ news_df['model_sentiment_score'] = [score_map_model.get(s['label'], 0) for s in sentiments_model]
76
+
77
+ print("⏳ 正在解析預存的情緒欄位 (class, polarity, subjectivity)...")
78
+ sentiment_dicts = news_df['sentiment'].apply(safe_literal_eval)
79
+
80
+ class_score_map = {'positive': 1, 'neutral': 0, 'negative': -1}
81
+ news_df['class_sentiment_score'] = sentiment_dicts.apply(lambda x: class_score_map.get(x.get('class', 'neutral'), 0))
82
+
83
+ news_df['polarity'] = sentiment_dicts.apply(lambda x: x.get('polarity', 0.0))
84
+ news_df['subjectivity'] = sentiment_dicts.apply(lambda x: x.get('subjectivity', 0.0))
85
+
86
+ print("⏳ 正在計算每日平均情緒指標...")
87
+ daily_metrics = news_df.groupby('date').agg(
88
+ avg_model_sentiment=('model_sentiment_score', 'mean'),
89
+ avg_class_sentiment=('class_sentiment_score', 'mean'),
90
+ avg_polarity=('polarity', 'mean'),
91
+ avg_subjectivity=('subjectivity', 'mean')
92
+ ).reset_index()
93
+
94
+ print(f"⏳ 正在讀取比特幣價格資料: '{BTC_CSV_PATH}'...")
95
+ btc_df = pd.read_csv(BTC_CSV_PATH)
96
+ btc_df['date'] = pd.to_datetime(btc_df['date'], errors='coerce').dt.date
97
+ btc_df['price_change_pct'] = btc_df['close'].pct_change() * 100
98
+
99
+ print("⏳ 正在合併所有資料...")
100
+ daily_metrics['date'] = pd.to_datetime(daily_metrics['date'])
101
+ btc_df['date'] = pd.to_datetime(btc_df['date'])
102
+ merged_df = pd.merge(btc_df, daily_metrics, on='date', how='inner')
103
+
104
+ news_content_df = news_df.groupby('date').agg(
105
+ titles=('title', list),
106
+ texts=('text', list)
107
+ ).reset_index()
108
+ news_content_df['date'] = pd.to_datetime(news_content_df['date'])
109
+
110
+ final_df = pd.merge(merged_df, news_content_df, on='date', how='left')
111
+
112
+ print(f"✅ 資料預處理完成!正在將結果儲存至 '{PROCESSED_DATA_PATH}'...")
113
+ final_df.to_csv(PROCESSED_DATA_PATH, index=False)
114
+ return final_df
115
+
116
+ def load_data():
117
+ """載入資料,若快取不存在則執行預處理。"""
118
+ if os.path.exists(PROCESSED_DATA_PATH):
119
+ print(f"✅ 發現已處理的資料快取,正在從 '{PROCESSED_DATA_PATH}' 載入...")
120
+ df = pd.read_csv(PROCESSED_DATA_PATH)
121
+ df['date'] = pd.to_datetime(df['date'])
122
+ df['titles'] = df['titles'].apply(ast.literal_eval)
123
+ df['texts'] = df['texts'].apply(ast.literal_eval)
124
+ return df
125
+ else:
126
+ print("⚠️ 未發現已處理的資料,將執行首次預處理...")
127
+ return preprocess_and_cache_data()
128
+
129
+ df = load_data()
130
+ # 確保資料按日期排序
131
+ df.sort_values(by='date', inplace=True)
132
+ df.set_index('date', inplace=True)
133
+
134
+ def get_filtered_df(start_date, end_date):
135
+ """根據日期範圍篩選 DataFrame。"""
136
+ if start_date is None or end_date is None:
137
+ return pd.DataFrame()
138
+ return df[(df.index >= pd.to_datetime(start_date)) & (df.index <= pd.to_datetime(end_date))].copy()
139
+
140
+ def plot_price_and_sentiment(filtered_df, sentiment_col, sentiment_name, color):
141
+ fig = go.Figure()
142
+ fig.add_trace(go.Scatter(x=filtered_df.index, y=filtered_df['close'], name='BTC 收盤價', line=dict(color='deepskyblue'), yaxis='y1'))
143
+ fig.add_trace(go.Scatter(x=filtered_df.index, y=filtered_df[sentiment_col], name=sentiment_name, line=dict(color=color, dash='dash'), yaxis='y2'))
144
+ fig.update_layout(
145
+ # title=f'📈 比特幣價格 vs. {sentiment_name}趨勢',
146
+ xaxis_title='日期',
147
+ yaxis=dict(title='價格 (USD)', color='deepskyblue'),
148
+ yaxis2=dict(title='情緒分數', overlaying='y', side='right', color=color, range=[-1, 1]),
149
+ legend=dict(x=0.01, y=0.99, orientation='h'),
150
+ template='plotly_dark', paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0.2)'
151
+ )
152
+ return fig
153
+
154
+ def plot_subjectivity_trend(filtered_df):
155
+ fig = go.Figure()
156
+ fig.add_trace(go.Scatter(x=filtered_df.index, y=filtered_df['avg_subjectivity'], name='每日新聞主觀性', line=dict(color='lightgreen')))
157
+ fig.update_layout(
158
+ # title='🧐 每日新聞主觀性趨勢',
159
+ xaxis_title='日期',
160
+ yaxis=dict(title='主觀性分數 (0=客觀, 1=主觀)', color='lightgreen', range=[0, 1]),
161
+ template='plotly_dark', paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0.2)'
162
+ )
163
+ return fig
164
+
165
+ def plot_correlation(filtered_df, sentiment_col, lag_days):
166
+ df_corr = filtered_df[[sentiment_col, 'price_change_pct']].copy()
167
+ df_corr['price_change_pct_lagged'] = df_corr['price_change_pct'].shift(-lag_days)
168
+ df_corr.dropna(inplace=True)
169
+ if df_corr.empty or len(df_corr) < 2:
170
+ correlation, p_value = 0, 1
171
+ else:
172
+ correlation, p_value = pearsonr(df_corr[sentiment_col], df_corr['price_change_pct_lagged'])
173
+ fig = go.Figure(data=go.Scatter(x=df_corr[sentiment_col], y=df_corr['price_change_pct_lagged'], mode='markers', marker=dict(color='mediumpurple', opacity=0.7)))
174
+ fig.update_layout(
175
+ title=f'🔗 情緒與 {lag_days} 天後價格變化的關聯性 (相關係數: {correlation:.3f})',
176
+ xaxis_title='每日平均情緒分數', yaxis_title=f'{lag_days} 天後價格變化 (%)',
177
+ template='plotly_dark', paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0.2)'
178
+ )
179
+ return fig, correlation, p_value
180
+
181
+ def get_top_bottom_news(date_obj):
182
+ """
183
+ 獲取指定日期的最正面與最負面新聞。
184
+ """
185
+ date_ts = pd.to_datetime(date_obj)
186
+ if date_ts not in df.index:
187
+ return "<ul><li>無此日期資料</li></ul>", "<ul><li>無此日期資料</li></ul>"
188
+
189
+ day_data = df.loc[date_ts]
190
+ titles, texts = day_data.get('titles', []), day_data.get('texts', [])
191
+
192
+ initialize_pipeline()
193
+ if SENTIMENT_PIPELINE is None or not isinstance(titles, list) or not isinstance(texts, list) or len(titles) != len(texts):
194
+ return "<ul><li>模型未載入或新聞資料格式錯誤</li></ul>", "<ul><li>模型未載入或新聞資料格式錯誤</li></ul>"
195
+
196
+ full_texts_for_day = [f"{title}. {text}" for title, text in zip(titles, texts)]
197
+ if not full_texts_for_day:
198
+ return "<ul><li>當日無新聞</li></ul>", "<ul><li>當日無新聞</li></ul>"
199
+
200
+ sentiments = SENTIMENT_PIPELINE(full_texts_for_day, batch_size=8, truncation=True, max_length=512)
201
+ score_map = {'LABEL_2': 1, 'LABEL_1': 0, 'LABEL_0': -1}
202
+
203
+ scored_titles = []
204
+ for i, sentiment in enumerate(sentiments):
205
+ directional_score = score_map.get(sentiment['label'], 0) * sentiment['score']
206
+ scored_titles.append((titles[i], directional_score))
207
+
208
+ positive_news = sorted([item for item in scored_titles if item[1] > 0], key=lambda x: x[1], reverse=True)
209
+ negative_news = sorted([item for item in scored_titles if item[1] < 0], key=lambda x: x[1], reverse=False)
210
+
211
+ if positive_news:
212
+ top_news_html = "".join([f"<li>{title}</li>" for title, score in positive_news[:3]])
213
+ else:
214
+ top_news_html = "<li>當日無正面情緒新聞</li>"
215
+
216
+ if negative_news:
217
+ bottom_news_html = "".join([f"<li>{title}</li>" for title, score in negative_news[:3]])
218
+ else:
219
+ bottom_news_html = "<li>當日無負面情緒新聞</li>"
220
+
221
+ return f"<ul>{top_news_html}</ul>", f"<ul>{bottom_news_html}</ul>"
222
+
223
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue="sky", secondary_hue="orange"), css=".gradio-container {background: #0B0F19}") as app:
224
+ gr.Markdown(f"""<div style='text-align: center; padding: 20px; color: white;'><h1 style='font-size: 3em; color: #00BFFF;'>📈 Crypto Pulse</h1><p style='font-size: 1.2em; color: #A9A9A9;'>比特幣新聞情緒與價格分析儀表板</p><p style='font-size: 0.9em; color: #888;'>Designed by: {DEVELOPER_NAME}</p></div>""")
225
+
226
+ max_date_dt = df.index.max()
227
+ # 確保資料數足夠
228
+ if len(df) > 360:
229
+ min_date_dt = df.index[-360]
230
+ else:
231
+ min_date_dt = df.index.min()
232
+
233
+ with gr.Row():
234
+ start_date_input = gr.DateTime(label="📅 開始日期", type="datetime", value=min_date_dt)
235
+ end_date_input = gr.DateTime(label="📅 結束日期", type="datetime", value=max_date_dt)
236
+
237
+ with gr.Tabs() as tabs:
238
+ with gr.TabItem("📊 模型情緒總覽", id=0):
239
+ plot_overview = gr.Plot(label="模型情緒 vs. 價格趨勢圖")
240
+ gr.Markdown("此圖展示了由 `twitter-roberta-base-sentiment` 模型分析出的**新聞內容(標題+內文)**情緒分數(右軸)與比特幣價格(左軸)的對比。")
241
+
242
+ with gr.TabItem("🔬 多維度情緒分析", id=1):
243
+ gr.Markdown("""
244
+ ### 指標說明
245
+ 此處的情緒指標來自資料集 `cryptonews.csv` 中預先計算好的 `sentiment` 欄位。
246
+ * **資料集預設情緒分類**: 將資料集內建的 `positive`, `neutral`, `negative` 類別轉換為 `1, 0, -1` 的數值分數。
247
+ * **情感極性 (Polarity)**: 衡量文本的正面或負面程度。值域從 -1 (非常負面) 到 +1 (非常正面)。
248
+ * **主觀性 (Subjectivity)**: 衡量文本是偏向客觀事實還是主觀意見。值域從 0 (非常客觀) 到 1 (非常主觀)。
249
+ """)
250
+ plot_class_sentiment = gr.Plot(label="資料集預設情緒 vs. 價格趨勢圖")
251
+ plot_polarity = gr.Plot(label="情感極性 vs. 價格趨勢圖")
252
+ plot_subjectivity = gr.Plot(label="新聞主觀性趨勢圖")
253
+
254
+ with gr.TabItem("🔍 關聯性深掘", id=2):
255
+ with gr.Row():
256
+ with gr.Column(scale=1, min_width=200):
257
+ sentiment_type_radio = gr.Radio(
258
+ ["模型情緒分數", "資料集預設情緒分類", "情感極性 (Polarity)"],
259
+ label="選擇分析的情緒指標", value="模型情緒分數"
260
+ )
261
+ lag_slider = gr.Slider(minimum=0, maximum=14, value=1, step=1, label="🕒 情緒延遲天數 (Lag Days)")
262
+ correlation_output = gr.Textbox(label="Pearson 相關係數", interactive=False)
263
+ p_value_output = gr.Textbox(label="P-Value", interactive=False)
264
+ with gr.Column(scale=3):
265
+ plot_corr = gr.Plot(label="情緒 vs. 價格變化 散點圖")
266
+
267
+ with gr.TabItem("📰 新聞瀏覽器", id=3):
268
+ gr.Markdown("在此處選擇特定日期,即可查看當天的熱點新聞。")
269
+ news_date_input = gr.DateTime(label="🗓️ 選擇查詢日期", type="datetime", value=max_date_dt)
270
+ with gr.Row():
271
+ gr.Markdown("### 👍 當日最正面新聞 Top 3"); gr.Markdown("### 👎 當日最負面新聞 Top 3")
272
+ with gr.Row():
273
+ top_news_output = gr.HTML(); bottom_news_output = gr.HTML()
274
+
275
+ def update_all(start_date, end_date, lag_days, sentiment_type):
276
+ if start_date is None or end_date is None or start_date > end_date:
277
+ gr.Warning("請選擇有效的開始與結束日期。")
278
+ empty_fig = go.Figure()
279
+ return empty_fig, empty_fig, empty_fig, empty_fig, empty_fig, "N/A", "N/A"
280
+
281
+ start_date, end_date = pd.to_datetime(start_date), pd.to_datetime(end_date)
282
+ filtered_df = get_filtered_df(start_date, end_date)
283
+ if filtered_df.empty:
284
+ gr.Warning("此日期範圍內無資料,請擴大範圍。")
285
+ empty_fig = go.Figure()
286
+ return empty_fig, empty_fig, empty_fig, empty_fig, empty_fig, "N/A", "N/A"
287
+
288
+ overview_fig = plot_price_and_sentiment(filtered_df, 'avg_model_sentiment', '模型情緒分數', 'crimson')
289
+ class_sentiment_fig = plot_price_and_sentiment(filtered_df, 'avg_class_sentiment', '資料集預設情緒分類', 'yellow')
290
+ polarity_fig = plot_price_and_sentiment(filtered_df, 'avg_polarity', '情感極性 (Polarity)', 'orange')
291
+ subjectivity_fig = plot_subjectivity_trend(filtered_df)
292
+
293
+ if sentiment_type == "模型情緒分數":
294
+ sentiment_col = 'avg_model_sentiment'
295
+ elif sentiment_type == "資料集預設情緒分類":
296
+ sentiment_col = 'avg_class_sentiment'
297
+ else: # Polarity
298
+ sentiment_col = 'avg_polarity'
299
+
300
+ corr_fig, corr_val, p_val = plot_correlation(filtered_df, sentiment_col, lag_days)
301
+
302
+ return overview_fig, class_sentiment_fig, polarity_fig, subjectivity_fig, corr_fig, f"{corr_val:.4f}", f"{p_val:.4f}"
303
+
304
+ def update_news_browser(date_obj):
305
+ if date_obj is None:
306
+ return "請選擇日期", "無"
307
+ top_news, bottom_news = get_top_bottom_news(date_obj)
308
+ return top_news, bottom_news
309
+
310
+ inputs_for_main_update = [start_date_input, end_date_input, lag_slider, sentiment_type_radio]
311
+ outputs_for_main_update = [plot_overview, plot_class_sentiment, plot_polarity, plot_subjectivity, plot_corr, correlation_output, p_value_output]
312
+
313
+ for component in [start_date_input, end_date_input, lag_slider, sentiment_type_radio]:
314
+ component.change(fn=update_all, inputs=inputs_for_main_update, outputs=outputs_for_main_update)
315
+
316
+ news_date_input.change(
317
+ fn=update_news_browser,
318
+ inputs=[news_date_input],
319
+ outputs=[top_news_output, bottom_news_output]
320
+ )
321
+
322
+ def load_app():
323
+ main_outputs = update_all(min_date_dt, max_date_dt, 1, "模型情緒分數")
324
+ news_outputs = update_news_browser(max_date_dt)
325
+ return main_outputs + news_outputs
326
+
327
+ app.load(
328
+ fn=load_app,
329
+ inputs=None,
330
+ outputs=outputs_for_main_update + [top_news_output, bottom_news_output]
331
+ )
332
+
333
+ app.launch(debug=False, share=True, show_error=True, show_api=False)