Spaces:
Running
Running
import torch | |
import gradio as gr | |
import pandas as pd | |
import plotly.graph_objects as go | |
from transformers import pipeline | |
import numpy as np | |
from tqdm.auto import tqdm | |
import warnings | |
import os | |
from datetime import datetime, timedelta | |
from scipy.stats import pearsonr | |
import ast | |
warnings.simplefilter(action='ignore', category=FutureWarning) | |
DEVELOPER_NAME = "汪于捷、李哲弘、黃千宥、陳奕瑄、洪寓澤" | |
NEWS_CSV_PATH = 'cryptonews.csv' | |
BTC_CSV_PATH = 'BTC.csv' | |
PROCESSED_DATA_PATH = 'processed_btc_sentiment_data.csv' | |
PLOTLY_TEMPLATE = "plotly_dark" | |
SENTIMENT_PIPELINE = None | |
def initialize_pipeline(): | |
"""載入情緒分析模型,只在需要時執行一次。""" | |
global SENTIMENT_PIPELINE | |
if SENTIMENT_PIPELINE is None: | |
try: | |
print("⏳ 正在載入情緒分析模型 (Hugging Face)...") | |
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment" | |
SENTIMENT_PIPELINE = pipeline( | |
"sentiment-analysis", model=MODEL_NAME, tokenizer=MODEL_NAME, device=-1 | |
) | |
print("✅ 模型載入成功!") | |
except Exception as e: | |
print(f"❌ 載入模型時發生錯誤: {e}") | |
SENTIMENT_PIPELINE = None | |
def safe_literal_eval(val): | |
"""安全地解析字串,如果失敗則回傳空字典。""" | |
try: | |
return ast.literal_eval(val) | |
except (ValueError, SyntaxError): | |
return {} | |
def preprocess_and_cache_data(): | |
""" | |
執行一次性的資料預處理,分析來源為新聞標題(title)與內文(text)的組合。 | |
""" | |
if not os.path.exists(NEWS_CSV_PATH) or not os.path.exists(BTC_CSV_PATH): | |
raise FileNotFoundError(f"請確認 '{NEWS_CSV_PATH}' 和 '{BTC_CSV_PATH}' 檔案存在。") | |
initialize_pipeline() | |
if SENTIMENT_PIPELINE is None: | |
raise RuntimeError("情緒分析模型未能成功初始化。") | |
print(f"⏳ 正在讀取原始資料: '{NEWS_CSV_PATH}'...") | |
news_df = pd.read_csv(NEWS_CSV_PATH) | |
news_df.dropna(subset=['title', 'text', 'sentiment'], inplace=True) | |
news_df['date'] = pd.to_datetime(news_df['date'], errors='coerce').dt.date | |
news_df.dropna(subset=['date'], inplace=True) | |
print("⏳ 正在合併新聞標題與內文...") | |
news_df['full_text'] = news_df['title'] + ". " + news_df['text'] | |
print("⏳ 正在對新聞完整內容 (標題+內文) 進行模型情緒分析...") | |
texts_to_analyze = news_df['full_text'].tolist() | |
sentiments_model = SENTIMENT_PIPELINE( | |
texts_to_analyze, | |
batch_size=256, | |
truncation=True, | |
max_length=512 | |
) | |
score_map_model = {'LABEL_2': 1, 'LABEL_1': 0, 'LABEL_0': -1} | |
news_df['model_sentiment_score'] = [score_map_model.get(s['label'], 0) for s in sentiments_model] | |
print("⏳ 正在解析預存的情緒欄位 (class, polarity, subjectivity)...") | |
sentiment_dicts = news_df['sentiment'].apply(safe_literal_eval) | |
class_score_map = {'positive': 1, 'neutral': 0, 'negative': -1} | |
news_df['class_sentiment_score'] = sentiment_dicts.apply(lambda x: class_score_map.get(x.get('class', 'neutral'), 0)) | |
news_df['polarity'] = sentiment_dicts.apply(lambda x: x.get('polarity', 0.0)) | |
news_df['subjectivity'] = sentiment_dicts.apply(lambda x: x.get('subjectivity', 0.0)) | |
print("⏳ 正在計算每日平均情緒指標...") | |
daily_metrics = news_df.groupby('date').agg( | |
avg_model_sentiment=('model_sentiment_score', 'mean'), | |
avg_class_sentiment=('class_sentiment_score', 'mean'), | |
avg_polarity=('polarity', 'mean'), | |
avg_subjectivity=('subjectivity', 'mean') | |
).reset_index() | |
print(f"⏳ 正在讀取比特幣價格資料: '{BTC_CSV_PATH}'...") | |
btc_df = pd.read_csv(BTC_CSV_PATH) | |
btc_df['date'] = pd.to_datetime(btc_df['date'], errors='coerce').dt.date | |
btc_df['price_change_pct'] = btc_df['close'].pct_change() * 100 | |
print("⏳ 正在合併所有資料...") | |
daily_metrics['date'] = pd.to_datetime(daily_metrics['date']) | |
btc_df['date'] = pd.to_datetime(btc_df['date']) | |
merged_df = pd.merge(btc_df, daily_metrics, on='date', how='inner') | |
news_content_df = news_df.groupby('date').agg( | |
titles=('title', list), | |
texts=('text', list) | |
).reset_index() | |
news_content_df['date'] = pd.to_datetime(news_content_df['date']) | |
final_df = pd.merge(merged_df, news_content_df, on='date', how='left') | |
print(f"✅ 資料預處理完成!正在將結果儲存至 '{PROCESSED_DATA_PATH}'...") | |
final_df.to_csv(PROCESSED_DATA_PATH, index=False) | |
return final_df | |
def load_data(): | |
"""載入資料,若快取不存在則執行預處理。""" | |
if os.path.exists(PROCESSED_DATA_PATH): | |
print(f"✅ 發現已處理的資料快取,正在從 '{PROCESSED_DATA_PATH}' 載入...") | |
df = pd.read_csv(PROCESSED_DATA_PATH) | |
df['date'] = pd.to_datetime(df['date']) | |
df['titles'] = df['titles'].apply(ast.literal_eval) | |
df['texts'] = df['texts'].apply(ast.literal_eval) | |
return df | |
else: | |
print("⚠️ 未發現已處理的資料,將執行首次預處理...") | |
return preprocess_and_cache_data() | |
df = load_data() | |
# 確保資料按日期排序 | |
df.sort_values(by='date', inplace=True) | |
df.set_index('date', inplace=True) | |
def get_filtered_df(start_date, end_date): | |
"""根據日期範圍篩選 DataFrame。""" | |
if start_date is None or end_date is None: | |
return pd.DataFrame() | |
return df[(df.index >= pd.to_datetime(start_date)) & (df.index <= pd.to_datetime(end_date))].copy() | |
def plot_price_and_sentiment(filtered_df, sentiment_col, sentiment_name, color): | |
fig = go.Figure() | |
fig.add_trace(go.Scatter(x=filtered_df.index, y=filtered_df['close'], name='BTC 收盤價', line=dict(color='deepskyblue'), yaxis='y1')) | |
fig.add_trace(go.Scatter(x=filtered_df.index, y=filtered_df[sentiment_col], name=sentiment_name, line=dict(color=color, dash='dash'), yaxis='y2')) | |
fig.update_layout( | |
# title=f'📈 比特幣價格 vs. {sentiment_name}趨勢', | |
xaxis_title='日期', | |
yaxis=dict(title='價格 (USD)', color='deepskyblue'), | |
yaxis2=dict(title='情緒分數', overlaying='y', side='right', color=color, range=[-1, 1]), | |
legend=dict(x=0.01, y=0.99, orientation='h'), | |
template=PLOTLY_TEMPLATE, | |
paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0.2)' | |
) | |
return fig | |
def plot_subjectivity_trend(filtered_df): | |
fig = go.Figure() | |
fig.add_trace(go.Scatter(x=filtered_df.index, y=filtered_df['avg_subjectivity'], name='每日新聞主觀性', line=dict(color='lightgreen'))) | |
fig.update_layout( | |
# title='🧐 每日新聞主觀性趨勢', | |
xaxis_title='日期', | |
yaxis=dict(title='主觀性分數 (0=客觀, 1=主觀)', color='lightgreen', range=[0, 1]), | |
template=PLOTLY_TEMPLATE, | |
paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0.2)' | |
) | |
return fig | |
def plot_correlation(filtered_df, sentiment_col, lag_days): | |
df_corr = filtered_df[[sentiment_col, 'price_change_pct']].copy() | |
df_corr['price_change_pct_lagged'] = df_corr['price_change_pct'].shift(-lag_days) | |
df_corr.dropna(inplace=True) | |
if df_corr.empty or len(df_corr) < 2: | |
correlation, p_value = 0, 1 | |
else: | |
correlation, p_value = pearsonr(df_corr[sentiment_col], df_corr['price_change_pct_lagged']) | |
fig = go.Figure(data=go.Scatter(x=df_corr[sentiment_col], y=df_corr['price_change_pct_lagged'], mode='markers', marker=dict(color='mediumpurple', opacity=0.7))) | |
fig.update_layout( | |
title=f'🔗 情緒與 {lag_days} 天後價格變化的關聯性 (相關係數: {correlation:.3f})', | |
xaxis_title='每日平均情緒分數', yaxis_title=f'{lag_days} 天後價格變化 (%)', | |
template=PLOTLY_TEMPLATE, | |
paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0.2)' | |
) | |
return fig, correlation, p_value | |
def get_top_bottom_news(date_obj): | |
""" | |
獲取指定日期的最正面與最負面新聞。 | |
""" | |
date_ts = pd.to_datetime(date_obj) | |
if date_ts not in df.index: | |
return "<ul><li>無此日期資料</li></ul>", "<ul><li>無此日期資料</li></ul>" | |
day_data = df.loc[date_ts] | |
titles, texts = day_data.get('titles', []), day_data.get('texts', []) | |
initialize_pipeline() | |
if SENTIMENT_PIPELINE is None or not isinstance(titles, list) or not isinstance(texts, list) or len(titles) != len(texts): | |
return "<ul><li>模型未載入或新聞資料格式錯誤</li></ul>", "<ul><li>模型未載入或新聞資料格式錯誤</li></ul>" | |
full_texts_for_day = [f"{title}. {text}" for title, text in zip(titles, texts)] | |
if not full_texts_for_day: | |
return "<ul><li>當日無新聞</li></ul>", "<ul><li>當日無新聞</li></ul>" | |
sentiments = SENTIMENT_PIPELINE(full_texts_for_day, batch_size=8, truncation=True, max_length=512) | |
score_map = {'LABEL_2': 1, 'LABEL_1': 0, 'LABEL_0': -1} | |
scored_titles = [] | |
for i, sentiment in enumerate(sentiments): | |
directional_score = score_map.get(sentiment['label'], 0) * sentiment['score'] | |
scored_titles.append((titles[i], directional_score)) | |
positive_news = sorted([item for item in scored_titles if item[1] > 0], key=lambda x: x[1], reverse=True) | |
negative_news = sorted([item for item in scored_titles if item[1] < 0], key=lambda x: x[1], reverse=False) | |
if positive_news: | |
top_news_html = "".join([f"<li>{title}</li>" for title, score in positive_news[:3]]) | |
else: | |
top_news_html = "<li>當日無正面情緒新聞</li>" | |
if negative_news: | |
bottom_news_html = "".join([f"<li>{title}</li>" for title, score in negative_news[:3]]) | |
else: | |
bottom_news_html = "<li>當日無負面情緒新聞</li>" | |
return f"<ul>{top_news_html}</ul>", f"<ul>{bottom_news_html}</ul>" | |
with gr.Blocks( | |
theme=gr.themes.Soft( | |
primary_hue="sky", | |
secondary_hue="orange", | |
font=["Arial", "sans-serif"] | |
), | |
js=""" | |
function refresh() { | |
const url = new URL(window.location); | |
if (url.searchParams.get('__theme') !== 'dark') { | |
url.searchParams.set('__theme', 'dark'); | |
window.location.href = url.href; | |
} | |
} | |
""" | |
) as app: | |
gr.Markdown(f"""<div style='text-align: center; padding: 20px; color: white;'><h1 style='font-size: 3em; color: #00BFFF;'>📈 Crypto Pulse</h1><p style='font-size: 1.2em; color: #A9A9A9;'>比特幣新聞情緒與價格分析儀表板</p><p style='font-size: 0.9em; color: #888;'>Designed by: {DEVELOPER_NAME}</p></div>""") | |
max_date_dt = df.index.max() | |
# 確保資料數足夠 | |
if len(df) > 360: | |
min_date_dt = df.index[-360] | |
else: | |
min_date_dt = df.index.min() | |
with gr.Row(): | |
start_date_input = gr.DateTime(label="📅 開始日期", type="datetime", value=min_date_dt) | |
end_date_input = gr.DateTime(label="📅 結束日期", type="datetime", value=max_date_dt) | |
with gr.Tabs() as tabs: | |
with gr.TabItem("📊 模型情緒總覽", id=0): | |
plot_overview = gr.Plot(label="模型情緒 vs. 價格趨勢圖") | |
gr.Markdown("此圖展示了由 `twitter-roberta-base-sentiment` 模型分析出的**新聞內容(標題+內文)**情緒分數(右軸)與比特幣價格(左軸)的對比。") | |
with gr.TabItem("🔬 多維度情緒分析", id=1): | |
gr.Markdown(""" | |
### 指標說明 | |
此處的情緒指標來自資料集 `cryptonews.csv` 中預先計算好的 `sentiment` 欄位。 | |
* **資料集預設情緒分類**: 將資料集內建的 `positive`, `neutral`, `negative` 類別轉換為 `1, 0, -1` 的數值分數。 | |
* **情感極性 (Polarity)**: 衡量文本的正面或負面程度。值域從 -1 (非常負面) 到 +1 (非常正面)。 | |
* **主觀性 (Subjectivity)**: 衡量文本是偏向客觀事實還是主觀意見。值域從 0 (非常客觀) 到 1 (非常主觀)。 | |
""") | |
plot_class_sentiment = gr.Plot(label="資料集預設情緒 vs. 價格趨勢圖") | |
plot_polarity = gr.Plot(label="情感極性 vs. 價格趨勢圖") | |
plot_subjectivity = gr.Plot(label="新聞主觀性趨勢圖") | |
with gr.TabItem("🔍 關聯性深掘", id=2): | |
with gr.Row(): | |
with gr.Column(scale=2, min_width=250): | |
sentiment_type_radio = gr.Radio( | |
["模型情緒分數", "資料集預設情緒分類", "情感極性 (Polarity)"], | |
label="選擇分析的情緒指標", value="模型情緒分數" | |
) | |
lag_slider = gr.Slider(minimum=0, maximum=14, value=1, step=1, label="🕒 情緒延遲天數 (Lag Days)") | |
correlation_output = gr.Textbox(label="Pearson 相關係數", interactive=False) | |
p_value_output = gr.Textbox(label="P-Value", interactive=False) | |
with gr.Column(scale=3): | |
plot_corr = gr.Plot(label="情緒 vs. 價格變化 散點圖") | |
with gr.TabItem("📰 新聞瀏覽器", id=3): | |
gr.Markdown("在此處選擇特定日期,即可查看當天的熱點新聞。") | |
news_date_input = gr.DateTime(label="🗓️ 選擇查詢日期", type="datetime", value=max_date_dt) | |
with gr.Row(): | |
gr.Markdown("### 👍 當日最正面新聞 Top 3"); gr.Markdown("### 👎 當日最負面新聞 Top 3") | |
with gr.Row(): | |
top_news_output = gr.HTML(); bottom_news_output = gr.HTML() | |
def update_all(start_date, end_date, lag_days, sentiment_type): | |
if start_date is None or end_date is None or start_date > end_date: | |
gr.Warning("請選擇有效的開始與結束日期。") | |
empty_fig = go.Figure() | |
return empty_fig, empty_fig, empty_fig, empty_fig, empty_fig, "N/A", "N/A" | |
start_date, end_date = pd.to_datetime(start_date), pd.to_datetime(end_date) | |
filtered_df = get_filtered_df(start_date, end_date) | |
if filtered_df.empty: | |
gr.Warning("此日期範圍內無資料,請擴大範圍。") | |
empty_fig = go.Figure() | |
return empty_fig, empty_fig, empty_fig, empty_fig, empty_fig, "N/A", "N/A" | |
overview_fig = plot_price_and_sentiment(filtered_df, 'avg_model_sentiment', '模型情緒分數', 'crimson') | |
class_sentiment_fig = plot_price_and_sentiment(filtered_df, 'avg_class_sentiment', '資料集預設情緒分類', 'yellow') | |
polarity_fig = plot_price_and_sentiment(filtered_df, 'avg_polarity', '情感極性 (Polarity)', 'orange') | |
subjectivity_fig = plot_subjectivity_trend(filtered_df) | |
if sentiment_type == "模型情緒分數": | |
sentiment_col = 'avg_model_sentiment' | |
elif sentiment_type == "資料集預設情緒分類": | |
sentiment_col = 'avg_class_sentiment' | |
else: # Polarity | |
sentiment_col = 'avg_polarity' | |
corr_fig, corr_val, p_val = plot_correlation(filtered_df, sentiment_col, lag_days) | |
return overview_fig, class_sentiment_fig, polarity_fig, subjectivity_fig, corr_fig, f"{corr_val:.4f}", f"{p_val:.4f}" | |
def update_news_browser(date_obj): | |
if date_obj is None: | |
return "請選擇日期", "無" | |
top_news, bottom_news = get_top_bottom_news(date_obj) | |
return top_news, bottom_news | |
inputs_for_main_update = [start_date_input, end_date_input, lag_slider, sentiment_type_radio] | |
outputs_for_main_update = [plot_overview, plot_class_sentiment, plot_polarity, plot_subjectivity, plot_corr, correlation_output, p_value_output] | |
for component in [start_date_input, end_date_input, lag_slider, sentiment_type_radio]: | |
component.change(fn=update_all, inputs=inputs_for_main_update, outputs=outputs_for_main_update) | |
news_date_input.change( | |
fn=update_news_browser, | |
inputs=[news_date_input], | |
outputs=[top_news_output, bottom_news_output] | |
) | |
def load_app(): | |
main_outputs = update_all(min_date_dt, max_date_dt, 1, "模型情緒分數") | |
news_outputs = update_news_browser(max_date_dt) | |
return main_outputs + news_outputs | |
app.load( | |
fn=load_app, | |
inputs=None, | |
outputs=outputs_for_main_update + [top_news_output, bottom_news_output] | |
) | |
app.launch(debug=False, share=True, show_error=True, show_api=False) |