Spaces:
Running
Running
import gradio as gr | |
import pandas as pd | |
from googleapiclient.discovery import build | |
import plotly.express as px | |
import base64 | |
import numpy as np | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.cluster import KMeans | |
import openai | |
from datetime import datetime, timedelta | |
def get_video_stats(api_key, video_id): | |
youtube = build("youtube", "v3", developerKey=api_key) | |
video_response = youtube.videos().list( | |
part="snippet,statistics", | |
id=video_id | |
).execute() | |
video = video_response["items"][0] | |
title = video["snippet"]["title"] | |
channel_id = video["snippet"]["channelId"] | |
publish_time = video["snippet"]["publishedAt"] | |
view_count = int(video["statistics"].get("viewCount", 0)) | |
like_count = int(video["statistics"].get("likeCount", 0)) | |
comment_count = int(video["statistics"].get("commentCount", 0)) | |
return { | |
"Video ID": video_id, | |
"Title": title, | |
"publishedAt": publish_time, | |
"Channel ID": channel_id, | |
"View Count": view_count, | |
"Like Count": like_count, | |
"Comment Count": comment_count | |
} | |
def get_channel_stats(api_key, channel_id): | |
youtube = build("youtube", "v3", developerKey=api_key) | |
channel_response = youtube.channels().list( | |
part="statistics", | |
id=channel_id | |
).execute() | |
if channel_response["items"]: | |
channel = channel_response["items"][0] | |
subscriber_count = int(channel["statistics"]["subscriberCount"]) | |
else: | |
subscriber_count = 0 | |
return subscriber_count | |
def get_video_data(api_key, query, max_results, published_after, published_before): | |
youtube = build("youtube", "v3", developerKey=api_key) | |
video_ids = [] | |
next_page_token = None | |
while len(video_ids) < max_results: | |
search_response = youtube.search().list( | |
q=query, | |
type="video", | |
part="id", | |
maxResults=50, | |
pageToken=next_page_token, | |
order="viewCount", | |
publishedAfter=published_after, | |
publishedBefore=published_before | |
).execute() | |
video_ids.extend([item["id"]["videoId"] for item in search_response["items"]]) | |
next_page_token = search_response.get("nextPageToken") | |
if not next_page_token: | |
break | |
video_ids = video_ids[:max_results] | |
video_stats = [] | |
for video_id in video_ids: | |
stats = get_video_stats(api_key, video_id) | |
channel_id = stats["Channel ID"] | |
subscriber_count = get_channel_stats(api_key, channel_id) | |
stats["Subscriber Count"] = subscriber_count | |
video_stats.append(stats) | |
video_stats_df = pd.DataFrame(video_stats) | |
return video_stats_df | |
def download_csv(df, filename): | |
csv = df.to_csv(index=False) | |
b64 = base64.b64encode(csv.encode()).decode() | |
href = f'<a href="data:file/csv;base64,{b64}" download="{filename}.csv">Download {filename} CSV</a>' | |
return href | |
def visualize_video_ranking(video_stats_df): | |
video_stats_df["Active_Index"] = video_stats_df["View Count"] / video_stats_df["Subscriber Count"] | |
csv_download_link = download_csv(video_stats_df, "video_stats") | |
fig = px.bar(video_stats_df, x="Video ID", y="Active_Index", color="View Count", | |
labels={"Video ID": "Video ID", "Active_Index": "Active_Index"}, | |
title="Video Active Index") | |
fig.update_layout(height=500, width=500) | |
return video_stats_df, fig, csv_download_link | |
def analyze_titles(video_stats_df, openai_key, n_clusters=5): | |
titles = video_stats_df['Title'].tolist() | |
vectorizer = TfidfVectorizer() | |
tfidf_matrix = vectorizer.fit_transform(titles) | |
kmeans = KMeans(n_clusters=n_clusters, random_state=42) | |
kmeans.fit(tfidf_matrix) | |
labels = kmeans.labels_ | |
video_stats_df["Cluster"] = labels | |
cluster_summaries = [] | |
for i in range(n_clusters): | |
cluster_titles = video_stats_df[video_stats_df["Cluster"] == i]['Title'].tolist() | |
cluster_text = ' '.join(cluster_titles) | |
summary = summarize_cluster(cluster_text, openai_key, i) | |
cluster_summaries.append(summary) | |
cluster_summary_df = pd.DataFrame({'Cluster': range(n_clusters), 'Summary': cluster_summaries}) | |
return cluster_summary_df | |
def summarize_cluster(cluster_text, openai_key, cluster_num): | |
openai.api_key = openai_key | |
prompt = f"これらの動画を日本語で徹底解析して要約し、動画の特徴・人気要因を500文字以内で解説してください: {cluster_text}" | |
response = openai.ChatCompletion.create( | |
model="gpt-3.5-turbo", | |
messages=[ | |
{"role": "system", "content": "あなたは世界中の人気動画や大規模データを解析してきた天才AI・データサイエンティストです"}, | |
{"role": "user", "content": prompt} | |
], | |
max_tokens=500, | |
n=1, | |
stop=None, | |
temperature=0.7, | |
) | |
summary = response['choices'][0]['message']['content'].strip() | |
return summary | |
def main(api_key, openai_key, query, max_results, period, page, n_clusters=5): | |
if query: | |
# 期間の設定 | |
now = datetime.utcnow() | |
published_before = now.isoformat("T") + "Z" | |
if period == "1週間": | |
published_after = (now - timedelta(days=7)).isoformat("T") + "Z" | |
elif period == "1か月": | |
published_after = (now - timedelta(days=30)).isoformat("T") + "Z" | |
elif period == "3か月": | |
published_after = (now - timedelta(days=90)).isoformat("T") + "Z" | |
else: | |
published_after = (now - timedelta(days=30)).isoformat("T") + "Z" # デフォルトで1か月 | |
video_stats_df = get_video_data(api_key, query, max_results, published_after, published_before) | |
if page == "Video Ranking": | |
video_stats_df, fig, csv_download_link = visualize_video_ranking(video_stats_df) | |
return video_stats_df, fig, csv_download_link | |
elif page == "Title Analysis": | |
cluster_summary_df = analyze_titles(video_stats_df, openai_key, n_clusters) | |
return cluster_summary_df, None, None | |
iface = gr.Interface( | |
fn=main, | |
inputs=[ | |
gr.components.Textbox(label="YouTube API Keyを入力してください", type="password"), | |
gr.components.Textbox(label="OpenAI API Keyを入力してください", type="password"), | |
gr.components.Textbox(label="Search query"), | |
gr.components.Slider(minimum=1, maximum=1000, value=5, label="Max results"), | |
gr.components.Dropdown(["1週間", "1か月", "3か月"], label="Period"), | |
gr.components.Dropdown(["Video Ranking", "Title Analysis"], label="Page"), | |
gr.components.Slider(minimum=2, maximum=10, value=5, label="Number of clusters") | |
], | |
outputs=[ | |
gr.components.Dataframe(label="Results"), | |
gr.components.Plot(label="Plot"), | |
gr.components.HTML(label="CSV Download Link") | |
], | |
live=False, | |
title="YouTube Analysis Tool" | |
) | |
if __name__ == "__main__": | |
iface.launch() |