import gradio as gr import pandas as pd from googleapiclient.discovery import build import plotly.express as px import base64 import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans import openai from datetime import datetime, timedelta def get_video_stats(api_key, video_id): youtube = build("youtube", "v3", developerKey=api_key) video_response = youtube.videos().list( part="snippet,statistics", id=video_id ).execute() video = video_response["items"][0] title = video["snippet"]["title"] channel_id = video["snippet"]["channelId"] publish_time = video["snippet"]["publishedAt"] view_count = int(video["statistics"].get("viewCount", 0)) like_count = int(video["statistics"].get("likeCount", 0)) comment_count = int(video["statistics"].get("commentCount", 0)) return { "Video ID": video_id, "Title": title, "publishedAt": publish_time, "Channel ID": channel_id, "View Count": view_count, "Like Count": like_count, "Comment Count": comment_count } def get_channel_stats(api_key, channel_id): youtube = build("youtube", "v3", developerKey=api_key) channel_response = youtube.channels().list( part="statistics", id=channel_id ).execute() if channel_response["items"]: channel = channel_response["items"][0] subscriber_count = int(channel["statistics"]["subscriberCount"]) else: subscriber_count = 0 return subscriber_count def get_video_data(api_key, query, max_results, published_after, published_before): youtube = build("youtube", "v3", developerKey=api_key) video_ids = [] next_page_token = None while len(video_ids) < max_results: search_response = youtube.search().list( q=query, type="video", part="id", maxResults=50, pageToken=next_page_token, order="viewCount", publishedAfter=published_after, publishedBefore=published_before ).execute() video_ids.extend([item["id"]["videoId"] for item in search_response["items"]]) next_page_token = search_response.get("nextPageToken") if not next_page_token: break video_ids = video_ids[:max_results] video_stats = [] for video_id in video_ids: stats = get_video_stats(api_key, video_id) channel_id = stats["Channel ID"] subscriber_count = get_channel_stats(api_key, channel_id) stats["Subscriber Count"] = subscriber_count video_stats.append(stats) video_stats_df = pd.DataFrame(video_stats) return video_stats_df def download_csv(df, filename): csv = df.to_csv(index=False) b64 = base64.b64encode(csv.encode()).decode() href = f'Download {filename} CSV' return href def visualize_video_ranking(video_stats_df): video_stats_df["Active_Index"] = video_stats_df["View Count"] / video_stats_df["Subscriber Count"] csv_download_link = download_csv(video_stats_df, "video_stats") fig = px.bar(video_stats_df, x="Video ID", y="Active_Index", color="View Count", labels={"Video ID": "Video ID", "Active_Index": "Active_Index"}, title="Video Active Index") fig.update_layout(height=500, width=500) return video_stats_df, fig, csv_download_link def analyze_titles(video_stats_df, openai_key, n_clusters=5): titles = video_stats_df['Title'].tolist() vectorizer = TfidfVectorizer() tfidf_matrix = vectorizer.fit_transform(titles) kmeans = KMeans(n_clusters=n_clusters, random_state=42) kmeans.fit(tfidf_matrix) labels = kmeans.labels_ video_stats_df["Cluster"] = labels cluster_summaries = [] for i in range(n_clusters): cluster_titles = video_stats_df[video_stats_df["Cluster"] == i]['Title'].tolist() cluster_text = ' '.join(cluster_titles) summary = summarize_cluster(cluster_text, openai_key, i) cluster_summaries.append(summary) cluster_summary_df = pd.DataFrame({'Cluster': range(n_clusters), 'Summary': cluster_summaries}) return cluster_summary_df def summarize_cluster(cluster_text, openai_key, cluster_num): openai.api_key = openai_key prompt = f"これらの動画を日本語で徹底解析して要約し、動画の特徴・人気要因を500文字以内で解説してください: {cluster_text}" response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=[ {"role": "system", "content": "あなたは世界中の人気動画や大規模データを解析してきた天才AI・データサイエンティストです"}, {"role": "user", "content": prompt} ], max_tokens=500, n=1, stop=None, temperature=0.7, ) summary = response['choices'][0]['message']['content'].strip() return summary def main(api_key, openai_key, query, max_results, period, page, n_clusters=5): if query: # 期間の設定 now = datetime.utcnow() published_before = now.isoformat("T") + "Z" if period == "1週間": published_after = (now - timedelta(days=7)).isoformat("T") + "Z" elif period == "1か月": published_after = (now - timedelta(days=30)).isoformat("T") + "Z" elif period == "3か月": published_after = (now - timedelta(days=90)).isoformat("T") + "Z" else: published_after = (now - timedelta(days=30)).isoformat("T") + "Z" # デフォルトで1か月 video_stats_df = get_video_data(api_key, query, max_results, published_after, published_before) if page == "Video Ranking": video_stats_df, fig, csv_download_link = visualize_video_ranking(video_stats_df) return video_stats_df, fig, csv_download_link elif page == "Title Analysis": cluster_summary_df = analyze_titles(video_stats_df, openai_key, n_clusters) return cluster_summary_df, None, None iface = gr.Interface( fn=main, inputs=[ gr.components.Textbox(label="YouTube API Keyを入力してください", type="password"), gr.components.Textbox(label="OpenAI API Keyを入力してください", type="password"), gr.components.Textbox(label="Search query"), gr.components.Slider(minimum=1, maximum=1000, value=5, label="Max results"), gr.components.Dropdown(["1週間", "1か月", "3か月"], label="Period"), gr.components.Dropdown(["Video Ranking", "Title Analysis"], label="Page"), gr.components.Slider(minimum=2, maximum=10, value=5, label="Number of clusters") ], outputs=[ gr.components.Dataframe(label="Results"), gr.components.Plot(label="Plot"), gr.components.HTML(label="CSV Download Link") ], live=False, title="YouTube Analysis Tool" ) if __name__ == "__main__": iface.launch()