shunwellbeing's picture
Create app.py
add81e1 verified
import gradio as gr
import pandas as pd
from googleapiclient.discovery import build
import plotly.express as px
import base64
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import openai
from datetime import datetime, timedelta
def get_video_stats(api_key, video_id):
youtube = build("youtube", "v3", developerKey=api_key)
video_response = youtube.videos().list(
part="snippet,statistics",
id=video_id
).execute()
video = video_response["items"][0]
title = video["snippet"]["title"]
channel_id = video["snippet"]["channelId"]
publish_time = video["snippet"]["publishedAt"]
view_count = int(video["statistics"].get("viewCount", 0))
like_count = int(video["statistics"].get("likeCount", 0))
comment_count = int(video["statistics"].get("commentCount", 0))
return {
"Video ID": video_id,
"Title": title,
"publishedAt": publish_time,
"Channel ID": channel_id,
"View Count": view_count,
"Like Count": like_count,
"Comment Count": comment_count
}
def get_channel_stats(api_key, channel_id):
youtube = build("youtube", "v3", developerKey=api_key)
channel_response = youtube.channels().list(
part="statistics",
id=channel_id
).execute()
if channel_response["items"]:
channel = channel_response["items"][0]
subscriber_count = int(channel["statistics"]["subscriberCount"])
else:
subscriber_count = 0
return subscriber_count
def get_video_data(api_key, query, max_results, published_after, published_before):
youtube = build("youtube", "v3", developerKey=api_key)
video_ids = []
next_page_token = None
while len(video_ids) < max_results:
search_response = youtube.search().list(
q=query,
type="video",
part="id",
maxResults=50,
pageToken=next_page_token,
order="viewCount",
publishedAfter=published_after,
publishedBefore=published_before
).execute()
video_ids.extend([item["id"]["videoId"] for item in search_response["items"]])
next_page_token = search_response.get("nextPageToken")
if not next_page_token:
break
video_ids = video_ids[:max_results]
video_stats = []
for video_id in video_ids:
stats = get_video_stats(api_key, video_id)
channel_id = stats["Channel ID"]
subscriber_count = get_channel_stats(api_key, channel_id)
stats["Subscriber Count"] = subscriber_count
video_stats.append(stats)
video_stats_df = pd.DataFrame(video_stats)
return video_stats_df
def download_csv(df, filename):
csv = df.to_csv(index=False)
b64 = base64.b64encode(csv.encode()).decode()
href = f'<a href="data:file/csv;base64,{b64}" download="{filename}.csv">Download {filename} CSV</a>'
return href
def visualize_video_ranking(video_stats_df):
video_stats_df["Active_Index"] = video_stats_df["View Count"] / video_stats_df["Subscriber Count"]
csv_download_link = download_csv(video_stats_df, "video_stats")
fig = px.bar(video_stats_df, x="Video ID", y="Active_Index", color="View Count",
labels={"Video ID": "Video ID", "Active_Index": "Active_Index"},
title="Video Active Index")
fig.update_layout(height=500, width=500)
return video_stats_df, fig, csv_download_link
def analyze_titles(video_stats_df, openai_key, n_clusters=5):
titles = video_stats_df['Title'].tolist()
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(titles)
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(tfidf_matrix)
labels = kmeans.labels_
video_stats_df["Cluster"] = labels
cluster_summaries = []
for i in range(n_clusters):
cluster_titles = video_stats_df[video_stats_df["Cluster"] == i]['Title'].tolist()
cluster_text = ' '.join(cluster_titles)
summary = summarize_cluster(cluster_text, openai_key, i)
cluster_summaries.append(summary)
cluster_summary_df = pd.DataFrame({'Cluster': range(n_clusters), 'Summary': cluster_summaries})
return cluster_summary_df
def summarize_cluster(cluster_text, openai_key, cluster_num):
openai.api_key = openai_key
prompt = f"これらの動画を日本語で徹底解析して要約し、動画の特徴・人気要因を500文字以内で解説してください: {cluster_text}"
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "あなたは世界中の人気動画や大規模データを解析してきた天才AI・データサイエンティストです"},
{"role": "user", "content": prompt}
],
max_tokens=500,
n=1,
stop=None,
temperature=0.7,
)
summary = response['choices'][0]['message']['content'].strip()
return summary
def main(api_key, openai_key, query, max_results, period, page, n_clusters=5):
if query:
# 期間の設定
now = datetime.utcnow()
published_before = now.isoformat("T") + "Z"
if period == "1週間":
published_after = (now - timedelta(days=7)).isoformat("T") + "Z"
elif period == "1か月":
published_after = (now - timedelta(days=30)).isoformat("T") + "Z"
elif period == "3か月":
published_after = (now - timedelta(days=90)).isoformat("T") + "Z"
else:
published_after = (now - timedelta(days=30)).isoformat("T") + "Z" # デフォルトで1か月
video_stats_df = get_video_data(api_key, query, max_results, published_after, published_before)
if page == "Video Ranking":
video_stats_df, fig, csv_download_link = visualize_video_ranking(video_stats_df)
return video_stats_df, fig, csv_download_link
elif page == "Title Analysis":
cluster_summary_df = analyze_titles(video_stats_df, openai_key, n_clusters)
return cluster_summary_df, None, None
iface = gr.Interface(
fn=main,
inputs=[
gr.components.Textbox(label="YouTube API Keyを入力してください", type="password"),
gr.components.Textbox(label="OpenAI API Keyを入力してください", type="password"),
gr.components.Textbox(label="Search query"),
gr.components.Slider(minimum=1, maximum=1000, value=5, label="Max results"),
gr.components.Dropdown(["1週間", "1か月", "3か月"], label="Period"),
gr.components.Dropdown(["Video Ranking", "Title Analysis"], label="Page"),
gr.components.Slider(minimum=2, maximum=10, value=5, label="Number of clusters")
],
outputs=[
gr.components.Dataframe(label="Results"),
gr.components.Plot(label="Plot"),
gr.components.HTML(label="CSV Download Link")
],
live=False,
title="YouTube Analysis Tool"
)
if __name__ == "__main__":
iface.launch()