Spaces:

shunwellbeing
/

youtube_analysis

Sleeping

App Files Files Community

shunwellbeing commited on Jul 10, 2024

Commit

add81e1

verified ·

1 Parent(s): a0882f2

Create app.py

Browse files

Files changed (1) hide show

app.py +188 -0

app.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import gradio as gr
+import pandas as pd
+from googleapiclient.discovery import build
+import plotly.express as px
+import base64
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.cluster import KMeans
+import openai
+from datetime import datetime, timedelta
+def get_video_stats(api_key, video_id):
+    youtube = build("youtube", "v3", developerKey=api_key)
+    video_response = youtube.videos().list(
+        part="snippet,statistics",
+        id=video_id
+    ).execute()
+    video = video_response["items"][0]
+    title = video["snippet"]["title"]
+    channel_id = video["snippet"]["channelId"]
+    publish_time = video["snippet"]["publishedAt"]
+    view_count = int(video["statistics"].get("viewCount", 0))
+    like_count = int(video["statistics"].get("likeCount", 0))
+    comment_count = int(video["statistics"].get("commentCount", 0))
+    return {
+        "Video ID": video_id,
+        "Title": title,
+        "publishedAt": publish_time,
+        "Channel ID": channel_id,
+        "View Count": view_count,
+        "Like Count": like_count,
+        "Comment Count": comment_count
+    }
+def get_channel_stats(api_key, channel_id):
+    youtube = build("youtube", "v3", developerKey=api_key)
+    channel_response = youtube.channels().list(
+        part="statistics",
+        id=channel_id
+    ).execute()
+    if channel_response["items"]:
+        channel = channel_response["items"][0]
+        subscriber_count = int(channel["statistics"]["subscriberCount"])
+    else:
+        subscriber_count = 0
+    return subscriber_count
+def get_video_data(api_key, query, max_results, published_after, published_before):
+    youtube = build("youtube", "v3", developerKey=api_key)
+    video_ids = []
+    next_page_token = None
+    while len(video_ids) < max_results:
+        search_response = youtube.search().list(
+            q=query,
+            type="video",
+            part="id",
+            maxResults=50,
+            pageToken=next_page_token,
+            order="viewCount",
+            publishedAfter=published_after,
+            publishedBefore=published_before
+        ).execute()
+        video_ids.extend([item["id"]["videoId"] for item in search_response["items"]])
+        next_page_token = search_response.get("nextPageToken")
+        if not next_page_token:
+            break
+    video_ids = video_ids[:max_results]
+    video_stats = []
+    for video_id in video_ids:
+        stats = get_video_stats(api_key, video_id)
+        channel_id = stats["Channel ID"]
+        subscriber_count = get_channel_stats(api_key, channel_id)
+        stats["Subscriber Count"] = subscriber_count
+        video_stats.append(stats)
+    video_stats_df = pd.DataFrame(video_stats)
+    return video_stats_df
+def download_csv(df, filename):
+    csv = df.to_csv(index=False)
+    b64 = base64.b64encode(csv.encode()).decode()
+    href = f'<a href="data:file/csv;base64,{b64}" download="{filename}.csv">Download {filename} CSV</a>'
+    return href
+def visualize_video_ranking(video_stats_df):
+    video_stats_df["Active_Index"] = video_stats_df["View Count"] / video_stats_df["Subscriber Count"]
+    csv_download_link = download_csv(video_stats_df, "video_stats")
+    fig = px.bar(video_stats_df, x="Video ID", y="Active_Index", color="View Count",
+                 labels={"Video ID": "Video ID", "Active_Index": "Active_Index"},
+                 title="Video Active Index")
+    fig.update_layout(height=500, width=500)
+    return video_stats_df, fig, csv_download_link
+def analyze_titles(video_stats_df, openai_key, n_clusters=5):
+    titles = video_stats_df['Title'].tolist()
+    vectorizer = TfidfVectorizer()
+    tfidf_matrix = vectorizer.fit_transform(titles)
+    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
+    kmeans.fit(tfidf_matrix)
+    labels = kmeans.labels_
+    video_stats_df["Cluster"] = labels
+    cluster_summaries = []
+    for i in range(n_clusters):
+        cluster_titles = video_stats_df[video_stats_df["Cluster"] == i]['Title'].tolist()
+        cluster_text = ' '.join(cluster_titles)
+        summary = summarize_cluster(cluster_text, openai_key, i)
+        cluster_summaries.append(summary)
+    cluster_summary_df = pd.DataFrame({'Cluster': range(n_clusters), 'Summary': cluster_summaries})
+    return cluster_summary_df
+def summarize_cluster(cluster_text, openai_key, cluster_num):
+    openai.api_key = openai_key
+    prompt = f"これらの動画を日本語で徹底解析して要約し、動画の特徴・人気要因を500文字以内で解説してください: {cluster_text}"
+    response = openai.ChatCompletion.create(
+        model="gpt-3.5-turbo",
+        messages=[
+            {"role": "system", "content": "あなたは世界中の人気動画や大規模データを解析してきた天才AI・データサイエンティストです"},
+            {"role": "user", "content": prompt}
+        ],
+        max_tokens=500,
+        n=1,
+        stop=None,
+        temperature=0.7,
+    )
+    summary = response['choices'][0]['message']['content'].strip()
+    return summary
+def main(api_key, openai_key, query, max_results, period, page, n_clusters=5):
+    if query:
+        # 期間の設定
+        now = datetime.utcnow()
+        published_before = now.isoformat("T") + "Z"
+        if period == "1週間":
+            published_after = (now - timedelta(days=7)).isoformat("T") + "Z"
+        elif period == "1か月":
+            published_after = (now - timedelta(days=30)).isoformat("T") + "Z"
+        elif period == "3か月":
+            published_after = (now - timedelta(days=90)).isoformat("T") + "Z"
+        else:
+            published_after = (now - timedelta(days=30)).isoformat("T") + "Z"  # デフォルトで1か月
+        video_stats_df = get_video_data(api_key, query, max_results, published_after, published_before)
+        if page == "Video Ranking":
+            video_stats_df, fig, csv_download_link = visualize_video_ranking(video_stats_df)
+            return video_stats_df, fig, csv_download_link
+        elif page == "Title Analysis":
+            cluster_summary_df = analyze_titles(video_stats_df, openai_key, n_clusters)
+            return cluster_summary_df, None, None
+iface = gr.Interface(
+    fn=main,
+    inputs=[
+        gr.components.Textbox(label="YouTube API Keyを入力してください", type="password"),
+        gr.components.Textbox(label="OpenAI API Keyを入力してください", type="password"),
+        gr.components.Textbox(label="Search query"),
+        gr.components.Slider(minimum=1, maximum=1000, value=5, label="Max results"),
+        gr.components.Dropdown(["1週間", "1か月", "3か月"], label="Period"),
+        gr.components.Dropdown(["Video Ranking", "Title Analysis"], label="Page"),
+        gr.components.Slider(minimum=2, maximum=10, value=5, label="Number of clusters")
+    ],
+    outputs=[
+        gr.components.Dataframe(label="Results"),
+        gr.components.Plot(label="Plot"),
+        gr.components.HTML(label="CSV Download Link")
+    ],
+    live=False,
+    title="YouTube Analysis Tool"
+)
+if __name__ == "__main__":
+    iface.launch()