Upload 4 files
Browse files- app.py +236 -0
- config.yaml +63 -0
- crawl4ai_scrapper.py +111 -0
- requirements.txt +10 -0
app.py
ADDED
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import os
|
3 |
+
import tempfile
|
4 |
+
import gc
|
5 |
+
import base64
|
6 |
+
import time
|
7 |
+
import yaml
|
8 |
+
|
9 |
+
from tqdm import tqdm
|
10 |
+
from datetime import datetime
|
11 |
+
from typing import Optional
|
12 |
+
|
13 |
+
from crawl4ai_scrapper import scrape_multiple_channels
|
14 |
+
from crewai import Agent, Crew, Process, Task, LLM
|
15 |
+
from crewai_tools import FileReadTool
|
16 |
+
from dotenv import load_dotenv
|
17 |
+
|
18 |
+
load_dotenv()
|
19 |
+
|
20 |
+
# ===========================
|
21 |
+
# Cerebras LLM Integration
|
22 |
+
# ===========================
|
23 |
+
class CerebrasLLM(LLM):
|
24 |
+
def __init__(self, model: str, api_key: str, base_url: str, **kwargs):
|
25 |
+
from llama_index.llms.cerebras import Cerebras
|
26 |
+
self.client = Cerebras(
|
27 |
+
model=model,
|
28 |
+
api_key=api_key,
|
29 |
+
base_url=base_url,
|
30 |
+
**kwargs
|
31 |
+
)
|
32 |
+
|
33 |
+
def generate(self, prompt: str, **kwargs) -> str:
|
34 |
+
response = self.client.complete(prompt, **kwargs)
|
35 |
+
return response.text
|
36 |
+
|
37 |
+
@st.cache_resource
|
38 |
+
def load_llm() -> CerebrasLLM:
|
39 |
+
return CerebrasLLM(
|
40 |
+
model="llama-3.3-70b",
|
41 |
+
api_key=os.getenv("CEREBRAS_API_KEY"),
|
42 |
+
base_url="https://api.cerebras.ai/v1",
|
43 |
+
temperature=0.7,
|
44 |
+
max_tokens=4096,
|
45 |
+
top_p=0.95,
|
46 |
+
timeout=30
|
47 |
+
)
|
48 |
+
|
49 |
+
# ===========================
|
50 |
+
# Core Application Logic
|
51 |
+
# ===========================
|
52 |
+
class YouTubeAnalyzer:
|
53 |
+
def __init__(self):
|
54 |
+
self.docs_tool = FileReadTool()
|
55 |
+
self.llm = load_llm()
|
56 |
+
|
57 |
+
def create_crew(self):
|
58 |
+
with open("config.yaml", 'r') as file:
|
59 |
+
config = yaml.safe_load(file)
|
60 |
+
|
61 |
+
analysis_agent = Agent(
|
62 |
+
role=config["agents"][0]["role"],
|
63 |
+
goal=config["agents"][0]["goal"],
|
64 |
+
backstory=config["agents"][0]["backstory"],
|
65 |
+
verbose=True,
|
66 |
+
tools=[self.docs_tool],
|
67 |
+
llm=self.llm,
|
68 |
+
memory=True
|
69 |
+
)
|
70 |
+
|
71 |
+
synthesis_agent = Agent(
|
72 |
+
role=config["agents"][1]["role"],
|
73 |
+
goal=config["agents"][1]["goal"],
|
74 |
+
backstory=config["agents"][1]["backstory"],
|
75 |
+
verbose=True,
|
76 |
+
llm=self.llm,
|
77 |
+
allow_delegation=False
|
78 |
+
)
|
79 |
+
|
80 |
+
analysis_task = Task(
|
81 |
+
description=config["tasks"][0]["description"],
|
82 |
+
expected_output=config["tasks"][0]["expected_output"],
|
83 |
+
agent=analysis_agent,
|
84 |
+
output_file="analysis_raw.md"
|
85 |
+
)
|
86 |
+
|
87 |
+
synthesis_task = Task(
|
88 |
+
description=config["tasks"][1]["description"],
|
89 |
+
expected_output=config["tasks"][1]["expected_output"],
|
90 |
+
agent=synthesis_agent,
|
91 |
+
context=[analysis_task],
|
92 |
+
output_file="final_report.md"
|
93 |
+
)
|
94 |
+
|
95 |
+
return Crew(
|
96 |
+
agents=[analysis_agent, synthesis_agent],
|
97 |
+
tasks=[analysis_task, synthesis_task],
|
98 |
+
process=Process.sequential,
|
99 |
+
verbose=2
|
100 |
+
)
|
101 |
+
|
102 |
+
# ===========================
|
103 |
+
# Streamlit Interface
|
104 |
+
# ===========================
|
105 |
+
class StreamlitApp:
|
106 |
+
def __init__(self):
|
107 |
+
self.analyzer = YouTubeAnalyzer()
|
108 |
+
self._init_session_state()
|
109 |
+
|
110 |
+
def _init_session_state(self):
|
111 |
+
if "response" not in st.session_state:
|
112 |
+
st.session_state.response = None
|
113 |
+
if "crew" not in st.session_state:
|
114 |
+
st.session_state.crew = None
|
115 |
+
if "youtube_channels" not in st.session_state:
|
116 |
+
st.session_state.youtube_channels = [""]
|
117 |
+
|
118 |
+
def _setup_sidebar(self):
|
119 |
+
with st.sidebar:
|
120 |
+
st.header("YouTube Analysis Configuration")
|
121 |
+
|
122 |
+
# Channel Management
|
123 |
+
for i, channel in enumerate(st.session_state.youtube_channels):
|
124 |
+
cols = st.columns([6, 1])
|
125 |
+
with cols[0]:
|
126 |
+
url = st.text_input(
|
127 |
+
"Channel URL",
|
128 |
+
value=channel,
|
129 |
+
key=f"channel_{i}",
|
130 |
+
help="Example: https://www.youtube.com/@ChannelName"
|
131 |
+
)
|
132 |
+
with cols[1]:
|
133 |
+
if i > 0 and st.button("❌", key=f"remove_{i}"):
|
134 |
+
st.session_state.youtube_channels.pop(i)
|
135 |
+
st.rerun()
|
136 |
+
|
137 |
+
st.button("Add Channel ➕", on_click=lambda: st.session_state.youtube_channels.append(""))
|
138 |
+
|
139 |
+
# Date Selection
|
140 |
+
st.divider()
|
141 |
+
st.subheader("Analysis Period")
|
142 |
+
self.start_date = st.date_input("Start Date", key="start_date")
|
143 |
+
self.end_date = st.date_input("End Date", key="end_date")
|
144 |
+
|
145 |
+
# Analysis Control
|
146 |
+
st.divider()
|
147 |
+
if st.button("🚀 Start Analysis", type="primary"):
|
148 |
+
self._trigger_analysis()
|
149 |
+
|
150 |
+
def _trigger_analysis(self):
|
151 |
+
with st.spinner('Initializing deep content analysis...'):
|
152 |
+
try:
|
153 |
+
valid_urls = [
|
154 |
+
url for url in st.session_state.youtube_channels
|
155 |
+
if self._is_valid_youtube_url(url)
|
156 |
+
]
|
157 |
+
|
158 |
+
if not valid_urls:
|
159 |
+
st.error("Please provide at least one valid YouTube channel URL")
|
160 |
+
return
|
161 |
+
|
162 |
+
# Scrape and process data
|
163 |
+
channel_data = asyncio.run(
|
164 |
+
scrape_multiple_channels(
|
165 |
+
valid_urls,
|
166 |
+
start_date=self.start_date.strftime("%Y-%m-%d"),
|
167 |
+
end_date=self.end_date.strftime("%Y-%m-%d")
|
168 |
+
)
|
169 |
+
)
|
170 |
+
|
171 |
+
# Save transcripts
|
172 |
+
self._save_transcripts(channel_data)
|
173 |
+
|
174 |
+
# Execute analysis
|
175 |
+
with st.spinner('Running AI-powered analysis...'):
|
176 |
+
st.session_state.crew = self.analyzer.create_crew()
|
177 |
+
st.session_state.response = st.session_state.crew.kickoff(
|
178 |
+
inputs={"files": st.session_state.all_files}
|
179 |
+
)
|
180 |
+
|
181 |
+
except Exception as e:
|
182 |
+
st.error(f"Analysis failed: {str(e)}")
|
183 |
+
st.stop()
|
184 |
+
|
185 |
+
def _save_transcripts(self, channel_data):
|
186 |
+
st.session_state.all_files = []
|
187 |
+
os.makedirs("transcripts", exist_ok=True)
|
188 |
+
|
189 |
+
with tqdm(total=sum(len(ch) for ch in channel_data), desc="Processing Videos") as pbar:
|
190 |
+
for channel in channel_data:
|
191 |
+
for video in channel:
|
192 |
+
file_path = f"transcripts/{video['id']}.txt"
|
193 |
+
with open(file_path, "w") as f:
|
194 |
+
f.write("\n".join(
|
195 |
+
[f"[{seg['start']}-{seg['end']}] {seg['text']}"
|
196 |
+
for seg in video['transcript']]
|
197 |
+
))
|
198 |
+
st.session_state.all_files.append(file_path)
|
199 |
+
pbar.update(1)
|
200 |
+
|
201 |
+
def _display_results(self):
|
202 |
+
st.markdown("## Analysis Report")
|
203 |
+
with st.expander("View Full Technical Analysis"):
|
204 |
+
st.markdown(st.session_state.response)
|
205 |
+
|
206 |
+
col1, col2 = st.columns([3, 1])
|
207 |
+
with col1:
|
208 |
+
st.download_button(
|
209 |
+
label="📥 Download Full Report",
|
210 |
+
data=st.session_state.response,
|
211 |
+
file_name="youtube_analysis_report.md",
|
212 |
+
mime="text/markdown"
|
213 |
+
)
|
214 |
+
with col2:
|
215 |
+
if st.button("🔄 New Analysis"):
|
216 |
+
gc.collect()
|
217 |
+
st.session_state.response = None
|
218 |
+
st.rerun()
|
219 |
+
|
220 |
+
@staticmethod
|
221 |
+
def _is_valid_youtube_url(url: str) -> bool:
|
222 |
+
return any(pattern in url for pattern in ["youtube.com/", "youtu.be/"])
|
223 |
+
|
224 |
+
def run(self):
|
225 |
+
# Move st.set_page_config to the top to fix the error
|
226 |
+
st.set_page_config(page_title="YouTube Intelligence System", layout="wide") # First Streamlit command
|
227 |
+
|
228 |
+
st.title("YouTube Content Analysis Platform")
|
229 |
+
st.markdown("---")
|
230 |
+
|
231 |
+
self._setup_sidebar()
|
232 |
+
|
233 |
+
if st.session_state.response:
|
234 |
+
self._display_results()
|
235 |
+
else:
|
236 |
+
st.info("Configure analysis parameters in the sidebar to begin")
|
config.yaml
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
agents:
|
2 |
+
- name: analysis_agent
|
3 |
+
role: "YouTube Transcript Deep Analyst"
|
4 |
+
goal: >
|
5 |
+
Perform multi-layered analysis of video transcripts from {file_paths} including:
|
6 |
+
1. Thematic analysis with topic modeling
|
7 |
+
2. Cross-video trend correlation
|
8 |
+
3. Sentiment trajectory over time
|
9 |
+
4. Keyword evolution analysis
|
10 |
+
5. Speaker style and rhetorical patterns
|
11 |
+
backstory: >
|
12 |
+
As a PhD-level content analyst with 10+ years experience in media analysis,
|
13 |
+
you combine NLP techniques with deep contextual understanding to uncover
|
14 |
+
hidden patterns in video content. Your analyses are known for their
|
15 |
+
academic rigor and practical applicability.
|
16 |
+
verbose: true
|
17 |
+
|
18 |
+
- name: response_synthesizer_agent
|
19 |
+
role: "Strategic Insights Synthesizer"
|
20 |
+
goal: >
|
21 |
+
Transform complex analyses into executive-ready reports with:
|
22 |
+
1. Data visualization recommendations
|
23 |
+
2. Actionable strategy frameworks
|
24 |
+
3. Risk/opportunity matrices
|
25 |
+
4. Audience-specific adaptations
|
26 |
+
backstory: >
|
27 |
+
Former management consultant specializing in translating technical analyses
|
28 |
+
into boardroom-ready strategies. Expert in creating multi-format outputs
|
29 |
+
(bullet points, summaries, dashboards) for different stakeholders.
|
30 |
+
verbose: true
|
31 |
+
|
32 |
+
tasks:
|
33 |
+
- name: analysis_task
|
34 |
+
description: >
|
35 |
+
Conduct temporal analysis of transcripts with:
|
36 |
+
- Topic clustering (TF-IDF + k-means)
|
37 |
+
- Sentiment progression graphs
|
38 |
+
- Keyword co-occurrence networks
|
39 |
+
- Cross-channel comparison matrices
|
40 |
+
expected_output: >
|
41 |
+
A technical report containing:
|
42 |
+
1. Time-coded thematic analysis
|
43 |
+
2. Sentiment heatmaps
|
44 |
+
3. Keyword evolution charts
|
45 |
+
4. Comparative channel insights
|
46 |
+
All findings must include statistical significance metrics.
|
47 |
+
agent: "analysis_agent"
|
48 |
+
|
49 |
+
- name: response_task
|
50 |
+
description: >
|
51 |
+
Create multi-format outputs:
|
52 |
+
- Executive summary (1-page)
|
53 |
+
- Detailed technical appendix
|
54 |
+
- Data visualization brief
|
55 |
+
- Action plan template
|
56 |
+
expected_output: >
|
57 |
+
A packaged insights suite containing:
|
58 |
+
1. Visual summary infographic
|
59 |
+
2. Strategic recommendations matrix
|
60 |
+
3. Risk assessment framework
|
61 |
+
4. Implementation roadmap
|
62 |
+
All elements must be production-ready.
|
63 |
+
agent: "response_synthesizer_agent"
|
crawl4ai_scrapper.py
ADDED
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
import re
|
3 |
+
import logging
|
4 |
+
from datetime import datetime
|
5 |
+
from typing import List, Dict, Optional
|
6 |
+
from tqdm import tqdm
|
7 |
+
|
8 |
+
from crawl4ai import AsyncWebCrawler
|
9 |
+
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
10 |
+
|
11 |
+
# Configure logging
|
12 |
+
logging.basicConfig(level=logging.INFO)
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
+
|
15 |
+
class YouTubeScraper:
|
16 |
+
YOUTUBE_SCHEMA = {
|
17 |
+
"name": "YouTubeVideoData",
|
18 |
+
"baseSelector": "ytd-rich-item-renderer",
|
19 |
+
"fields": [
|
20 |
+
{"name": "title", "selector": "#video-title", "type": "text"},
|
21 |
+
{"name": "url", "selector": "#video-title", "type": "link"},
|
22 |
+
{"name": "views", "selector": "#metadata-line span:first-child", "type": "text"},
|
23 |
+
{"name": "upload_date", "selector": "#metadata-line span:last-child", "type": "text"},
|
24 |
+
{"name": "transcript", "selector": "#segments-container", "type": "text"}
|
25 |
+
]
|
26 |
+
}
|
27 |
+
|
28 |
+
def __init__(self):
|
29 |
+
self.crawler = AsyncWebCrawler(
|
30 |
+
headless=True,
|
31 |
+
browser="chromium",
|
32 |
+
stealth=True,
|
33 |
+
timeout=60
|
34 |
+
)
|
35 |
+
|
36 |
+
async def scrape_channel(self, url: str, start_date: str, end_date: str, max_videos: int = 10):
|
37 |
+
"""Scrape and process YouTube channel content"""
|
38 |
+
try:
|
39 |
+
logger.info(f"Scraping channel: {url}")
|
40 |
+
result = await self.crawler.arun(
|
41 |
+
url=url,
|
42 |
+
extraction_strategy=JsonCssExtractionStrategy(self.YOUTUBE_SCHEMA),
|
43 |
+
wait_for_selector="#video-title"
|
44 |
+
)
|
45 |
+
return self._process_results(result.data, start_date, end_date, max_videos)
|
46 |
+
except Exception as e:
|
47 |
+
logger.error(f"Failed to scrape {url}: {str(e)}")
|
48 |
+
return []
|
49 |
+
|
50 |
+
def _process_results(self, raw_data: List[Dict], start_date: str, end_date: str, max_videos: int):
|
51 |
+
"""Process and filter scraped data"""
|
52 |
+
processed = []
|
53 |
+
date_format = "%b %d, %Y"
|
54 |
+
|
55 |
+
for item in raw_data[:max_videos]:
|
56 |
+
try:
|
57 |
+
if not item.get("url"):
|
58 |
+
continue
|
59 |
+
|
60 |
+
upload_date = datetime.strptime(item["upload_date"], date_format)
|
61 |
+
start = datetime.strptime(start_date, "%Y-%m-%d")
|
62 |
+
end = datetime.strptime(end_date, "%Y-%m-%d")
|
63 |
+
|
64 |
+
if not (start <= upload_date <= end):
|
65 |
+
continue
|
66 |
+
|
67 |
+
processed.append({
|
68 |
+
"id": self._extract_video_id(item["url"]),
|
69 |
+
"title": item.get("title", "Untitled"),
|
70 |
+
"url": f"https://youtube.com{item['url']}",
|
71 |
+
"views": self._parse_views(item.get("views", "0")),
|
72 |
+
"upload_date": upload_date.strftime("%Y-%m-%d"),
|
73 |
+
"transcript": self._process_transcript(item.get("transcript", ""))
|
74 |
+
})
|
75 |
+
except Exception as e:
|
76 |
+
logger.warning(f"Skipping invalid video data: {str(e)}")
|
77 |
+
|
78 |
+
return processed
|
79 |
+
|
80 |
+
@staticmethod
|
81 |
+
def _parse_views(views_str: str) -> int:
|
82 |
+
"""Convert view count string to integer"""
|
83 |
+
return int(re.sub(r"[^\d]", "", views_str.split()[0])) if views_str else 0
|
84 |
+
|
85 |
+
@staticmethod
|
86 |
+
def _process_transcript(raw: str) -> List[Dict]:
|
87 |
+
"""Structure raw transcript text"""
|
88 |
+
return [{
|
89 |
+
"start": i*5,
|
90 |
+
"end": (i+1)*5,
|
91 |
+
"text": line.strip()
|
92 |
+
} for i, line in enumerate(raw.split("\n") if raw else [])]
|
93 |
+
|
94 |
+
@staticmethod
|
95 |
+
def _extract_video_id(url: str) -> Optional[str]:
|
96 |
+
"""Extract YouTube video ID from URL"""
|
97 |
+
match = re.search(r"v=([a-zA-Z0-9_-]{11})", url)
|
98 |
+
return match.group(1) if match else None
|
99 |
+
|
100 |
+
async def scrape_multiple_channels(urls: List[str], start_date: str, end_date: str, num_videos: int = 10):
|
101 |
+
"""Scrape multiple YouTube channels with progress tracking"""
|
102 |
+
scraper = YouTubeScraper()
|
103 |
+
tasks = [scraper.scrape_channel(url, start_date, end_date, num_videos) for url in urls]
|
104 |
+
|
105 |
+
with tqdm(total=len(tasks), desc="Processing Channels") as pbar:
|
106 |
+
results = []
|
107 |
+
for future in asyncio.as_completed(tasks):
|
108 |
+
results.append(await future)
|
109 |
+
pbar.update(1)
|
110 |
+
|
111 |
+
return results
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
asyncio
|
3 |
+
crewai
|
4 |
+
crewai-tools
|
5 |
+
crawl4ai
|
6 |
+
playwright
|
7 |
+
python-dotenv
|
8 |
+
tqdm
|
9 |
+
pyyaml
|
10 |
+
llama-index-llms-cerebras
|