muhammadsalmanalfaridzi commited on
Commit
381fba2
·
verified ·
1 Parent(s): edb44e8

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +236 -0
  2. config.yaml +63 -0
  3. crawl4ai_scrapper.py +111 -0
  4. requirements.txt +10 -0
app.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import tempfile
4
+ import gc
5
+ import base64
6
+ import time
7
+ import yaml
8
+
9
+ from tqdm import tqdm
10
+ from datetime import datetime
11
+ from typing import Optional
12
+
13
+ from crawl4ai_scrapper import scrape_multiple_channels
14
+ from crewai import Agent, Crew, Process, Task, LLM
15
+ from crewai_tools import FileReadTool
16
+ from dotenv import load_dotenv
17
+
18
+ load_dotenv()
19
+
20
+ # ===========================
21
+ # Cerebras LLM Integration
22
+ # ===========================
23
+ class CerebrasLLM(LLM):
24
+ def __init__(self, model: str, api_key: str, base_url: str, **kwargs):
25
+ from llama_index.llms.cerebras import Cerebras
26
+ self.client = Cerebras(
27
+ model=model,
28
+ api_key=api_key,
29
+ base_url=base_url,
30
+ **kwargs
31
+ )
32
+
33
+ def generate(self, prompt: str, **kwargs) -> str:
34
+ response = self.client.complete(prompt, **kwargs)
35
+ return response.text
36
+
37
+ @st.cache_resource
38
+ def load_llm() -> CerebrasLLM:
39
+ return CerebrasLLM(
40
+ model="llama-3.3-70b",
41
+ api_key=os.getenv("CEREBRAS_API_KEY"),
42
+ base_url="https://api.cerebras.ai/v1",
43
+ temperature=0.7,
44
+ max_tokens=4096,
45
+ top_p=0.95,
46
+ timeout=30
47
+ )
48
+
49
+ # ===========================
50
+ # Core Application Logic
51
+ # ===========================
52
+ class YouTubeAnalyzer:
53
+ def __init__(self):
54
+ self.docs_tool = FileReadTool()
55
+ self.llm = load_llm()
56
+
57
+ def create_crew(self):
58
+ with open("config.yaml", 'r') as file:
59
+ config = yaml.safe_load(file)
60
+
61
+ analysis_agent = Agent(
62
+ role=config["agents"][0]["role"],
63
+ goal=config["agents"][0]["goal"],
64
+ backstory=config["agents"][0]["backstory"],
65
+ verbose=True,
66
+ tools=[self.docs_tool],
67
+ llm=self.llm,
68
+ memory=True
69
+ )
70
+
71
+ synthesis_agent = Agent(
72
+ role=config["agents"][1]["role"],
73
+ goal=config["agents"][1]["goal"],
74
+ backstory=config["agents"][1]["backstory"],
75
+ verbose=True,
76
+ llm=self.llm,
77
+ allow_delegation=False
78
+ )
79
+
80
+ analysis_task = Task(
81
+ description=config["tasks"][0]["description"],
82
+ expected_output=config["tasks"][0]["expected_output"],
83
+ agent=analysis_agent,
84
+ output_file="analysis_raw.md"
85
+ )
86
+
87
+ synthesis_task = Task(
88
+ description=config["tasks"][1]["description"],
89
+ expected_output=config["tasks"][1]["expected_output"],
90
+ agent=synthesis_agent,
91
+ context=[analysis_task],
92
+ output_file="final_report.md"
93
+ )
94
+
95
+ return Crew(
96
+ agents=[analysis_agent, synthesis_agent],
97
+ tasks=[analysis_task, synthesis_task],
98
+ process=Process.sequential,
99
+ verbose=2
100
+ )
101
+
102
+ # ===========================
103
+ # Streamlit Interface
104
+ # ===========================
105
+ class StreamlitApp:
106
+ def __init__(self):
107
+ self.analyzer = YouTubeAnalyzer()
108
+ self._init_session_state()
109
+
110
+ def _init_session_state(self):
111
+ if "response" not in st.session_state:
112
+ st.session_state.response = None
113
+ if "crew" not in st.session_state:
114
+ st.session_state.crew = None
115
+ if "youtube_channels" not in st.session_state:
116
+ st.session_state.youtube_channels = [""]
117
+
118
+ def _setup_sidebar(self):
119
+ with st.sidebar:
120
+ st.header("YouTube Analysis Configuration")
121
+
122
+ # Channel Management
123
+ for i, channel in enumerate(st.session_state.youtube_channels):
124
+ cols = st.columns([6, 1])
125
+ with cols[0]:
126
+ url = st.text_input(
127
+ "Channel URL",
128
+ value=channel,
129
+ key=f"channel_{i}",
130
+ help="Example: https://www.youtube.com/@ChannelName"
131
+ )
132
+ with cols[1]:
133
+ if i > 0 and st.button("❌", key=f"remove_{i}"):
134
+ st.session_state.youtube_channels.pop(i)
135
+ st.rerun()
136
+
137
+ st.button("Add Channel ➕", on_click=lambda: st.session_state.youtube_channels.append(""))
138
+
139
+ # Date Selection
140
+ st.divider()
141
+ st.subheader("Analysis Period")
142
+ self.start_date = st.date_input("Start Date", key="start_date")
143
+ self.end_date = st.date_input("End Date", key="end_date")
144
+
145
+ # Analysis Control
146
+ st.divider()
147
+ if st.button("🚀 Start Analysis", type="primary"):
148
+ self._trigger_analysis()
149
+
150
+ def _trigger_analysis(self):
151
+ with st.spinner('Initializing deep content analysis...'):
152
+ try:
153
+ valid_urls = [
154
+ url for url in st.session_state.youtube_channels
155
+ if self._is_valid_youtube_url(url)
156
+ ]
157
+
158
+ if not valid_urls:
159
+ st.error("Please provide at least one valid YouTube channel URL")
160
+ return
161
+
162
+ # Scrape and process data
163
+ channel_data = asyncio.run(
164
+ scrape_multiple_channels(
165
+ valid_urls,
166
+ start_date=self.start_date.strftime("%Y-%m-%d"),
167
+ end_date=self.end_date.strftime("%Y-%m-%d")
168
+ )
169
+ )
170
+
171
+ # Save transcripts
172
+ self._save_transcripts(channel_data)
173
+
174
+ # Execute analysis
175
+ with st.spinner('Running AI-powered analysis...'):
176
+ st.session_state.crew = self.analyzer.create_crew()
177
+ st.session_state.response = st.session_state.crew.kickoff(
178
+ inputs={"files": st.session_state.all_files}
179
+ )
180
+
181
+ except Exception as e:
182
+ st.error(f"Analysis failed: {str(e)}")
183
+ st.stop()
184
+
185
+ def _save_transcripts(self, channel_data):
186
+ st.session_state.all_files = []
187
+ os.makedirs("transcripts", exist_ok=True)
188
+
189
+ with tqdm(total=sum(len(ch) for ch in channel_data), desc="Processing Videos") as pbar:
190
+ for channel in channel_data:
191
+ for video in channel:
192
+ file_path = f"transcripts/{video['id']}.txt"
193
+ with open(file_path, "w") as f:
194
+ f.write("\n".join(
195
+ [f"[{seg['start']}-{seg['end']}] {seg['text']}"
196
+ for seg in video['transcript']]
197
+ ))
198
+ st.session_state.all_files.append(file_path)
199
+ pbar.update(1)
200
+
201
+ def _display_results(self):
202
+ st.markdown("## Analysis Report")
203
+ with st.expander("View Full Technical Analysis"):
204
+ st.markdown(st.session_state.response)
205
+
206
+ col1, col2 = st.columns([3, 1])
207
+ with col1:
208
+ st.download_button(
209
+ label="📥 Download Full Report",
210
+ data=st.session_state.response,
211
+ file_name="youtube_analysis_report.md",
212
+ mime="text/markdown"
213
+ )
214
+ with col2:
215
+ if st.button("🔄 New Analysis"):
216
+ gc.collect()
217
+ st.session_state.response = None
218
+ st.rerun()
219
+
220
+ @staticmethod
221
+ def _is_valid_youtube_url(url: str) -> bool:
222
+ return any(pattern in url for pattern in ["youtube.com/", "youtu.be/"])
223
+
224
+ def run(self):
225
+ # Move st.set_page_config to the top to fix the error
226
+ st.set_page_config(page_title="YouTube Intelligence System", layout="wide") # First Streamlit command
227
+
228
+ st.title("YouTube Content Analysis Platform")
229
+ st.markdown("---")
230
+
231
+ self._setup_sidebar()
232
+
233
+ if st.session_state.response:
234
+ self._display_results()
235
+ else:
236
+ st.info("Configure analysis parameters in the sidebar to begin")
config.yaml ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ agents:
2
+ - name: analysis_agent
3
+ role: "YouTube Transcript Deep Analyst"
4
+ goal: >
5
+ Perform multi-layered analysis of video transcripts from {file_paths} including:
6
+ 1. Thematic analysis with topic modeling
7
+ 2. Cross-video trend correlation
8
+ 3. Sentiment trajectory over time
9
+ 4. Keyword evolution analysis
10
+ 5. Speaker style and rhetorical patterns
11
+ backstory: >
12
+ As a PhD-level content analyst with 10+ years experience in media analysis,
13
+ you combine NLP techniques with deep contextual understanding to uncover
14
+ hidden patterns in video content. Your analyses are known for their
15
+ academic rigor and practical applicability.
16
+ verbose: true
17
+
18
+ - name: response_synthesizer_agent
19
+ role: "Strategic Insights Synthesizer"
20
+ goal: >
21
+ Transform complex analyses into executive-ready reports with:
22
+ 1. Data visualization recommendations
23
+ 2. Actionable strategy frameworks
24
+ 3. Risk/opportunity matrices
25
+ 4. Audience-specific adaptations
26
+ backstory: >
27
+ Former management consultant specializing in translating technical analyses
28
+ into boardroom-ready strategies. Expert in creating multi-format outputs
29
+ (bullet points, summaries, dashboards) for different stakeholders.
30
+ verbose: true
31
+
32
+ tasks:
33
+ - name: analysis_task
34
+ description: >
35
+ Conduct temporal analysis of transcripts with:
36
+ - Topic clustering (TF-IDF + k-means)
37
+ - Sentiment progression graphs
38
+ - Keyword co-occurrence networks
39
+ - Cross-channel comparison matrices
40
+ expected_output: >
41
+ A technical report containing:
42
+ 1. Time-coded thematic analysis
43
+ 2. Sentiment heatmaps
44
+ 3. Keyword evolution charts
45
+ 4. Comparative channel insights
46
+ All findings must include statistical significance metrics.
47
+ agent: "analysis_agent"
48
+
49
+ - name: response_task
50
+ description: >
51
+ Create multi-format outputs:
52
+ - Executive summary (1-page)
53
+ - Detailed technical appendix
54
+ - Data visualization brief
55
+ - Action plan template
56
+ expected_output: >
57
+ A packaged insights suite containing:
58
+ 1. Visual summary infographic
59
+ 2. Strategic recommendations matrix
60
+ 3. Risk assessment framework
61
+ 4. Implementation roadmap
62
+ All elements must be production-ready.
63
+ agent: "response_synthesizer_agent"
crawl4ai_scrapper.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import re
3
+ import logging
4
+ from datetime import datetime
5
+ from typing import List, Dict, Optional
6
+ from tqdm import tqdm
7
+
8
+ from crawl4ai import AsyncWebCrawler
9
+ from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
10
+
11
+ # Configure logging
12
+ logging.basicConfig(level=logging.INFO)
13
+ logger = logging.getLogger(__name__)
14
+
15
+ class YouTubeScraper:
16
+ YOUTUBE_SCHEMA = {
17
+ "name": "YouTubeVideoData",
18
+ "baseSelector": "ytd-rich-item-renderer",
19
+ "fields": [
20
+ {"name": "title", "selector": "#video-title", "type": "text"},
21
+ {"name": "url", "selector": "#video-title", "type": "link"},
22
+ {"name": "views", "selector": "#metadata-line span:first-child", "type": "text"},
23
+ {"name": "upload_date", "selector": "#metadata-line span:last-child", "type": "text"},
24
+ {"name": "transcript", "selector": "#segments-container", "type": "text"}
25
+ ]
26
+ }
27
+
28
+ def __init__(self):
29
+ self.crawler = AsyncWebCrawler(
30
+ headless=True,
31
+ browser="chromium",
32
+ stealth=True,
33
+ timeout=60
34
+ )
35
+
36
+ async def scrape_channel(self, url: str, start_date: str, end_date: str, max_videos: int = 10):
37
+ """Scrape and process YouTube channel content"""
38
+ try:
39
+ logger.info(f"Scraping channel: {url}")
40
+ result = await self.crawler.arun(
41
+ url=url,
42
+ extraction_strategy=JsonCssExtractionStrategy(self.YOUTUBE_SCHEMA),
43
+ wait_for_selector="#video-title"
44
+ )
45
+ return self._process_results(result.data, start_date, end_date, max_videos)
46
+ except Exception as e:
47
+ logger.error(f"Failed to scrape {url}: {str(e)}")
48
+ return []
49
+
50
+ def _process_results(self, raw_data: List[Dict], start_date: str, end_date: str, max_videos: int):
51
+ """Process and filter scraped data"""
52
+ processed = []
53
+ date_format = "%b %d, %Y"
54
+
55
+ for item in raw_data[:max_videos]:
56
+ try:
57
+ if not item.get("url"):
58
+ continue
59
+
60
+ upload_date = datetime.strptime(item["upload_date"], date_format)
61
+ start = datetime.strptime(start_date, "%Y-%m-%d")
62
+ end = datetime.strptime(end_date, "%Y-%m-%d")
63
+
64
+ if not (start <= upload_date <= end):
65
+ continue
66
+
67
+ processed.append({
68
+ "id": self._extract_video_id(item["url"]),
69
+ "title": item.get("title", "Untitled"),
70
+ "url": f"https://youtube.com{item['url']}",
71
+ "views": self._parse_views(item.get("views", "0")),
72
+ "upload_date": upload_date.strftime("%Y-%m-%d"),
73
+ "transcript": self._process_transcript(item.get("transcript", ""))
74
+ })
75
+ except Exception as e:
76
+ logger.warning(f"Skipping invalid video data: {str(e)}")
77
+
78
+ return processed
79
+
80
+ @staticmethod
81
+ def _parse_views(views_str: str) -> int:
82
+ """Convert view count string to integer"""
83
+ return int(re.sub(r"[^\d]", "", views_str.split()[0])) if views_str else 0
84
+
85
+ @staticmethod
86
+ def _process_transcript(raw: str) -> List[Dict]:
87
+ """Structure raw transcript text"""
88
+ return [{
89
+ "start": i*5,
90
+ "end": (i+1)*5,
91
+ "text": line.strip()
92
+ } for i, line in enumerate(raw.split("\n") if raw else [])]
93
+
94
+ @staticmethod
95
+ def _extract_video_id(url: str) -> Optional[str]:
96
+ """Extract YouTube video ID from URL"""
97
+ match = re.search(r"v=([a-zA-Z0-9_-]{11})", url)
98
+ return match.group(1) if match else None
99
+
100
+ async def scrape_multiple_channels(urls: List[str], start_date: str, end_date: str, num_videos: int = 10):
101
+ """Scrape multiple YouTube channels with progress tracking"""
102
+ scraper = YouTubeScraper()
103
+ tasks = [scraper.scrape_channel(url, start_date, end_date, num_videos) for url in urls]
104
+
105
+ with tqdm(total=len(tasks), desc="Processing Channels") as pbar:
106
+ results = []
107
+ for future in asyncio.as_completed(tasks):
108
+ results.append(await future)
109
+ pbar.update(1)
110
+
111
+ return results
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ asyncio
3
+ crewai
4
+ crewai-tools
5
+ crawl4ai
6
+ playwright
7
+ python-dotenv
8
+ tqdm
9
+ pyyaml
10
+ llama-index-llms-cerebras