Spaces:

muhammadsalmanalfaridzi
/

YT-Trend

Paused

App Files Files Community

muhammadsalmanalfaridzi commited on Jan 26

Commit

381fba2

verified ·

1 Parent(s): edb44e8

Upload 4 files

Browse files

Files changed (4) hide show

app.py +236 -0
config.yaml +63 -0
crawl4ai_scrapper.py +111 -0
requirements.txt +10 -0

app.py ADDED Viewed

	@@ -0,0 +1,236 @@

+import streamlit as st
+import os
+import tempfile
+import gc
+import base64
+import time
+import yaml
+from tqdm import tqdm
+from datetime import datetime
+from typing import Optional
+from crawl4ai_scrapper import scrape_multiple_channels
+from crewai import Agent, Crew, Process, Task, LLM
+from crewai_tools import FileReadTool
+from dotenv import load_dotenv
+load_dotenv()
+# ===========================
+#   Cerebras LLM Integration
+# ===========================
+class CerebrasLLM(LLM):
+    def __init__(self, model: str, api_key: str, base_url: str, **kwargs):
+        from llama_index.llms.cerebras import Cerebras
+        self.client = Cerebras(
+            model=model,
+            api_key=api_key,
+            base_url=base_url,
+            **kwargs
+        )
+    def generate(self, prompt: str, **kwargs) -> str:
+        response = self.client.complete(prompt, **kwargs)
+        return response.text
+@st.cache_resource
+def load_llm() -> CerebrasLLM:
+    return CerebrasLLM(
+        model="llama-3.3-70b",
+        api_key=os.getenv("CEREBRAS_API_KEY"),
+        base_url="https://api.cerebras.ai/v1",
+        temperature=0.7,
+        max_tokens=4096,
+        top_p=0.95,
+        timeout=30
+    )
+# ===========================
+#   Core Application Logic
+# ===========================
+class YouTubeAnalyzer:
+    def __init__(self):
+        self.docs_tool = FileReadTool()
+        self.llm = load_llm()
+    def create_crew(self):
+        with open("config.yaml", 'r') as file:
+            config = yaml.safe_load(file)
+        analysis_agent = Agent(
+            role=config["agents"][0]["role"],
+            goal=config["agents"][0]["goal"],
+            backstory=config["agents"][0]["backstory"],
+            verbose=True,
+            tools=[self.docs_tool],
+            llm=self.llm,
+            memory=True
+        )
+        synthesis_agent = Agent(
+            role=config["agents"][1]["role"],
+            goal=config["agents"][1]["goal"],
+            backstory=config["agents"][1]["backstory"],
+            verbose=True,
+            llm=self.llm,
+            allow_delegation=False
+        )
+        analysis_task = Task(
+            description=config["tasks"][0]["description"],
+            expected_output=config["tasks"][0]["expected_output"],
+            agent=analysis_agent,
+            output_file="analysis_raw.md"
+        )
+        synthesis_task = Task(
+            description=config["tasks"][1]["description"],
+            expected_output=config["tasks"][1]["expected_output"],
+            agent=synthesis_agent,
+            context=[analysis_task],
+            output_file="final_report.md"
+        )
+        return Crew(
+            agents=[analysis_agent, synthesis_agent],
+            tasks=[analysis_task, synthesis_task],
+            process=Process.sequential,
+            verbose=2
+        )
+# ===========================
+#   Streamlit Interface
+# ===========================
+class StreamlitApp:
+    def __init__(self):
+        self.analyzer = YouTubeAnalyzer()
+        self._init_session_state()
+    def _init_session_state(self):
+        if "response" not in st.session_state:
+            st.session_state.response = None
+        if "crew" not in st.session_state:
+            st.session_state.crew = None
+        if "youtube_channels" not in st.session_state:
+            st.session_state.youtube_channels = [""]
+    def _setup_sidebar(self):
+        with st.sidebar:
+            st.header("YouTube Analysis Configuration")
+            # Channel Management
+            for i, channel in enumerate(st.session_state.youtube_channels):
+                cols = st.columns([6, 1])
+                with cols[0]:
+                    url = st.text_input(
+                        "Channel URL",
+                        value=channel,
+                        key=f"channel_{i}",
+                        help="Example: https://www.youtube.com/@ChannelName"
+                    )
+                with cols[1]:
+                    if i > 0 and st.button("❌", key=f"remove_{i}"):
+                        st.session_state.youtube_channels.pop(i)
+                        st.rerun()
+            st.button("Add Channel ➕", on_click=lambda: st.session_state.youtube_channels.append(""))
+            # Date Selection
+            st.divider()
+            st.subheader("Analysis Period")
+            self.start_date = st.date_input("Start Date", key="start_date")
+            self.end_date = st.date_input("End Date", key="end_date")
+            # Analysis Control
+            st.divider()
+            if st.button("🚀 Start Analysis", type="primary"):
+                self._trigger_analysis()
+    def _trigger_analysis(self):
+        with st.spinner('Initializing deep content analysis...'):
+            try:
+                valid_urls = [
+                    url for url in st.session_state.youtube_channels
+                    if self._is_valid_youtube_url(url)
+                ]
+                if not valid_urls:
+                    st.error("Please provide at least one valid YouTube channel URL")
+                    return
+                # Scrape and process data
+                channel_data = asyncio.run(
+                    scrape_multiple_channels(
+                        valid_urls,
+                        start_date=self.start_date.strftime("%Y-%m-%d"),
+                        end_date=self.end_date.strftime("%Y-%m-%d")
+                    )
+                )
+                # Save transcripts
+                self._save_transcripts(channel_data)
+                # Execute analysis
+                with st.spinner('Running AI-powered analysis...'):
+                    st.session_state.crew = self.analyzer.create_crew()
+                    st.session_state.response = st.session_state.crew.kickoff(
+                        inputs={"files": st.session_state.all_files}
+                    )
+            except Exception as e:
+                st.error(f"Analysis failed: {str(e)}")
+                st.stop()
+    def _save_transcripts(self, channel_data):
+        st.session_state.all_files = []
+        os.makedirs("transcripts", exist_ok=True)
+        with tqdm(total=sum(len(ch) for ch in channel_data), desc="Processing Videos") as pbar:
+            for channel in channel_data:
+                for video in channel:
+                    file_path = f"transcripts/{video['id']}.txt"
+                    with open(file_path, "w") as f:
+                        f.write("\n".join(
+                            [f"[{seg['start']}-{seg['end']}] {seg['text']}"
+                             for seg in video['transcript']]
+                        ))
+                    st.session_state.all_files.append(file_path)
+                    pbar.update(1)
+    def _display_results(self):
+        st.markdown("## Analysis Report")
+        with st.expander("View Full Technical Analysis"):
+            st.markdown(st.session_state.response)
+        col1, col2 = st.columns([3, 1])
+        with col1:
+            st.download_button(
+                label="📥 Download Full Report",
+                data=st.session_state.response,
+                file_name="youtube_analysis_report.md",
+                mime="text/markdown"
+            )
+        with col2:
+            if st.button("🔄 New Analysis"):
+                gc.collect()
+                st.session_state.response = None
+                st.rerun()
+    @staticmethod
+    def _is_valid_youtube_url(url: str) -> bool:
+        return any(pattern in url for pattern in ["youtube.com/", "youtu.be/"])
+    def run(self):
+        # Move st.set_page_config to the top to fix the error
+        st.set_page_config(page_title="YouTube Intelligence System", layout="wide")  # First Streamlit command
+        st.title("YouTube Content Analysis Platform")
+        st.markdown("---")
+        self._setup_sidebar()
+        if st.session_state.response:
+            self._display_results()
+        else:
+            st.info("Configure analysis parameters in the sidebar to begin")

config.yaml ADDED Viewed

	@@ -0,0 +1,63 @@

+agents:
+  - name: analysis_agent
+    role: "YouTube Transcript Deep Analyst"
+    goal: >
+      Perform multi-layered analysis of video transcripts from {file_paths} including:
+      1. Thematic analysis with topic modeling
+      2. Cross-video trend correlation
+      3. Sentiment trajectory over time
+      4. Keyword evolution analysis
+      5. Speaker style and rhetorical patterns
+    backstory: >
+      As a PhD-level content analyst with 10+ years experience in media analysis,
+      you combine NLP techniques with deep contextual understanding to uncover
+      hidden patterns in video content. Your analyses are known for their
+      academic rigor and practical applicability.
+    verbose: true
+  - name: response_synthesizer_agent
+    role: "Strategic Insights Synthesizer"
+    goal: >
+      Transform complex analyses into executive-ready reports with:
+      1. Data visualization recommendations
+      2. Actionable strategy frameworks
+      3. Risk/opportunity matrices
+      4. Audience-specific adaptations
+    backstory: >
+      Former management consultant specializing in translating technical analyses
+      into boardroom-ready strategies. Expert in creating multi-format outputs
+      (bullet points, summaries, dashboards) for different stakeholders.
+    verbose: true
+tasks:
+  - name: analysis_task
+    description: >
+      Conduct temporal analysis of transcripts with:
+      - Topic clustering (TF-IDF + k-means)
+      - Sentiment progression graphs
+      - Keyword co-occurrence networks
+      - Cross-channel comparison matrices
+    expected_output: >
+      A technical report containing:
+      1. Time-coded thematic analysis
+      2. Sentiment heatmaps
+      3. Keyword evolution charts
+      4. Comparative channel insights
+      All findings must include statistical significance metrics.
+    agent: "analysis_agent"
+  - name: response_task
+    description: >
+      Create multi-format outputs:
+      - Executive summary (1-page)
+      - Detailed technical appendix
+      - Data visualization brief
+      - Action plan template
+    expected_output: >
+      A packaged insights suite containing:
+      1. Visual summary infographic
+      2. Strategic recommendations matrix
+      3. Risk assessment framework
+      4. Implementation roadmap
+      All elements must be production-ready.
+    agent: "response_synthesizer_agent"

crawl4ai_scrapper.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import asyncio
+import re
+import logging
+from datetime import datetime
+from typing import List, Dict, Optional
+from tqdm import tqdm
+from crawl4ai import AsyncWebCrawler
+from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class YouTubeScraper:
+    YOUTUBE_SCHEMA = {
+        "name": "YouTubeVideoData",
+        "baseSelector": "ytd-rich-item-renderer",
+        "fields": [
+            {"name": "title", "selector": "#video-title", "type": "text"},
+            {"name": "url", "selector": "#video-title", "type": "link"},
+            {"name": "views", "selector": "#metadata-line span:first-child", "type": "text"},
+            {"name": "upload_date", "selector": "#metadata-line span:last-child", "type": "text"},
+            {"name": "transcript", "selector": "#segments-container", "type": "text"}
+        ]
+    }
+    def __init__(self):
+        self.crawler = AsyncWebCrawler(
+            headless=True,
+            browser="chromium",
+            stealth=True,
+            timeout=60
+        )
+    async def scrape_channel(self, url: str, start_date: str, end_date: str, max_videos: int = 10):
+        """Scrape and process YouTube channel content"""
+        try:
+            logger.info(f"Scraping channel: {url}")
+            result = await self.crawler.arun(
+                url=url,
+                extraction_strategy=JsonCssExtractionStrategy(self.YOUTUBE_SCHEMA),
+                wait_for_selector="#video-title"
+            )
+            return self._process_results(result.data, start_date, end_date, max_videos)
+        except Exception as e:
+            logger.error(f"Failed to scrape {url}: {str(e)}")
+            return []
+    def _process_results(self, raw_data: List[Dict], start_date: str, end_date: str, max_videos: int):
+        """Process and filter scraped data"""
+        processed = []
+        date_format = "%b %d, %Y"
+        for item in raw_data[:max_videos]:
+            try:
+                if not item.get("url"):
+                    continue
+                upload_date = datetime.strptime(item["upload_date"], date_format)
+                start = datetime.strptime(start_date, "%Y-%m-%d")
+                end = datetime.strptime(end_date, "%Y-%m-%d")
+                if not (start <= upload_date <= end):
+                    continue
+                processed.append({
+                    "id": self._extract_video_id(item["url"]),
+                    "title": item.get("title", "Untitled"),
+                    "url": f"https://youtube.com{item['url']}",
+                    "views": self._parse_views(item.get("views", "0")),
+                    "upload_date": upload_date.strftime("%Y-%m-%d"),
+                    "transcript": self._process_transcript(item.get("transcript", ""))
+                })
+            except Exception as e:
+                logger.warning(f"Skipping invalid video data: {str(e)}")
+        return processed
+    @staticmethod
+    def _parse_views(views_str: str) -> int:
+        """Convert view count string to integer"""
+        return int(re.sub(r"[^\d]", "", views_str.split()[0])) if views_str else 0
+    @staticmethod
+    def _process_transcript(raw: str) -> List[Dict]:
+        """Structure raw transcript text"""
+        return [{
+            "start": i*5,
+            "end": (i+1)*5,
+            "text": line.strip()
+        } for i, line in enumerate(raw.split("\n") if raw else [])]
+    @staticmethod
+    def _extract_video_id(url: str) -> Optional[str]:
+        """Extract YouTube video ID from URL"""
+        match = re.search(r"v=([a-zA-Z0-9_-]{11})", url)
+        return match.group(1) if match else None
+async def scrape_multiple_channels(urls: List[str], start_date: str, end_date: str, num_videos: int = 10):
+    """Scrape multiple YouTube channels with progress tracking"""
+    scraper = YouTubeScraper()
+    tasks = [scraper.scrape_channel(url, start_date, end_date, num_videos) for url in urls]
+    with tqdm(total=len(tasks), desc="Processing Channels") as pbar:
+        results = []
+        for future in asyncio.as_completed(tasks):
+            results.append(await future)
+            pbar.update(1)
+    return results

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+streamlit
+asyncio
+crewai
+crewai-tools
+crawl4ai
+playwright
+python-dotenv
+tqdm
+pyyaml
+llama-index-llms-cerebras