File size: 4,314 Bytes
381fba2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import asyncio
import re
import logging
from datetime import datetime
from typing import List, Dict, Optional
from tqdm import tqdm

from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class YouTubeScraper:
    YOUTUBE_SCHEMA = {
        "name": "YouTubeVideoData",
        "baseSelector": "ytd-rich-item-renderer",
        "fields": [
            {"name": "title", "selector": "#video-title", "type": "text"},
            {"name": "url", "selector": "#video-title", "type": "link"},
            {"name": "views", "selector": "#metadata-line span:first-child", "type": "text"},
            {"name": "upload_date", "selector": "#metadata-line span:last-child", "type": "text"},
            {"name": "transcript", "selector": "#segments-container", "type": "text"}
        ]
    }

    def __init__(self):
        self.crawler = AsyncWebCrawler(
            headless=True,
            browser="chromium",
            stealth=True,
            timeout=60
        )

    async def scrape_channel(self, url: str, start_date: str, end_date: str, max_videos: int = 10):
        """Scrape and process YouTube channel content"""
        try:
            logger.info(f"Scraping channel: {url}")
            result = await self.crawler.arun(
                url=url,
                extraction_strategy=JsonCssExtractionStrategy(self.YOUTUBE_SCHEMA),
                wait_for_selector="#video-title"
            )
            return self._process_results(result.data, start_date, end_date, max_videos)
        except Exception as e:
            logger.error(f"Failed to scrape {url}: {str(e)}")
            return []

    def _process_results(self, raw_data: List[Dict], start_date: str, end_date: str, max_videos: int):
        """Process and filter scraped data"""
        processed = []
        date_format = "%b %d, %Y"
        
        for item in raw_data[:max_videos]:
            try:
                if not item.get("url"):
                    continue
                    
                upload_date = datetime.strptime(item["upload_date"], date_format)
                start = datetime.strptime(start_date, "%Y-%m-%d")
                end = datetime.strptime(end_date, "%Y-%m-%d")
                
                if not (start <= upload_date <= end):
                    continue
                    
                processed.append({
                    "id": self._extract_video_id(item["url"]),
                    "title": item.get("title", "Untitled"),
                    "url": f"https://youtube.com{item['url']}",
                    "views": self._parse_views(item.get("views", "0")),
                    "upload_date": upload_date.strftime("%Y-%m-%d"),
                    "transcript": self._process_transcript(item.get("transcript", ""))
                })
            except Exception as e:
                logger.warning(f"Skipping invalid video data: {str(e)}")
        
        return processed

    @staticmethod
    def _parse_views(views_str: str) -> int:
        """Convert view count string to integer"""
        return int(re.sub(r"[^\d]", "", views_str.split()[0])) if views_str else 0

    @staticmethod
    def _process_transcript(raw: str) -> List[Dict]:
        """Structure raw transcript text"""
        return [{
            "start": i*5,
            "end": (i+1)*5,
            "text": line.strip()
        } for i, line in enumerate(raw.split("\n") if raw else [])]

    @staticmethod
    def _extract_video_id(url: str) -> Optional[str]:
        """Extract YouTube video ID from URL"""
        match = re.search(r"v=([a-zA-Z0-9_-]{11})", url)
        return match.group(1) if match else None

async def scrape_multiple_channels(urls: List[str], start_date: str, end_date: str, num_videos: int = 10):
    """Scrape multiple YouTube channels with progress tracking"""
    scraper = YouTubeScraper()
    tasks = [scraper.scrape_channel(url, start_date, end_date, num_videos) for url in urls]
    
    with tqdm(total=len(tasks), desc="Processing Channels") as pbar:
        results = []
        for future in asyncio.as_completed(tasks):
            results.append(await future)
            pbar.update(1)
    
    return results