Spaces:

eusholli
/

ttv-ec

Build error

App Files Files Community

eusholli commited on Aug 31, 2024

Commit

2127ae4

1 Parent(s): 47681bb

fixed getting download clip

Browse files

Files changed (2) hide show

ttv_web_scraper.py +200 -237
video_utils.py +6 -4

ttv_web_scraper.py CHANGED Viewed

@@ -5,33 +5,52 @@ import os
 import traceback
 from pyppeteer import launch
 from bs4 import BeautifulSoup, NavigableString
-import hashlib
 from ai_config_faiss import get_ai_assistant
 from video_utils import generate_clips
-CACHE_DIR = "cache/"
-if not os.path.exists(CACHE_DIR):
-    os.makedirs(CACHE_DIR)
 DB_METADATA_FILE = os.path.join(CACHE_DIR, "db_metadata.json")
 SUBJECTS = [
-    " 5G ",  " AI ",  " Innovation ",  " Network ",  " Enterprise ",  " Open RAN ",
-    " TechCo ",  " B2B ",  " API ",  " Infrastructure ",  " Connectivity "
 ]
-async def get_client_rendered_content(url):
     browser = None
     try:
         browser = await launch()
         page = await browser.newPage()
         await page.goto(url, {'waitUntil': 'networkidle0', 'timeout': 60000})
         await asyncio.sleep(5)
-        content = await page.content()
-        return content
     except Exception as e:
-        raise Exception(f"Error fetching content: {str(e)}")
     finally:
         if browser:
             await browser.close()
@@ -47,7 +66,7 @@ def extract_text_with_br(element):
     return ''.join(result).strip()
-def extract_info(html_content):
     try:
         soup = BeautifulSoup(html_content, 'html.parser')
         title = soup.title.string.strip() if soup.title else None
@@ -55,289 +74,233 @@ def extract_info(html_content):
         date = date_elem.find('span', class_='ng-binding').text.strip() if date_elem else None
         youtube_iframe = soup.find('iframe', src=lambda x: x and 'youtube.com' in x)
         youtube_url = youtube_iframe['src'] if youtube_iframe else None
-        youtube_id = None
-        if youtube_url:
-            match = re.search(r'youtube\.com/embed/([^?]+)', youtube_url)
-            if match:
-                youtube_id = match.group(1)
         transcript_elem = soup.find(id='transcript0')
         transcript = extract_text_with_br(transcript_elem) if transcript_elem else None
-        return {
-            'metadata': {
-                'title': title,
-                'date': date,
-                'youtube_id': youtube_id,
-            },
-            'transcript': transcript
-        }
     except Exception as e:
-        raise Exception(f"Error extracting information: {str(e)}")
-def read_html_from_file(filename):
     try:
         if os.path.exists(filename):
             with open(filename, 'r', encoding='utf-8') as f:
                 return f.read()
         return None
     except Exception as e:
-        raise Exception(f"Error reading file {filename}: {str(e)}")
-def read_json_from_file(filename):
-    try:
-        if os.path.exists(filename):
-            with open(filename, 'r', encoding='utf-8') as f:
-                return json.load(f)
-        return None
-    except json.JSONDecodeError as e:
-        raise Exception(f"Error decoding JSON in file {filename}: {str(e)}")
-    except Exception as e:
-        raise Exception(f"Error reading file {filename}: {str(e)}")
-def extract_subject_info(text):
-    # Convert text to lowercase for case-insensitive matching
-    lower_text = text.lower()
-    # Find all subjects present in the text
-    found_subjects = [
-        subject for subject in SUBJECTS if subject.lower() in lower_text]
-    return found_subjects
-PATTERN = r'<br><br>(?:(?P<speaker>[^,(]+?)(?:,\s*(?P<company>[^(]+?))?)?\s*\((?P<timestamp>\d{2}:\d{2}:\d{2}|\d{2}:\d{2})\):<br>'
-def extract_speaker_info(segment):
-    try:
-        match = re.match(PATTERN, segment)
-        if match:
-            return {key: value.strip() if value else None for key, value in match.groupdict().items()}
-        else:
-            return None
-    except Exception as e:
-        raise Exception(f"Error extracting speaker info: {str(e)}")
-def parse_transcript(content):
-    try:
-        parsed_segments = []
-        saved_info = None
-        pattern = r'(<br><br>.*?\((?:\d{2}:)?\d{2}:\d{2}\):<br>)'
-        segments = re.split(pattern, content)
-        segments = [segment.strip() for segment in segments if segment.strip()]
-        for i, segment in enumerate(segments):
-            speaker_info = extract_speaker_info(segment)
-            if speaker_info:
-                if speaker_info['speaker']:
-                    # Full speaker, company, timestamp format
-                    if saved_info:
-                        text = segments[i-1] if i > 0 else ""
-                        subjects = extract_subject_info(text)
-                        parsed_segments.append({
-                            'metadata': {
-                                'speaker': saved_info['speaker'],
-                                'company': saved_info['company'],
-                                'start_timestamp': saved_info['timestamp'],
-                                'end_timestamp': speaker_info['timestamp'],
-                                'subjects': subjects
-                            },
-                            'text': text
-                        })
-                    saved_info = speaker_info
-                else:
-                    # Standalone timestamp format
-                    if saved_info:
-                        text = segments[i-1] if i > 0 else ""
-                        subjects = extract_subject_info(text)
-                        parsed_segments.append({
-                            'metadata': {
-                                'speaker': saved_info['speaker'],
-                                'company': saved_info['company'],
-                                'start_timestamp': saved_info['timestamp'],
-                                'end_timestamp': speaker_info['timestamp'],
-                                'subjects': subjects
-                            },
-                            'text': text
-                        })
-                        saved_info['timestamp'] = speaker_info['timestamp']
-            elif saved_info:
-                # Text segment
-                continue
-        # Add final entry
-        if saved_info:
-            text = segments[-1]
-            subjects = extract_subject_info(text)
-            parsed_segments.append({
-                'metadata': {
-                    'speaker': saved_info['speaker'],
-                    'company': saved_info['company'],
-                    'start_timestamp': saved_info['timestamp'],
-                    'end_timestamp': "00:00:00",
-                    'subjects': subjects
-                },
-                'text': text
-            })
-        return parsed_segments
-    except Exception as e:
-        raise Exception(f"Error parsing transcript: {str(e)}")
-def get_cached_filename(url):
-    return f"{CACHE_DIR}cached_{url.replace('://', '_').replace('/', '_')}.html"
-async def process_url(url):
-    try:
-        cached_filename = get_cached_filename(url)
-        json_filename = f"{cached_filename}.json"
-        info = read_json_from_file(json_filename)
-        if info:
-            return info
-        content = read_html_from_file(cached_filename)
-        if content is None:
-            print(f"Fetching content from web for {url}...")
-            content = await get_client_rendered_content(url)
-            with open(cached_filename, 'w', encoding='utf-8') as f:
-                f.write(content)
-        else:
-            print(f"Using cached content from file for {url}...")
-        info = extract_info(content)
-        transcript = info['transcript']
-        if (transcript):
-            info['transcript'] = parse_transcript(transcript)
-            generate_clips(CACHE_DIR, info)
-            with open(json_filename, 'w', encoding='utf-8') as f:
-                json.dump(info, f, ensure_ascii=False, indent=4)
-            print(f"Information extracted and saved to {json_filename}")
-        else:
-            print(f"No transcript found for {url}")
-        return info
-    except Exception as e:
-        print(f"Error processing URL {url}:")
-        print(traceback.format_exc())
-        print(f"Detailed error: {str(e)}")
-        return None
-async def process_urls(urls):
-    tasks = [process_url(url) for url in urls]
-    return await asyncio.gather(*tasks)
-def main():
-    global assistant
     assistant = get_ai_assistant()
-    url_file = "dsp-urls-one.txt"  # File containing list of URLs
     if not os.path.exists(url_file):
-        print(f"Error: {url_file} not found.")
         return
-    content_hashes, speakers, companies, sentiments, subjects = db_load_metadata_sets()
-    # Convert companies to a dictionary of speaker sets if it's not already
-    if not isinstance(companies, dict):
-        companies = {company: set() for company in companies}
     with open(url_file, 'r') as f:
         urls = [line.strip() for line in f if line.strip()]
-    for url in urls:
-        # Generate a hash of the url
-        filename_hash = hashlib.md5(url.encode()).hexdigest()
-        # Check if this content has already been added
-        if filename_hash in content_hashes:
-            print(f"{url} already added")
             continue
-        info = asyncio.run(process_url(url))
         if info is None:
             continue
-        metadata = info['metadata']
-        transcript = info['transcript']
-        if transcript is None:
-            continue
-        for entry in transcript:
-            metadata.update(entry['metadata'])
-            company = metadata['company']
-            speaker = metadata['speaker']
-            entry_subjects = metadata['subjects']
-            speakers.add(speaker)
-            # Add new subjects to the master set
             subjects.update(entry_subjects)
-            text = entry['text']
-            assistant.add_to_knowledge_base(
-                text, data_type='text', metadata=metadata.copy())
-            if company not in companies:
-                companies[company] = set()
-            companies[company].add(speaker)
-        content_hashes.add(filename_hash)
-        print(f"Added new url: {url}")
-    # Save updated hashes and metadata
-    save_metadata_sets(content_hashes, speakers,
-                       companies, sentiments, subjects)
     assistant.save()
-    print("Processing complete. Check individual URL outputs for any errors.")
-def save_metadata_sets(content_hashes, speakers, companies, sentiments, subjects):
-    metadata = {
-        'content_hashes': list(content_hashes),
-        'speakers': list(speakers),
-        'companies': {company: list(speakers) for company, speakers in companies.items()},
-        'sentiments': list(sentiments),
-        'subjects': list(subjects)
-    }
-    with open(DB_METADATA_FILE, 'w') as f:
-        json.dump(metadata, f, indent=2)
-def db_load_metadata_sets():
-    content_hashes = set()
-    speakers = set()
-    companies = {}
-    sentiments = set()
-    subjects = set()
-    if os.path.exists(DB_METADATA_FILE):
-        with open(DB_METADATA_FILE, 'r') as f:
-            metadata = json.load(f)
-        content_hashes = set(metadata.get('content_hashes', []))
-        speakers = set(metadata.get('speakers', []))
-        companies = {company: set(speakers) for company, speakers in metadata.get(
-            'companies', {}).items()}
-        sentiments = set(metadata.get('sentiments', []))
-        subjects = set(metadata.get('subjects', SUBJECTS))
-    return content_hashes, speakers, companies, sentiments, subjects
 if __name__ == "__main__":
-    main()

 import traceback
 from pyppeteer import launch
 from bs4 import BeautifulSoup, NavigableString
 from ai_config_faiss import get_ai_assistant
 from video_utils import generate_clips
+from typing import Dict, List, Set, Optional
+from dataclasses import dataclass, asdict
+import logging
+# Set the TOKENIZERS_PARALLELISM environment variable
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# Set up logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+CACHE_DIR = "cache/"
 DB_METADATA_FILE = os.path.join(CACHE_DIR, "db_metadata.json")
 SUBJECTS = [
+    " 5G ", " AI ", " Innovation ", " Network ", " Enterprise ", " Open RAN ",
+    " TechCo ", " B2B ", " API ", " Infrastructure ", " Connectivity "
 ]
+os.makedirs(CACHE_DIR, exist_ok=True)
+@dataclass
+class TranscriptSegment:
+    metadata: Dict[str, Optional[str]]
+    text: str
+@dataclass
+class VideoInfo:
+    metadata: Dict[str, Optional[str]]
+    transcript: List[TranscriptSegment]
+async def get_client_rendered_content(url: str) -> str:
     browser = None
     try:
         browser = await launch()
         page = await browser.newPage()
         await page.goto(url, {'waitUntil': 'networkidle0', 'timeout': 60000})
         await asyncio.sleep(5)
+        return await page.content()
     except Exception as e:
+        logger.error(f"Error fetching content for {url}: {str(e)}")
+        raise
     finally:
         if browser:
             await browser.close()
     return ''.join(result).strip()
+def extract_info(html_content: str) -> VideoInfo:
     try:
         soup = BeautifulSoup(html_content, 'html.parser')
         title = soup.title.string.strip() if soup.title else None
         date = date_elem.find('span', class_='ng-binding').text.strip() if date_elem else None
         youtube_iframe = soup.find('iframe', src=lambda x: x and 'youtube.com' in x)
         youtube_url = youtube_iframe['src'] if youtube_iframe else None
+        youtube_id = re.search(r'youtube\.com/embed/([^?]+)', youtube_url).group(1) if youtube_url else None
         transcript_elem = soup.find(id='transcript0')
         transcript = extract_text_with_br(transcript_elem) if transcript_elem else None
+        return VideoInfo(
+            metadata={'title': title, 'date': date, 'youtube_id': youtube_id},
+            transcript=parse_transcript(transcript) if transcript else []
+        )
     except Exception as e:
+        logger.error(f"Error extracting information: {str(e)}")
+        raise
+def read_file(filename: str) -> Optional[str]:
     try:
         if os.path.exists(filename):
             with open(filename, 'r', encoding='utf-8') as f:
                 return f.read()
         return None
     except Exception as e:
+        logger.error(f"Error reading file {filename}: {str(e)}")
+        raise
+def extract_subject_info(text: str) -> List[str]:
+    return [subject for subject in SUBJECTS if subject.lower() in text.lower()]
+def extract_speaker_info(segment: str) -> Optional[Dict[str, Optional[str]]]:
+    pattern = r'<br><br>(?:(?P<speaker>[^,(]+?)(?:,\s*(?P<company>[^(]+?))?)?\s*\((?P<timestamp>\d{2}:\d{2}:\d{2}|\d{2}:\d{2})\):<br>'
+    match = re.match(pattern, segment)
+    return {key: value.strip() if value else None for key, value in match.groupdict().items()} if match else None
+def parse_transcript(content: str) -> List[TranscriptSegment]:
+    parsed_segments = []
+    saved_info = None
+    segments = [segment.strip() for segment in re.split(r'(<br><br>.*?\((?:\d{2}:)?\d{2}:\d{2}\):<br>)', content) if segment.strip()]
+    for i, segment in enumerate(segments):
+        speaker_info = extract_speaker_info(segment)
+        if speaker_info:
+            if speaker_info['speaker']:
+                if saved_info:
+                    text = segments[i-1] if i > 0 else ""
+                    parsed_segments.append(TranscriptSegment(
+                        metadata={
+                            'speaker': saved_info['speaker'],
+                            'company': saved_info['company'],
+                            'start_timestamp': saved_info['timestamp'],
+                            'end_timestamp': speaker_info['timestamp'],
+                            'subjects': extract_subject_info(text)
+                        },
+                        text=text
+                    ))
+                saved_info = speaker_info
+                if not saved_info['company']:
+                    saved_info['company'] = "Unknown"
+            else:
+                if saved_info:
+                    text = segments[i-1] if i > 0 else ""
+                    parsed_segments.append(TranscriptSegment(
+                        metadata={
+                            'speaker': saved_info['speaker'],
+                            'company': saved_info['company'],
+                            'start_timestamp': saved_info['timestamp'],
+                            'end_timestamp': speaker_info['timestamp'],
+                            'subjects': extract_subject_info(text)
+                        },
+                        text=text
+                    ))
+                    saved_info['timestamp'] = speaker_info['timestamp']
+        elif saved_info:
+            continue
+    if saved_info:
+        text = segments[-1]
+        parsed_segments.append(TranscriptSegment(
+            metadata={
+                'speaker': saved_info['speaker'],
+                'company': saved_info['company'],
+                'start_timestamp': saved_info['timestamp'],
+                'end_timestamp': "00:00:00",
+                'subjects': extract_subject_info(text)
+            },
+            text=text
+        ))
+    return parsed_segments
+def get_cached_filename(url: str) -> str:
+    return f"{CACHE_DIR}cached_{url.replace('://', '_').replace('/', '_')}"
+async def process_url(url: str) -> Optional[VideoInfo]:
+    try:
+        cached_filename = get_cached_filename(url)
+        html_filename = f"{cached_filename}.html"
+        json_filename = f"{cached_filename}.json"
+        if os.path.exists(json_filename):
+            logger.info(f"Using cached JSON for {url}")
+            with open(json_filename, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+                return VideoInfo(
+                    metadata=data['metadata'],
+                    transcript=[TranscriptSegment(**segment) for segment in data['transcript']]
+                )
+        if os.path.exists(html_filename):
+            logger.info(f"Using cached HTML for {url}")
+            content = read_file(html_filename)
+        else:
+            logger.info(f"Fetching content from web for {url}")
+            content = await get_client_rendered_content(url)
+            with open(html_filename, 'w', encoding='utf-8') as f:
+                f.write(content)
+        info = extract_info(content)
+        if info.transcript:
+            logger.info(f"Generating clips for {url}")
+            info_dict = asdict(info)
+            info_dict['transcript'] = generate_clips(CACHE_DIR, info_dict)
+            info = VideoInfo(
+                metadata=info_dict['metadata'],
+                transcript=[TranscriptSegment(**segment) for segment in info_dict['transcript']]
+            )
+            with open(json_filename, 'w', encoding='utf-8') as f:
+                json.dump(asdict(info), f, ensure_ascii=False, indent=4)
+            logger.info(f"Information extracted and saved to {json_filename}")
+        else:
+            logger.warning(f"No transcript found for {url}")
+        return info
+    except Exception:
+        logger.error(f"Error processing URL {url}:\n{traceback.format_exc()}")
+        return None
+async def process_urls(urls: List[str]) -> List[Optional[VideoInfo]]:
+    return await asyncio.gather(*[process_url(url) for url in urls])
+def save_metadata_sets(processed_urls: Set[str], speakers: Set[str], companies: Dict[str, Set[str]], sentiments: Set[str], subjects: Set[str]):
+    metadata = {
+        'processed_urls': list(processed_urls),
+        'speakers': list(speakers),
+        'companies': {company: list(speakers) for company, speakers in companies.items()},
+        'sentiments': list(sentiments),
+        'subjects': list(subjects)
+    }
+    with open(DB_METADATA_FILE, 'w') as f:
+        json.dump(metadata, f, indent=2)
+def db_load_metadata_sets() -> tuple:
+    if os.path.exists(DB_METADATA_FILE):
+        with open(DB_METADATA_FILE, 'r') as f:
+            metadata = json.load(f)
+        return (
+            set(metadata.get('processed_urls', [])),
+            set(metadata.get('speakers', [])),
+            {company: set(speakers) for company, speakers in metadata.get('companies', {}).items()},
+            set(metadata.get('sentiments', [])),
+            set(metadata.get('subjects', SUBJECTS))
+        )
+    return set(), set(), {}, set(), set(SUBJECTS)
+async def main():
     assistant = get_ai_assistant()
+    url_file = "dsp-urls-one.txt"
     if not os.path.exists(url_file):
+        logger.error(f"Error: {url_file} not found.")
         return
+    processed_urls, speakers, companies, sentiments, subjects = db_load_metadata_sets()
     with open(url_file, 'r') as f:
         urls = [line.strip() for line in f if line.strip()]
+    total_urls = len(urls)
+    for i, url in enumerate(urls, 1):
+        if url in processed_urls:
+            logger.info(f"[{i}/{total_urls}] {url} already processed")
             continue
+        logger.info(f"[{i}/{total_urls}] Processing {url}")
+        info = await process_url(url)
         if info is None:
+            logger.warning(f"[{i}/{total_urls}] Failed to process {url}")
             continue
+        for entry in info.transcript:
+            metadata = {**info.metadata, **entry.metadata}
+            company = metadata.get('company')
+            speaker = metadata.get('speaker')
+            entry_subjects = metadata.get('subjects', [])
+            if speaker:
+                speakers.add(speaker)
             subjects.update(entry_subjects)
+            assistant.add_to_knowledge_base(entry.text, data_type='text', metadata=metadata.copy())
+            if company and speaker:
+                companies.setdefault(company, set()).add(speaker)
+        processed_urls.add(url)
+        logger.info(f"[{i}/{total_urls}] Added new url: {url}")
+    save_metadata_sets(processed_urls, speakers, companies, sentiments, subjects)
     assistant.save()
+    logger.info("Processing complete. Check logs for any errors.")
 if __name__ == "__main__":
+    asyncio.run(main())

video_utils.py CHANGED Viewed

@@ -111,9 +111,9 @@ def main():
 def generate_clips(cache_dir, info):
     yt_id = info['metadata']['youtube_id']
     download_file = get_youtube_video(cache_dir, yt_id)
     if download_file:
-        transcript = info['transcript']
         video = VideoFileClip(download_file)
         for entry in transcript:
@@ -127,9 +127,6 @@ def generate_clips(cache_dir, info):
             end_time = min(video.duration, end_time +
                            1) if end_time != 0 else video.duration
-            # Create clip
-            clip = video.subclip(start_time, end_time)
             # Generate output filename
             output_filename = (
                 f"{CLIP_DIR}{yt_id}-"
@@ -140,6 +137,9 @@ def generate_clips(cache_dir, info):
             if os.path.exists(output_filename):
                 continue
             # Write the clip to a file
             clip.write_videofile(
@@ -151,6 +151,8 @@ def generate_clips(cache_dir, info):
         video.close()
     else:
         print(f"Failed to download video for YouTube ID: {yt_id}")
 if __name__ == "__main__":

 def generate_clips(cache_dir, info):
     yt_id = info['metadata']['youtube_id']
     download_file = get_youtube_video(cache_dir, yt_id)
+    transcript = info['transcript']
     if download_file:
         video = VideoFileClip(download_file)
         for entry in transcript:
             end_time = min(video.duration, end_time +
                            1) if end_time != 0 else video.duration
             # Generate output filename
             output_filename = (
                 f"{CLIP_DIR}{yt_id}-"
             if os.path.exists(output_filename):
                 continue
+            # Create clip
+            clip = video.subclip(start_time, end_time)
             # Write the clip to a file
             clip.write_videofile(
         video.close()
     else:
         print(f"Failed to download video for YouTube ID: {yt_id}")
+    return transcript
 if __name__ == "__main__":