Spaces:

eusholli
/

ttv-ec

Build error

App Files Files Community

eusholli commited on Aug 31, 2024

Commit

47681bb

1 Parent(s): 21ed5c5

fixed speaker parsing error

Browse files

Files changed (1) hide show

ttv_web_scraper.py +33 -26

ttv_web_scraper.py CHANGED Viewed

@@ -2,14 +2,14 @@ import re
 import asyncio
 import json
 import os
-import gc
 import traceback
 from pyppeteer import launch
-from bs4 import BeautifulSoup
 import hashlib
 from ai_config_faiss import get_ai_assistant
 from video_utils import generate_clips
 CACHE_DIR = "cache/"
 if not os.path.exists(CACHE_DIR):
     os.makedirs(CACHE_DIR)
@@ -37,15 +37,23 @@ async def get_client_rendered_content(url):
             await browser.close()
 def extract_info(html_content):
     try:
         soup = BeautifulSoup(html_content, 'html.parser')
         title = soup.title.string.strip() if soup.title else None
         date_elem = soup.find('p', class_='content-date')
-        date = date_elem.find(
-            'span', class_='ng-binding').text.strip() if date_elem else None
-        youtube_iframe = soup.find(
-            'iframe', src=lambda x: x and 'youtube.com' in x)
         youtube_url = youtube_iframe['src'] if youtube_iframe else None
         youtube_id = None
         if youtube_url:
@@ -53,8 +61,7 @@ def extract_info(html_content):
             if match:
                 youtube_id = match.group(1)
         transcript_elem = soup.find(id='transcript0')
-        transcript = transcript_elem.get_text(
-            strip=True) if transcript_elem else None
         return {
             'metadata': {
                 'title': title,
@@ -89,22 +96,6 @@ def read_json_from_file(filename):
         raise Exception(f"Error reading file {filename}: {str(e)}")
-def extract_speaker_info(segment):
-    try:
-        pattern = r'(?P<speaker>(?:[A-Z][a-zöäüß]+ ){1,3}[A-Z][a-zöäüß]+), (?P<company>[A-Za-z0-9\s]+)\((?P<timestamp>(?:\d{2}:)?\d{2}:\d{2})\):'
-        match = re.match(pattern, segment)
-        if match:
-            return {key: value.strip() if value else None for key, value in match.groupdict().items()}
-        else:
-            timestamp_pattern = r'\((?P<timestamp>(?:\d{2}:)?\d{2}:\d{2})\):'
-            timestamp_match = re.match(timestamp_pattern, segment)
-            if timestamp_match:
-                return {'speaker': None, 'company': None, 'timestamp': timestamp_match.group('timestamp')}
-        return None
-    except Exception as e:
-        raise Exception(f"Error extracting speaker info: {str(e)}")
 def extract_subject_info(text):
     # Convert text to lowercase for case-insensitive matching
     lower_text = text.lower()
@@ -116,11 +107,27 @@ def extract_subject_info(text):
     return found_subjects
 def parse_transcript(content):
     try:
         parsed_segments = []
         saved_info = None
-        pattern = r'((?:[A-Z][a-zöäüß]+ ){1,3}[A-Z][a-zöäüß]+,\s+[A-Za-z0-9\s]+\s+\((?:\d{2}:)?\d{2}:\d{2}\):|\((?:\d{2}:)?\d{2}:\d{2}\):)'
         segments = re.split(pattern, content)
         segments = [segment.strip() for segment in segments if segment.strip()]
@@ -234,7 +241,7 @@ def main():
     global assistant
     assistant = get_ai_assistant()
-    url_file = "dsp-urls.txt"  # File containing list of URLs
     if not os.path.exists(url_file):
         print(f"Error: {url_file} not found.")

 import asyncio
 import json
 import os
 import traceback
 from pyppeteer import launch
+from bs4 import BeautifulSoup, NavigableString
 import hashlib
 from ai_config_faiss import get_ai_assistant
 from video_utils import generate_clips
 CACHE_DIR = "cache/"
 if not os.path.exists(CACHE_DIR):
     os.makedirs(CACHE_DIR)
             await browser.close()
+def extract_text_with_br(element):
+    result = ['<br><br>']
+    for child in element.descendants:
+        if isinstance(child, NavigableString):
+            result.append(child.strip())
+        elif child.name == 'br':
+            result.append('<br>')
+    return ''.join(result).strip()
 def extract_info(html_content):
     try:
         soup = BeautifulSoup(html_content, 'html.parser')
         title = soup.title.string.strip() if soup.title else None
         date_elem = soup.find('p', class_='content-date')
+        date = date_elem.find('span', class_='ng-binding').text.strip() if date_elem else None
+        youtube_iframe = soup.find('iframe', src=lambda x: x and 'youtube.com' in x)
         youtube_url = youtube_iframe['src'] if youtube_iframe else None
         youtube_id = None
         if youtube_url:
             if match:
                 youtube_id = match.group(1)
         transcript_elem = soup.find(id='transcript0')
+        transcript = extract_text_with_br(transcript_elem) if transcript_elem else None
         return {
             'metadata': {
                 'title': title,
         raise Exception(f"Error reading file {filename}: {str(e)}")
 def extract_subject_info(text):
     # Convert text to lowercase for case-insensitive matching
     lower_text = text.lower()
     return found_subjects
+PATTERN = r'<br><br>(?:(?P<speaker>[^,(]+?)(?:,\s*(?P<company>[^(]+?))?)?\s*\((?P<timestamp>\d{2}:\d{2}:\d{2}|\d{2}:\d{2})\):<br>'
+def extract_speaker_info(segment):
+    try:
+        match = re.match(PATTERN, segment)
+        if match:
+            return {key: value.strip() if value else None for key, value in match.groupdict().items()}
+        else:
+            return None
+    except Exception as e:
+        raise Exception(f"Error extracting speaker info: {str(e)}")
 def parse_transcript(content):
     try:
         parsed_segments = []
         saved_info = None
+        pattern = r'(<br><br>.*?\((?:\d{2}:)?\d{2}:\d{2}\):<br>)'
         segments = re.split(pattern, content)
         segments = [segment.strip() for segment in segments if segment.strip()]
     global assistant
     assistant = get_ai_assistant()
+    url_file = "dsp-urls-one.txt"  # File containing list of URLs
     if not os.path.exists(url_file):
         print(f"Error: {url_file} not found.")