fixed speaker parsing error
Browse files- ttv_web_scraper.py +33 -26
ttv_web_scraper.py
CHANGED
@@ -2,14 +2,14 @@ import re
|
|
2 |
import asyncio
|
3 |
import json
|
4 |
import os
|
5 |
-
import gc
|
6 |
import traceback
|
7 |
from pyppeteer import launch
|
8 |
-
from bs4 import BeautifulSoup
|
9 |
import hashlib
|
10 |
from ai_config_faiss import get_ai_assistant
|
11 |
from video_utils import generate_clips
|
12 |
|
|
|
13 |
CACHE_DIR = "cache/"
|
14 |
if not os.path.exists(CACHE_DIR):
|
15 |
os.makedirs(CACHE_DIR)
|
@@ -37,15 +37,23 @@ async def get_client_rendered_content(url):
|
|
37 |
await browser.close()
|
38 |
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
def extract_info(html_content):
|
41 |
try:
|
42 |
soup = BeautifulSoup(html_content, 'html.parser')
|
43 |
title = soup.title.string.strip() if soup.title else None
|
44 |
date_elem = soup.find('p', class_='content-date')
|
45 |
-
date = date_elem.find(
|
46 |
-
|
47 |
-
youtube_iframe = soup.find(
|
48 |
-
'iframe', src=lambda x: x and 'youtube.com' in x)
|
49 |
youtube_url = youtube_iframe['src'] if youtube_iframe else None
|
50 |
youtube_id = None
|
51 |
if youtube_url:
|
@@ -53,8 +61,7 @@ def extract_info(html_content):
|
|
53 |
if match:
|
54 |
youtube_id = match.group(1)
|
55 |
transcript_elem = soup.find(id='transcript0')
|
56 |
-
transcript = transcript_elem
|
57 |
-
strip=True) if transcript_elem else None
|
58 |
return {
|
59 |
'metadata': {
|
60 |
'title': title,
|
@@ -89,22 +96,6 @@ def read_json_from_file(filename):
|
|
89 |
raise Exception(f"Error reading file {filename}: {str(e)}")
|
90 |
|
91 |
|
92 |
-
def extract_speaker_info(segment):
|
93 |
-
try:
|
94 |
-
pattern = r'(?P<speaker>(?:[A-Z][a-zöäüß]+ ){1,3}[A-Z][a-zöäüß]+), (?P<company>[A-Za-z0-9\s]+)\((?P<timestamp>(?:\d{2}:)?\d{2}:\d{2})\):'
|
95 |
-
match = re.match(pattern, segment)
|
96 |
-
if match:
|
97 |
-
return {key: value.strip() if value else None for key, value in match.groupdict().items()}
|
98 |
-
else:
|
99 |
-
timestamp_pattern = r'\((?P<timestamp>(?:\d{2}:)?\d{2}:\d{2})\):'
|
100 |
-
timestamp_match = re.match(timestamp_pattern, segment)
|
101 |
-
if timestamp_match:
|
102 |
-
return {'speaker': None, 'company': None, 'timestamp': timestamp_match.group('timestamp')}
|
103 |
-
return None
|
104 |
-
except Exception as e:
|
105 |
-
raise Exception(f"Error extracting speaker info: {str(e)}")
|
106 |
-
|
107 |
-
|
108 |
def extract_subject_info(text):
|
109 |
# Convert text to lowercase for case-insensitive matching
|
110 |
lower_text = text.lower()
|
@@ -116,11 +107,27 @@ def extract_subject_info(text):
|
|
116 |
return found_subjects
|
117 |
|
118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
def parse_transcript(content):
|
120 |
try:
|
121 |
parsed_segments = []
|
122 |
saved_info = None
|
123 |
-
|
|
|
124 |
segments = re.split(pattern, content)
|
125 |
segments = [segment.strip() for segment in segments if segment.strip()]
|
126 |
|
@@ -234,7 +241,7 @@ def main():
|
|
234 |
global assistant
|
235 |
assistant = get_ai_assistant()
|
236 |
|
237 |
-
url_file = "dsp-urls.txt" # File containing list of URLs
|
238 |
|
239 |
if not os.path.exists(url_file):
|
240 |
print(f"Error: {url_file} not found.")
|
|
|
2 |
import asyncio
|
3 |
import json
|
4 |
import os
|
|
|
5 |
import traceback
|
6 |
from pyppeteer import launch
|
7 |
+
from bs4 import BeautifulSoup, NavigableString
|
8 |
import hashlib
|
9 |
from ai_config_faiss import get_ai_assistant
|
10 |
from video_utils import generate_clips
|
11 |
|
12 |
+
|
13 |
CACHE_DIR = "cache/"
|
14 |
if not os.path.exists(CACHE_DIR):
|
15 |
os.makedirs(CACHE_DIR)
|
|
|
37 |
await browser.close()
|
38 |
|
39 |
|
40 |
+
def extract_text_with_br(element):
|
41 |
+
result = ['<br><br>']
|
42 |
+
for child in element.descendants:
|
43 |
+
if isinstance(child, NavigableString):
|
44 |
+
result.append(child.strip())
|
45 |
+
elif child.name == 'br':
|
46 |
+
result.append('<br>')
|
47 |
+
return ''.join(result).strip()
|
48 |
+
|
49 |
+
|
50 |
def extract_info(html_content):
|
51 |
try:
|
52 |
soup = BeautifulSoup(html_content, 'html.parser')
|
53 |
title = soup.title.string.strip() if soup.title else None
|
54 |
date_elem = soup.find('p', class_='content-date')
|
55 |
+
date = date_elem.find('span', class_='ng-binding').text.strip() if date_elem else None
|
56 |
+
youtube_iframe = soup.find('iframe', src=lambda x: x and 'youtube.com' in x)
|
|
|
|
|
57 |
youtube_url = youtube_iframe['src'] if youtube_iframe else None
|
58 |
youtube_id = None
|
59 |
if youtube_url:
|
|
|
61 |
if match:
|
62 |
youtube_id = match.group(1)
|
63 |
transcript_elem = soup.find(id='transcript0')
|
64 |
+
transcript = extract_text_with_br(transcript_elem) if transcript_elem else None
|
|
|
65 |
return {
|
66 |
'metadata': {
|
67 |
'title': title,
|
|
|
96 |
raise Exception(f"Error reading file {filename}: {str(e)}")
|
97 |
|
98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
def extract_subject_info(text):
|
100 |
# Convert text to lowercase for case-insensitive matching
|
101 |
lower_text = text.lower()
|
|
|
107 |
return found_subjects
|
108 |
|
109 |
|
110 |
+
PATTERN = r'<br><br>(?:(?P<speaker>[^,(]+?)(?:,\s*(?P<company>[^(]+?))?)?\s*\((?P<timestamp>\d{2}:\d{2}:\d{2}|\d{2}:\d{2})\):<br>'
|
111 |
+
|
112 |
+
|
113 |
+
def extract_speaker_info(segment):
|
114 |
+
try:
|
115 |
+
match = re.match(PATTERN, segment)
|
116 |
+
if match:
|
117 |
+
return {key: value.strip() if value else None for key, value in match.groupdict().items()}
|
118 |
+
else:
|
119 |
+
return None
|
120 |
+
|
121 |
+
except Exception as e:
|
122 |
+
raise Exception(f"Error extracting speaker info: {str(e)}")
|
123 |
+
|
124 |
+
|
125 |
def parse_transcript(content):
|
126 |
try:
|
127 |
parsed_segments = []
|
128 |
saved_info = None
|
129 |
+
|
130 |
+
pattern = r'(<br><br>.*?\((?:\d{2}:)?\d{2}:\d{2}\):<br>)'
|
131 |
segments = re.split(pattern, content)
|
132 |
segments = [segment.strip() for segment in segments if segment.strip()]
|
133 |
|
|
|
241 |
global assistant
|
242 |
assistant = get_ai_assistant()
|
243 |
|
244 |
+
url_file = "dsp-urls-one.txt" # File containing list of URLs
|
245 |
|
246 |
if not os.path.exists(url_file):
|
247 |
print(f"Error: {url_file} not found.")
|