eusholli commited on
Commit
47681bb
·
1 Parent(s): 21ed5c5

fixed speaker parsing error

Browse files
Files changed (1) hide show
  1. ttv_web_scraper.py +33 -26
ttv_web_scraper.py CHANGED
@@ -2,14 +2,14 @@ import re
2
  import asyncio
3
  import json
4
  import os
5
- import gc
6
  import traceback
7
  from pyppeteer import launch
8
- from bs4 import BeautifulSoup
9
  import hashlib
10
  from ai_config_faiss import get_ai_assistant
11
  from video_utils import generate_clips
12
 
 
13
  CACHE_DIR = "cache/"
14
  if not os.path.exists(CACHE_DIR):
15
  os.makedirs(CACHE_DIR)
@@ -37,15 +37,23 @@ async def get_client_rendered_content(url):
37
  await browser.close()
38
 
39
 
 
 
 
 
 
 
 
 
 
 
40
  def extract_info(html_content):
41
  try:
42
  soup = BeautifulSoup(html_content, 'html.parser')
43
  title = soup.title.string.strip() if soup.title else None
44
  date_elem = soup.find('p', class_='content-date')
45
- date = date_elem.find(
46
- 'span', class_='ng-binding').text.strip() if date_elem else None
47
- youtube_iframe = soup.find(
48
- 'iframe', src=lambda x: x and 'youtube.com' in x)
49
  youtube_url = youtube_iframe['src'] if youtube_iframe else None
50
  youtube_id = None
51
  if youtube_url:
@@ -53,8 +61,7 @@ def extract_info(html_content):
53
  if match:
54
  youtube_id = match.group(1)
55
  transcript_elem = soup.find(id='transcript0')
56
- transcript = transcript_elem.get_text(
57
- strip=True) if transcript_elem else None
58
  return {
59
  'metadata': {
60
  'title': title,
@@ -89,22 +96,6 @@ def read_json_from_file(filename):
89
  raise Exception(f"Error reading file {filename}: {str(e)}")
90
 
91
 
92
- def extract_speaker_info(segment):
93
- try:
94
- pattern = r'(?P<speaker>(?:[A-Z][a-zöäüß]+ ){1,3}[A-Z][a-zöäüß]+), (?P<company>[A-Za-z0-9\s]+)\((?P<timestamp>(?:\d{2}:)?\d{2}:\d{2})\):'
95
- match = re.match(pattern, segment)
96
- if match:
97
- return {key: value.strip() if value else None for key, value in match.groupdict().items()}
98
- else:
99
- timestamp_pattern = r'\((?P<timestamp>(?:\d{2}:)?\d{2}:\d{2})\):'
100
- timestamp_match = re.match(timestamp_pattern, segment)
101
- if timestamp_match:
102
- return {'speaker': None, 'company': None, 'timestamp': timestamp_match.group('timestamp')}
103
- return None
104
- except Exception as e:
105
- raise Exception(f"Error extracting speaker info: {str(e)}")
106
-
107
-
108
  def extract_subject_info(text):
109
  # Convert text to lowercase for case-insensitive matching
110
  lower_text = text.lower()
@@ -116,11 +107,27 @@ def extract_subject_info(text):
116
  return found_subjects
117
 
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  def parse_transcript(content):
120
  try:
121
  parsed_segments = []
122
  saved_info = None
123
- pattern = r'((?:[A-Z][a-zöäüß]+ ){1,3}[A-Z][a-zöäüß]+,\s+[A-Za-z0-9\s]+\s+\((?:\d{2}:)?\d{2}:\d{2}\):|\((?:\d{2}:)?\d{2}:\d{2}\):)'
 
124
  segments = re.split(pattern, content)
125
  segments = [segment.strip() for segment in segments if segment.strip()]
126
 
@@ -234,7 +241,7 @@ def main():
234
  global assistant
235
  assistant = get_ai_assistant()
236
 
237
- url_file = "dsp-urls.txt" # File containing list of URLs
238
 
239
  if not os.path.exists(url_file):
240
  print(f"Error: {url_file} not found.")
 
2
  import asyncio
3
  import json
4
  import os
 
5
  import traceback
6
  from pyppeteer import launch
7
+ from bs4 import BeautifulSoup, NavigableString
8
  import hashlib
9
  from ai_config_faiss import get_ai_assistant
10
  from video_utils import generate_clips
11
 
12
+
13
  CACHE_DIR = "cache/"
14
  if not os.path.exists(CACHE_DIR):
15
  os.makedirs(CACHE_DIR)
 
37
  await browser.close()
38
 
39
 
40
+ def extract_text_with_br(element):
41
+ result = ['<br><br>']
42
+ for child in element.descendants:
43
+ if isinstance(child, NavigableString):
44
+ result.append(child.strip())
45
+ elif child.name == 'br':
46
+ result.append('<br>')
47
+ return ''.join(result).strip()
48
+
49
+
50
  def extract_info(html_content):
51
  try:
52
  soup = BeautifulSoup(html_content, 'html.parser')
53
  title = soup.title.string.strip() if soup.title else None
54
  date_elem = soup.find('p', class_='content-date')
55
+ date = date_elem.find('span', class_='ng-binding').text.strip() if date_elem else None
56
+ youtube_iframe = soup.find('iframe', src=lambda x: x and 'youtube.com' in x)
 
 
57
  youtube_url = youtube_iframe['src'] if youtube_iframe else None
58
  youtube_id = None
59
  if youtube_url:
 
61
  if match:
62
  youtube_id = match.group(1)
63
  transcript_elem = soup.find(id='transcript0')
64
+ transcript = extract_text_with_br(transcript_elem) if transcript_elem else None
 
65
  return {
66
  'metadata': {
67
  'title': title,
 
96
  raise Exception(f"Error reading file {filename}: {str(e)}")
97
 
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  def extract_subject_info(text):
100
  # Convert text to lowercase for case-insensitive matching
101
  lower_text = text.lower()
 
107
  return found_subjects
108
 
109
 
110
+ PATTERN = r'<br><br>(?:(?P<speaker>[^,(]+?)(?:,\s*(?P<company>[^(]+?))?)?\s*\((?P<timestamp>\d{2}:\d{2}:\d{2}|\d{2}:\d{2})\):<br>'
111
+
112
+
113
+ def extract_speaker_info(segment):
114
+ try:
115
+ match = re.match(PATTERN, segment)
116
+ if match:
117
+ return {key: value.strip() if value else None for key, value in match.groupdict().items()}
118
+ else:
119
+ return None
120
+
121
+ except Exception as e:
122
+ raise Exception(f"Error extracting speaker info: {str(e)}")
123
+
124
+
125
  def parse_transcript(content):
126
  try:
127
  parsed_segments = []
128
  saved_info = None
129
+
130
+ pattern = r'(<br><br>.*?\((?:\d{2}:)?\d{2}:\d{2}\):<br>)'
131
  segments = re.split(pattern, content)
132
  segments = [segment.strip() for segment in segments if segment.strip()]
133
 
 
241
  global assistant
242
  assistant = get_ai_assistant()
243
 
244
+ url_file = "dsp-urls-one.txt" # File containing list of URLs
245
 
246
  if not os.path.exists(url_file):
247
  print(f"Error: {url_file} not found.")