import re import asyncio import json import os import gc import traceback from pyppeteer import launch from bs4 import BeautifulSoup import hashlib from ai_config_faiss import get_ai_assistant from video_utils import generate_clips CACHE_DIR = "cache/" if not os.path.exists(CACHE_DIR): os.makedirs(CACHE_DIR) DB_METADATA_FILE = os.path.join(CACHE_DIR, "db_metadata.json") SUBJECTS = [ " 5G ", " AI ", " Innovation ", " Network ", " Enterprise ", " Open RAN ", " TechCo ", " B2B ", " API ", " Infrastructure ", " Connectivity " ] async def get_client_rendered_content(url): browser = None try: browser = await launch() page = await browser.newPage() await page.goto(url, {'waitUntil': 'networkidle0', 'timeout': 60000}) await asyncio.sleep(5) content = await page.content() return content except Exception as e: raise Exception(f"Error fetching content: {str(e)}") finally: if browser: await browser.close() def extract_info(html_content): try: soup = BeautifulSoup(html_content, 'html.parser') title = soup.title.string.strip() if soup.title else None date_elem = soup.find('p', class_='content-date') date = date_elem.find( 'span', class_='ng-binding').text.strip() if date_elem else None youtube_iframe = soup.find( 'iframe', src=lambda x: x and 'youtube.com' in x) youtube_url = youtube_iframe['src'] if youtube_iframe else None youtube_id = None if youtube_url: match = re.search(r'youtube\.com/embed/([^?]+)', youtube_url) if match: youtube_id = match.group(1) transcript_elem = soup.find(id='transcript0') transcript = transcript_elem.get_text( strip=True) if transcript_elem else None return { 'metadata': { 'title': title, 'date': date, 'youtube_id': youtube_id, }, 'transcript': transcript } except Exception as e: raise Exception(f"Error extracting information: {str(e)}") def read_html_from_file(filename): try: if os.path.exists(filename): with open(filename, 'r', encoding='utf-8') as f: return f.read() return None except Exception as e: raise Exception(f"Error reading file {filename}: {str(e)}") def read_json_from_file(filename): try: if os.path.exists(filename): with open(filename, 'r', encoding='utf-8') as f: return json.load(f) return None except json.JSONDecodeError as e: raise Exception(f"Error decoding JSON in file {filename}: {str(e)}") except Exception as e: raise Exception(f"Error reading file {filename}: {str(e)}") def extract_speaker_info(segment): try: pattern = r'(?P(?:[A-Z][a-zöäüß]+ ){1,3}[A-Z][a-zöäüß]+), (?P[A-Za-z0-9\s]+)\((?P(?:\d{2}:)?\d{2}:\d{2})\):' match = re.match(pattern, segment) if match: return {key: value.strip() if value else None for key, value in match.groupdict().items()} else: timestamp_pattern = r'\((?P(?:\d{2}:)?\d{2}:\d{2})\):' timestamp_match = re.match(timestamp_pattern, segment) if timestamp_match: return {'speaker': None, 'company': None, 'timestamp': timestamp_match.group('timestamp')} return None except Exception as e: raise Exception(f"Error extracting speaker info: {str(e)}") def extract_subject_info(text): # Convert text to lowercase for case-insensitive matching lower_text = text.lower() # Find all subjects present in the text found_subjects = [ subject for subject in SUBJECTS if subject.lower() in lower_text] return found_subjects def parse_transcript(content): try: parsed_segments = [] saved_info = None pattern = r'((?:[A-Z][a-zöäüß]+ ){1,3}[A-Z][a-zöäüß]+,\s+[A-Za-z0-9\s]+\s+\((?:\d{2}:)?\d{2}:\d{2}\):|\((?:\d{2}:)?\d{2}:\d{2}\):)' segments = re.split(pattern, content) segments = [segment.strip() for segment in segments if segment.strip()] for i, segment in enumerate(segments): speaker_info = extract_speaker_info(segment) if speaker_info: if speaker_info['speaker']: # Full speaker, company, timestamp format if saved_info: text = segments[i-1] if i > 0 else "" subjects = extract_subject_info(text) parsed_segments.append({ 'metadata': { 'speaker': saved_info['speaker'], 'company': saved_info['company'], 'start_timestamp': saved_info['timestamp'], 'end_timestamp': speaker_info['timestamp'], 'subjects': subjects }, 'text': text }) saved_info = speaker_info else: # Standalone timestamp format if saved_info: text = segments[i-1] if i > 0 else "" subjects = extract_subject_info(text) parsed_segments.append({ 'metadata': { 'speaker': saved_info['speaker'], 'company': saved_info['company'], 'start_timestamp': saved_info['timestamp'], 'end_timestamp': speaker_info['timestamp'], 'subjects': subjects }, 'text': text }) saved_info['timestamp'] = speaker_info['timestamp'] elif saved_info: # Text segment continue # Add final entry if saved_info: text = segments[-1] subjects = extract_subject_info(text) parsed_segments.append({ 'metadata': { 'speaker': saved_info['speaker'], 'company': saved_info['company'], 'start_timestamp': saved_info['timestamp'], 'end_timestamp': "00:00:00", 'subjects': subjects }, 'text': text }) return parsed_segments except Exception as e: raise Exception(f"Error parsing transcript: {str(e)}") def get_cached_filename(url): return f"{CACHE_DIR}cached_{url.replace('://', '_').replace('/', '_')}.html" async def process_url(url): try: cached_filename = get_cached_filename(url) json_filename = f"{cached_filename}.json" info = read_json_from_file(json_filename) if info: return info content = read_html_from_file(cached_filename) if content is None: print(f"Fetching content from web for {url}...") content = await get_client_rendered_content(url) with open(cached_filename, 'w', encoding='utf-8') as f: f.write(content) else: print(f"Using cached content from file for {url}...") info = extract_info(content) transcript = info['transcript'] if (transcript): info['transcript'] = parse_transcript(transcript) generate_clips(CACHE_DIR, info) with open(json_filename, 'w', encoding='utf-8') as f: json.dump(info, f, ensure_ascii=False, indent=4) print(f"Information extracted and saved to {json_filename}") else: print(f"No transcript found for {url}") return info except Exception as e: print(f"Error processing URL {url}:") print(traceback.format_exc()) print(f"Detailed error: {str(e)}") return None async def process_urls(urls): tasks = [process_url(url) for url in urls] return await asyncio.gather(*tasks) def main(): global assistant assistant = get_ai_assistant() url_file = "dsp-urls.txt" # File containing list of URLs if not os.path.exists(url_file): print(f"Error: {url_file} not found.") return content_hashes, speakers, companies, sentiments, subjects = db_load_metadata_sets() # Convert companies to a dictionary of speaker sets if it's not already if not isinstance(companies, dict): companies = {company: set() for company in companies} with open(url_file, 'r') as f: urls = [line.strip() for line in f if line.strip()] for url in urls: # Generate a hash of the url filename_hash = hashlib.md5(url.encode()).hexdigest() # Check if this content has already been added if filename_hash in content_hashes: print(f"{url} already added") continue info = asyncio.run(process_url(url)) if info is None: continue metadata = info['metadata'] transcript = info['transcript'] if transcript is None: continue for entry in transcript: metadata.update(entry['metadata']) company = metadata['company'] speaker = metadata['speaker'] entry_subjects = metadata['subjects'] speakers.add(speaker) # Add new subjects to the master set subjects.update(entry_subjects) text = entry['text'] assistant.add_to_knowledge_base( text, data_type='text', metadata=metadata.copy()) if company not in companies: companies[company] = set() companies[company].add(speaker) content_hashes.add(filename_hash) print(f"Added new url: {url}") # Save updated hashes and metadata save_metadata_sets(content_hashes, speakers, companies, sentiments, subjects) assistant.save() print("Processing complete. Check individual URL outputs for any errors.") def save_metadata_sets(content_hashes, speakers, companies, sentiments, subjects): metadata = { 'content_hashes': list(content_hashes), 'speakers': list(speakers), 'companies': {company: list(speakers) for company, speakers in companies.items()}, 'sentiments': list(sentiments), 'subjects': list(subjects) } with open(DB_METADATA_FILE, 'w') as f: json.dump(metadata, f, indent=2) def db_load_metadata_sets(): content_hashes = set() speakers = set() companies = {} sentiments = set() subjects = set() if os.path.exists(DB_METADATA_FILE): with open(DB_METADATA_FILE, 'r') as f: metadata = json.load(f) content_hashes = set(metadata.get('content_hashes', [])) speakers = set(metadata.get('speakers', [])) companies = {company: set(speakers) for company, speakers in metadata.get( 'companies', {}).items()} sentiments = set(metadata.get('sentiments', [])) subjects = set(metadata.get('subjects', SUBJECTS)) return content_hashes, speakers, companies, sentiments, subjects if __name__ == "__main__": main()