|
import re |
|
import asyncio |
|
import json |
|
import os |
|
import gc |
|
import traceback |
|
from pyppeteer import launch |
|
from bs4 import BeautifulSoup |
|
import hashlib |
|
from ai_config_faiss import get_ai_assistant |
|
from video_utils import generate_clips |
|
|
|
CACHE_DIR = "cache/" |
|
if not os.path.exists(CACHE_DIR): |
|
os.makedirs(CACHE_DIR) |
|
|
|
DB_METADATA_FILE = os.path.join(CACHE_DIR, "db_metadata.json") |
|
SUBJECTS = [ |
|
" 5G ", " AI ", " Innovation ", " Network ", " Enterprise ", " Open RAN ", |
|
" TechCo ", " B2B ", " API ", " Infrastructure ", " Connectivity " |
|
] |
|
|
|
|
|
async def get_client_rendered_content(url): |
|
browser = None |
|
try: |
|
browser = await launch() |
|
page = await browser.newPage() |
|
await page.goto(url, {'waitUntil': 'networkidle0', 'timeout': 60000}) |
|
await asyncio.sleep(5) |
|
content = await page.content() |
|
return content |
|
except Exception as e: |
|
raise Exception(f"Error fetching content: {str(e)}") |
|
finally: |
|
if browser: |
|
await browser.close() |
|
|
|
|
|
def extract_info(html_content): |
|
try: |
|
soup = BeautifulSoup(html_content, 'html.parser') |
|
title = soup.title.string.strip() if soup.title else None |
|
date_elem = soup.find('p', class_='content-date') |
|
date = date_elem.find( |
|
'span', class_='ng-binding').text.strip() if date_elem else None |
|
youtube_iframe = soup.find( |
|
'iframe', src=lambda x: x and 'youtube.com' in x) |
|
youtube_url = youtube_iframe['src'] if youtube_iframe else None |
|
youtube_id = None |
|
if youtube_url: |
|
match = re.search(r'youtube\.com/embed/([^?]+)', youtube_url) |
|
if match: |
|
youtube_id = match.group(1) |
|
transcript_elem = soup.find(id='transcript0') |
|
transcript = transcript_elem.get_text( |
|
strip=True) if transcript_elem else None |
|
return { |
|
'metadata': { |
|
'title': title, |
|
'date': date, |
|
'youtube_id': youtube_id, |
|
}, |
|
'transcript': transcript |
|
} |
|
except Exception as e: |
|
raise Exception(f"Error extracting information: {str(e)}") |
|
|
|
|
|
def read_html_from_file(filename): |
|
try: |
|
if os.path.exists(filename): |
|
with open(filename, 'r', encoding='utf-8') as f: |
|
return f.read() |
|
return None |
|
except Exception as e: |
|
raise Exception(f"Error reading file {filename}: {str(e)}") |
|
|
|
|
|
def read_json_from_file(filename): |
|
try: |
|
if os.path.exists(filename): |
|
with open(filename, 'r', encoding='utf-8') as f: |
|
return json.load(f) |
|
return None |
|
except json.JSONDecodeError as e: |
|
raise Exception(f"Error decoding JSON in file {filename}: {str(e)}") |
|
except Exception as e: |
|
raise Exception(f"Error reading file {filename}: {str(e)}") |
|
|
|
|
|
def extract_speaker_info(segment): |
|
try: |
|
pattern = r'(?P<speaker>(?:[A-Z][a-zöäüß]+ ){1,3}[A-Z][a-zöäüß]+), (?P<company>[A-Za-z0-9\s]+)\((?P<timestamp>(?:\d{2}:)?\d{2}:\d{2})\):' |
|
match = re.match(pattern, segment) |
|
if match: |
|
return {key: value.strip() if value else None for key, value in match.groupdict().items()} |
|
else: |
|
timestamp_pattern = r'\((?P<timestamp>(?:\d{2}:)?\d{2}:\d{2})\):' |
|
timestamp_match = re.match(timestamp_pattern, segment) |
|
if timestamp_match: |
|
return {'speaker': None, 'company': None, 'timestamp': timestamp_match.group('timestamp')} |
|
return None |
|
except Exception as e: |
|
raise Exception(f"Error extracting speaker info: {str(e)}") |
|
|
|
|
|
def extract_subject_info(text): |
|
|
|
lower_text = text.lower() |
|
|
|
|
|
found_subjects = [ |
|
subject for subject in SUBJECTS if subject.lower() in lower_text] |
|
|
|
return found_subjects |
|
|
|
|
|
def parse_transcript(content): |
|
try: |
|
parsed_segments = [] |
|
saved_info = None |
|
pattern = r'((?:[A-Z][a-zöäüß]+ ){1,3}[A-Z][a-zöäüß]+,\s+[A-Za-z0-9\s]+\s+\((?:\d{2}:)?\d{2}:\d{2}\):|\((?:\d{2}:)?\d{2}:\d{2}\):)' |
|
segments = re.split(pattern, content) |
|
segments = [segment.strip() for segment in segments if segment.strip()] |
|
|
|
for i, segment in enumerate(segments): |
|
speaker_info = extract_speaker_info(segment) |
|
if speaker_info: |
|
if speaker_info['speaker']: |
|
|
|
if saved_info: |
|
text = segments[i-1] if i > 0 else "" |
|
subjects = extract_subject_info(text) |
|
parsed_segments.append({ |
|
'metadata': { |
|
'speaker': saved_info['speaker'], |
|
'company': saved_info['company'], |
|
'start_timestamp': saved_info['timestamp'], |
|
'end_timestamp': speaker_info['timestamp'], |
|
'subjects': subjects |
|
}, |
|
'text': text |
|
}) |
|
saved_info = speaker_info |
|
else: |
|
|
|
if saved_info: |
|
text = segments[i-1] if i > 0 else "" |
|
subjects = extract_subject_info(text) |
|
parsed_segments.append({ |
|
'metadata': { |
|
'speaker': saved_info['speaker'], |
|
'company': saved_info['company'], |
|
'start_timestamp': saved_info['timestamp'], |
|
'end_timestamp': speaker_info['timestamp'], |
|
'subjects': subjects |
|
}, |
|
'text': text |
|
}) |
|
saved_info['timestamp'] = speaker_info['timestamp'] |
|
elif saved_info: |
|
|
|
continue |
|
|
|
|
|
if saved_info: |
|
text = segments[-1] |
|
subjects = extract_subject_info(text) |
|
parsed_segments.append({ |
|
'metadata': { |
|
'speaker': saved_info['speaker'], |
|
'company': saved_info['company'], |
|
'start_timestamp': saved_info['timestamp'], |
|
'end_timestamp': "00:00:00", |
|
'subjects': subjects |
|
}, |
|
'text': text |
|
}) |
|
|
|
return parsed_segments |
|
except Exception as e: |
|
raise Exception(f"Error parsing transcript: {str(e)}") |
|
|
|
|
|
def get_cached_filename(url): |
|
return f"{CACHE_DIR}cached_{url.replace('://', '_').replace('/', '_')}.html" |
|
|
|
|
|
async def process_url(url): |
|
try: |
|
cached_filename = get_cached_filename(url) |
|
json_filename = f"{cached_filename}.json" |
|
info = read_json_from_file(json_filename) |
|
|
|
if info: |
|
return info |
|
|
|
content = read_html_from_file(cached_filename) |
|
|
|
if content is None: |
|
print(f"Fetching content from web for {url}...") |
|
content = await get_client_rendered_content(url) |
|
with open(cached_filename, 'w', encoding='utf-8') as f: |
|
f.write(content) |
|
else: |
|
print(f"Using cached content from file for {url}...") |
|
|
|
info = extract_info(content) |
|
transcript = info['transcript'] |
|
if (transcript): |
|
info['transcript'] = parse_transcript(transcript) |
|
generate_clips(CACHE_DIR, info) |
|
with open(json_filename, 'w', encoding='utf-8') as f: |
|
json.dump(info, f, ensure_ascii=False, indent=4) |
|
print(f"Information extracted and saved to {json_filename}") |
|
else: |
|
print(f"No transcript found for {url}") |
|
return info |
|
|
|
except Exception as e: |
|
print(f"Error processing URL {url}:") |
|
print(traceback.format_exc()) |
|
print(f"Detailed error: {str(e)}") |
|
return None |
|
|
|
|
|
async def process_urls(urls): |
|
tasks = [process_url(url) for url in urls] |
|
return await asyncio.gather(*tasks) |
|
|
|
|
|
def main(): |
|
global assistant |
|
assistant = get_ai_assistant() |
|
|
|
url_file = "dsp-urls.txt" |
|
|
|
if not os.path.exists(url_file): |
|
print(f"Error: {url_file} not found.") |
|
return |
|
|
|
content_hashes, speakers, companies, sentiments, subjects = db_load_metadata_sets() |
|
|
|
|
|
if not isinstance(companies, dict): |
|
companies = {company: set() for company in companies} |
|
|
|
with open(url_file, 'r') as f: |
|
urls = [line.strip() for line in f if line.strip()] |
|
|
|
for url in urls: |
|
|
|
filename_hash = hashlib.md5(url.encode()).hexdigest() |
|
|
|
if filename_hash in content_hashes: |
|
print(f"{url} already added") |
|
continue |
|
|
|
info = asyncio.run(process_url(url)) |
|
if info is None: |
|
continue |
|
|
|
metadata = info['metadata'] |
|
transcript = info['transcript'] |
|
|
|
if transcript is None: |
|
continue |
|
|
|
for entry in transcript: |
|
metadata.update(entry['metadata']) |
|
company = metadata['company'] |
|
speaker = metadata['speaker'] |
|
entry_subjects = metadata['subjects'] |
|
|
|
speakers.add(speaker) |
|
|
|
subjects.update(entry_subjects) |
|
|
|
text = entry['text'] |
|
|
|
assistant.add_to_knowledge_base( |
|
text, data_type='text', metadata=metadata.copy()) |
|
|
|
if company not in companies: |
|
companies[company] = set() |
|
companies[company].add(speaker) |
|
|
|
content_hashes.add(filename_hash) |
|
print(f"Added new url: {url}") |
|
|
|
|
|
save_metadata_sets(content_hashes, speakers, |
|
companies, sentiments, subjects) |
|
|
|
assistant.save() |
|
|
|
print("Processing complete. Check individual URL outputs for any errors.") |
|
|
|
|
|
def save_metadata_sets(content_hashes, speakers, companies, sentiments, subjects): |
|
metadata = { |
|
'content_hashes': list(content_hashes), |
|
'speakers': list(speakers), |
|
'companies': {company: list(speakers) for company, speakers in companies.items()}, |
|
'sentiments': list(sentiments), |
|
'subjects': list(subjects) |
|
} |
|
|
|
with open(DB_METADATA_FILE, 'w') as f: |
|
json.dump(metadata, f, indent=2) |
|
|
|
|
|
def db_load_metadata_sets(): |
|
content_hashes = set() |
|
speakers = set() |
|
companies = {} |
|
sentiments = set() |
|
subjects = set() |
|
|
|
if os.path.exists(DB_METADATA_FILE): |
|
with open(DB_METADATA_FILE, 'r') as f: |
|
metadata = json.load(f) |
|
|
|
content_hashes = set(metadata.get('content_hashes', [])) |
|
speakers = set(metadata.get('speakers', [])) |
|
companies = {company: set(speakers) for company, speakers in metadata.get( |
|
'companies', {}).items()} |
|
sentiments = set(metadata.get('sentiments', [])) |
|
subjects = set(metadata.get('subjects', SUBJECTS)) |
|
|
|
return content_hashes, speakers, companies, sentiments, subjects |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|