ttv-ec / ttv_web_scraper.py
eusholli's picture
add clip download
bea81c7
raw
history blame
11.6 kB
import re
import asyncio
import json
import os
import gc
import traceback
from pyppeteer import launch
from bs4 import BeautifulSoup
import hashlib
from ai_config_faiss import get_ai_assistant
from video_utils import generate_clips
CACHE_DIR = "cache/"
if not os.path.exists(CACHE_DIR):
os.makedirs(CACHE_DIR)
DB_METADATA_FILE = os.path.join(CACHE_DIR, "db_metadata.json")
SUBJECTS = [
" 5G ", " AI ", " Innovation ", " Network ", " Enterprise ", " Open RAN ",
" TechCo ", " B2B ", " API ", " Infrastructure ", " Connectivity "
]
async def get_client_rendered_content(url):
browser = None
try:
browser = await launch()
page = await browser.newPage()
await page.goto(url, {'waitUntil': 'networkidle0', 'timeout': 60000})
await asyncio.sleep(5)
content = await page.content()
return content
except Exception as e:
raise Exception(f"Error fetching content: {str(e)}")
finally:
if browser:
await browser.close()
def extract_info(html_content):
try:
soup = BeautifulSoup(html_content, 'html.parser')
title = soup.title.string.strip() if soup.title else None
date_elem = soup.find('p', class_='content-date')
date = date_elem.find(
'span', class_='ng-binding').text.strip() if date_elem else None
youtube_iframe = soup.find(
'iframe', src=lambda x: x and 'youtube.com' in x)
youtube_url = youtube_iframe['src'] if youtube_iframe else None
youtube_id = None
if youtube_url:
match = re.search(r'youtube\.com/embed/([^?]+)', youtube_url)
if match:
youtube_id = match.group(1)
transcript_elem = soup.find(id='transcript0')
transcript = transcript_elem.get_text(
strip=True) if transcript_elem else None
return {
'metadata': {
'title': title,
'date': date,
'youtube_id': youtube_id,
},
'transcript': transcript
}
except Exception as e:
raise Exception(f"Error extracting information: {str(e)}")
def read_html_from_file(filename):
try:
if os.path.exists(filename):
with open(filename, 'r', encoding='utf-8') as f:
return f.read()
return None
except Exception as e:
raise Exception(f"Error reading file {filename}: {str(e)}")
def read_json_from_file(filename):
try:
if os.path.exists(filename):
with open(filename, 'r', encoding='utf-8') as f:
return json.load(f)
return None
except json.JSONDecodeError as e:
raise Exception(f"Error decoding JSON in file {filename}: {str(e)}")
except Exception as e:
raise Exception(f"Error reading file {filename}: {str(e)}")
def extract_speaker_info(segment):
try:
pattern = r'(?P<speaker>(?:[A-Z][a-zöäüß]+ ){1,3}[A-Z][a-zöäüß]+), (?P<company>[A-Za-z0-9\s]+)\((?P<timestamp>(?:\d{2}:)?\d{2}:\d{2})\):'
match = re.match(pattern, segment)
if match:
return {key: value.strip() if value else None for key, value in match.groupdict().items()}
else:
timestamp_pattern = r'\((?P<timestamp>(?:\d{2}:)?\d{2}:\d{2})\):'
timestamp_match = re.match(timestamp_pattern, segment)
if timestamp_match:
return {'speaker': None, 'company': None, 'timestamp': timestamp_match.group('timestamp')}
return None
except Exception as e:
raise Exception(f"Error extracting speaker info: {str(e)}")
def extract_subject_info(text):
# Convert text to lowercase for case-insensitive matching
lower_text = text.lower()
# Find all subjects present in the text
found_subjects = [
subject for subject in SUBJECTS if subject.lower() in lower_text]
return found_subjects
def parse_transcript(content):
try:
parsed_segments = []
saved_info = None
pattern = r'((?:[A-Z][a-zöäüß]+ ){1,3}[A-Z][a-zöäüß]+,\s+[A-Za-z0-9\s]+\s+\((?:\d{2}:)?\d{2}:\d{2}\):|\((?:\d{2}:)?\d{2}:\d{2}\):)'
segments = re.split(pattern, content)
segments = [segment.strip() for segment in segments if segment.strip()]
for i, segment in enumerate(segments):
speaker_info = extract_speaker_info(segment)
if speaker_info:
if speaker_info['speaker']:
# Full speaker, company, timestamp format
if saved_info:
text = segments[i-1] if i > 0 else ""
subjects = extract_subject_info(text)
parsed_segments.append({
'metadata': {
'speaker': saved_info['speaker'],
'company': saved_info['company'],
'start_timestamp': saved_info['timestamp'],
'end_timestamp': speaker_info['timestamp'],
'subjects': subjects
},
'text': text
})
saved_info = speaker_info
else:
# Standalone timestamp format
if saved_info:
text = segments[i-1] if i > 0 else ""
subjects = extract_subject_info(text)
parsed_segments.append({
'metadata': {
'speaker': saved_info['speaker'],
'company': saved_info['company'],
'start_timestamp': saved_info['timestamp'],
'end_timestamp': speaker_info['timestamp'],
'subjects': subjects
},
'text': text
})
saved_info['timestamp'] = speaker_info['timestamp']
elif saved_info:
# Text segment
continue
# Add final entry
if saved_info:
text = segments[-1]
subjects = extract_subject_info(text)
parsed_segments.append({
'metadata': {
'speaker': saved_info['speaker'],
'company': saved_info['company'],
'start_timestamp': saved_info['timestamp'],
'end_timestamp': "00:00:00",
'subjects': subjects
},
'text': text
})
return parsed_segments
except Exception as e:
raise Exception(f"Error parsing transcript: {str(e)}")
def get_cached_filename(url):
return f"{CACHE_DIR}cached_{url.replace('://', '_').replace('/', '_')}.html"
async def process_url(url):
try:
cached_filename = get_cached_filename(url)
json_filename = f"{cached_filename}.json"
info = read_json_from_file(json_filename)
if info:
return info
content = read_html_from_file(cached_filename)
if content is None:
print(f"Fetching content from web for {url}...")
content = await get_client_rendered_content(url)
with open(cached_filename, 'w', encoding='utf-8') as f:
f.write(content)
else:
print(f"Using cached content from file for {url}...")
info = extract_info(content)
transcript = info['transcript']
if (transcript):
info['transcript'] = parse_transcript(transcript)
generate_clips(CACHE_DIR, info)
with open(json_filename, 'w', encoding='utf-8') as f:
json.dump(info, f, ensure_ascii=False, indent=4)
print(f"Information extracted and saved to {json_filename}")
else:
print(f"No transcript found for {url}")
return info
except Exception as e:
print(f"Error processing URL {url}:")
print(traceback.format_exc())
print(f"Detailed error: {str(e)}")
return None
async def process_urls(urls):
tasks = [process_url(url) for url in urls]
return await asyncio.gather(*tasks)
def main():
global assistant
assistant = get_ai_assistant()
url_file = "dsp-urls.txt" # File containing list of URLs
if not os.path.exists(url_file):
print(f"Error: {url_file} not found.")
return
content_hashes, speakers, companies, sentiments, subjects = db_load_metadata_sets()
# Convert companies to a dictionary of speaker sets if it's not already
if not isinstance(companies, dict):
companies = {company: set() for company in companies}
with open(url_file, 'r') as f:
urls = [line.strip() for line in f if line.strip()]
for url in urls:
# Generate a hash of the url
filename_hash = hashlib.md5(url.encode()).hexdigest()
# Check if this content has already been added
if filename_hash in content_hashes:
print(f"{url} already added")
continue
info = asyncio.run(process_url(url))
if info is None:
continue
metadata = info['metadata']
transcript = info['transcript']
if transcript is None:
continue
for entry in transcript:
metadata.update(entry['metadata'])
company = metadata['company']
speaker = metadata['speaker']
entry_subjects = metadata['subjects']
speakers.add(speaker)
# Add new subjects to the master set
subjects.update(entry_subjects)
text = entry['text']
assistant.add_to_knowledge_base(
text, data_type='text', metadata=metadata.copy())
if company not in companies:
companies[company] = set()
companies[company].add(speaker)
content_hashes.add(filename_hash)
print(f"Added new url: {url}")
# Save updated hashes and metadata
save_metadata_sets(content_hashes, speakers,
companies, sentiments, subjects)
assistant.save()
print("Processing complete. Check individual URL outputs for any errors.")
def save_metadata_sets(content_hashes, speakers, companies, sentiments, subjects):
metadata = {
'content_hashes': list(content_hashes),
'speakers': list(speakers),
'companies': {company: list(speakers) for company, speakers in companies.items()},
'sentiments': list(sentiments),
'subjects': list(subjects)
}
with open(DB_METADATA_FILE, 'w') as f:
json.dump(metadata, f, indent=2)
def db_load_metadata_sets():
content_hashes = set()
speakers = set()
companies = {}
sentiments = set()
subjects = set()
if os.path.exists(DB_METADATA_FILE):
with open(DB_METADATA_FILE, 'r') as f:
metadata = json.load(f)
content_hashes = set(metadata.get('content_hashes', []))
speakers = set(metadata.get('speakers', []))
companies = {company: set(speakers) for company, speakers in metadata.get(
'companies', {}).items()}
sentiments = set(metadata.get('sentiments', []))
subjects = set(metadata.get('subjects', SUBJECTS))
return content_hashes, speakers, companies, sentiments, subjects
if __name__ == "__main__":
main()