Spaces:

eusholli
/

ttv-ec

Build error

App Files Files Community

ttv-ec / ttv_web_scraper.py

eusholli

add clip download

bea81c7 7 months ago

raw

history blame

11.6 kB

	import re
	import asyncio
	import json
	import os
	import gc
	import traceback
	from pyppeteer import launch
	from bs4 import BeautifulSoup
	import hashlib
	from ai_config_faiss import get_ai_assistant
	from video_utils import generate_clips

	CACHE_DIR = "cache/"
	if not os.path.exists(CACHE_DIR):
	os.makedirs(CACHE_DIR)

	DB_METADATA_FILE = os.path.join(CACHE_DIR, "db_metadata.json")
	SUBJECTS = [
	" 5G ", " AI ", " Innovation ", " Network ", " Enterprise ", " Open RAN ",
	" TechCo ", " B2B ", " API ", " Infrastructure ", " Connectivity "
	]


	async def get_client_rendered_content(url):
	browser = None
	try:
	browser = await launch()
	page = await browser.newPage()
	await page.goto(url, {'waitUntil': 'networkidle0', 'timeout': 60000})
	await asyncio.sleep(5)
	content = await page.content()
	return content
	except Exception as e:
	raise Exception(f"Error fetching content: {str(e)}")
	finally:
	if browser:
	await browser.close()


	def extract_info(html_content):
	try:
	soup = BeautifulSoup(html_content, 'html.parser')
	title = soup.title.string.strip() if soup.title else None
	date_elem = soup.find('p', class_='content-date')
	date = date_elem.find(
	'span', class_='ng-binding').text.strip() if date_elem else None
	youtube_iframe = soup.find(
	'iframe', src=lambda x: x and 'youtube.com' in x)
	youtube_url = youtube_iframe['src'] if youtube_iframe else None
	youtube_id = None
	if youtube_url:
	match = re.search(r'youtube\.com/embed/([^?]+)', youtube_url)
	if match:
	youtube_id = match.group(1)
	transcript_elem = soup.find(id='transcript0')
	transcript = transcript_elem.get_text(
	strip=True) if transcript_elem else None
	return {
	'metadata': {
	'title': title,
	'date': date,
	'youtube_id': youtube_id,
	},
	'transcript': transcript
	}
	except Exception as e:
	raise Exception(f"Error extracting information: {str(e)}")


	def read_html_from_file(filename):
	try:
	if os.path.exists(filename):
	with open(filename, 'r', encoding='utf-8') as f:
	return f.read()
	return None
	except Exception as e:
	raise Exception(f"Error reading file {filename}: {str(e)}")


	def read_json_from_file(filename):
	try:
	if os.path.exists(filename):
	with open(filename, 'r', encoding='utf-8') as f:
	return json.load(f)
	return None
	except json.JSONDecodeError as e:
	raise Exception(f"Error decoding JSON in file {filename}: {str(e)}")
	except Exception as e:
	raise Exception(f"Error reading file {filename}: {str(e)}")


	def extract_speaker_info(segment):
	try:
	pattern = r'(?P<speaker>(?:[A-Z][a-zöäüß]+ ){1,3}[A-Z][a-zöäüß]+), (?P<company>[A-Za-z0-9\s]+)\((?P<timestamp>(?:\d{2}:)?\d{2}:\d{2})\):'
	match = re.match(pattern, segment)
	if match:
	return {key: value.strip() if value else None for key, value in match.groupdict().items()}
	else:
	timestamp_pattern = r'\((?P<timestamp>(?:\d{2}:)?\d{2}:\d{2})\):'
	timestamp_match = re.match(timestamp_pattern, segment)
	if timestamp_match:
	return {'speaker': None, 'company': None, 'timestamp': timestamp_match.group('timestamp')}
	return None
	except Exception as e:
	raise Exception(f"Error extracting speaker info: {str(e)}")


	def extract_subject_info(text):
	# Convert text to lowercase for case-insensitive matching
	lower_text = text.lower()

	# Find all subjects present in the text
	found_subjects = [
	subject for subject in SUBJECTS if subject.lower() in lower_text]

	return found_subjects


	def parse_transcript(content):
	try:
	parsed_segments = []
	saved_info = None
	pattern = r'((?:[A-Z][a-zöäüß]+ ){1,3}[A-Z][a-zöäüß]+,\s+[A-Za-z0-9\s]+\s+\((?:\d{2}:)?\d{2}:\d{2}\):\|\((?:\d{2}:)?\d{2}:\d{2}\):)'
	segments = re.split(pattern, content)
	segments = [segment.strip() for segment in segments if segment.strip()]

	for i, segment in enumerate(segments):
	speaker_info = extract_speaker_info(segment)
	if speaker_info:
	if speaker_info['speaker']:
	# Full speaker, company, timestamp format
	if saved_info:
	text = segments[i-1] if i > 0 else ""
	subjects = extract_subject_info(text)
	parsed_segments.append({
	'metadata': {
	'speaker': saved_info['speaker'],
	'company': saved_info['company'],
	'start_timestamp': saved_info['timestamp'],
	'end_timestamp': speaker_info['timestamp'],
	'subjects': subjects
	},
	'text': text
	})
	saved_info = speaker_info
	else:
	# Standalone timestamp format
	if saved_info:
	text = segments[i-1] if i > 0 else ""
	subjects = extract_subject_info(text)
	parsed_segments.append({
	'metadata': {
	'speaker': saved_info['speaker'],
	'company': saved_info['company'],
	'start_timestamp': saved_info['timestamp'],
	'end_timestamp': speaker_info['timestamp'],
	'subjects': subjects
	},
	'text': text
	})
	saved_info['timestamp'] = speaker_info['timestamp']
	elif saved_info:
	# Text segment
	continue

	# Add final entry
	if saved_info:
	text = segments[-1]
	subjects = extract_subject_info(text)
	parsed_segments.append({
	'metadata': {
	'speaker': saved_info['speaker'],
	'company': saved_info['company'],
	'start_timestamp': saved_info['timestamp'],
	'end_timestamp': "00:00:00",
	'subjects': subjects
	},
	'text': text
	})

	return parsed_segments
	except Exception as e:
	raise Exception(f"Error parsing transcript: {str(e)}")


	def get_cached_filename(url):
	return f"{CACHE_DIR}cached_{url.replace('://', '_').replace('/', '_')}.html"


	async def process_url(url):
	try:
	cached_filename = get_cached_filename(url)
	json_filename = f"{cached_filename}.json"
	info = read_json_from_file(json_filename)

	if info:
	return info

	content = read_html_from_file(cached_filename)

	if content is None:
	print(f"Fetching content from web for {url}...")
	content = await get_client_rendered_content(url)
	with open(cached_filename, 'w', encoding='utf-8') as f:
	f.write(content)
	else:
	print(f"Using cached content from file for {url}...")

	info = extract_info(content)
	transcript = info['transcript']
	if (transcript):
	info['transcript'] = parse_transcript(transcript)
	generate_clips(CACHE_DIR, info)
	with open(json_filename, 'w', encoding='utf-8') as f:
	json.dump(info, f, ensure_ascii=False, indent=4)
	print(f"Information extracted and saved to {json_filename}")
	else:
	print(f"No transcript found for {url}")
	return info

	except Exception as e:
	print(f"Error processing URL {url}:")
	print(traceback.format_exc())
	print(f"Detailed error: {str(e)}")
	return None


	async def process_urls(urls):
	tasks = [process_url(url) for url in urls]
	return await asyncio.gather(*tasks)


	def main():
	global assistant
	assistant = get_ai_assistant()

	url_file = "dsp-urls.txt" # File containing list of URLs

	if not os.path.exists(url_file):
	print(f"Error: {url_file} not found.")
	return

	content_hashes, speakers, companies, sentiments, subjects = db_load_metadata_sets()

	# Convert companies to a dictionary of speaker sets if it's not already
	if not isinstance(companies, dict):
	companies = {company: set() for company in companies}

	with open(url_file, 'r') as f:
	urls = [line.strip() for line in f if line.strip()]

	for url in urls:
	# Generate a hash of the url
	filename_hash = hashlib.md5(url.encode()).hexdigest()
	# Check if this content has already been added
	if filename_hash in content_hashes:
	print(f"{url} already added")
	continue

	info = asyncio.run(process_url(url))
	if info is None:
	continue

	metadata = info['metadata']
	transcript = info['transcript']

	if transcript is None:
	continue

	for entry in transcript:
	metadata.update(entry['metadata'])
	company = metadata['company']
	speaker = metadata['speaker']
	entry_subjects = metadata['subjects']

	speakers.add(speaker)
	# Add new subjects to the master set
	subjects.update(entry_subjects)

	text = entry['text']

	assistant.add_to_knowledge_base(
	text, data_type='text', metadata=metadata.copy())

	if company not in companies:
	companies[company] = set()
	companies[company].add(speaker)

	content_hashes.add(filename_hash)
	print(f"Added new url: {url}")

	# Save updated hashes and metadata
	save_metadata_sets(content_hashes, speakers,
	companies, sentiments, subjects)

	assistant.save()

	print("Processing complete. Check individual URL outputs for any errors.")


	def save_metadata_sets(content_hashes, speakers, companies, sentiments, subjects):
	metadata = {
	'content_hashes': list(content_hashes),
	'speakers': list(speakers),
	'companies': {company: list(speakers) for company, speakers in companies.items()},
	'sentiments': list(sentiments),
	'subjects': list(subjects)
	}

	with open(DB_METADATA_FILE, 'w') as f:
	json.dump(metadata, f, indent=2)


	def db_load_metadata_sets():
	content_hashes = set()
	speakers = set()
	companies = {}
	sentiments = set()
	subjects = set()

	if os.path.exists(DB_METADATA_FILE):
	with open(DB_METADATA_FILE, 'r') as f:
	metadata = json.load(f)

	content_hashes = set(metadata.get('content_hashes', []))
	speakers = set(metadata.get('speakers', []))
	companies = {company: set(speakers) for company, speakers in metadata.get(
	'companies', {}).items()}
	sentiments = set(metadata.get('sentiments', []))
	subjects = set(metadata.get('subjects', SUBJECTS))

	return content_hashes, speakers, companies, sentiments, subjects


	if __name__ == "__main__":
	main()