import re
import asyncio
import json
import os
import traceback
from pyppeteer import launch
from bs4 import BeautifulSoup, NavigableString
from ai_config_faiss import get_ai_assistant
from video_utils import get_youtube_video, generate_clips
from typing import Dict, List, Set, Optional
from dataclasses import dataclass, asdict
import logging
# Set the TOKENIZERS_PARALLELISM environment variable
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Configure logging to suppress MoviePy's console output
logging.getLogger("moviepy").setLevel(logging.WARNING)
CACHE_DIR = "cache/"
DB_METADATA_FILE = os.path.join(CACHE_DIR, "db_metadata.json")
SUBJECTS = [
" 5G ", " AI ", " Innovation ", " Network ", " Enterprise ", " Open RAN ",
" TechCo ", " B2B ", " API ", " Infrastructure ", " Connectivity "
]
os.makedirs(CACHE_DIR, exist_ok=True)
@dataclass
class TranscriptSegment:
metadata: Dict[str, Optional[str]]
text: str
@dataclass
class VideoInfo:
metadata: Dict[str, Optional[str]]
transcript: List[TranscriptSegment]
async def get_client_rendered_content(url: str) -> str:
browser = None
try:
browser = await launch()
page = await browser.newPage()
await page.goto(url, {'waitUntil': 'networkidle0', 'timeout': 60000})
await asyncio.sleep(5)
return await page.content()
except Exception as e:
logger.error(f"Error fetching content for {url}: {str(e)}")
raise
finally:
if browser:
await browser.close()
def extract_text_with_br(element):
result = ['
']
for child in element.descendants:
if isinstance(child, NavigableString):
result.append(child.strip())
elif child.name == 'br':
result.append('
')
return ''.join(result).strip()
def extract_info(html_content: str) -> VideoInfo:
try:
soup = BeautifulSoup(html_content, 'html.parser')
title = soup.title.string.strip() if soup.title else None
date_elem = soup.find('p', class_='content-date')
date = date_elem.find('span', class_='ng-binding').text.strip() if date_elem else None
youtube_iframe = soup.find('iframe', src=lambda x: x and 'youtube.com' in x)
youtube_url = youtube_iframe['src'] if youtube_iframe else None
youtube_id = re.search(r'youtube\.com/embed/([^?]+)', youtube_url).group(1) if youtube_url else None
if get_youtube_video(CACHE_DIR, youtube_id):
transcript_elem = soup.find(id='transcript0')
transcript = extract_text_with_br(transcript_elem) if transcript_elem else None
return VideoInfo(
metadata={'title': title, 'date': date, 'youtube_id': youtube_id},
transcript=parse_transcript(transcript) if transcript else []
)
else:
return None
except Exception as e:
logger.error(f"Error extracting information: {str(e)}")
raise
def read_file(filename: str) -> Optional[str]:
try:
if os.path.exists(filename):
with open(filename, 'r', encoding='utf-8') as f:
return f.read()
return None
except Exception as e:
logger.error(f"Error reading file {filename}: {str(e)}")
raise
def extract_subject_info(text: str) -> List[str]:
return [subject for subject in SUBJECTS if subject.lower() in text.lower()]
def extract_speaker_info(segment: str) -> Optional[Dict[str, Optional[str]]]:
pattern = r'
(?:(?P[^,(]+?)(?:,\s*(?P[^(]+?))?)?\s*\((?P\d{2}:\d{2}:\d{2}|\d{2}:\d{2})\):
'
match = re.match(pattern, segment)
return {key: value.strip() if value else None for key, value in match.groupdict().items()} if match else None
def parse_transcript(content: str) -> List[TranscriptSegment]:
parsed_segments = []
saved_info = None
segments = [segment.strip() for segment in re.split(r'(
.*?\((?:\d{2}:)?\d{2}:\d{2}\):
)',
content) if segment.strip()]
for i, segment in enumerate(segments):
speaker_info = extract_speaker_info(segment)
if speaker_info:
if speaker_info['speaker']:
if saved_info:
text = segments[i-1] if i > 0 else ""
parsed_segments.append(TranscriptSegment(
metadata={
'speaker': saved_info['speaker'],
'company': saved_info['company'],
'start_timestamp': saved_info['timestamp'],
'end_timestamp': speaker_info['timestamp'],
'subjects': extract_subject_info(text)
},
text=text
))
saved_info = speaker_info
if not saved_info['company']:
saved_info['company'] = "Unknown"
else:
if saved_info:
text = segments[i-1] if i > 0 else ""
parsed_segments.append(TranscriptSegment(
metadata={
'speaker': saved_info['speaker'],
'company': saved_info['company'],
'start_timestamp': saved_info['timestamp'],
'end_timestamp': speaker_info['timestamp'],
'subjects': extract_subject_info(text)
},
text=text
))
saved_info['timestamp'] = speaker_info['timestamp']
elif saved_info:
continue
if saved_info:
text = segments[-1]
parsed_segments.append(TranscriptSegment(
metadata={
'speaker': saved_info['speaker'],
'company': saved_info['company'],
'start_timestamp': saved_info['timestamp'],
'end_timestamp': "00:00:00",
'subjects': extract_subject_info(text)
},
text=text
))
return parsed_segments
def get_cached_filename(url: str) -> str:
return f"{CACHE_DIR}cached_{url.replace('://', '_').replace('/', '_')}"
async def process_url(url: str) -> Optional[VideoInfo]:
try:
cached_filename = get_cached_filename(url)
html_filename = f"{cached_filename}.html"
json_filename = f"{cached_filename}.json"
if os.path.exists(json_filename):
logger.info(f"Using cached JSON for {url}")
with open(json_filename, 'r', encoding='utf-8') as f:
data = json.load(f)
return VideoInfo(
metadata=data['metadata'],
transcript=[TranscriptSegment(**segment) for segment in data['transcript']]
)
if os.path.exists(html_filename):
logger.info(f"Using cached HTML for {url}")
content = read_file(html_filename)
else:
logger.info(f"Fetching content from web for {url}")
content = await get_client_rendered_content(url)
with open(html_filename, 'w', encoding='utf-8') as f:
f.write(content)
info = extract_info(content)
if info.transcript:
logger.info(f"Generating clips for {url}")
info_dict = asdict(info)
info_dict['transcript'] = generate_clips(CACHE_DIR, info_dict)
info = VideoInfo(
metadata=info_dict['metadata'],
transcript=[TranscriptSegment(**segment) for segment in info_dict['transcript']]
)
with open(json_filename, 'w', encoding='utf-8') as f:
json.dump(asdict(info), f, ensure_ascii=False, indent=4)
logger.info(f"Information extracted and saved to {json_filename}")
else:
logger.warning(f"No transcript found for {url}")
return info
except Exception:
logger.error(f"Error processing URL {url}:\n{traceback.format_exc()}")
return None
async def process_urls(urls: List[str]) -> List[Optional[VideoInfo]]:
return await asyncio.gather(*[process_url(url) for url in urls])
def db_save_metadata_sets(processed_urls: Set[str], speakers: Set[str],
companies: Dict[str, Set[str]],
sentiments: Set[str], subjects: Set[str]):
metadata = {
'processed_urls': list(processed_urls),
'speakers': list(speakers),
'companies': {company: list(speakers) for company, speakers in companies.items()},
'sentiments': list(sentiments),
'subjects': list(subjects)
}
with open(DB_METADATA_FILE, 'w') as f:
json.dump(metadata, f, indent=2)
def db_load_metadata_sets() -> tuple:
if os.path.exists(DB_METADATA_FILE):
with open(DB_METADATA_FILE, 'r') as f:
metadata = json.load(f)
return (
set(metadata.get('processed_urls', [])),
set(metadata.get('speakers', [])),
{company: set(speakers) for company, speakers in metadata.get('companies', {}).items()},
set(metadata.get('sentiments', [])),
set(metadata.get('subjects', SUBJECTS))
)
return set(), set(), {}, set(), set(SUBJECTS)
async def main():
assistant = get_ai_assistant()
url_file = "dsp-urls.txt"
if not os.path.exists(url_file):
logger.error(f"Error: {url_file} not found.")
return
processed_urls, speakers, companies, sentiments, subjects = db_load_metadata_sets()
with open(url_file, 'r') as f:
urls = [line.strip() for line in f if line.strip()]
total_urls = len(urls)
for i, url in enumerate(urls, 1):
if url in processed_urls:
logger.info(f"[{i}/{total_urls}] {url} already processed")
continue
logger.info(f"[{i}/{total_urls}] Processing {url}")
info = await process_url(url)
if info is None:
logger.warning(f"[{i}/{total_urls}] Failed to process {url}")
continue
for entry in info.transcript:
metadata = {**info.metadata, **entry.metadata}
company = metadata.get('company')
speaker = metadata.get('speaker')
entry_subjects = metadata.get('subjects', [])
if speaker:
speakers.add(speaker)
subjects.update(entry_subjects)
assistant.add_to_knowledge_base(entry.text, data_type='text', metadata=metadata.copy())
if company and speaker:
companies.setdefault(company, set()).add(speaker)
processed_urls.add(url)
logger.info(f"[{i}/{total_urls}] Added new url: {url}")
db_save_metadata_sets(processed_urls, speakers, companies, sentiments, subjects)
assistant.save()
logger.info("Processing complete. Check logs for any errors.")
if __name__ == "__main__":
asyncio.run(main())