import re
import asyncio
import json
import os
import gc
import traceback
from pyppeteer import launch
from bs4 import BeautifulSoup
import hashlib
from ai_config_faiss import get_ai_assistant
from video_utils import generate_clips

CACHE_DIR = "cache/"
if not os.path.exists(CACHE_DIR):
    os.makedirs(CACHE_DIR)

DB_METADATA_FILE = os.path.join(CACHE_DIR, "db_metadata.json")
SUBJECTS = [
    " 5G ",  " AI ",  " Innovation ",  " Network ",  " Enterprise ",  " Open RAN ",
    " TechCo ",  " B2B ",  " API ",  " Infrastructure ",  " Connectivity "
]


async def get_client_rendered_content(url):
    browser = None
    try:
        browser = await launch()
        page = await browser.newPage()
        await page.goto(url, {'waitUntil': 'networkidle0', 'timeout': 60000})
        await asyncio.sleep(5)
        content = await page.content()
        return content
    except Exception as e:
        raise Exception(f"Error fetching content: {str(e)}")
    finally:
        if browser:
            await browser.close()


def extract_info(html_content):
    try:
        soup = BeautifulSoup(html_content, 'html.parser')
        title = soup.title.string.strip() if soup.title else None
        date_elem = soup.find('p', class_='content-date')
        date = date_elem.find(
            'span', class_='ng-binding').text.strip() if date_elem else None
        youtube_iframe = soup.find(
            'iframe', src=lambda x: x and 'youtube.com' in x)
        youtube_url = youtube_iframe['src'] if youtube_iframe else None
        youtube_id = None
        if youtube_url:
            match = re.search(r'youtube\.com/embed/([^?]+)', youtube_url)
            if match:
                youtube_id = match.group(1)
        transcript_elem = soup.find(id='transcript0')
        transcript = transcript_elem.get_text(
            strip=True) if transcript_elem else None
        return {
            'metadata': {
                'title': title,
                'date': date,
                'youtube_id': youtube_id,
            },
            'transcript': transcript
        }
    except Exception as e:
        raise Exception(f"Error extracting information: {str(e)}")


def read_html_from_file(filename):
    try:
        if os.path.exists(filename):
            with open(filename, 'r', encoding='utf-8') as f:
                return f.read()
        return None
    except Exception as e:
        raise Exception(f"Error reading file {filename}: {str(e)}")


def read_json_from_file(filename):
    try:
        if os.path.exists(filename):
            with open(filename, 'r', encoding='utf-8') as f:
                return json.load(f)
        return None
    except json.JSONDecodeError as e:
        raise Exception(f"Error decoding JSON in file {filename}: {str(e)}")
    except Exception as e:
        raise Exception(f"Error reading file {filename}: {str(e)}")


def extract_speaker_info(segment):
    try:
        pattern = r'(?P<speaker>(?:[A-Z][a-zöäüß]+ ){1,3}[A-Z][a-zöäüß]+), (?P<company>[A-Za-z0-9\s]+)\((?P<timestamp>(?:\d{2}:)?\d{2}:\d{2})\):'
        match = re.match(pattern, segment)
        if match:
            return {key: value.strip() if value else None for key, value in match.groupdict().items()}
        else:
            timestamp_pattern = r'\((?P<timestamp>(?:\d{2}:)?\d{2}:\d{2})\):'
            timestamp_match = re.match(timestamp_pattern, segment)
            if timestamp_match:
                return {'speaker': None, 'company': None, 'timestamp': timestamp_match.group('timestamp')}
        return None
    except Exception as e:
        raise Exception(f"Error extracting speaker info: {str(e)}")


def extract_subject_info(text):
    # Convert text to lowercase for case-insensitive matching
    lower_text = text.lower()

    # Find all subjects present in the text
    found_subjects = [
        subject for subject in SUBJECTS if subject.lower() in lower_text]

    return found_subjects


def parse_transcript(content):
    try:
        parsed_segments = []
        saved_info = None
        pattern = r'((?:[A-Z][a-zöäüß]+ ){1,3}[A-Z][a-zöäüß]+,\s+[A-Za-z0-9\s]+\s+\((?:\d{2}:)?\d{2}:\d{2}\):|\((?:\d{2}:)?\d{2}:\d{2}\):)'
        segments = re.split(pattern, content)
        segments = [segment.strip() for segment in segments if segment.strip()]

        for i, segment in enumerate(segments):
            speaker_info = extract_speaker_info(segment)
            if speaker_info:
                if speaker_info['speaker']:
                    # Full speaker, company, timestamp format
                    if saved_info:
                        text = segments[i-1] if i > 0 else ""
                        subjects = extract_subject_info(text)
                        parsed_segments.append({
                            'metadata': {
                                'speaker': saved_info['speaker'],
                                'company': saved_info['company'],
                                'start_timestamp': saved_info['timestamp'],
                                'end_timestamp': speaker_info['timestamp'],
                                'subjects': subjects
                            },
                            'text': text
                        })
                    saved_info = speaker_info
                else:
                    # Standalone timestamp format
                    if saved_info:
                        text = segments[i-1] if i > 0 else ""
                        subjects = extract_subject_info(text)
                        parsed_segments.append({
                            'metadata': {
                                'speaker': saved_info['speaker'],
                                'company': saved_info['company'],
                                'start_timestamp': saved_info['timestamp'],
                                'end_timestamp': speaker_info['timestamp'],
                                'subjects': subjects
                            },
                            'text': text
                        })
                        saved_info['timestamp'] = speaker_info['timestamp']
            elif saved_info:
                # Text segment
                continue

        # Add final entry
        if saved_info:
            text = segments[-1]
            subjects = extract_subject_info(text)
            parsed_segments.append({
                'metadata': {
                    'speaker': saved_info['speaker'],
                    'company': saved_info['company'],
                    'start_timestamp': saved_info['timestamp'],
                    'end_timestamp': "00:00:00",
                    'subjects': subjects
                },
                'text': text
            })

        return parsed_segments
    except Exception as e:
        raise Exception(f"Error parsing transcript: {str(e)}")


def get_cached_filename(url):
    return f"{CACHE_DIR}cached_{url.replace('://', '_').replace('/', '_')}.html"


async def process_url(url):
    try:
        cached_filename = get_cached_filename(url)
        json_filename = f"{cached_filename}.json"
        info = read_json_from_file(json_filename)

        if info:
            return info

        content = read_html_from_file(cached_filename)

        if content is None:
            print(f"Fetching content from web for {url}...")
            content = await get_client_rendered_content(url)
            with open(cached_filename, 'w', encoding='utf-8') as f:
                f.write(content)
        else:
            print(f"Using cached content from file for {url}...")

        info = extract_info(content)
        transcript = info['transcript']
        if (transcript):
            info['transcript'] = parse_transcript(transcript)
            generate_clips(CACHE_DIR, info)
            with open(json_filename, 'w', encoding='utf-8') as f:
                json.dump(info, f, ensure_ascii=False, indent=4)
            print(f"Information extracted and saved to {json_filename}")
        else:
            print(f"No transcript found for {url}")
        return info

    except Exception as e:
        print(f"Error processing URL {url}:")
        print(traceback.format_exc())
        print(f"Detailed error: {str(e)}")
        return None


async def process_urls(urls):
    tasks = [process_url(url) for url in urls]
    return await asyncio.gather(*tasks)


def main():
    global assistant
    assistant = get_ai_assistant()

    url_file = "dsp-urls.txt"  # File containing list of URLs

    if not os.path.exists(url_file):
        print(f"Error: {url_file} not found.")
        return

    content_hashes, speakers, companies, sentiments, subjects = db_load_metadata_sets()

    # Convert companies to a dictionary of speaker sets if it's not already
    if not isinstance(companies, dict):
        companies = {company: set() for company in companies}

    with open(url_file, 'r') as f:
        urls = [line.strip() for line in f if line.strip()]

    for url in urls:
        # Generate a hash of the url
        filename_hash = hashlib.md5(url.encode()).hexdigest()
        # Check if this content has already been added
        if filename_hash in content_hashes:
            print(f"{url} already added")
            continue

        info = asyncio.run(process_url(url))
        if info is None:
            continue

        metadata = info['metadata']
        transcript = info['transcript']

        if transcript is None:
            continue

        for entry in transcript:
            metadata.update(entry['metadata'])
            company = metadata['company']
            speaker = metadata['speaker']
            entry_subjects = metadata['subjects']

            speakers.add(speaker)
            # Add new subjects to the master set
            subjects.update(entry_subjects)

            text = entry['text']

            assistant.add_to_knowledge_base(
                text, data_type='text', metadata=metadata.copy())

            if company not in companies:
                companies[company] = set()
            companies[company].add(speaker)

        content_hashes.add(filename_hash)
        print(f"Added new url: {url}")

    # Save updated hashes and metadata
    save_metadata_sets(content_hashes, speakers,
                       companies, sentiments, subjects)

    assistant.save()

    print("Processing complete. Check individual URL outputs for any errors.")


def save_metadata_sets(content_hashes, speakers, companies, sentiments, subjects):
    metadata = {
        'content_hashes': list(content_hashes),
        'speakers': list(speakers),
        'companies': {company: list(speakers) for company, speakers in companies.items()},
        'sentiments': list(sentiments),
        'subjects': list(subjects)
    }

    with open(DB_METADATA_FILE, 'w') as f:
        json.dump(metadata, f, indent=2)


def db_load_metadata_sets():
    content_hashes = set()
    speakers = set()
    companies = {}
    sentiments = set()
    subjects = set()

    if os.path.exists(DB_METADATA_FILE):
        with open(DB_METADATA_FILE, 'r') as f:
            metadata = json.load(f)

        content_hashes = set(metadata.get('content_hashes', []))
        speakers = set(metadata.get('speakers', []))
        companies = {company: set(speakers) for company, speakers in metadata.get(
            'companies', {}).items()}
        sentiments = set(metadata.get('sentiments', []))
        subjects = set(metadata.get('subjects', SUBJECTS))

    return content_hashes, speakers, companies, sentiments, subjects


if __name__ == "__main__":
    main()