Spaces:

Qdrant
/

webinar-vibe-coding-rag

Running

File size: 24,033 Bytes

import os
import uuid
from typing import List, Dict, Any, Optional
import re
from datetime import datetime
from sentence_transformers import SentenceTransformer
from qdrant_client.http import models
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.proxies import WebshareProxyConfig
import yt_dlp
from app.models.video import VideoSegment, Video, SearchResult
from app.services.qdrant_service import qdrant_client

# Initialize the sentence transformer model
model = SentenceTransformer(
    "sentence-transformers/static-retrieval-mrl-en-v1", cache_folder="/tmp"
)

# Collection names
COLLECTION_NAME = "video_segments"
PROCESSED_VIDEOS_COLLECTION = "processed_videos"


def _fetch_youtube_metadata(video_id: str, video: Optional[Video] = None) -> Video:
    """Helper function to fetch video metadata from YouTube using yt-dlp."""
    import logging

    if not video:
        video = Video(video_id=video_id)

    try:
        logging.info(f"Fetching metadata for video {video_id} from YouTube")

        # Configure yt-dlp options
        ydl_opts = {
            "skip_download": True,  # Don't download the video
            "quiet": True,  # Don't print progress
            "no_warnings": True,  # Don't print warnings
            "extract_flat": True,  # Don't extract videos in playlists
            "format": "best",  # Best quality (doesn't matter since we're not downloading)
        }

        # Use yt-dlp to extract video info
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(
                f"https://www.youtube.com/watch?v={video_id}", download=False
            )

            # Set video properties if available
            if info.get("title"):
                video.title = info.get("title")

            if info.get("description"):
                video.description = info.get("description")

            if info.get("uploader"):
                video.channel = info.get("uploader")

        logging.info(
            f"Successfully retrieved video metadata: title='{video.title}', channel='{video.channel}'"
        )
    except Exception as meta_error:
        logging.warning(f"Could not fetch metadata from YouTube: {str(meta_error)}")
        if not video.title:
            video.title = f"Video {video_id}"

    return video


# Ensure collections exist
def ensure_collection_exists():
    """Ensure the required collections exist in Qdrant."""
    import logging

    try:
        logging.info("Checking Qdrant collections")
        collections = qdrant_client.get_collections().collections
        collection_names = [collection.name for collection in collections]
        logging.info(f"Existing collections: {collection_names}")

        # Create video segments collection if it doesn't exist
        if COLLECTION_NAME not in collection_names:
            logging.info(f"Creating collection: {COLLECTION_NAME}")
            vector_size = model.get_sentence_embedding_dimension()
            qdrant_client.create_collection(
                collection_name=COLLECTION_NAME,
                vectors_config=models.VectorParams(
                    size=vector_size,
                    distance=models.Distance.COSINE,
                ),
            )
            logging.info(
                f"Successfully created {COLLECTION_NAME} collection with vector size {vector_size}"
            )

        # Create processed videos collection if it doesn't exist
        if PROCESSED_VIDEOS_COLLECTION not in collection_names:
            logging.info(f"Creating collection: {PROCESSED_VIDEOS_COLLECTION}")
            vector_size = model.get_sentence_embedding_dimension()
            qdrant_client.create_collection(
                collection_name=PROCESSED_VIDEOS_COLLECTION,
                vectors_config=models.VectorParams(
                    size=vector_size,
                    distance=models.Distance.COSINE,
                ),
            )
            qdrant_client.create_payload_index(
                collection_name=PROCESSED_VIDEOS_COLLECTION,
                field_name="video_id",
                field_schema=models.PayloadSchemaType.KEYWORD,
            )
            qdrant_client.create_payload_index(
                collection_name=PROCESSED_VIDEOS_COLLECTION,
                field_name="created_at",
                field_schema=models.IntegerIndexParams(
                    type=models.IntegerIndexType.INTEGER,
                    range=True,
                ),
            )
            logging.info(
                f"Successfully created {PROCESSED_VIDEOS_COLLECTION} collection with vector size {vector_size}"
            )
    except Exception as e:
        import traceback

        logging.error(f"Error ensuring collections exist: {str(e)}")
        logging.error(traceback.format_exc())
        raise


def get_embeddings(text: str) -> List[float]:
    """Get embeddings for the given text using SentenceTransformer."""
    return model.encode(text).tolist()


def extract_video_id(youtube_url: str) -> str:
    """Extract YouTube video ID from URL."""
    import logging

    logging.info(f"Extracting video ID from URL: {youtube_url}")

    # Match patterns like: https://www.youtube.com/watch?v=VIDEO_ID or https://youtu.be/VIDEO_ID
    patterns = [
        r"(?:youtube\.com/watch\?v=|youtu\.be/)([\w-]+)",
        r"(?:youtube\.com/embed/)([\w-]+)",
        r"(?:youtube\.com/v/)([\w-]+)",
    ]

    for pattern in patterns:
        match = re.search(pattern, youtube_url)
        if match:
            video_id = match.group(1)
            logging.info(f"Extracted video ID: {video_id}")
            return video_id

    # If no pattern matches, assume the input might be a direct video ID
    if re.match(r"^[\w-]+$", youtube_url):
        logging.info(f"Using direct video ID: {youtube_url}")
        return youtube_url

    logging.error(f"Failed to extract video ID from URL: {youtube_url}")
    raise ValueError(f"Could not extract video ID from URL: {youtube_url}")


def get_video_transcript(video_id: str) -> List[Dict[str, Any]]:
    """
    Get transcript for a YouTube video in any available language.
    Will try to get transcripts in this priority:
    1. English transcript (if available)
    2. Any available transcript translated to English (if translatable)
    3. Any available transcript in its original language
    """
    import logging
    import traceback

    try:
        # Try to get available transcript languages
        webshare_username = os.environ.get("WEBSHARE_USERNAME")
        webshare_password = os.environ.get("WEBSHARE_PASSWORD")
        if webshare_username and webshare_password:
            yt_transcript_api = YouTubeTranscriptApi(
                proxy_config=WebshareProxyConfig(
                    proxy_username=webshare_username,
                    proxy_password=webshare_password,
                )
            )
        else:
            yt_transcript_api = YouTubeTranscriptApi()
        transcript_list = yt_transcript_api.list(video_id)

        # First, look for English transcript
        english_transcript = None
        other_transcripts = []

        # Categorize available transcripts
        for transcript_item in transcript_list:
            if transcript_item.language_code == "en":
                english_transcript = transcript_item
            else:
                other_transcripts.append(transcript_item)

        # 1. Try English first if available
        if english_transcript:
            try:
                logging.info("Found English transcript, using it directly")
                return english_transcript.fetch()
            except Exception as e:
                logging.warning(f"Failed to fetch English transcript: {str(e)}")

        # 2. Try translatable transcripts
        translatable_transcripts = [t for t in other_transcripts if t.is_translatable]
        for transcript_item in translatable_transcripts:
            try:
                logging.info(
                    f"Trying to translate {transcript_item.language_code} transcript to English"
                )
                translated = transcript_item.translate("en").fetch()
                logging.info(
                    f"Successfully translated {transcript_item.language_code} transcript to English"
                )
                return translated
            except Exception as e:
                logging.warning(
                    f"Failed to translate {transcript_item.language_code} transcript: {str(e)}"
                )

        # 3. Try any transcript in original language
        for transcript_item in other_transcripts:
            try:
                logging.info(
                    f"Using non-translated {transcript_item.language_code} transcript"
                )
                return transcript_item.fetch()
            except Exception as e:
                logging.warning(
                    f"Failed to fetch {transcript_item.language_code} transcript: {str(e)}"
                )

        # If we get here, no transcripts worked
        available_langs = [t.language_code for t in transcript_list]
        raise ValueError(
            f"No usable transcripts found for video {video_id}. Available languages: {available_langs}"
        )

    except Exception as e:
        logging.error(f"Transcript API error for video {video_id}: {str(e)}")
        logging.error(traceback.format_exc())
        raise ValueError(f"Could not get transcript for video {video_id}: {str(e)}")


def store_processed_video(video: Video) -> bool:
    """Store a processed video in Qdrant."""
    try:
        # Get a simple embedding for the video ID
        vector = get_embeddings(f"video_{video.video_id}")

        # Prepare payload
        payload = video.model_dump()

        # Store in Qdrant
        qdrant_client.upsert(
            collection_name=PROCESSED_VIDEOS_COLLECTION,
            points=[
                models.PointStruct(
                    id=uuid.uuid4().hex,
                    vector=vector,
                    payload=payload,
                ),
            ],
        )
        return True
    except Exception as e:
        print(f"Error storing processed video: {e}")
        return False


def get_processed_videos(limit: int = 10) -> List[Video]:
    """Get recently processed videos ordered by creation time."""
    try:
        # Scroll through the processed videos collection
        scroll_result = qdrant_client.scroll(
            collection_name=PROCESSED_VIDEOS_COLLECTION,
            limit=limit,
            with_payload=True,
            order_by=models.OrderBy(key="created_at", direction=models.Direction.DESC),
        )

        # Extract videos from the result
        videos = []
        for point in scroll_result[0]:
            # Convert payload to Video
            video = Video(**point.payload)
            videos.append(video)

        # Sort by created_at timestamp (most recent first)
        videos.sort(key=lambda x: x.created_at or "", reverse=True)

        return videos[:limit]
    except Exception as e:
        print(f"Error getting processed videos: {e}")
        return []


def process_video(youtube_url: str) -> Video:
    """Process a YouTube video to extract and store transcript segments."""
    import logging
    import traceback

    logging.info(f"Processing video URL: {youtube_url}")
    transcript = None
    video_id = None

    # Extract video ID and get transcript
    try:
        # Extract video ID
        video_id = extract_video_id(youtube_url)
        logging.info(f"Successfully extracted video ID: {video_id}")

        # Check if video has already been processed
        existing_video = get_video_by_id(video_id)
        if existing_video and existing_video.processed:
            logging.info(
                f"Video {video_id} has already been processed. Skipping processing."
            )
            return existing_video

        # Create basic video object with current timestamp
        current_time = int(datetime.utcnow().timestamp())
        video = Video(video_id=video_id, created_at=current_time)

        # Get video metadata from YouTube using the helper function
        try:
            video = _fetch_youtube_metadata(video_id, video)
        except Exception as meta_error:
            logging.warning(
                f"Error fetching YouTube metadata during processing: {str(meta_error)}"
            )
            # Continue with processing even if metadata fetch fails

        # Get transcript
        logging.info(f"Fetching transcript for video ID: {video_id}")
        transcript = get_video_transcript(video_id)
        logging.info(
            f"Successfully retrieved transcript with {len(transcript)} entries"
        )

        # If we couldn't get metadata and have a transcript, try to extract a title from transcript
        if (
            (not video.title or video.title == f"Video {video_id}")
            and transcript
            and len(transcript) > 0
        ):
            # Handle different transcript formats
            try:
                # Check if transcript is a list of dictionaries (original format)
                if isinstance(transcript[0], dict) and "text" in transcript[0]:
                    video.title = f"{transcript[0]['text'][:30]}..."
                # Check if transcript is a list of objects with text attribute
                elif hasattr(transcript[0], "text"):
                    video.title = f"{transcript[0].text[:30]}..."
                # If it's another format, just use the string representation of first item
                else:
                    first_item_str = str(transcript[0])[:30]
                    video.title = f"{first_item_str}..."
                logging.info(f"Set video title from transcript: {video.title}")
            except Exception as title_error:
                logging.warning(
                    f"Could not set title from transcript: {str(title_error)}"
                )
    except Exception as e:
        logging.error(f"Error in initial video processing: {str(e)}")
        logging.error(traceback.format_exc())
        raise

    # Process transcript into segments
    try:
        # Process transcript into overlapping 30-second segments with 10-second overlap
        logging.info(f"Processing {len(transcript)} transcript entries into segments")
        segments = []

        # First, normalize the transcript to a standard format
        normalized_transcript = []
        for item in transcript:
            if (
                isinstance(item, dict)
                and "text" in item
                and "start" in item
                and "duration" in item
            ):
                # Original dictionary format
                normalized_transcript.append(
                    {
                        "text": item["text"],
                        "start": item["start"],
                        "duration": item["duration"],
                    }
                )
            elif (
                hasattr(item, "text")
                and hasattr(item, "start")
                and hasattr(item, "duration")
            ):
                # Object with attributes
                normalized_transcript.append(
                    {"text": item.text, "start": item.start, "duration": item.duration}
                )
            else:
                # Unknown format, try to extract what we can
                logging.warning(
                    f"Encountered unknown transcript item format: {type(item)}"
                )
                try:
                    # Convert to string if we can't determine the structure
                    text = str(item)
                    # Use index as a timestamp approximation
                    idx = transcript.index(item)
                    normalized_transcript.append(
                        {
                            "text": text,
                            "start": float(idx * 5),  # Approximate 5 seconds per item
                            "duration": 5.0,
                        }
                    )
                except Exception as e:
                    logging.error(f"Failed to normalize transcript item: {str(e)}")
                    continue

        # Use the normalized transcript for segment processing
        for i in range(len(normalized_transcript)):
            # Find segments that form approximately 30 seconds
            segment_text = []
            start_time = normalized_transcript[i]["start"]
            end_time = start_time
            current_index = i

            while (
                current_index < len(normalized_transcript)
                and end_time - start_time < 30
            ):
                segment_text.append(normalized_transcript[current_index]["text"])
                end_time = (
                    normalized_transcript[current_index]["start"]
                    + normalized_transcript[current_index]["duration"]
                )
                current_index += 1

            if segment_text:  # Only create segment if we have text
                segment_id = f"{video_id}_{i}"
                text = " ".join(segment_text)

                # Create VideoSegment
                segment = VideoSegment(
                    text=text,
                    start=start_time,
                    end=end_time,
                    segment_id=segment_id,
                    video_id=video_id,
                )

                segments.append(segment)

            # Skip forward with 10-second overlap (if we're not at the end)
            if (
                i + 1 < len(normalized_transcript)
                and normalized_transcript[i + 1]["start"] < end_time - 10
            ):
                # Find the next segment that starts at least 20 seconds after our current start
                while (
                    i + 1 < len(normalized_transcript)
                    and normalized_transcript[i + 1]["start"] < start_time + 20
                ):
                    i += 1

        logging.info(f"Created {len(segments)} segments from transcript")

        # Store segments in Qdrant
        logging.info("Ensuring Qdrant collections exist")
        ensure_collection_exists()

        # Store each segment
        logging.info(f"Storing {len(segments)} segments in Qdrant")
        for segment in segments:
            store_segment(segment)
    except Exception as e:
        logging.error(f"Error processing transcript segments: {str(e)}")
        logging.error(traceback.format_exc())
        raise

    # Mark video as processed and store it
    try:
        logging.info(f"Marking video {video_id} as processed")
        video.processed = True

        # Store the processed video in Qdrant
        logging.info("Storing processed video in Qdrant")
        store_result = store_processed_video(video)
        if store_result:
            logging.info(f"Successfully stored processed video: {video_id}")
        else:
            logging.warning(f"Failed to store processed video in Qdrant: {video_id}")

        return video
    except Exception as e:
        logging.error(f"Error storing processed video: {str(e)}")
        logging.error(traceback.format_exc())
        raise


def store_segment(segment: VideoSegment) -> bool:
    """Store a video segment in Qdrant."""
    import logging

    try:
        # Get embeddings
        logging.debug(f"Getting embeddings for segment {segment.segment_id}")
        vector = get_embeddings(segment.text)

        # Prepare payload
        payload = segment.model_dump()

        # Store in Qdrant
        point_id = uuid.uuid4().hex
        logging.debug(
            f"Storing segment {segment.segment_id} in Qdrant with point ID {point_id}"
        )
        qdrant_client.upsert(
            collection_name=COLLECTION_NAME,
            points=[
                models.PointStruct(
                    id=point_id,
                    vector=vector,
                    payload=payload,
                ),
            ],
        )
        return True
    except Exception as e:
        import traceback

        logging.error(f"Error storing segment {segment.segment_id}: {str(e)}")
        logging.error(traceback.format_exc())
        return False


def search_video_segments(
    query: str, video_id: Optional[str] = None, limit: int = 5
) -> List[SearchResult]:
    """Search for video segments based on the provided query."""
    # Get query embeddings
    query_vector = get_embeddings(query)

    # Prepare filter if video_id is provided
    filter_param = None
    if video_id:
        filter_param = models.Filter(
            must=[
                models.FieldCondition(
                    key="video_id",
                    match=models.MatchValue(value=video_id),
                ),
            ],
        )

    # Search in Qdrant
    search_result = qdrant_client.search(
        collection_name=COLLECTION_NAME,
        query_vector=query_vector,
        limit=limit,
        query_filter=filter_param,
    )

    # Format results
    results = []
    for scored_point in search_result:
        # Convert payload to VideoSegment
        segment = VideoSegment(**scored_point.payload)

        # Create SearchResult
        result = SearchResult(
            score=scored_point.score,
            segment=segment,
        )
        results.append(result)

    return results


def get_all_segments(video_id: str) -> List[VideoSegment]:
    """Get all segments for a specific video, ordered by start time."""
    # Prepare filter for the video_id
    filter_param = models.Filter(
        must=[
            models.FieldCondition(
                key="video_id",
                match=models.MatchValue(value=video_id),
            ),
        ],
    )

    # Search in Qdrant without vector, just to get all segments
    scroll_result = qdrant_client.scroll(
        collection_name=COLLECTION_NAME,
        scroll_filter=filter_param,
        limit=10000,  # Adjust based on expected maximum segments
    )

    # Format results
    segments = []
    for point in scroll_result[0]:
        # Convert payload to VideoSegment
        segment = VideoSegment(**point.payload)
        segments.append(segment)

    # Sort by start time
    segments.sort(key=lambda x: x.start)

    return segments


def get_video_by_id(video_id: str) -> Optional[Video]:
    """Get a specific video by its video_id. If not found in database, attempt to fetch from YouTube."""
    import logging

    try:
        # Create filter for the video_id
        filter_param = models.Filter(
            must=[
                models.FieldCondition(
                    key="video_id",
                    match=models.MatchValue(value=video_id),
                ),
            ],
        )

        # Search in the processed_videos collection
        scroll_result = qdrant_client.scroll(
            collection_name=PROCESSED_VIDEOS_COLLECTION,
            scroll_filter=filter_param,
            limit=1,  # We only need one result
            with_payload=True,
        )

        # Check if any results were found
        if scroll_result[0]:
            # Convert payload to Video
            video = Video(**scroll_result[0][0].payload)

            # If video exists but doesn't have title, try to fetch it from YouTube
            if not video.title or video.title == f"Video {video_id}":
                video = _fetch_youtube_metadata(video_id, video)

            return video

        # If video not found in database, fetch basic metadata from YouTube
        logging.info(f"Video {video_id} not found in database, fetching from YouTube")
        video = Video(video_id=video_id)
        return _fetch_youtube_metadata(video_id, video)

    except Exception as e:
        logging.error(f"Error getting video by ID {video_id}: {str(e)}")
        # Return a basic video object with just the ID
        return Video(video_id=video_id, title=f"Video {video_id}")