import os import streamlit as st from embedchain import App from typing import Dict, Any, List def timestamp_to_seconds(timestamp): """Convert a timestamp in the format 'hh:mm:ss' or 'mm:ss' to total seconds.""" parts = timestamp.split(':') if len(parts) == 3: h, m, s = map(int, parts) ts = h * 3600 + m * 60 + s elif len(parts) == 2: m, s = map(int, parts) ts = m * 60 + s else: raise ValueError(f"Invalid timestamp format: {timestamp}") return ts class AIAssistant: def __init__(self): self.app = self._create_app() def _get_api_key(self, name: str) -> str: api_key = os.environ.get(name) if not api_key: api_key = st.secrets.get(name) if not api_key: raise ValueError( f"{name} is not set. Please set it in your environment or Streamlit secrets.") return api_key def _create_config(self) -> Dict[str, Any]: return { 'app': { 'config': { 'name': 'ttv-ec' } }, 'llm': { 'provider': 'huggingface', 'config': { 'model': 'mistralai/Mistral-7B-Instruct-v0.2', 'top_p': 0.5, 'stream': False, 'prompt': """You are an AI assistant that answers questions based solely on the information provided in your knowledge base. Question: $query Context: $context If the information to answer a question is not available in your knowledge base, respond with 'I don't have enough information to answer that question. """, 'api_key': self._get_api_key('HF_TOKEN') } }, 'embedder': { 'provider': 'huggingface', 'config': { 'model': 'sentence-transformers/all-mpnet-base-v2', 'api_key': self._get_api_key('HF_TOKEN') } } } def _create_app(self) -> App: config = self._create_config() return App.from_config(config=config) def save(self) -> None: # null function return def add_to_knowledge_base(self, data: str, data_type: str, metadata: Dict[str, Any] = None) -> None: self.app.add(data, data_type=data_type, metadata=metadata) def query(self, question: str, num_results: int = 30, filters: Dict[str, Any] = None) -> Dict[str, List[Dict[str, Any]]]: search_results = self.app.search( question, num_documents=num_results, where=filters) # Process and display search results answer = "Here are the most relevant transcript excerpts:\n\n" for i, result in enumerate(search_results['results'], 1): metadata = result['metadata'] ts = timestamp_to_seconds(metadata['timestamp']) yt_url = f"https://youtu.be/{metadata['youtube_id']}?t={ts}" speaker_info = ( f"Speaker: {metadata.get('speaker', 'Unknown')}, " f"Company: {metadata.get('company', 'Unknown')}, " f"Timestamp: {metadata.get('timestamp', 'Unknown')}" ) answer += f"{i}. [Speaker Info: {speaker_info}]({yt_url}) \n" answer += f"{metadata.get('title', 'Unknown')} \n" answer += f"\"{result['context']}\"\n\n" return {'results': search_results} # Usage example def get_ai_assistant() -> AIAssistant: return AIAssistant()