import os import argparse import logging from datetime import datetime from dotenv import load_dotenv from typing import List, Dict, Any, Optional, Tuple from rich.console import Console from rich.markdown import Markdown from pinecone import Pinecone from langchain_pinecone import Pinecone as LangchainPinecone # Import our custom LLM Manager from llm_manager import LLMManager # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) console = Console() # Load environment variables load_dotenv() class F1AI: def __init__(self, index_name: str = "f12", llm_provider: str = "openrouter"): """ Initialize the F1-AI RAG application. Args: index_name (str): Name of the Pinecone index to use llm_provider (str): Provider for LLM. "openrouter" is used by default. """ self.index_name = index_name # Initialize LLM via manager self.llm_manager = LLMManager(provider=llm_provider) self.llm = self.llm_manager.get_llm() # Load Pinecone API Key pinecone_api_key = os.getenv("PINECONE_API_KEY") if not pinecone_api_key: raise ValueError("❌ Pinecone API key missing! Set PINECONE_API_KEY in environment variables.") # Initialize Pinecone with v2 client self.pc = Pinecone(api_key=pinecone_api_key) # Check existing indexes existing_indexes = [idx['name'] for idx in self.pc.list_indexes()] if index_name not in existing_indexes: raise ValueError(f"❌ Pinecone index '{index_name}' does not exist! Please create it first.") # Connect to Pinecone index index = self.pc.Index(index_name) # Use the existing pre-configured Pinecone index # Note: We're using the embeddings that Pinecone already has configured self.vectordb = LangchainPinecone( index=index, text_key="text", embedding=self.llm_manager.get_embeddings() # This will only be used for new queries ) print(f"✅ Successfully connected to Pinecone index: {index_name}") async def scrape(self, url: str, max_chunks: int = 100) -> List[Dict[str, Any]]: """Scrape content from a URL and split into chunks with improved error handling.""" from playwright.async_api import async_playwright, TimeoutError from langchain.text_splitter import RecursiveCharacterTextSplitter from bs4 import BeautifulSoup try: async with async_playwright() as p: browser = await p.chromium.launch() page = await browser.new_page() console.log(f"[blue]Loading {url}...[/blue]") try: await page.goto(url, timeout=30000) # Get HTML content html_content = await page.content() soup = BeautifulSoup(html_content, 'html.parser') # Remove unwanted elements for element in soup.find_all(['script', 'style', 'nav', 'footer']): element.decompose() text = soup.get_text(separator=' ', strip=True) except TimeoutError: logger.error(f"Timeout while loading {url}") return [] finally: await browser.close() console.log(f"[green]Processing text ({len(text)} characters)...[/green]") # Enhanced text cleaning text = ' '.join(text.split()) # Normalize whitespace # Improved text splitting with semantic boundaries splitter = RecursiveCharacterTextSplitter( chunk_size=512, chunk_overlap=50, separators=["\n\n", "\n", ".", "!", "?", ",", " "], length_function=len ) docs = splitter.create_documents([text]) # Limit the number of chunks limited_docs = docs[:max_chunks] console.log(f"[yellow]Using {len(limited_docs)} chunks out of {len(docs)} total chunks[/yellow]") # Enhanced metadata timestamp = datetime.now().isoformat() return [{ "page_content": doc.page_content, "metadata": { "source": url, "chunk_index": i, "total_chunks": len(limited_docs), "timestamp": timestamp } } for i, doc in enumerate(limited_docs)] except Exception as e: logger.error(f"Error scraping {url}: {str(e)}") return [] async def ingest(self, urls: List[str], max_chunks_per_url: int = 100) -> None: """Ingest data from URLs into the vector database.""" from tqdm import tqdm # Create empty list to store documents all_docs = [] # Scrape and process each URL with progress bar for url in tqdm(urls, desc="Scraping URLs"): chunks = await self.scrape(url, max_chunks=max_chunks_per_url) all_docs.extend(chunks) # Create or update vector database total_docs = len(all_docs) print(f"\nCreating vector database with {total_docs} documents...") texts = [doc["page_content"] for doc in all_docs] metadatas = [doc["metadata"] for doc in all_docs] print("Starting embedding generation and uploading to Pinecone (this might take several minutes)...") # Use the existing vectordb to add documents self.vectordb.add_texts( texts=texts, metadatas=metadatas ) print("✅ Documents successfully uploaded to Pinecone!") async def ask_question(self, question: str) -> Dict[str, Any]: """Ask a question and get a response using RAG.""" if not self.vectordb: return {"answer": "Error: Vector database not initialized. Please ingest data first.", "sources": []} try: # Retrieve relevant documents with similarity search retriever = self.vectordb.as_retriever( search_type="similarity", search_kwargs={"k": 5} ) # Get relevant documents docs = retriever.get_relevant_documents(question) if not docs: return { "answer": "I couldn't find any relevant information in my knowledge base. Please try a different question or ingest more relevant data.", "sources": [] } # Format context from documents context = "\n\n".join([f"Document {i+1}: {doc.page_content}" for i, doc in enumerate(docs)]) # Create enhanced prompt for the LLM with better instructions prompt = f""" You are an expert Formula 1 knowledge assistant. Using the provided context, answer the question comprehensively and naturally. Guidelines: 1. Provide detailed, well-structured responses that flow naturally 2. Use source citations [1], [2], etc. to support key facts 3. If information is uncertain or missing from context, acknowledge it 4. Organize complex answers with clear paragraphs 5. Add relevant examples or explanations when helpful 6. dont fill the ouput with citations only Context: {context} Question: {question} Provide a comprehensive answer with appropriate citations: """ # Get response from LLM response_text = "" if hasattr(self.llm, "__call__"): # Direct inference client wrapped function response_text = self.llm(prompt) # Debug response logger.info(f"Raw LLM response type: {type(response_text)}") if not response_text or response_text.strip() == "": logger.error("Empty response from LLM") response_text = "I apologize, but I couldn't generate a response. This might be due to an issue with the language model." else: # LangChain LLM response_text = self.llm.invoke(prompt) # Process and format sources with better attribution sources = [] for i, doc in enumerate(docs, 1): source = { "index": i, "url": doc.metadata["source"], "chunk_index": doc.metadata.get("chunk_index", 0), "timestamp": doc.metadata.get("timestamp", ""), "excerpt": doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content } sources.append(source) # Format response with enhanced structure formatted_response = { "answer": response_text, "sources": sources, "metadata": { "total_sources": len(sources), "query_timestamp": datetime.now().isoformat(), "response_format_version": "2.0" } } return formatted_response except Exception as e: logger.error(f"Error processing question: {str(e)}") return { "answer": f"I apologize, but I encountered an error while processing your question: {str(e)}", "sources": [] } async def main(): """Main function to run the application.""" import asyncio parser = argparse.ArgumentParser(description="F1-AI: RAG Application for Formula 1 information") subparsers = parser.add_subparsers(dest="command", help="Command to run") # Ingest command ingest_parser = subparsers.add_parser("ingest", help="Ingest data from URLs") ingest_parser.add_argument("--urls", nargs="+", required=True, help="URLs to scrape") ingest_parser.add_argument("--max-chunks", type=int, default=100, help="Maximum chunks per URL") # Ask command ask_parser = subparsers.add_parser("ask", help="Ask a question") ask_parser.add_argument("question", help="Question to ask") # Provider argument parser.add_argument("--provider", choices=["ollama", "openrouter"], default="openrouter", help="Provider for LLM (default: openrouter)") args = parser.parse_args() f1_ai = F1AI(llm_provider=args.provider) if args.command == "ingest": await f1_ai.ingest(args.urls, max_chunks_per_url=args.max_chunks) elif args.command == "ask": response = await f1_ai.ask_question(args.question) console.print("\n[bold green]Answer:[/bold green]") # Format as markdown to make it prettier console.print(Markdown(response['answer'])) console.print("\n[bold yellow]Sources:[/bold yellow]") for source in response['sources']: console.print(f"[{source['index']}] {source['url']}") console.print(f"[dim]Excerpt:[/dim] {source['excerpt']}\n") # Print metadata console.print("\n[bold blue]Query Info:[/bold blue]") console.print(f"Total sources: {response['metadata']['total_sources']}") console.print(f"Query time: {response['metadata']['query_timestamp']}") console.print(f"Response version: {response['metadata']['response_format_version']}") else: parser.print_help() if __name__ == "__main__": import asyncio asyncio.run(main())