import os import argparse import logging from datetime import datetime from dotenv import load_dotenv from typing import List, Dict, Any, Optional, Tuple from rich.console import Console from rich.markdown import Markdown from pinecone import Pinecone from langchain_pinecone import Pinecone as LangchainPinecone # Import our custom LLM Manager from llm_manager import LLMManager # Configure logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) console = Console() # Load environment variables load_dotenv() class F1AI: def __init__(self, index_name: str = "f12", llm_provider: str = "huggingface"): """ Initialize the F1-AI RAG application. Args: index_name (str): Name of the Pinecone index to use llm_provider (str): Provider for LLM and embeddings. Options: "ollama", "huggingface", "huggingface-openai" """ self.index_name = index_name # Initialize LLM and embeddings via manager self.llm_manager = LLMManager(provider=llm_provider) self.llm = self.llm_manager.get_llm() self.embeddings = self.llm_manager.get_embeddings() # Load Pinecone API Key pinecone_api_key = os.getenv("PINECONE_API_KEY") if not pinecone_api_key: raise ValueError("❌ Pinecone API key missing! Set PINECONE_API_KEY in environment variables.") # Modify this part in f1_ai.py # Initialize Pinecone with v2 client try: self.pc = Pinecone(api_key=pinecone_api_key) # Check existing indexes existing_indexes = [idx['name'] for idx in self.pc.list_indexes()] if index_name not in existing_indexes: console.log(f"🚀 Creating Pinecone index: {index_name}") # Update the dimension to match your embedding model self.pc.create_index( name=index_name, dimension=384, # Match embedding dimensions of the model metric="cosine" ) # Connect to Pinecone index index = self.pc.Index(index_name) self.vectordb = LangchainPinecone.from_existing_index( index_name=index_name, text_key="text", embedding=self.embeddings ) print(f"✅ Successfully connected to Pinecone index: {index_name}") except Exception as e: import traceback print(f"⚠️ Error connecting to Pinecone: {str(e)}") print(traceback.format_exc()) # Set vectordb to None, the application will handle this gracefully self.vectordb = None async def scrape(self, url: str, max_chunks: int = 100) -> List[Dict[str, Any]]: """Scrape content from a URL and split into chunks with improved error handling.""" from playwright.async_api import async_playwright, TimeoutError from langchain.text_splitter import RecursiveCharacterTextSplitter from bs4 import BeautifulSoup try: async with async_playwright() as p: browser = await p.chromium.launch() page = await browser.new_page() console.log(f"[blue]Loading {url}...[/blue]") try: await page.goto(url, timeout=30000) # Get HTML content html_content = await page.content() soup = BeautifulSoup(html_content, 'html.parser') # Remove unwanted elements for element in soup.find_all(['script', 'style', 'nav', 'footer']): element.decompose() text = soup.get_text(separator=' ', strip=True) except TimeoutError: logger.error(f"Timeout while loading {url}") return [] finally: await browser.close() console.log(f"[green]Processing text ({len(text)} characters)...[/green]") # Enhanced text cleaning text = ' '.join(text.split()) # Normalize whitespace # Improved text splitting with semantic boundaries splitter = RecursiveCharacterTextSplitter( chunk_size=512, chunk_overlap=50, separators=["\n\n", "\n", ".", "!", "?", ",", " "], length_function=len ) docs = splitter.create_documents([text]) # Limit the number of chunks limited_docs = docs[:max_chunks] console.log(f"[yellow]Using {len(limited_docs)} chunks out of {len(docs)} total chunks[/yellow]") # Enhanced metadata timestamp = datetime.now().isoformat() return [{ "page_content": doc.page_content, "metadata": { "source": url, "chunk_index": i, "total_chunks": len(limited_docs), "timestamp": timestamp } } for i, doc in enumerate(limited_docs)] except Exception as e: logger.error(f"Error scraping {url}: {str(e)}") return [] async def ingest(self, urls: List[str], max_chunks_per_url: int = 100) -> None: """Ingest data from URLs into the vector database.""" from langchain_community.vectorstores import Pinecone as LangchainPinecone from tqdm import tqdm # Create empty list to store documents all_docs = [] # Scrape and process each URL with progress bar for url in tqdm(urls, desc="Scraping URLs"): chunks = await self.scrape(url, max_chunks=max_chunks_per_url) all_docs.extend(chunks) # Create or update vector database total_docs = len(all_docs) print(f"\nCreating vector database with {total_docs} documents...") texts = [doc["page_content"] for doc in all_docs] metadatas = [doc["metadata"] for doc in all_docs] print("Starting embedding generation and uploading to Pinecone (this might take several minutes)...") self.vectordb = LangchainPinecone.from_texts( texts=texts, embedding=self.embeddings, index_name=self.index_name, metadatas=metadatas, text_key="text" ) print("✅ Documents successfully uploaded to Pinecone!") async def ask_question(self, question: str) -> Dict[str, Any]: """Ask a question and get a response using RAG.""" if not self.vectordb: return {"answer": "Error: Vector database not initialized. Please ingest data first.", "sources": []} try: # Retrieve relevant documents with similarity search retriever = self.vectordb.as_retriever( search_type="similarity", search_kwargs={"k": 5} ) # Get relevant documents docs = retriever.get_relevant_documents(question) if not docs: return { "answer": "I couldn't find any relevant information in my knowledge base. Please try a different question or ingest more relevant data.", "sources": [] } # Format context from documents context = "\n\n".join([f"Document {i+1}: {doc.page_content}" for i, doc in enumerate(docs)]) # Create prompt for the LLM prompt = f""" Answer the question based on the provided context. Include relevant citations using [1], [2], etc. If you're unsure or if the context doesn't contain the information, acknowledge the uncertainty. Context: {context} Question: {question} Answer with citations: """ # Get response from LLM response_text = "" if hasattr(self.llm, "__call__"): # Direct inference client wrapped function response_text = self.llm(prompt) # Debug response logger.info(f"Raw LLM response type: {type(response_text)}") if not response_text or response_text.strip() == "": logger.error("Empty response from LLM") response_text = "I apologize, but I couldn't generate a response. This might be due to an issue with the language model." else: # LangChain LLM response_text = self.llm.invoke(prompt) # Format sources sources = [{ "url": doc.metadata["source"], "chunk_index": doc.metadata.get("chunk_index", 0), "timestamp": doc.metadata.get("timestamp", "") } for doc in docs] # Format response formatted_response = { "answer": response_text, "sources": sources } return formatted_response except Exception as e: logger.error(f"Error processing question: {str(e)}") return { "answer": f"I apologize, but I encountered an error while processing your question: {str(e)}", "sources": [] } async def main(): """Main function to run the application.""" import asyncio parser = argparse.ArgumentParser(description="F1-AI: RAG Application for Formula 1 information") subparsers = parser.add_subparsers(dest="command", help="Command to run") # Ingest command ingest_parser = subparsers.add_parser("ingest", help="Ingest data from URLs") ingest_parser.add_argument("--urls", nargs="+", required=True, help="URLs to scrape") ingest_parser.add_argument("--max-chunks", type=int, default=100, help="Maximum chunks per URL") # Ask command ask_parser = subparsers.add_parser("ask", help="Ask a question") ask_parser.add_argument("question", help="Question to ask") # Added provider argument with the new option parser.add_argument("--provider", choices=["ollama", "huggingface", "huggingface-openai"], default="huggingface", help="Provider for LLM and embeddings (default: huggingface)") args = parser.parse_args() f1_ai = F1AI(llm_provider=args.provider) if args.command == "ingest": await f1_ai.ingest(args.urls, max_chunks_per_url=args.max_chunks) elif args.command == "ask": response = await f1_ai.ask_question(args.question) console.print("\n[bold green]Answer:[/bold green]") # Format as markdown to make it prettier console.print(Markdown(response['answer'])) console.print("\n[bold yellow]Sources:[/bold yellow]") for i, source in enumerate(response['sources']): console.print(f"[{i+1}] {source['url']}") else: parser.print_help() if __name__ == "__main__": import asyncio asyncio.run(main())