File size: 12,065 Bytes
180a8b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79f7264
180a8b0
 
 
 
 
79f7264
180a8b0
 
 
79f7264
180a8b0
 
 
 
 
 
 
 
 
79f7264
180a8b0
79f7264
 
180a8b0
79f7264
 
180a8b0
79f7264
 
 
 
 
 
 
 
 
 
180a8b0
79f7264
180a8b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79f7264
 
180a8b0
79f7264
180a8b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b11459b
180a8b0
b11459b
 
 
 
 
 
 
 
 
 
180a8b0
 
 
 
b11459b
 
180a8b0
 
 
 
 
 
 
 
 
 
 
 
 
 
b11459b
 
 
 
 
 
 
 
 
 
 
180a8b0
b11459b
180a8b0
 
b11459b
 
 
 
 
 
180a8b0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79f7264
 
 
180a8b0
 
 
 
 
 
 
 
 
 
 
 
 
 
b11459b
 
 
 
 
 
 
 
 
 
180a8b0
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
import os
import argparse
import logging
from datetime import datetime
from dotenv import load_dotenv
from typing import List, Dict, Any, Optional, Tuple
from rich.console import Console
from rich.markdown import Markdown
from pinecone import Pinecone
from langchain_pinecone import Pinecone as LangchainPinecone

# Import our custom LLM Manager
from llm_manager import LLMManager

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
console = Console()

# Load environment variables
load_dotenv()

class F1AI:
    def __init__(self, index_name: str = "f12", llm_provider: str = "openrouter"):
        """
        Initialize the F1-AI RAG application.
        
        Args:
            index_name (str): Name of the Pinecone index to use
            llm_provider (str): Provider for LLM. "openrouter" is used by default.
        """
        self.index_name = index_name
        
        # Initialize LLM via manager
        self.llm_manager = LLMManager(provider=llm_provider)
        self.llm = self.llm_manager.get_llm()
        
        # Load Pinecone API Key
        pinecone_api_key = os.getenv("PINECONE_API_KEY")
        if not pinecone_api_key:
            raise ValueError("❌ Pinecone API key missing! Set PINECONE_API_KEY in environment variables.")

        # Initialize Pinecone with v2 client
        self.pc = Pinecone(api_key=pinecone_api_key)

        # Check existing indexes
        existing_indexes = [idx['name'] for idx in self.pc.list_indexes()]

        if index_name not in existing_indexes:
            raise ValueError(f"❌ Pinecone index '{index_name}' does not exist! Please create it first.")

        # Connect to Pinecone index
        index = self.pc.Index(index_name)
        
        # Use the existing pre-configured Pinecone index
        # Note: We're using the embeddings that Pinecone already has configured
        self.vectordb = LangchainPinecone(
            index=index,
            text_key="text",
            embedding=self.llm_manager.get_embeddings()  # This will only be used for new queries
        )

        print(f"βœ… Successfully connected to Pinecone index: {index_name}")


    async def scrape(self, url: str, max_chunks: int = 100) -> List[Dict[str, Any]]:
        """Scrape content from a URL and split into chunks with improved error handling."""
        from playwright.async_api import async_playwright, TimeoutError
        from langchain.text_splitter import RecursiveCharacterTextSplitter
        from bs4 import BeautifulSoup
        
        try:
            async with async_playwright() as p:
                browser = await p.chromium.launch()
                page = await browser.new_page()
                console.log(f"[blue]Loading {url}...[/blue]")
                
                try:
                    await page.goto(url, timeout=30000)
                    # Get HTML content
                    html_content = await page.content()
                    soup = BeautifulSoup(html_content, 'html.parser')
                    
                    # Remove unwanted elements
                    for element in soup.find_all(['script', 'style', 'nav', 'footer']):
                        element.decompose()
                    
                    text = soup.get_text(separator=' ', strip=True)
                except TimeoutError:
                    logger.error(f"Timeout while loading {url}")
                    return []
                finally:
                    await browser.close()
            
            console.log(f"[green]Processing text ({len(text)} characters)...[/green]")
            
            # Enhanced text cleaning
            text = ' '.join(text.split())  # Normalize whitespace
            
            # Improved text splitting with semantic boundaries
            splitter = RecursiveCharacterTextSplitter(
                chunk_size=512,
                chunk_overlap=50,
                separators=["\n\n", "\n", ".", "!", "?", ",", " "],
                length_function=len
            )
            
            docs = splitter.create_documents([text])
            
            # Limit the number of chunks
            limited_docs = docs[:max_chunks]
            console.log(f"[yellow]Using {len(limited_docs)} chunks out of {len(docs)} total chunks[/yellow]")
            
            # Enhanced metadata
            timestamp = datetime.now().isoformat()
            return [{
                "page_content": doc.page_content,
                "metadata": {
                    "source": url,
                    "chunk_index": i,
                    "total_chunks": len(limited_docs),
                    "timestamp": timestamp
                }
            } for i, doc in enumerate(limited_docs)]
            
        except Exception as e:
            logger.error(f"Error scraping {url}: {str(e)}")
            return []

    async def ingest(self, urls: List[str], max_chunks_per_url: int = 100) -> None:
        """Ingest data from URLs into the vector database."""
        from tqdm import tqdm
        
        # Create empty list to store documents
        all_docs = []
        
        # Scrape and process each URL with progress bar
        for url in tqdm(urls, desc="Scraping URLs"):
            chunks = await self.scrape(url, max_chunks=max_chunks_per_url)
            all_docs.extend(chunks)
            
        # Create or update vector database
        total_docs = len(all_docs)
        print(f"\nCreating vector database with {total_docs} documents...")
        texts = [doc["page_content"] for doc in all_docs]
        metadatas = [doc["metadata"] for doc in all_docs]
        
        print("Starting embedding generation and uploading to Pinecone (this might take several minutes)...")
        # Use the existing vectordb to add documents
        self.vectordb.add_texts(
            texts=texts,
            metadatas=metadatas
        )
        
        print("βœ… Documents successfully uploaded to Pinecone!")
    
    async def ask_question(self, question: str) -> Dict[str, Any]:
        """Ask a question and get a response using RAG."""
        if not self.vectordb:
            return {"answer": "Error: Vector database not initialized. Please ingest data first.", "sources": []}
        
        try:
            # Retrieve relevant documents with similarity search
            retriever = self.vectordb.as_retriever(
                search_type="similarity",
                search_kwargs={"k": 5}
            )
            
            # Get relevant documents
            docs = retriever.get_relevant_documents(question)
            
            if not docs:
                return {
                    "answer": "I couldn't find any relevant information in my knowledge base. Please try a different question or ingest more relevant data.",
                    "sources": []
                }
            
            # Format context from documents
            context = "\n\n".join([f"Document {i+1}: {doc.page_content}" for i, doc in enumerate(docs)])
            
            # Create enhanced prompt for the LLM with better instructions
            prompt = f"""
            You are an expert Formula 1 knowledge assistant. Using the provided context, answer the question comprehensively and naturally.
            
            Guidelines:
            1. Provide detailed, well-structured responses that flow naturally
            2. Use source citations [1], [2], etc. to support key facts
            3. If information is uncertain or missing from context, acknowledge it
            4. Organize complex answers with clear paragraphs
            5. Add relevant examples or explanations when helpful
            6. dont fill the ouput with citations only
            
            Context:
            {context}
            
            Question: {question}
            
            Provide a comprehensive answer with appropriate citations:
            """
            
            # Get response from LLM
            response_text = ""
            if hasattr(self.llm, "__call__"):  # Direct inference client wrapped function
                response_text = self.llm(prompt)
                # Debug response
                logger.info(f"Raw LLM response type: {type(response_text)}")
                if not response_text or response_text.strip() == "":
                    logger.error("Empty response from LLM")
                    response_text = "I apologize, but I couldn't generate a response. This might be due to an issue with the language model."
            else:  # LangChain LLM
                response_text = self.llm.invoke(prompt)
            
            # Process and format sources with better attribution
            sources = []
            for i, doc in enumerate(docs, 1):
                source = {
                    "index": i,
                    "url": doc.metadata["source"],
                    "chunk_index": doc.metadata.get("chunk_index", 0),
                    "timestamp": doc.metadata.get("timestamp", ""),
                    "excerpt": doc.page_content[:200] + "..." if len(doc.page_content) > 200 else doc.page_content
                }
                sources.append(source)
            
            # Format response with enhanced structure
            formatted_response = {
                "answer": response_text,
                "sources": sources,
                "metadata": {
                    "total_sources": len(sources),
                    "query_timestamp": datetime.now().isoformat(),
                    "response_format_version": "2.0"
                }
            }
            
            return formatted_response
            
        except Exception as e:
            logger.error(f"Error processing question: {str(e)}")
            return {
                "answer": f"I apologize, but I encountered an error while processing your question: {str(e)}",
                "sources": []
            }

async def main():
    """Main function to run the application."""
    import asyncio
    
    parser = argparse.ArgumentParser(description="F1-AI: RAG Application for Formula 1 information")
    subparsers = parser.add_subparsers(dest="command", help="Command to run")
    
    # Ingest command
    ingest_parser = subparsers.add_parser("ingest", help="Ingest data from URLs")
    ingest_parser.add_argument("--urls", nargs="+", required=True, help="URLs to scrape")
    ingest_parser.add_argument("--max-chunks", type=int, default=100, help="Maximum chunks per URL")
    
    # Ask command
    ask_parser = subparsers.add_parser("ask", help="Ask a question")
    ask_parser.add_argument("question", help="Question to ask")
    
    # Provider argument
    parser.add_argument("--provider", choices=["ollama", "openrouter"], default="openrouter",
                        help="Provider for LLM (default: openrouter)")
    
    args = parser.parse_args()
    
    f1_ai = F1AI(llm_provider=args.provider)
    
    if args.command == "ingest":
        await f1_ai.ingest(args.urls, max_chunks_per_url=args.max_chunks)
    elif args.command == "ask":
        response = await f1_ai.ask_question(args.question)
        console.print("\n[bold green]Answer:[/bold green]")
        # Format as markdown to make it prettier
        console.print(Markdown(response['answer']))
        
        console.print("\n[bold yellow]Sources:[/bold yellow]")
        for source in response['sources']:
            console.print(f"[{source['index']}] {source['url']}")
            console.print(f"[dim]Excerpt:[/dim] {source['excerpt']}\n")
        
        # Print metadata
        console.print("\n[bold blue]Query Info:[/bold blue]")
        console.print(f"Total sources: {response['metadata']['total_sources']}")
        console.print(f"Query time: {response['metadata']['query_timestamp']}")
        console.print(f"Response version: {response['metadata']['response_format_version']}")

    else:
        parser.print_help()

if __name__ == "__main__":
    import asyncio
    asyncio.run(main())