Spaces:

sikeaditya
/

f1-ai

Running

AdityaAdaki commited on Mar 20

Commit

79f7264

1 Parent(s): 34a1969

refactor(llm_manager): switch from HuggingFace to OpenRouter for LLM

The LLM provider has been changed from HuggingFace to OpenRouter to improve reliability and reduce API complexity. This change also removes redundant code related to HuggingFace's InferenceClient and simplifies the LLM interface. The default provider is now set to OpenRouter, and the embeddings are handled separately using Ollama. This refactor improves maintainability and aligns with the new architecture.

Files changed (2) hide show

f1_ai.py +25 -42
llm_manager.py +113 -160

f1_ai.py CHANGED Viewed

@@ -21,60 +21,46 @@ console = Console()
 load_dotenv()
 class F1AI:
-    def __init__(self, index_name: str = "f12", llm_provider: str = "huggingface"):
         """
         Initialize the F1-AI RAG application.
         Args:
             index_name (str): Name of the Pinecone index to use
-            llm_provider (str): Provider for LLM and embeddings.
-                                Options: "ollama", "huggingface", "huggingface-openai"
         """
         self.index_name = index_name
-        # Initialize LLM and embeddings via manager
         self.llm_manager = LLMManager(provider=llm_provider)
         self.llm = self.llm_manager.get_llm()
-        self.embeddings = self.llm_manager.get_embeddings()
         # Load Pinecone API Key
         pinecone_api_key = os.getenv("PINECONE_API_KEY")
         if not pinecone_api_key:
             raise ValueError("❌ Pinecone API key missing! Set PINECONE_API_KEY in environment variables.")
-        # Modify this part in f1_ai.py
         # Initialize Pinecone with v2 client
-        try:
-            self.pc = Pinecone(api_key=pinecone_api_key)
-            # Check existing indexes
-            existing_indexes = [idx['name'] for idx in self.pc.list_indexes()]
-            if index_name not in existing_indexes:
-                console.log(f"🚀 Creating Pinecone index: {index_name}")
-                # Update the dimension to match your embedding model
-                self.pc.create_index(
-                    name=index_name,
-                    dimension=384,  # Match embedding dimensions of the model
-                    metric="cosine"
-                )
-            # Connect to Pinecone index
-            index = self.pc.Index(index_name)
-            self.vectordb = LangchainPinecone.from_existing_index(
-                index_name=index_name,
-                text_key="text",
-                embedding=self.embeddings
-            )
-            print(f"✅ Successfully connected to Pinecone index: {index_name}")
-        except Exception as e:
-            import traceback
-            print(f"⚠️ Error connecting to Pinecone: {str(e)}")
-            print(traceback.format_exc())
-            # Set vectordb to None, the application will handle this gracefully
-            self.vectordb = None
     async def scrape(self, url: str, max_chunks: int = 100) -> List[Dict[str, Any]]:
@@ -143,7 +129,6 @@ class F1AI:
     async def ingest(self, urls: List[str], max_chunks_per_url: int = 100) -> None:
         """Ingest data from URLs into the vector database."""
-        from langchain_community.vectorstores import Pinecone as LangchainPinecone
         from tqdm import tqdm
         # Create empty list to store documents
@@ -161,12 +146,10 @@ class F1AI:
         metadatas = [doc["metadata"] for doc in all_docs]
         print("Starting embedding generation and uploading to Pinecone (this might take several minutes)...")
-        self.vectordb = LangchainPinecone.from_texts(
             texts=texts,
-            embedding=self.embeddings,
-            index_name=self.index_name,
-            metadatas=metadatas,
-            text_key="text"
         )
         print("✅ Documents successfully uploaded to Pinecone!")
@@ -258,9 +241,9 @@ async def main():
     ask_parser = subparsers.add_parser("ask", help="Ask a question")
     ask_parser.add_argument("question", help="Question to ask")
-    # Added provider argument with the new option
-    parser.add_argument("--provider", choices=["ollama", "huggingface", "huggingface-openai"], default="huggingface",
-                        help="Provider for LLM and embeddings (default: huggingface)")
     args = parser.parse_args()

 load_dotenv()
 class F1AI:
+    def __init__(self, index_name: str = "f12", llm_provider: str = "openrouter"):
         """
         Initialize the F1-AI RAG application.
         Args:
             index_name (str): Name of the Pinecone index to use
+            llm_provider (str): Provider for LLM. "openrouter" is used by default.
         """
         self.index_name = index_name
+        # Initialize LLM via manager
         self.llm_manager = LLMManager(provider=llm_provider)
         self.llm = self.llm_manager.get_llm()
         # Load Pinecone API Key
         pinecone_api_key = os.getenv("PINECONE_API_KEY")
         if not pinecone_api_key:
             raise ValueError("❌ Pinecone API key missing! Set PINECONE_API_KEY in environment variables.")
         # Initialize Pinecone with v2 client
+        self.pc = Pinecone(api_key=pinecone_api_key)
+        # Check existing indexes
+        existing_indexes = [idx['name'] for idx in self.pc.list_indexes()]
+        if index_name not in existing_indexes:
+            raise ValueError(f"❌ Pinecone index '{index_name}' does not exist! Please create it first.")
+        # Connect to Pinecone index
+        index = self.pc.Index(index_name)
+        # Use the existing pre-configured Pinecone index
+        # Note: We're using the embeddings that Pinecone already has configured
+        self.vectordb = LangchainPinecone(
+            index=index,
+            text_key="text",
+            embedding=self.llm_manager.get_embeddings()  # This will only be used for new queries
+        )
+        print(f"✅ Successfully connected to Pinecone index: {index_name}")
     async def scrape(self, url: str, max_chunks: int = 100) -> List[Dict[str, Any]]:
     async def ingest(self, urls: List[str], max_chunks_per_url: int = 100) -> None:
         """Ingest data from URLs into the vector database."""
         from tqdm import tqdm
         # Create empty list to store documents
         metadatas = [doc["metadata"] for doc in all_docs]
         print("Starting embedding generation and uploading to Pinecone (this might take several minutes)...")
+        # Use the existing vectordb to add documents
+        self.vectordb.add_texts(
             texts=texts,
+            metadatas=metadatas
         )
         print("✅ Documents successfully uploaded to Pinecone!")
     ask_parser = subparsers.add_parser("ask", help="Ask a question")
     ask_parser.add_argument("question", help="Question to ask")
+    # Provider argument
+    parser.add_argument("--provider", choices=["ollama", "openrouter"], default="openrouter",
+                        help="Provider for LLM (default: openrouter)")
     args = parser.parse_args()

llm_manager.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import os
 from typing import List, Dict, Any
-from huggingface_hub import InferenceClient
-from langchain_ollama import OllamaEmbeddings, OllamaLLM
 from dotenv import load_dotenv
-import numpy as np
 import logging
 # Configure logging
@@ -15,181 +15,134 @@ load_dotenv()
 class LLMManager:
     """
-    Manager class for handling different LLM and embedding models.
-    Uses HuggingFace's InferenceClient directly for HuggingFace models.
     """
-    def __init__(self, provider: str = "huggingface"):
         """
         Initialize the LLM Manager.
         Args:
-            provider (str): The provider for LLM and embeddings.
-                           Options: "ollama", "huggingface", "huggingface-openai"
         """
         self.provider = provider
-        self.llm_client = None
-        self.embedding_client = None
-        # Initialize models based on the provider
-        if provider == "ollama":
-            self._init_ollama()
-        elif provider == "huggingface" or provider == "huggingface-openai":
-            self._init_huggingface()
-        else:
-            raise ValueError(f"Unsupported provider: {provider}. Choose 'ollama', 'huggingface', or 'huggingface-openai'")
-    def _init_ollama(self):
-        """Initialize Ollama models."""
-        self.llm = OllamaLLM(model="phi4-mini:3.8b")
-        self.embeddings = OllamaEmbeddings(model="mxbai-embed-large:latest")
-    def _init_huggingface(self):
-        """Initialize HuggingFace models using InferenceClient directly."""
-        # Get API key from environment
-        api_key = os.getenv("HUGGINGFACE_API_KEY")
-        if not api_key:
-            raise ValueError("HuggingFace API key not found. Set HUGGINGFACE_API_KEY in environment variables.")
-        llm_endpoint = "mistralai/Mixtral-8x7B-Instruct-v0.1"
-        embedding_endpoint = "sentence-transformers/all-MiniLM-L6-v2"
-        # Initialize InferenceClient for LLM
-        self.llm_client = InferenceClient(
-            model=llm_endpoint,
-            token=api_key
-        )
-        # Initialize InferenceClient for embeddings
-        self.embedding_client = InferenceClient(
-            model=embedding_endpoint,
-            token=api_key
-        )
-        # Store generation parameters
-        self.generation_kwargs = {
-            "temperature": 0.7,
-            "max_new_tokens": 512,  # Reduced to avoid potential token limit issues
-            "repetition_penalty": 1.1,
-            "do_sample": True,
-            "top_k": 50,
-            "top_p": 0.9,
-            "return_full_text": False  # Only return the generated text, not the prompt
         }
     # LLM methods for compatibility with LangChain
     def get_llm(self):
         """
-        Return a callable object that mimics LangChain LLM interface.
-        For huggingface providers, this returns a function that calls the InferenceClient.
-        """
-        if self.provider == "ollama":
-            return self.llm
-        else:
-            # Return a function that wraps the InferenceClient for LLM
-            def llm_function(prompt, **kwargs):
-                params = {**self.generation_kwargs, **kwargs}
-                try:
-                    logger.info(f"Sending prompt to HuggingFace (length: {len(prompt)})")
-                    response = self.llm_client.text_generation(
-                        prompt,
-                        details=True,  # Get detailed response
-                        **params
-                    )
-                    # Extract generated text from response
-                    if isinstance(response, dict) and 'generated_text' in response:
-                        response = response['generated_text']
-                    logger.info(f"Received response from HuggingFace (length: {len(response) if response else 0})")
-                    # Ensure we get a valid string response
-                    if not response or not isinstance(response, str) or response.strip() == "":
-                        logger.warning("Empty or invalid response from HuggingFace, using fallback")
-                        return "I couldn't generate a proper response based on the available information."
-                    return response
-                except Exception as e:
-                    logger.error(f"Error during LLM inference: {str(e)}")
-                    return f"Error generating response: {str(e)}"
-            # Add async capability
-            async def allm_function(prompt, **kwargs):
-                params = {**self.generation_kwargs, **kwargs}
-                try:
-                    response = await self.llm_client.text_generation(
-                        prompt,
-                        **params,
-                        stream=False
-                    )
-                    # Ensure we get a valid string response
-                    if not response or not isinstance(response, str) or response.strip() == "":
-                        logger.warning("Empty or invalid response from HuggingFace async, using fallback")
-                        return "I couldn't generate a proper response based on the available information."
-                    return response
-                except Exception as e:
-                    logger.error(f"Error during async LLM inference: {str(e)}")
-                    return f"Error generating response: {str(e)}"
-            llm_function.ainvoke = allm_function
-            return llm_function
-    # Embeddings methods for compatibility with LangChain
-    def get_embeddings(self):
-        """
-        Return a callable object that mimics LangChain Embeddings interface.
-        For huggingface providers, this returns an object with embed_documents and embed_query methods.
         """
-        if self.provider == "ollama":
-            return self.embeddings
-        else:
-            # Create a wrapper object that has the expected methods
-            class EmbeddingsWrapper:
-                def __init__(self, client):
-                    self.client = client
-                def embed_documents(self, texts: List[str]) -> List[List[float]]:
-                    """Embed multiple documents."""
-                    embeddings = []
-                    # Process in batches to avoid overwhelming the API
-                    batch_size = 8
-                    for i in range(0, len(texts), batch_size):
-                        batch = texts[i:i+batch_size]
-                        try:
-                            batch_embeddings = self.client.feature_extraction(batch)
-                            # Convert to standard Python list format
-                            batch_results = [list(map(float, embedding)) for embedding in batch_embeddings]
-                            embeddings.extend(batch_results)
-                        except Exception as e:
-                            logger.error(f"Error embedding batch {i}: {str(e)}")
-                            # Return zero vectors as fallback
-                            for _ in range(len(batch)):
-                                embeddings.append([0.0] * 384)  # Use correct dimension
-                    return embeddings
-                def embed_query(self, text: str) -> List[float]:
-                    """Embed a single query."""
-                    try:
-                        embedding = self.client.feature_extraction(text)
-                        if isinstance(embedding, list) and len(embedding) > 0:
-                            # If it returns a batch (list of embeddings) for a single input
-                            return list(map(float, embedding[0]))
-                        # If it returns a single embedding
-                        return list(map(float, embedding))
-                    except Exception as e:
-                        logger.error(f"Error embedding query: {str(e)}")
-                        # Return zero vector as fallback
-                        return [0.0] * 384  # Use correct dimension
-                # Make the class callable to fix the TypeError
-                def __call__(self, texts):
-                    """Make the object callable for compatibility with LangChain."""
-                    if isinstance(texts, str):
-                        return self.embed_query(texts)
-                    elif isinstance(texts, list):
-                        return self.embed_documents(texts)
                     else:
-                        raise ValueError(f"Unsupported input type: {type(texts)}")
-            return EmbeddingsWrapper(self.embedding_client)

 import os
+import json
+import requests
 from typing import List, Dict, Any
+from langchain_ollama import OllamaEmbeddings
 from dotenv import load_dotenv
 import logging
 # Configure logging
 class LLMManager:
     """
+    Manager class for handling Ollama embeddings and OpenRouter LLM.
     """
+    def __init__(self, provider: str = "openrouter"):
         """
         Initialize the LLM Manager.
         Args:
+            provider (str): "ollama" for embeddings, OpenRouter is used for LLM regardless
         """
         self.provider = provider
+        # Initialize Ollama embeddings
+        self.embeddings = OllamaEmbeddings(model="tazarov/all-minilm-l6-v2-f32:latest")
+        # Initialize OpenRouter client
+        self.openrouter_api_key = os.getenv("OPENROUTER_API_KEY")
+        if not self.openrouter_api_key:
+            raise ValueError("OpenRouter API key not found. Set OPENROUTER_API_KEY in environment variables.")
+        # Set up OpenRouter API details
+        self.openrouter_url = "https://openrouter.ai/api/v1/chat/completions"
+        self.openrouter_model = "mistralai/mistral-7b-instruct:free"
+        self.openrouter_headers = {
+            "Authorization": f"Bearer {self.openrouter_api_key}",
+            "Content-Type": "application/json",
+            "HTTP-Referer": "https://f1-ai.app",  # Replace with your app's URL
+            "X-Title": "F1-AI Application"  # Replace with your app's name
         }
     # LLM methods for compatibility with LangChain
     def get_llm(self):
         """
+        Return a callable function that serves as the LLM interface.
         """
+        def llm_function(prompt, **kwargs):
+            try:
+                logger.info(f"Sending prompt to OpenRouter (length: {len(prompt)})")
+                # Format the messages for OpenRouter API
+                messages = [{"role": "user", "content": prompt}]
+                # Set up request payload
+                payload = {
+                    "model": self.openrouter_model,
+                    "messages": messages,
+                    "temperature": kwargs.get("temperature", 0.7),
+                    "max_tokens": kwargs.get("max_tokens", 1024),
+                    "top_p": kwargs.get("top_p", 0.9),
+                    "stream": False
+                }
+                # Send request to OpenRouter
+                response = requests.post(
+                    self.openrouter_url,
+                    headers=self.openrouter_headers,
+                    json=payload,
+                    timeout=60
+                )
+                # Process the response
+                if response.status_code == 200:
+                    response_json = response.json()
+                    if "choices" in response_json and len(response_json["choices"]) > 0:
+                        generated_text = response_json["choices"][0]["message"]["content"]
+                        logger.info(f"Received response from OpenRouter (length: {len(generated_text)})")
+                        return generated_text
                     else:
+                        logger.warning("Unexpected response format from OpenRouter")
+                        return "I couldn't generate a proper response based on the available information."
+                else:
+                    logger.error(f"Error from OpenRouter API: {response.status_code} - {response.text}")
+                    return f"Error from LLM API: {response.status_code}"
+            except Exception as e:
+                logger.error(f"Error during LLM inference: {str(e)}")
+                return f"Error generating response: {str(e)}"
+        # Add async capability
+        async def allm_function(prompt, **kwargs):
+            import aiohttp
+            try:
+                # Format the messages for OpenRouter API
+                messages = [{"role": "user", "content": prompt}]
+                # Set up request payload
+                payload = {
+                    "model": self.openrouter_model,
+                    "messages": messages,
+                    "temperature": kwargs.get("temperature", 0.7),
+                    "max_tokens": kwargs.get("max_tokens", 1024),
+                    "top_p": kwargs.get("top_p", 0.9),
+                    "stream": False
+                }
+                async with aiohttp.ClientSession() as session:
+                    async with session.post(
+                        self.openrouter_url,
+                        headers=self.openrouter_headers,
+                        json=payload,
+                        timeout=aiohttp.ClientTimeout(total=60)
+                    ) as response:
+                        if response.status == 200:
+                            response_json = await response.json()
+                            if "choices" in response_json and len(response_json["choices"]) > 0:
+                                generated_text = response_json["choices"][0]["message"]["content"]
+                                return generated_text
+                            else:
+                                logger.warning("Unexpected response format from OpenRouter")
+                                return "I couldn't generate a proper response based on the available information."
+                        else:
+                            error_text = await response.text()
+                            logger.error(f"Error from OpenRouter API: {response.status} - {error_text}")
+                            return f"Error from LLM API: {response.status}"
+            except Exception as e:
+                logger.error(f"Error during async LLM inference: {str(e)}")
+                return f"Error generating response: {str(e)}"
+        # Add async method to the function
+        llm_function.ainvoke = allm_function
+        # Add invoke method for compatibility
+        llm_function.invoke = llm_function
+        return llm_function
+    # Embeddings methods for compatibility with LangChain
+    def get_embeddings(self):
+        """Return the embeddings instance."""
+        return self.embeddings