Spaces:

TalatMasud
/

chatbot-backend

Running

App Files Files Community

TalatMasood commited on Jan 26

Commit

640b1c8

0 Parent(s):

initial commit

Browse files

Files changed (26) hide show

.vscode/launch.json +30 -0
.vscode/settings.json +12 -0
DockerComposeConfiguration +33 -0
Dockerfile +25 -0
Readme.md +80 -0
config/__init__.py +0 -0
config/config.py +29 -0
requirements.txt +23 -0
src/__init__.py +0 -0
src/agents/__init__.py +0 -0
src/agents/rag_agent.py +106 -0
src/embeddings/__init__.py +0 -0
src/embeddings/base_embedding.py +30 -0
src/embeddings/huggingface_embedding.py +39 -0
src/llms/__init__.py +0 -0
src/llms/base_llm.py +51 -0
src/llms/ollama_llm.py +80 -0
src/llms/openai_llm.py +76 -0
src/main.py +66 -0
src/utils/__init__.py +0 -0
src/utils/document_loader.py +91 -0
src/utils/logger.py +83 -0
src/utils/text_splitter.py +52 -0
src/vctorstores/__init__.py +0 -0
src/vctorstores/base_vectorstore.py +37 -0
src/vctorstores/chroma_vectorstore.py +68 -0

.vscode/launch.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: FastAPI",
+            "type": "python",
+            "request": "launch",
+            "module": "uvicorn",
+            "args": [
+                "src.main:app",
+                "--reload"
+            ],
+            "jinja": true,
+            "justMyCode": true,
+            "env": {
+                "PYTHONPATH": "${workspaceFolder}"
+            }
+        },
+        {
+            "name": "Python: Test",
+            "type": "python",
+            "request": "launch",
+            "module": "pytest",
+            "args": [
+                "tests"
+            ],
+            "console": "integratedTerminal"
+        }
+    ]
+}

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+    "python.pythonPath": "${workspaceFolder}/venv/bin/python",
+    "python.linting.enabled": true,
+    "python.linting.pylintEnabled": true,
+    "python.formatting.provider": "black",
+    "editor.formatOnSave": true,
+    "python.testing.pytestArgs": [
+        "tests"
+    ],
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true
+}

DockerComposeConfiguration ADDED Viewed

	@@ -0,0 +1,33 @@

+version: '3.8'
+services:
+  app:
+    build: .
+    ports:
+      - "8000:8000"
+    env_file:
+      - .env
+    volumes:
+      - ./:/app
+    depends_on:
+      - ollama
+  ollama:
+    image: ollama/ollama
+    ports:
+      - "11434:11434"
+    volumes:
+      - ollama-data:/root/.ollama
+  chroma:
+    image: chromadb/chroma
+    ports:
+      - "8000:8000"
+    volumes:
+      - chroma-data:/chroma
+    environment:
+      - PERSIST_DIRECTORY=/chroma
+volumes:
+  ollama-data:
+  chroma-data:

Dockerfile ADDED Viewed

	@@ -0,0 +1,25 @@

+# Use an official Python runtime as a parent image
+FROM python:3.9-slim
+# Set the working directory in the container
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+# Copy the current directory contents into the container at /app
+COPY . /app
+# Install any needed packages specified in requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+# Make port 8000 available to the world outside this container
+EXPOSE 8000
+# Define environment variable
+ENV NAME RAGChatbot
+# Run the application
+CMD ["uvicorn", "src.main:app", "--host", "0.0.0.0", "--port", "8000"]

Readme.md ADDED Viewed

	@@ -0,0 +1,80 @@

+# RAG Chatbot Application
+## Project Overview
+A modular Retrieval Augmented Generation (RAG) chatbot application built with FastAPI, supporting multiple LLM providers and embedding models.
+## Project Structure
+- `config/`: Configuration management
+- `src/`: Main application source code
+- `tests/`: Unit and integration tests
+- `data/`: Document storage and ingestion
+## Prerequisites
+- Python 3.9+
+- pip
+- (Optional) Virtual environment
+## Installation
+1. Clone the repository
+```bash
+git clone https://your-repo-url.git
+cd rag-chatbot
+```
+2. Create a virtual environment
+```bash
+python -m venv venv
+source venv/bin/activate  # On Windows use `venv\Scripts\activate`
+```
+3. Install dependencies
+```bash
+pip install -r requirements.txt
+```
+4. Set up environment variables
+```bash
+cp .env.example .env
+# Edit .env with your credentials
+```
+## Configuration
+### Environment Variables
+- `OPENAI_API_KEY`: OpenAI API key
+- `OLLAMA_BASE_URL`: Ollama server URL
+- `EMBEDDING_MODEL`: Hugging Face embedding model
+- `CHROMA_PATH`: Vector store persistence path
+- `DEBUG`: Enable debug mode
+## Running the Application
+### Development Server
+```bash
+uvicorn src.main:app --reload
+```
+### Production Deployment
+```bash
+gunicorn -w 4 -k uvicorn.workers.UvicornWorker src.main:app
+```
+## Testing
+```bash
+pytest tests/
+```
+## Features
+- Multiple LLM Provider Support
+- Retrieval Augmented Generation
+- Document Ingestion
+- Flexible Configuration
+- FastAPI Backend
+## Contributing
+1. Fork the repository
+2. Create your feature branch
+3. Commit your changes
+4. Push to the branch
+5. Create a Pull Request

config/__init__.py ADDED Viewed

File without changes

config/config.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# config/config.py
+import os
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+class Settings:
+    # OpenAI Configuration
+    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY', '')
+    OPENAI_MODEL = os.getenv('OPENAI_MODEL', 'gpt-3.5-turbo')
+    # Ollama Configuration
+    OLLAMA_BASE_URL = os.getenv('OLLAMA_BASE_URL', 'http://localhost:11434')
+    OLLAMA_MODEL = os.getenv('OLLAMA_MODEL', 'llama2')
+    # Anthropic Configuration
+    ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY', '')
+    # Embedding Configuration
+    EMBEDDING_MODEL = os.getenv('EMBEDDING_MODEL', 'all-MiniLM-L6-v2')
+    # Vector Store Configuration
+    CHROMA_PATH = os.getenv('CHROMA_PATH', './chroma_db')
+    # Application Configuration
+    DEBUG = os.getenv('DEBUG', 'False') == 'True'
+settings = Settings()

requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+# Requirements for RAG Chatbot
+fastapi==0.109.0
+uvicorn==0.24.0
+pydantic==2.6.1
+python-dotenv==1.0.0
+# LLM Providers
+openai==1.12.0
+anthropic==0.18.0
+ollama==0.1.6
+# Embedding and Vector Store
+sentence-transformers==2.3.1
+chromadb==0.4.22
+huggingface_hub==0.20.3
+# Optional: Additional dependencies
+numpy==1.26.3
+torch==2.1.2
+PyPDF2==3.0.1
+python-docx==1.0.1
+requests==2.31.0

src/__init__.py ADDED Viewed

File without changes

src/agents/__init__.py ADDED Viewed

File without changes

src/agents/rag_agent.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# src/agents/rag_agent.py
+from dataclasses import dataclass
+from typing import List, Optional
+from ..llms.base_llm import BaseLLM
+from ..embeddings import BaseEmbedding
+from ..vectorstores.base_vectorstore import BaseVectorStore
+from ..utils import split_text
+@dataclass
+class RAGResponse:
+    response: str
+    context_docs: Optional[List[str]] = None
+class RAGAgent:
+    def __init__(
+        self,
+        llm: BaseLLM,
+        embedding: BaseEmbedding,
+        vector_store: BaseVectorStore
+    ):
+        self.llm = llm
+        self.embedding = embedding
+        self.vector_store = vector_store
+    def retrieve_context(
+        self,
+        query: str,
+        top_k: int = 3
+    ) -> List[str]:
+        """
+        Retrieve relevant context documents for a given query
+        Args:
+            query (str): Input query to find context for
+            top_k (int): Number of top context documents to retrieve
+        Returns:
+            List[str]: List of retrieved context documents
+        """
+        # Embed the query
+        query_embedding = self.embedding.embed_query(query)
+        # Retrieve similar documents
+        context_docs = self.vector_store.similarity_search(
+            query_embedding,
+            top_k=top_k
+        )
+        return context_docs
+    def generate_response(
+        self,
+        query: str,
+        context_docs: Optional[List[str]] = None
+    ) -> RAGResponse:
+        """
+        Generate a response using RAG approach
+        Args:
+            query (str): User input query
+            context_docs (Optional[List[str]]): Optional pre-provided context documents
+        Returns:
+            RAGResponse: Response with generated text and context
+        """
+        # If no context provided, retrieve from vector store
+        if not context_docs:
+            context_docs = self.retrieve_context(query)
+        # Construct augmented prompt with context
+        augmented_prompt = self._construct_prompt(query, context_docs)
+        # Generate response using LLM
+        response = self.llm.generate(augmented_prompt)
+        return RAGResponse(
+            response=response,
+            context_docs=context_docs
+        )
+    def _construct_prompt(
+        self,
+        query: str,
+        context_docs: List[str]
+    ) -> str:
+        """
+        Construct a prompt with retrieved context
+        Args:
+            query (str): Original user query
+            context_docs (List[str]): Retrieved context documents
+        Returns:
+            str: Augmented prompt for the LLM
+        """
+        context_str = "\n\n".join(context_docs)
+        return f"""
+        Context Information:
+        {context_str}
+        User Query: {query}
+        Based on the context, please provide a comprehensive and accurate response.
+        """

src/embeddings/__init__.py ADDED Viewed

File without changes

src/embeddings/base_embedding.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# src/embeddings/base_embedding.py
+from abc import ABC, abstractmethod
+from typing import List, Union
+class BaseEmbedding(ABC):
+    @abstractmethod
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """
+        Embed a list of documents
+        Args:
+            texts (List[str]): List of texts to embed
+        Returns:
+            List[List[float]]: List of embeddings
+        """
+        pass
+    @abstractmethod
+    def embed_query(self, text: str) -> List[float]:
+        """
+        Embed a single query
+        Args:
+            text (str): Text to embed
+        Returns:
+            List[float]: Embedding vector
+        """
+        pass

src/embeddings/huggingface_embedding.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# src/embeddings/huggingface_embedding.py
+from typing import List
+from sentence_transformers import SentenceTransformer
+from .base_embedding import BaseEmbedding
+class HuggingFaceEmbedding(BaseEmbedding):
+    def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
+        """
+        Initialize HuggingFace embedding model
+        Args:
+            model_name (str): Name of the embedding model
+        """
+        self.model = SentenceTransformer(model_name)
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """
+        Embed a list of documents
+        Args:
+            texts (List[str]): List of texts to embed
+        Returns:
+            List[List[float]]: List of embeddings
+        """
+        return self.model.encode(texts).tolist()
+    def embed_query(self, text: str) -> List[float]:
+        """
+        Embed a single query
+        Args:
+            text (str): Text to embed
+        Returns:
+            List[float]: Embedding vector
+        """
+        return self.model.encode(text).tolist()

src/llms/__init__.py ADDED Viewed

File without changes

src/llms/base_llm.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# src/llms/base_llm.py
+from abc import ABC, abstractmethod
+from typing import List, Optional, Dict, Any
+class BaseLLM(ABC):
+    @abstractmethod
+    def generate(
+        self,
+        prompt: str,
+        max_tokens: Optional[int] = None,
+        temperature: float = 0.7,
+        **kwargs
+    ) -> str:
+        """
+        Generate a response based on the given prompt
+        Args:
+            prompt (str): Input prompt for the model
+            max_tokens (Optional[int]): Maximum number of tokens to generate
+            temperature (float): Sampling temperature for randomness
+        Returns:
+            str: Generated response
+        """
+        pass
+    @abstractmethod
+    def tokenize(self, text: str) -> List[str]:
+        """
+        Tokenize the input text
+        Args:
+            text (str): Input text to tokenize
+        Returns:
+            List[str]: List of tokens
+        """
+        pass
+    @abstractmethod
+    def count_tokens(self, text: str) -> int:
+        """
+        Count tokens in the input text
+        Args:
+            text (str): Input text to count tokens
+        Returns:
+            int: Number of tokens
+        """
+        pass

src/llms/ollama_llm.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# src/llms/ollama_llm.py
+import requests
+from typing import Optional, List
+from .base_llm import BaseLLM
+class OllamaLanguageModel(BaseLLM):
+    def __init__(
+        self,
+        base_url: str = 'http://localhost:11434',
+        model: str = 'llama2'
+    ):
+        """
+        Initialize Ollama Language Model
+        Args:
+            base_url (str): Base URL for Ollama API
+            model (str): Name of the Ollama model to use
+        """
+        self.base_url = base_url
+        self.model = model
+    def generate(
+        self,
+        prompt: str,
+        max_tokens: Optional[int] = 150,
+        temperature: float = 0.7,
+        **kwargs
+    ) -> str:
+        """
+        Generate response using Ollama API
+        Args:
+            prompt (str): Input prompt
+            max_tokens (Optional[int]): Maximum tokens to generate
+            temperature (float): Sampling temperature
+        Returns:
+            str: Generated response
+        """
+        response = requests.post(
+            f"{self.base_url}/api/generate",
+            json={
+                "model": self.model,
+                "prompt": prompt,
+                "stream": False,
+                "options": {
+                    "temperature": temperature,
+                    "num_predict": max_tokens
+                }
+            }
+        )
+        response.raise_for_status()
+        return response.json().get('response', '').strip()
+    def tokenize(self, text: str) -> List[str]:
+        """
+        Tokenize text
+        Args:
+            text (str): Input text to tokenize
+        Returns:
+            List[str]: List of tokens
+        """
+        # Simple tokenization
+        return text.split()
+    def count_tokens(self, text: str) -> int:
+        """
+        Count tokens in the text
+        Args:
+            text (str): Input text to count tokens
+        Returns:
+            int: Number of tokens
+        """
+        return len(self.tokenize(text))

src/llms/openai_llm.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# src/llms/openai_llm.py
+import openai
+from typing import Optional, List
+from .base_llm import BaseLLM
+class OpenAILanguageModel(BaseLLM):
+    def __init__(
+        self,
+        api_key: str,
+        model: str = 'gpt-3.5-turbo'
+    ):
+        """
+        Initialize OpenAI Language Model
+        Args:
+            api_key (str): OpenAI API key
+            model (str): Name of the OpenAI model to use
+        """
+        openai.api_key = api_key
+        self.model = model
+    def generate(
+        self,
+        prompt: str,
+        max_tokens: Optional[int] = 150,
+        temperature: float = 0.7,
+        **kwargs
+    ) -> str:
+        """
+        Generate response using OpenAI API
+        Args:
+            prompt (str): Input prompt
+            max_tokens (Optional[int]): Maximum tokens to generate
+            temperature (float): Sampling temperature
+        Returns:
+            str: Generated response
+        """
+        response = openai.ChatCompletion.create(
+            model=self.model,
+            messages=[{"role": "user", "content": prompt}],
+            max_tokens=max_tokens,
+            temperature=temperature,
+            **kwargs
+        )
+        return response.choices[0].message.content.strip()
+    def tokenize(self, text: str) -> List[str]:
+        """
+        Tokenize text using OpenAI tokenizer
+        Args:
+            text (str): Input text to tokenize
+        Returns:
+            List[str]: List of tokens
+        """
+        # Note: This is a placeholder. OpenAI doesn't provide a direct
+        # tokenization method without making an API call.
+        return text.split()
+    def count_tokens(self, text: str) -> int:
+        """
+        Count tokens in the text
+        Args:
+            text (str): Input text to count tokens
+        Returns:
+            int: Number of tokens
+        """
+        # Approximate token counting
+        return len(self.tokenize(text))

src/main.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# src/main.py
+from fastapi import FastAPI, Depends, HTTPException
+from pydantic import BaseModel
+from typing import List, Optional
+from .agents.rag_agent import RAGAgent
+from .llms.openai_llm import OpenAILanguageModel
+from .llms.ollama_llm import OllamaLanguageModel
+from .embeddings.huggingface_embedding import HuggingFaceEmbedding
+from .vectorstores.chroma_vectorstore import ChromaVectorStore
+from config.config import settings
+app = FastAPI(title="RAG Chatbot API")
+class ChatRequest(BaseModel):
+    query: str
+    context_docs: Optional[List[str]] = None
+    llm_provider: str = 'openai'
+class ChatResponse(BaseModel):
+    response: str
+    context: Optional[List[str]] = None
+@app.post("/chat", response_model=ChatResponse)
+async def chat_endpoint(request: ChatRequest):
+    try:
+        # Select LLM based on provider
+        if request.llm_provider == 'openai':
+            llm = OpenAILanguageModel(api_key=settings.OPENAI_API_KEY)
+        elif request.llm_provider == 'ollama':
+            llm = OllamaLanguageModel(base_url=settings.OLLAMA_BASE_URL)
+        else:
+            raise HTTPException(status_code=400, detail="Unsupported LLM provider")
+        # Initialize embedding and vector store
+        embedding = HuggingFaceEmbedding(model_name=settings.EMBEDDING_MODEL)
+        vector_store = ChromaVectorStore(
+            embedding_function=embedding.embed_documents,
+            persist_directory=settings.CHROMA_PATH
+        )
+        # Create RAG agent
+        rag_agent = RAGAgent(
+            llm=llm,
+            embedding=embedding,
+            vector_store=vector_store
+        )
+        # Process query
+        response = rag_agent.generate_response(
+            query=request.query,
+            context_docs=request.context_docs
+        )
+        return ChatResponse(
+            response=response.response,
+            context=response.context_docs
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# Optional: Health check endpoint
+@app.get("/health")
+async def health_check():
+    return {"status": "healthy"}

src/utils/__init__.py ADDED Viewed

File without changes

src/utils/document_loader.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# src/utils/document_loader.py
+import os
+from typing import List, Union
+import PyPDF2
+import docx
+def load_document(file_path: str) -> str:
+    """
+    Load text from various document types
+    s
+    Args:
+        file_path (str): Path to the document file
+    Returns:
+        str: Extracted text from the document
+    Raises:
+        ValueError: If file type is not supported
+    """
+    # Get file extension
+    _, ext = os.path.splitext(file_path)
+    ext = ext.lower()
+    # Load based on file type
+    if ext == '.txt':
+        with open(file_path, 'r', encoding='utf-8') as f:
+            return f.read()
+    elif ext == '.pdf':
+        return load_pdf(file_path)
+    elif ext == '.docx':
+        return load_docx(file_path)
+    else:
+        raise ValueError(f"Unsupported file type: {ext}")
+def load_pdf(file_path: str) -> str:
+    """
+    Extract text from PDF file
+    Args:
+        file_path (str): Path to PDF file
+    Returns:
+        str: Extracted text
+    """
+    text = ""
+    with open(file_path, 'rb') as file:
+        reader = PyPDF2.PdfReader(file)
+        for page in reader.pages:
+            text += page.extract_text()
+    return text
+def load_docx(file_path: str) -> str:
+    """
+    Extract text from DOCX file
+    Args:
+        file_path (str): Path to DOCX file
+    Returns:
+        str: Extracted text
+    """
+    doc = docx.Document(file_path)
+    return '\n'.join([paragraph.text for paragraph in doc.paragraphs])
+def load_documents_from_directory(
+    directory: str,
+    extensions: List[str] = ['.txt', '.pdf', '.docx']
+) -> List[str]:
+    """
+    Load all documents from a directory
+    Args:
+        directory (str): Path to the directory
+        extensions (List[str]): List of file extensions to load
+    Returns:
+        List[str]: List of document texts
+    """
+    documents = []
+    for filename in os.listdir(directory):
+        file_path = os.path.join(directory, filename)
+        if os.path.isfile(file_path) and any(filename.lower().endswith(ext) for ext in extensions):
+            try:
+                documents.append(load_document(file_path))
+            except Exception as e:
+                print(f"Error loading {filename}: {e}")
+    return documents

src/utils/logger.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# src/utils/logger.py
+import logging
+import sys
+from typing import Optional
+def setup_logger(
+    name: str = "rag_chatbot",
+    log_level: str = "INFO",
+    log_file: Optional[str] = None
+) -> logging.Logger:
+    """
+    Set up a comprehensive logger for the application
+    Args:
+        name (str): Name of the logger
+        log_level (str): Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+        log_file (Optional[str]): Path to log file (optional)
+    Returns:
+        logging.Logger: Configured logger instance
+    """
+    # Create logger
+    logger = logging.getLogger(name)
+    logger.setLevel(getattr(logging, log_level.upper()))
+    # Create formatters
+    console_formatter = logging.Formatter(
+        '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    )
+    file_formatter = logging.Formatter(
+        '%(asctime)s - %(name)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s'
+    )
+    # Console Handler
+    console_handler = logging.StreamHandler(sys.stdout)
+    console_handler.setFormatter(console_formatter)
+    logger.addHandler(console_handler)
+    # File Handler (if log_file is provided)
+    if log_file:
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setFormatter(file_formatter)
+        logger.addHandler(file_handler)
+    return logger
+# Global logger instance
+logger = setup_logger()
+class AppException(Exception):
+    """
+    Custom base exception for the application
+    """
+    def __init__(self, message: str, error_code: Optional[str] = None):
+        """
+        Initialize custom exception
+        Args:
+            message (str): Error message
+            error_code (Optional[str]): Optional error code
+        """
+        self.message = message
+        self.error_code = error_code
+        super().__init__(self.message)
+        # Log the exception
+        logger.error(f"AppException: {message}")
+class ConfigurationError(AppException):
+    """Exception raised for configuration-related errors"""
+    pass
+class LLMProviderError(AppException):
+    """Exception raised for LLM provider-related errors"""
+    pass
+class EmbeddingError(AppException):
+    """Exception raised for embedding-related errors"""
+    pass
+class VectorStoreError(AppException):
+    """Exception raised for vector store-related errors"""
+    pass

src/utils/text_splitter.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# src/utils/text_splitter.py
+from typing import List
+def split_text(
+    text: str,
+    chunk_size: int = 500,
+    overlap: int = 50
+) -> List[str]:
+    """
+    Split a long text into smaller chunks
+    Args:
+        text (str): Input text to split
+        chunk_size (int): Maximum size of each text chunk
+        overlap (int): Number of characters to overlap between chunks
+    Returns:
+        List[str]: List of text chunks
+    """
+    chunks = []
+    start = 0
+    while start < len(text):
+        # Extract chunk
+        chunk = text[start:start + chunk_size]
+        chunks.append(chunk)
+        # Move start position with overlap
+        start += chunk_size - overlap
+    return chunks
+def clean_text(text: str) -> str:
+    """
+    Clean and preprocess text
+    Args:
+        text (str): Input text to clean
+    Returns:
+        str: Cleaned text
+    """
+    # Remove extra whitespaces
+    text = ' '.join(text.split())
+    # Add more cleaning steps as needed
+    # For example:
+    # - Remove special characters
+    # - Convert to lowercase
+    # - Remove HTML tags
+    return text

src/vctorstores/__init__.py ADDED Viewed

File without changes

src/vctorstores/base_vectorstore.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# src/vectorstores/base_vectorstore.py
+from abc import ABC, abstractmethod
+from typing import List, Callable, Any
+class BaseVectorStore(ABC):
+    @abstractmethod
+    def add_documents(
+        self,
+        documents: List[str],
+        embeddings: List[List[float]]
+    ) -> None:
+        """
+        Add documents to the vector store
+        Args:
+            documents (List[str]): List of document texts
+            embeddings (List[List[float]]): Corresponding embeddings
+        """
+        pass
+    @abstractmethod
+    def similarity_search(
+        self,
+        query_embedding: List[float],
+        top_k: int = 3
+    ) -> List[str]:
+        """
+        Perform similarity search
+        Args:
+            query_embedding (List[float]): Embedding of the query
+            top_k (int): Number of top similar documents to retrieve
+        Returns:
+            List[str]: List of most similar documents
+        """
+        pass

src/vctorstores/chroma_vectorstore.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# src/vectorstores/chroma_vectorstore.py
+import chromadb
+from typing import List, Callable, Any
+from .base_vectorstore import BaseVectorStore
+class ChromaVectorStore(BaseVectorStore):
+    def __init__(
+        self,
+        embedding_function: Callable[[List[str]], List[List[float]]],
+        persist_directory: str = './chroma_db'
+    ):
+        """
+        Initialize Chroma Vector Store
+        Args:
+            embedding_function (Callable): Function to generate embeddings
+            persist_directory (str): Directory to persist the vector store
+        """
+        self.client = chromadb.PersistentClient(path=persist_directory)
+        self.collection = self.client.get_or_create_collection(name="documents")
+        self.embedding_function = embedding_function
+    def add_documents(
+        self,
+        documents: List[str],
+        embeddings: List[List[float]] = None
+    ) -> None:
+        """
+        Add documents to the vector store
+        Args:
+            documents (List[str]): List of document texts
+            embeddings (List[List[float]], optional): Pre-computed embeddings
+        """
+        if not embeddings:
+            embeddings = self.embedding_function(documents)
+        # Generate unique IDs
+        ids = [f"doc_{i}" for i in range(len(documents))]
+        self.collection.add(
+            documents=documents,
+            embeddings=embeddings,
+            ids=ids
+        )
+    def similarity_search(
+        self,
+        query_embedding: List[float],
+        top_k: int = 3
+    ) -> List[str]:
+        """
+        Perform similarity search
+        Args:
+            query_embedding (List[float]): Embedding of the query
+            top_k (int): Number of top similar documents to retrieve
+        Returns:
+            List[str]: List of most similar documents
+        """
+        results = self.collection.query(
+            query_embeddings=[query_embedding],
+            n_results=top_k
+        )
+        return results.get('documents', [[]])[0]