{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Note: you may need to restart the kernel to use updated packages.\n" ] } ], "source": [ "pip install langchain-community tiktoken langchainhub langchain langchain-huggingface sentence_transformers langchain-ollama ollama docling easyocr FlagEmbedding chonkie pinecone --quiet" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "curl -fsSL https://ollama.com/install.sh | sh\n", "sleep 1\n", "ollama pull nomic-embed-text\n", "ollama pull mistral:7b" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "from typing import List, Union\n", "import logging\n", "from dataclasses import dataclass\n", "\n", "from langchain_core.documents import Document as LCDocument\n", "from langchain_core.document_loaders import BaseLoader\n", "from docling.document_converter import DocumentConverter, PdfFormatOption\n", "from docling.datamodel.base_models import InputFormat, ConversionStatus\n", "from docling.datamodel.pipeline_options import (\n", " PdfPipelineOptions,\n", " EasyOcrOptions\n", ")\n", "\n", "logging.basicConfig(level=logging.INFO)\n", "_log = logging.getLogger(__name__)\n", "\n", "@dataclass\n", "class ProcessingResult:\n", " \"\"\"Store results of document processing\"\"\"\n", " success_count: int = 0\n", " failure_count: int = 0\n", " partial_success_count: int = 0\n", " failed_files: List[str] = None\n", "\n", " def __post_init__(self):\n", " if self.failed_files is None:\n", " self.failed_files = []\n", "\n", "class MultiFormatDocumentLoader(BaseLoader):\n", " \"\"\"Loader for multiple document formats that converts to LangChain documents\"\"\"\n", " \n", " def __init__(\n", " self,\n", " file_paths: Union[str, List[str]],\n", " enable_ocr: bool = True,\n", " enable_tables: bool = True\n", " ):\n", " self._file_paths = [file_paths] if isinstance(file_paths, str) else file_paths\n", " self._enable_ocr = enable_ocr\n", " self._enable_tables = enable_tables\n", " self._converter = self._setup_converter()\n", " \n", " def _setup_converter(self):\n", " \"\"\"Set up the document converter with appropriate options\"\"\"\n", " # Configure pipeline options\n", " pipeline_options = PdfPipelineOptions(do_ocr=False, do_table_structure=False, ocr_options=EasyOcrOptions(\n", " force_full_page_ocr=True\n", " ))\n", " if self._enable_ocr:\n", " pipeline_options.do_ocr = True\n", " if self._enable_tables:\n", " pipeline_options.do_table_structure = True\n", " pipeline_options.table_structure_options.do_cell_matching = True\n", "\n", " # Create converter with supported formats\n", " return DocumentConverter(\n", " allowed_formats=[\n", " InputFormat.PDF,\n", " InputFormat.IMAGE,\n", " InputFormat.DOCX,\n", " InputFormat.HTML,\n", " InputFormat.PPTX,\n", " InputFormat.ASCIIDOC,\n", " InputFormat.MD,\n", " ],\n", " format_options={\n", " InputFormat.PDF: PdfFormatOption(\n", " pipeline_options=pipeline_options,\n", " )}\n", " )\n", "\n", " def lazy_load(self):\n", " \"\"\"Convert documents and yield LangChain documents\"\"\"\n", " results = ProcessingResult()\n", " \n", " for file_path in self._file_paths:\n", " try:\n", " path = Path(file_path)\n", " if not path.exists():\n", " _log.warning(f\"File not found: {file_path}\")\n", " results.failure_count += 1\n", " results.failed_files.append(file_path)\n", " continue\n", "\n", " conversion_result = self._converter.convert(path)\n", " \n", " if conversion_result.status == ConversionStatus.SUCCESS:\n", " results.success_count += 1\n", " text = conversion_result.document.export_to_markdown()\n", " metadata = {\n", " 'source': str(path),\n", " 'file_type': path.suffix,\n", " }\n", " yield LCDocument(\n", " page_content=text,\n", " metadata=metadata\n", " )\n", " elif conversion_result.status == ConversionStatus.PARTIAL_SUCCESS:\n", " results.partial_success_count += 1\n", " _log.warning(f\"Partial conversion for {file_path}\")\n", " text = conversion_result.document.export_to_markdown()\n", " metadata = {\n", " 'source': str(path),\n", " 'file_type': path.suffix,\n", " 'conversion_status': 'partial'\n", " }\n", " yield LCDocument(\n", " page_content=text,\n", " metadata=metadata\n", " )\n", " else:\n", " results.failure_count += 1\n", " results.failed_files.append(file_path)\n", " _log.error(f\"Failed to convert {file_path}\")\n", " \n", " except Exception as e:\n", " _log.error(f\"Error processing {file_path}: {str(e)}\")\n", " results.failure_count += 1\n", " results.failed_files.append(file_path)\n", "\n", " # Log final results\n", " total = results.success_count + results.partial_success_count + results.failure_count\n", " _log.info(\n", " f\"Processed {total} documents:\\n\"\n", " f\"- Successfully converted: {results.success_count}\\n\"\n", " f\"- Partially converted: {results.partial_success_count}\\n\"\n", " f\"- Failed: {results.failure_count}\"\n", " )\n", " if results.failed_files:\n", " _log.info(\"Failed files:\")\n", " for file in results.failed_files:\n", " _log.info(f\"- {file}\")\n", " \n", " \n", "# if __name__ == '__main__':\n", "# # Load documents from a list of file paths\n", "# loader = MultiFormatDocumentLoader(\n", "# file_paths=[\n", "# # './data/2404.19756v1.pdf',\n", "# # './data/OD429347375590223100.pdf',\n", "# '/teamspace/studios/this_studio/TabularRAG/data/FeesPaymentReceipt_7thsem.pdf',\n", "# # './data/UNIT 2 GENDER BASED VIOLENCE.pptx'\n", "# ],\n", "# enable_ocr=False,\n", "# enable_tables=True\n", "# )\n", "# for doc in loader.lazy_load():\n", "# print(doc.page_content)\n", "# print(doc.metadata)\n", "# # save document in .md file \n", "# with open('/teamspace/studios/this_studio/TabularRAG/data/output.md', 'w') as f:\n", "# f.write(doc.page_content)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from typing import List, Tuple, Union\n", "import re\n", "from dataclasses import dataclass\n", "from chonkie.chunker import RecursiveChunker\n", "from chonkie.types import RecursiveChunk\n", "from chonkie import RecursiveRules\n", "\n", "@dataclass\n", "class TableChunk:\n", " \"\"\"Represents a table chunk from the markdown document.\"\"\"\n", " text: str\n", " start_index: int\n", " end_index: int\n", " token_count: int\n", "\n", "class TableRecursiveChunker(RecursiveChunker):\n", " \"\"\"A recursive chunker that preserves markdown tables while chunking text.\n", " \n", " This chunker extends the base RecursiveChunker to handle markdown tables as special cases,\n", " keeping them intact rather than splitting them according to the recursive rules.\n", " \"\"\"\n", "\n", " def _extract_tables(self, text: str) -> Tuple[List[TableChunk], List[Tuple[int, int, str]]]:\n", " \"\"\"\n", " Extract markdown tables from text and return table chunks and remaining text segments.\n", " \n", " Args:\n", " text: The input text containing markdown content\n", " \n", " Returns:\n", " Tuple containing:\n", " - List of TableChunk objects for tables\n", " - List of (start_index, end_index, text) tuples for non-table segments\n", " \"\"\"\n", " # Regular expression for markdown tables (matches header, separator, and content rows)\n", " table_pattern = r'(\\|[^\\n]+\\|\\n\\|[-:\\|\\s]+\\|\\n(?:\\|[^\\n]+\\|\\n)+)'\n", " \n", " table_chunks = []\n", " non_table_segments = []\n", " last_end = 0\n", " \n", " for match in re.finditer(table_pattern, text):\n", " start, end = match.span()\n", " \n", " # Add non-table text before this table\n", " if start > last_end:\n", " non_table_segments.append((last_end, start, text[last_end:start]))\n", " \n", " # Create table chunk\n", " table_text = match.group()\n", " token_count = self._count_tokens(table_text)\n", " table_chunks.append(TableChunk(\n", " text=table_text,\n", " start_index=start,\n", " end_index=end,\n", " token_count=token_count\n", " ))\n", " \n", " last_end = end\n", " \n", " # Add remaining text after last table\n", " if last_end < len(text):\n", " non_table_segments.append((last_end, len(text), text[last_end:]))\n", " \n", " return table_chunks, non_table_segments\n", "\n", " def chunk(self, text: str) -> Tuple[List[RecursiveChunk], List[TableChunk]]:\n", " \"\"\"\n", " Chunk the text while preserving tables.\n", " \n", " This method overrides the base chunk method to handle tables separately from\n", " regular text content.\n", " \n", " Args:\n", " text: The input text to chunk\n", " \n", " Returns:\n", " Tuple containing:\n", " - List of RecursiveChunk objects for non-table text\n", " - List of TableChunk objects for tables\n", " \"\"\"\n", " # First extract tables\n", " table_chunks, non_table_segments = self._extract_tables(text)\n", " \n", " # Chunk each non-table segment using the parent class's recursive chunking\n", " text_chunks = []\n", " for start, end, segment in non_table_segments:\n", " if segment.strip(): # Only process non-empty segments\n", " # Use the parent class's recursive chunking logic\n", " chunks = super()._recursive_chunk(segment, level=0, full_text=text)\n", " text_chunks.extend(chunks)\n", " \n", " return text_chunks, table_chunks\n", "\n", " def chunk_batch(self, texts: List[str]) -> List[Tuple[List[RecursiveChunk], List[TableChunk]]]:\n", " \"\"\"\n", " Chunk multiple texts while preserving tables in each.\n", " \n", " Args:\n", " texts: List of texts to chunk\n", " \n", " Returns:\n", " List of tuples, each containing:\n", " - List of RecursiveChunk objects for non-table text\n", " - List of TableChunk objects for tables\n", " \"\"\"\n", " return [self.chunk(text) for text in texts]\n", "\n", " def __call__(self, texts: Union[str, List[str]]) -> Union[\n", " Tuple[List[RecursiveChunk], List[TableChunk]],\n", " List[Tuple[List[RecursiveChunk], List[TableChunk]]]\n", " ]:\n", " \"\"\"Make the chunker callable for convenience.\"\"\"\n", " if isinstance(texts, str):\n", " return self.chunk(texts)\n", " return self.chunk_batch(texts)\n", " \n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from typing import List\n", "from langchain_ollama import OllamaEmbeddings\n", "\n", "class EmbeddingModel:\n", " def __init__(self, model_name: str = \"llama3.2\"):\n", " \"\"\"\n", " Initialize embedding model with LangChain OllamaEmbeddings\n", " \n", " Args:\n", " model_name (str): Name of the model to use\n", " \"\"\"\n", " self.model_name = model_name\n", " self.embeddings = OllamaEmbeddings(\n", " model=model_name\n", " )\n", "\n", " def embed(self, text: str) -> List[float]:\n", " \"\"\"\n", " Generate embeddings for a single text input\n", " \n", " Args:\n", " text (str): Input text to embed\n", " \n", " Returns:\n", " List[float]: Embedding vector\n", " \"\"\"\n", " try:\n", " # Use embed_query for single text embedding\n", " return self.embeddings.embed_query(text)\n", " except Exception as e:\n", " print(f\"Error generating embedding: {e}\")\n", " return []\n", "\n", " def embed_batch(self, texts: List[str]) -> List[List[float]]:\n", " \"\"\"\n", " Generate embeddings for multiple texts\n", " \n", " Args:\n", " texts (List[str]): List of input texts to embed\n", " \n", " Returns:\n", " List[List[float]]: List of embedding vectors\n", " \"\"\"\n", " try:\n", " # Use embed_documents for batch embedding\n", " return self.embeddings.embed_documents(texts)\n", " except Exception as e:\n", " print(f\"Error generating batch embeddings: {e}\")\n", " return []\n", " \n", "# if __name__ == \"__main__\":\n", "# # Initialize the embedding model\n", "# embedding_model = EmbeddingModel(model_name=\"llama3.2\")\n", "\n", "# # Generate embedding for a single text\n", "# single_text = \"The meaning of life is 42\"\n", "# vector = embedding_model.embed(single_text)\n", "# print(vector[:3]) # Print first 3 dimensions\n", "\n", "# # Generate embeddings for multiple texts\n", "# texts = [\"Document 1...\", \"Document 2...\"]\n", "# vectors = embedding_model.embed_batch(texts)\n", "# print(len(vectors)) # Number of vectors\n", "# print(vectors[0][:3]) # First 3 dimensions of first vector" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from typing import List, Dict, Optional\n", "from langchain_ollama import ChatOllama\n", "from langchain_core.messages import HumanMessage, AIMessage\n", "from langchain_core.prompts import ChatPromptTemplate\n", "\n", "class LLMChat:\n", " def __init__(self, model_name: str = \"llama3.2\", temperature: float = 0):\n", " \"\"\"\n", " Initialize LLMChat with LangChain ChatOllama\n", " \n", " Args:\n", " model_name (str): Name of the model to use\n", " temperature (float): Temperature parameter for response generation\n", " \"\"\"\n", " self.model_name = model_name\n", " self.llm = ChatOllama(\n", " model=model_name,\n", " temperature=temperature\n", " )\n", " self.history: List[Dict[str, str]] = []\n", "\n", " def chat_once(self, message: str):\n", " \"\"\"\n", " Single chat interaction without maintaining history\n", " \n", " Args:\n", " message (str): User input message\n", " \n", " Returns:\n", " str: Model's response\n", " \"\"\"\n", " try:\n", " # Create a simple prompt template for single messages\n", " prompt = ChatPromptTemplate.from_messages([\n", " (\"human\", \"{input}\")\n", " ])\n", " \n", " # Create and invoke the chain\n", " chain = prompt | self.llm\n", " response = chain.invoke({\"input\": message})\n", " \n", " return response.content\n", " except Exception as e:\n", " print(f\"Error in chat: {e}\")\n", " return \"\"\n", "\n", " def chat_with_history(self, message: str):\n", " \"\"\"\n", " Chat interaction maintaining conversation history\n", " \n", " Args:\n", " message (str): User input message\n", " \n", " Returns:\n", " str: Model's response\n", " \"\"\"\n", " try:\n", " # Add user message to history\n", " self.history.append({'role': 'human', 'content': message})\n", " \n", " # Convert history to LangChain message format\n", " messages = [\n", " HumanMessage(content=msg['content']) if msg['role'] == 'human'\n", " else AIMessage(content=msg['content'])\n", " for msg in self.history\n", " ]\n", " \n", " # Get response using chat method\n", " response = self.llm.invoke(messages)\n", " assistant_message = response.content\n", " \n", " # Add assistant response to history\n", " self.history.append({'role': 'assistant', 'content': assistant_message})\n", " \n", " return assistant_message\n", " except Exception as e:\n", " print(f\"Error in chat with history: {e}\")\n", " return \"\"\n", "\n", " def chat_with_template(self, template_messages: List[Dict[str, str]], \n", " input_variables: Dict[str, str]):\n", " \"\"\"\n", " Chat using a custom template\n", " \n", " Args:\n", " template_messages (List[Dict[str, str]]): List of template messages\n", " input_variables (Dict[str, str]): Variables to fill in the template\n", " \n", " Returns:\n", " str: Model's response\n", " \"\"\"\n", " try:\n", " # Create prompt template from messages\n", " prompt = ChatPromptTemplate.from_messages([\n", " (msg['role'], msg['content'])\n", " for msg in template_messages\n", " ])\n", " \n", " # Create and invoke the chain\n", " chain = prompt | self.llm\n", " response = chain.invoke(input_variables)\n", " \n", " return response.content\n", " except Exception as e:\n", " print(f\"Error in template chat: {e}\")\n", " return \"\"\n", "\n", " def clear_history(self):\n", " \"\"\"Clear the conversation history\"\"\"\n", " self.history = []\n", "\n", " def get_history(self) -> List[Dict[str, str]]:\n", " \"\"\"Return the current conversation history\"\"\"\n", " return self.history\n", " \n", "# if __name__ == \"__main__\":\n", "# # Initialize the chat\n", "# chat = LLMChat(model_name=\"llama3.1\", temperature=0)\n", "\n", "# # Example of using a template for translation\n", "# template_messages = [\n", "# {\n", "# \"role\": \"system\",\n", "# \"content\": \"You are a helpful assistant that translates {input_language} to {output_language}.\"\n", "# },\n", "# {\n", "# \"role\": \"human\",\n", "# \"content\": \"{input}\"\n", "# }\n", "# ]\n", "\n", "# input_vars = {\n", "# \"input_language\": \"English\",\n", "# \"output_language\": \"German\",\n", "# \"input\": \"I love programming.\"\n", "# }\n", "\n", "# response = chat.chat_with_template(template_messages, input_vars)\n", "# # Simple chat without history\n", "# response = chat.chat_once(\"Hello!\")\n", "\n", "# # Chat with history\n", "# response = chat.chat_with_history(\"How are you?\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "from typing import List, Dict, Any\n", "from tqdm import tqdm\n", "import time\n", "\n", "# from src.embedding import EmbeddingModel\n", "# from src.llm import LLMChat\n", "\n", "class TableProcessor:\n", " def __init__(self, llm_model: LLMChat, embedding_model: EmbeddingModel, batch_size: int = 8):\n", " \"\"\"\n", " Initialize the TableProcessor with pre-initialized models.\n", " \n", " Args:\n", " llm_model (LLMChat): Initialized LLM model\n", " embedding_model (EmbeddingModel): Initialized embedding model\n", " batch_size (int): Batch size for processing embeddings\n", " \"\"\"\n", " self.llm = llm_model\n", " self.embedder = embedding_model\n", " self.batch_size = batch_size\n", " \n", " def get_table_description(self, markdown_table: str) -> str:\n", " \"\"\"\n", " Generate description for a single markdown table using Ollama chat.\n", " \n", " Args:\n", " markdown_table (str): Input markdown table\n", " \n", " Returns:\n", " str: Generated description of the table\n", " \"\"\"\n", " system_prompt = \"\"\"You are an AI language model. Your task is to examine the provided table, taking into account both its rows and columns, and produce a concise summary of up to 200 words. Emphasize key patterns, trends, and notable data points that provide meaningful insights into the content of the table.\"\"\"\n", " \n", " try:\n", " # Use chat_once to avoid maintaining history between tables\n", " full_prompt = f\"{system_prompt}\\n\\nTable:\\n{markdown_table}\"\n", " return self.llm.chat_once(full_prompt)\n", " except Exception as e:\n", " print(f\"Error generating table description: {e}\")\n", " return \"\"\n", " \n", " def process_tables(self, markdown_tables) -> List[Dict[str, Any]]:\n", " \"\"\"\n", " Process a list of markdown tables: generate descriptions and embeddings.\n", " \n", " Args:\n", " markdown_tables (List[str]): List of markdown tables to process\n", " \n", " Returns:\n", " List[Dict[str, Any]]: List of dictionaries containing processed information\n", " \"\"\"\n", " results = []\n", " descriptions = []\n", " \n", " # Generate descriptions for all tables\n", " with tqdm(total=len(markdown_tables), desc=\"Generating table descriptions\") as pbar:\n", " for i, table in enumerate(markdown_tables):\n", " description = self.get_table_description(table.text)\n", " print(f\"\\nTable {i+1}:\")\n", " print(f\"Description: {description}\")\n", " print(\"-\" * 50)\n", " descriptions.append(description)\n", " pbar.update(1)\n", " time.sleep(1) # Rate limiting\n", " \n", " # Generate embeddings in batches\n", " embeddings = []\n", " total_batches = (len(descriptions) + self.batch_size - 1) // self.batch_size\n", " \n", " with tqdm(total=total_batches, desc=\"Generating embeddings\") as pbar:\n", " for i in range(0, len(descriptions), self.batch_size):\n", " batch = descriptions[i:i + self.batch_size]\n", " if len(batch) == 1:\n", " batch_embeddings = [self.embedder.embed(batch[0])]\n", " else:\n", " batch_embeddings = self.embedder.embed_batch(batch)\n", " embeddings.extend(batch_embeddings)\n", " pbar.update(1)\n", " \n", " # Combine results with progress bar\n", " with tqdm(total=len(markdown_tables), desc=\"Combining results\") as pbar:\n", " for table, description, embedding in zip(markdown_tables, descriptions, embeddings):\n", " results.append({\n", " \"embedding\": embedding,\n", " \"text\": table,\n", " \"table_description\": description,\n", " \"type\": \"table_chunk\"\n", " })\n", " pbar.update(1)\n", " \n", " return results\n", "\n", " def __call__(self, markdown_tables) -> List[Dict[str, Any]]:\n", " \"\"\"\n", " Make the class callable for easier use.\n", " \n", " Args:\n", " markdown_tables (List[str]): List of markdown tables to process\n", " \n", " Returns:\n", " List[Dict[str, Any]]: Processed results\n", " \"\"\"\n", " return self.process_tables(markdown_tables)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "from typing import List, Dict, Any, Optional\n", "import pandas as pd\n", "import time\n", "from tqdm import tqdm\n", "import logging\n", "from pinecone import Pinecone, ServerlessSpec\n", "from dataclasses import dataclass\n", "from enum import Enum\n", "# from src.table_aware_chunker import TableRecursiveChunker\n", "# from src.processor import TableProcessor\n", "# from src.llm import LLMChat\n", "# from src.embedding import EmbeddingModel\n", "from chonkie import RecursiveRules\n", "# from src.loader import MultiFormatDocumentLoader\n", "from dotenv import load_dotenv\n", "import os\n", "\n", "load_dotenv()\n", "# API Keys\n", "PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')\n", "\n", "logging.basicConfig(\n", " level=logging.INFO,\n", " format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'\n", ")\n", "logger = logging.getLogger('table_aware_rag')\n", "\n", "class ChunkType(Enum):\n", " TEXT = \"text_chunk\"\n", " TABLE = \"table_chunk\"\n", "\n", "@dataclass\n", "class ProcessedChunk:\n", " text: str # This will be the embedable text (table description for tables)\n", " chunk_type: ChunkType\n", " token_count: int\n", " markdown_table: Optional[str] = None # Store original markdown table format\n", " start_index: Optional[int] = None\n", " end_index: Optional[int] = None\n", "\n", "def process_documents(\n", " file_paths: List[str],\n", " chunker: TableRecursiveChunker,\n", " processor: TableProcessor,\n", " output_path: str = './output.md'\n", ") -> List[ProcessedChunk]:\n", " \"\"\"\n", " Process documents into text and table chunks\n", " \"\"\"\n", " # Load documents\n", " loader = MultiFormatDocumentLoader(\n", " file_paths=file_paths,\n", " enable_ocr=False,\n", " enable_tables=True\n", " )\n", " \n", " # Save to markdown and read content\n", " with open(output_path, 'w') as f:\n", " for doc in loader.lazy_load():\n", " f.write(doc.page_content)\n", " \n", " with open(output_path, 'r') as file:\n", " text = file.read()\n", " \n", " # Get text and table chunks\n", " text_chunks, table_chunks = chunker.chunk(text)\n", " \n", " # Process chunks\n", " processed_chunks = []\n", " \n", " # Process text chunks\n", " for chunk in text_chunks:\n", " processed_chunks.append(\n", " ProcessedChunk(\n", " text=chunk.text,\n", " chunk_type=ChunkType.TEXT,\n", " token_count=chunk.token_count,\n", " start_index=chunk.start_index,\n", " end_index=chunk.end_index\n", " )\n", " )\n", " \n", " # Process table chunks\n", " table_results = processor(table_chunks)\n", " for table in table_results:\n", " # Convert table chunk to string representation if needed\n", " table_str = str(table[\"text\"].text)\n", " \n", " processed_chunks.append(\n", " ProcessedChunk(\n", " text=table[\"table_description\"], # Use description for embedding\n", " chunk_type=ChunkType.TABLE,\n", " token_count=len(table[\"table_description\"].split()),\n", " markdown_table=table_str # Store string version of table\n", " )\n", " )\n", " \n", " return processed_chunks\n", "\n", "class PineconeRetriever:\n", " def __init__(\n", " self,\n", " pinecone_client: Pinecone,\n", " index_name: str,\n", " namespace: str,\n", " embedding_model: Any,\n", " llm_model: Any\n", " ):\n", " \"\"\"\n", " Initialize retriever with configurable models\n", " \"\"\"\n", " self.pinecone = pinecone_client\n", " self.index = self.pinecone.Index(index_name)\n", " self.namespace = namespace\n", " self.embedding_model = embedding_model\n", " self.llm_model = llm_model\n", " \n", " def _prepare_query(self, question: str) -> List[float]:\n", " \"\"\"Generate embedding for query\"\"\"\n", " return self.embedding_model.embed(question)\n", " \n", " def invoke(\n", " self,\n", " question: str,\n", " top_k: int = 5,\n", " chunk_type_filter: Optional[ChunkType] = None\n", " ) -> List[Dict[str, Any]]:\n", " \"\"\"\n", " Retrieve similar documents with optional filtering by chunk type\n", " \"\"\"\n", " query_embedding = self._prepare_query(question)\n", " \n", " # Prepare filter if chunk type specified\n", " filter_dict = None\n", " if chunk_type_filter:\n", " filter_dict = {\"chunk_type\": chunk_type_filter.value}\n", " \n", " results = self.index.query(\n", " namespace=self.namespace,\n", " vector=query_embedding,\n", " top_k=top_k,\n", " include_values=False,\n", " include_metadata=True,\n", " filter=filter_dict\n", " )\n", " \n", " retrieved_docs = []\n", " for match in results.matches:\n", " doc = {\n", " \"score\": match.score,\n", " \"chunk_type\": match.metadata[\"chunk_type\"]\n", " }\n", " \n", " # Handle different chunk types\n", " if match.metadata[\"chunk_type\"] == ChunkType.TABLE.value:\n", " doc[\"table_description\"] = match.metadata[\"text\"] # The embedded description\n", " doc[\"markdown_table\"] = match.metadata[\"markdown_table\"] # Original table format\n", " else:\n", " doc[\"page_content\"] = match.metadata[\"text\"]\n", " \n", " retrieved_docs.append(doc)\n", " \n", " return retrieved_docs\n", "\n", "def ingest_data(\n", " processed_chunks: List[ProcessedChunk],\n", " embedding_model: Any,\n", " pinecone_client: Pinecone,\n", " index_name: str = \"vector-index\",\n", " namespace: str = \"rag\",\n", " batch_size: int = 100\n", "):\n", " \"\"\"\n", " Ingest processed chunks into Pinecone\n", " \"\"\"\n", " # Create or get index\n", " if not pinecone_client.has_index(index_name):\n", " pinecone_client.create_index(\n", " name=index_name,\n", " dimension=768,\n", " metric=\"cosine\",\n", " spec=ServerlessSpec(\n", " cloud='aws',\n", " region='us-east-1'\n", " )\n", " )\n", " \n", " while not pinecone_client.describe_index(index_name).status['ready']:\n", " time.sleep(1)\n", " \n", " index = pinecone_client.Index(index_name)\n", " \n", " # Process in batches\n", " for i in tqdm(range(0, len(processed_chunks), batch_size)):\n", " batch = processed_chunks[i:i+batch_size]\n", " \n", " # Generate embeddings for the text content\n", " texts = [chunk.text for chunk in batch]\n", " embeddings = embedding_model.embed_batch(texts)\n", " \n", " # Prepare records\n", " records = []\n", " for idx, chunk in enumerate(batch):\n", " metadata = {\n", " \"text\": chunk.text, # This is the description for tables\n", " \"chunk_type\": chunk.chunk_type.value,\n", " \"token_count\": chunk.token_count\n", " }\n", " \n", " # Add markdown table to metadata if it's a table chunk\n", " if chunk.markdown_table is not None:\n", " # Ensure the table is in string format\n", " metadata[\"markdown_table\"] = str(chunk.markdown_table)\n", " \n", " records.append({\n", " \"id\": f\"chunk_{i + idx}\",\n", " \"values\": embeddings[idx],\n", " \"metadata\": metadata\n", " })\n", " \n", " # Upsert to Pinecone\n", " try:\n", " index.upsert(vectors=records, namespace=namespace)\n", " except Exception as e:\n", " logger.error(f\"Error during upsert: {str(e)}\")\n", " logger.error(f\"Problematic record metadata: {records[0]['metadata']}\")\n", " raise\n", " \n", " time.sleep(0.5) # Rate limiting\n", "\n", "\n", "# def main():\n", "# # Initialize components\n", "# pc = Pinecone(api_key=PINECONE_API_KEY)\n", " \n", "# chunker = TableRecursiveChunker(\n", "# tokenizer=\"gpt2\",\n", "# chunk_size=512,\n", "# rules=RecursiveRules(),\n", "# min_characters_per_chunk=12\n", "# )\n", " \n", "# llm = LLMChat(\"qwen2.5:0.5b\")\n", "# embedder = EmbeddingModel(\"nomic-embed-text\")\n", " \n", "# processor = TableProcessor(\n", "# llm_model=llm,\n", "# embedding_model=embedder,\n", "# batch_size=8\n", "# )\n", " \n", "# try:\n", "# # Process documents\n", "# processed_chunks = process_documents(\n", "# file_paths=['/teamspace/studios/this_studio/TabularRAG/data/FeesPaymentReceipt_7thsem.pdf'],\n", "# chunker=chunker,\n", "# processor=processor\n", "# )\n", " \n", "# # Ingest data\n", "# ingest_data(\n", "# processed_chunks=processed_chunks,\n", "# embedding_model=embedder,\n", "# pinecone_client=pc\n", "# )\n", " \n", "# # Test retrieval\n", "# retriever = PineconeRetriever(\n", "# pinecone_client=pc,\n", "# index_name=\"vector-index\",\n", "# namespace=\"rag\",\n", "# embedding_model=embedder,\n", "# llm_model=llm\n", "# )\n", " \n", " # # Test text-only retrieval\n", " # text_results = retriever.invoke(\n", " # question=\"What is paid fees amount?\",\n", " # top_k=3,\n", " # chunk_type_filter=ChunkType.TEXT\n", " # )\n", " # print(\"Text results:\")\n", " # for result in text_results:\n", " # print(result)\n", " # Test table-only retrieval\n", " # table_results = retriever.invoke(\n", " # question=\"What is paid fees amount?\",\n", " # top_k=3,\n", " # chunk_type_filter=ChunkType.TABLE\n", " # )\n", " # print(\"Table results:\")\n", " # for result in table_results:\n", " # print(result)\n", " \n", " # results = retriever.invoke(\n", " # question=\"What is paid fees amount?\",\n", " # top_k=3\n", " # )\n", " \n", " # for i, result in enumerate(results, 1):\n", " # print(f\"\\nResult {i}:\")\n", " # if result[\"chunk_type\"] == ChunkType.TABLE.value:\n", " # print(f\"Table Description: {result['table_description']}\")\n", " # print(\"Table Format:\")\n", " # print(result['markdown_table'])\n", " # else:\n", " # print(f\"Content: {result['page_content']}\")\n", " # print(f\"Score: {result['score']}\")\n", " \n", " # except Exception as e:\n", " # logger.error(f\"Error in pipeline: {str(e)}\")\n", "\n", "# if __name__ == \"__main__\":\n", "# main()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "import tempfile\n", "import os\n", "from typing import List, Dict\n", "from pinecone import Pinecone\n", "# from src.table_aware_chunker import TableRecursiveChunker\n", "# from src.processor import TableProcessor\n", "# from src.llm import LLMChat\n", "# from src.embedding import EmbeddingModel\n", "from chonkie import RecursiveRules\n", "# from src.vectordb import ChunkType, process_documents, ingest_data, PineconeRetriever\n", "\n", "class TableRAGSystem:\n", " def __init__(self, pinecone_api_key: str):\n", " \"\"\"Initialize the Table RAG system with necessary components.\"\"\"\n", " self.pc = Pinecone(api_key=pinecone_api_key)\n", " \n", " # Initialize LLM\n", " self.llm = LLMChat(\n", " model_name=\"mistral:7b\",\n", " temperature=0.3\n", " )\n", " \n", " # Initialize Embeddings\n", " self.embedder = EmbeddingModel(\"nomic-embed-text\")\n", " \n", " # Initialize Chunker\n", " self.chunker = TableRecursiveChunker(\n", " tokenizer=\"gpt2\",\n", " chunk_size=512,\n", " rules=RecursiveRules(),\n", " min_characters_per_chunk=12\n", " )\n", " \n", " # Initialize Processor\n", " self.processor = TableProcessor(\n", " llm_model=self.llm,\n", " embedding_model=self.embedder,\n", " batch_size=8\n", " )\n", " \n", " self.retriever = None\n", " \n", " def process_documents(self, file_paths: List[str]) -> bool:\n", " \"\"\"Process documents and initialize the retriever.\"\"\"\n", " try:\n", " # Process documents\n", " print(\"Processing documents...\")\n", " processed_chunks = process_documents(\n", " file_paths=file_paths,\n", " chunker=self.chunker,\n", " processor=self.processor,\n", " output_path='./output.md'\n", " )\n", " \n", " # Ingest data\n", " print(\"Ingesting data to vector database...\")\n", " ingest_data(\n", " processed_chunks=processed_chunks,\n", " embedding_model=self.embedder,\n", " pinecone_client=self.pc\n", " )\n", " \n", " # Setup retriever\n", " print(\"Setting up retriever...\")\n", " self.retriever = PineconeRetriever(\n", " pinecone_client=self.pc,\n", " index_name=\"vector-index\",\n", " namespace=\"rag\",\n", " embedding_model=self.embedder,\n", " llm_model=self.llm\n", " )\n", " \n", " print(\"Processing complete!\")\n", " return True\n", "\n", " except Exception as e:\n", " print(f\"Error processing documents: {str(e)}\")\n", " return False\n", "\n", " def format_context(self, results: List[Dict]) -> str:\n", " \"\"\"Format retrieved results into context string.\"\"\"\n", " context_parts = []\n", " \n", " for result in results:\n", " if result.get(\"chunk_type\") == ChunkType.TABLE.value:\n", " table_text = f\"Table: {result['markdown_table']}\"\n", " if result.get(\"table_description\"):\n", " table_text += f\"\\nDescription: {result['table_description']}\"\n", " context_parts.append(table_text)\n", " else:\n", " context_parts.append(result.get(\"page_content\", \"\"))\n", " \n", " return \"\\n\\n\".join(context_parts)\n", "\n", " def query(self, question: str) -> Dict:\n", " \"\"\"Query the system with a question.\"\"\"\n", " if not self.retriever:\n", " raise ValueError(\"Documents must be processed before querying\")\n", " \n", " # Retrieve relevant content\n", " results = self.retriever.invoke(\n", " question=question,\n", " top_k=3\n", " )\n", " \n", " # Format context and get response from LLM\n", " context = self.format_context(results)\n", " \n", " # RAG Template\n", " rag_template = [\n", " {\n", " \"role\": \"system\",\n", " \"content\": \"\"\"You are a knowledgeable assistant specialized in analyzing documents and tables. \n", " Your responses should be:\n", " - Accurate and based on the provided context\n", " - Concise (three sentences maximum)\n", " - Professional yet conversational\n", " - Include specific references to tables when relevant\n", " \n", " If you cannot find an answer in the context, acknowledge this clearly.\"\"\"\n", " },\n", " {\n", " \"role\": \"human\",\n", " \"content\": \"Context: {context}\\n\\nQuestion: {question}\"\n", " }\n", " ]\n", " \n", " input_vars = {\n", " \"question\": question,\n", " \"context\": context\n", " }\n", "\n", " response = self.llm.chat_with_template(rag_template, input_vars)\n", " \n", " return {\n", " \"response\": response,\n", " \"context\": context,\n", " \"retrieved_results\": results\n", " }\n", "\n", " def clear_index(self, index_name: str = \"vector-index\"):\n", " \"\"\"Clear the Pinecone index.\"\"\"\n", " try:\n", " self.pc.delete_index(index_name)\n", " self.retriever = None\n", " print(\"Database cleared successfully!\")\n", " except Exception as e:\n", " print(f\"Error clearing database: {str(e)}\")" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/pinecone_plugins'])\n", "INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference\n", "INFO:pinecone_plugin_interface.logging:Installing plugin inference into Pinecone\n", "INFO:docling.document_converter:Going to convert document batch...\n", "INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Processing documents...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'\n", "INFO:docling.pipeline.base_pipeline:Processing document FeesPaymentReceipt_7thsem.pdf\n", "INFO:docling.document_converter:Finished converting document FeesPaymentReceipt_7thsem.pdf in 6.28 sec.\n", "INFO:__main__:Processed 1 documents:\n", "- Successfully converted: 1\n", "- Partially converted: 0\n", "- Failed: 0\n", "Generating table descriptions: 0%| | 0/1 [00:00\n", "\n", "## THE NEOTIA UNIVERSITY\n", "\n", "Diamond Harbour Road, Sarisha Hat, Sarisha, West Bengal - 743368, India\n", "\n", "Payment Receipt\n", "\n", "Student Details\n", "\n", "Receipt Date\n", "\n", "03/07/2024\n", "\n", "Name\n", "\n", ":\n", "\n", "ANINDYA MITRA\n", "\n", "UID No.\n", "\n", "Course\n", "\n", ":\n", "\n", "Contact No.\n", "\n", "Installment\n", "\n", ":\n", "\n", "Payment Type :\n", "\n", ":\n", "\n", "TNU2021053100042\n", "\n", "Bachelor of Technology in Computer Science & Engineering with\n", "\n", "8240716218\n", "\n", "Semester Fee-7\n", "\n", "Online Payment\n", "\n", "\n", "\n", "Table: | Heads | Amount |\n", "|----------------------------------------------------------|----------------------------------------------------------|\n", "| Outstanding(Tuition Fees & Others) | Outstanding(Tuition Fees & Others) |\n", "| Outstanding(Fooding) | Outstanding(Fooding) |\n", "| Tuition Fee | 22500 |\n", "| Other Charges | |\n", "| Lodging including facilities(for one semester) e P A I D | Lodging including facilities(for one semester) e P A I D |\n", "| Excess | Excess |\n", "| Late Fine 22500 Total | Late Fine 22500 Total |\n", "\n", "Description: The table provides a breakdown of various costs associated with educational expenses, including tuition fees, lodging, fooding, and other charges. The most significant cost is the tuition fee at $22,500. It's interesting to note that there are two categories labeled as \"Outstanding\" for both tuition fees & others, and fooding, suggesting that these costs have not been fully paid.\n", "\n", " The lodging including facilities for one semester is also a substantial cost, although the amount is not specified in this table. The presence of an \"Excess\" and \"Late Fine 22500 Total\" categories implies that there may be additional fees for late payments or exceeding certain limits.\n", "\n", " Overall, the data suggests that the total educational costs can be quite high, with a significant portion of these costs being outstanding, potentially indicating a need for financial planning and budgeting strategies to manage these expenses effectively.\n", "\n", "\n", "For THE NEOTIA UNIVERSITY\n", "\n", ":\n", "\n", ":\n", "\n", "\n", "\n", "## THE NEOTIA UNIVERSITY\n", "\n", "Diamond Harbour Road, Sarisha Hat, Sarisha, West Bengal - 743368, India\n", "\n", "Payment Receipt\n", "\n", "Semester Fee-7\n", "\n", ":\n", "\n", "Student Details\n", "\n", "8240716218\n", "\n", ": UID No.\n", "\n", "TNU2021053100042\n", "\n", "Name\n", "\n", ":\n", "\n", "ANINDYA MITRA\n", "\n", "Contact No.\n", "\n", "Installment\n", "\n", ":\n", "\n", "Receipt Date\n", "\n", "03/07/2024\n", "\n", ":\n", "\n", "Bachelor of Technology in Computer Science & Engineering with\n", "\n", "Course\n", "\n", ":\n", "\n", "Online Payment\n", "\n", "Payment Type :\n", "\n", ": 418511050700\n", "\n", "Bank Ref. No. e P A I D\n", "\n", ":\n", "\n", "Transaction Ref. No.\n", "\n", "Bank Merchant ID\n", "\n", "005693\n", "\n", "Transaction ID\n", "\n", ":\n", "\n", ":\n", "\n", "Service Charges : NA\n", "\n", "22500\n", "\n", "Online Payment Total\n", "\n", "On-line Payment Details\n", "\n", "For THE NEOTIA UNIVERSITY\n" ] } ], "source": [ "print(\"Answer:\", result[\"response\"])\n", "print(\"\\nRelevant Context:\", result[\"context\"])" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Database cleared successfully!\n" ] } ], "source": [ "\n", "# Clear the database when done\n", "rag_system.clear_index()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 2 }