In [1]:
pip install langchain-community  tiktoken  langchainhub langchain  langchain-huggingface sentence_transformers langchain-ollama ollama docling easyocr FlagEmbedding chonkie pinecone --quiet

Note: you may need to restart the kernel to use updated packages.


In [None]:
curl -fsSL https://ollama.com/install.sh | sh
sleep 1
ollama pull nomic-embed-text
ollama pull mistral:7b

In [1]:
from pathlib import Path
from typing import List, Union
import logging
from dataclasses import dataclass

from langchain_core.documents import Document as LCDocument
from langchain_core.document_loaders import BaseLoader
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat, ConversionStatus
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    EasyOcrOptions
)

logging.basicConfig(level=logging.INFO)
_log = logging.getLogger(__name__)

@dataclass
class ProcessingResult:
    """Store results of document processing"""
    success_count: int = 0
    failure_count: int = 0
    partial_success_count: int = 0
    failed_files: List[str] = None

    def __post_init__(self):
        if self.failed_files is None:
            self.failed_files = []

class MultiFormatDocumentLoader(BaseLoader):
    """Loader for multiple document formats that converts to LangChain documents"""
    
    def __init__(
        self,
        file_paths: Union[str, List[str]],
        enable_ocr: bool = True,
        enable_tables: bool = True
    ):
        self._file_paths = [file_paths] if isinstance(file_paths, str) else file_paths
        self._enable_ocr = enable_ocr
        self._enable_tables = enable_tables
        self._converter = self._setup_converter()
        
    def _setup_converter(self):
        """Set up the document converter with appropriate options"""
        # Configure pipeline options
        pipeline_options = PdfPipelineOptions(do_ocr=False, do_table_structure=False, ocr_options=EasyOcrOptions(
                force_full_page_ocr=True
            ))
        if self._enable_ocr:
            pipeline_options.do_ocr = True
        if self._enable_tables:
            pipeline_options.do_table_structure = True
            pipeline_options.table_structure_options.do_cell_matching = True

        # Create converter with supported formats
        return DocumentConverter(
            allowed_formats=[
                InputFormat.PDF,
                InputFormat.IMAGE,
                InputFormat.DOCX,
                InputFormat.HTML,
                InputFormat.PPTX,
                InputFormat.ASCIIDOC,
                InputFormat.MD,
            ],
            format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options,
            )}
        )

    def lazy_load(self):
        """Convert documents and yield LangChain documents"""
        results = ProcessingResult()
        
        for file_path in self._file_paths:
            try:
                path = Path(file_path)
                if not path.exists():
                    _log.warning(f"File not found: {file_path}")
                    results.failure_count += 1
                    results.failed_files.append(file_path)
                    continue

                conversion_result = self._converter.convert(path)
                
                if conversion_result.status == ConversionStatus.SUCCESS:
                    results.success_count += 1
                    text = conversion_result.document.export_to_markdown()
                    metadata = {
                        'source': str(path),
                        'file_type': path.suffix,
                    }
                    yield LCDocument(
                        page_content=text,
                        metadata=metadata
                    )
                elif conversion_result.status == ConversionStatus.PARTIAL_SUCCESS:
                    results.partial_success_count += 1
                    _log.warning(f"Partial conversion for {file_path}")
                    text = conversion_result.document.export_to_markdown()
                    metadata = {
                        'source': str(path),
                        'file_type': path.suffix,
                        'conversion_status': 'partial'
                    }
                    yield LCDocument(
                        page_content=text,
                        metadata=metadata
                    )
                else:
                    results.failure_count += 1
                    results.failed_files.append(file_path)
                    _log.error(f"Failed to convert {file_path}")
                    
            except Exception as e:
                _log.error(f"Error processing {file_path}: {str(e)}")
                results.failure_count += 1
                results.failed_files.append(file_path)

        # Log final results
        total = results.success_count + results.partial_success_count + results.failure_count
        _log.info(
            f"Processed {total} documents:\n"
            f"- Successfully converted: {results.success_count}\n"
            f"- Partially converted: {results.partial_success_count}\n"
            f"- Failed: {results.failure_count}"
        )
        if results.failed_files:
            _log.info("Failed files:")
            for file in results.failed_files:
                _log.info(f"- {file}")
                
                
# if __name__ == '__main__':
#     # Load documents from a list of file paths
#     loader = MultiFormatDocumentLoader(
#         file_paths=[
#             # './data/2404.19756v1.pdf',
#             # './data/OD429347375590223100.pdf',
#             '/teamspace/studios/this_studio/TabularRAG/data/FeesPaymentReceipt_7thsem.pdf',
#             # './data/UNIT 2 GENDER BASED VIOLENCE.pptx'
#         ],
#         enable_ocr=False,
#         enable_tables=True
#     )
#     for doc in loader.lazy_load():
#         print(doc.page_content)
#         print(doc.metadata)
#         # save document in .md file 
#         with open('/teamspace/studios/this_studio/TabularRAG/data/output.md', 'w') as f:
#             f.write(doc.page_content)

In [2]:
from typing import List, Tuple, Union
import re
from dataclasses import dataclass
from chonkie.chunker import RecursiveChunker
from chonkie.types import RecursiveChunk
from chonkie import RecursiveRules

@dataclass
class TableChunk:
    """Represents a table chunk from the markdown document."""
    text: str
    start_index: int
    end_index: int
    token_count: int

class TableRecursiveChunker(RecursiveChunker):
    """A recursive chunker that preserves markdown tables while chunking text.
    
    This chunker extends the base RecursiveChunker to handle markdown tables as special cases,
    keeping them intact rather than splitting them according to the recursive rules.
    """

    def _extract_tables(self, text: str) -> Tuple[List[TableChunk], List[Tuple[int, int, str]]]:
        """
        Extract markdown tables from text and return table chunks and remaining text segments.
        
        Args:
            text: The input text containing markdown content
            
        Returns:
            Tuple containing:
            - List of TableChunk objects for tables
            - List of (start_index, end_index, text) tuples for non-table segments
        """
        # Regular expression for markdown tables (matches header, separator, and content rows)
        table_pattern = r'(\|[^\n]+\|\n\|[-:\|\s]+\|\n(?:\|[^\n]+\|\n)+)'
        
        table_chunks = []
        non_table_segments = []
        last_end = 0
        
        for match in re.finditer(table_pattern, text):
            start, end = match.span()
            
            # Add non-table text before this table
            if start > last_end:
                non_table_segments.append((last_end, start, text[last_end:start]))
            
            # Create table chunk
            table_text = match.group()
            token_count = self._count_tokens(table_text)
            table_chunks.append(TableChunk(
                text=table_text,
                start_index=start,
                end_index=end,
                token_count=token_count
            ))
            
            last_end = end
        
        # Add remaining text after last table
        if last_end < len(text):
            non_table_segments.append((last_end, len(text), text[last_end:]))
            
        return table_chunks, non_table_segments

    def chunk(self, text: str) -> Tuple[List[RecursiveChunk], List[TableChunk]]:
        """
        Chunk the text while preserving tables.
        
        This method overrides the base chunk method to handle tables separately from
        regular text content.
        
        Args:
            text: The input text to chunk
            
        Returns:
            Tuple containing:
            - List of RecursiveChunk objects for non-table text
            - List of TableChunk objects for tables
        """
        # First extract tables
        table_chunks, non_table_segments = self._extract_tables(text)
        
        # Chunk each non-table segment using the parent class's recursive chunking
        text_chunks = []
        for start, end, segment in non_table_segments:
            if segment.strip():  # Only process non-empty segments
                # Use the parent class's recursive chunking logic
                chunks = super()._recursive_chunk(segment, level=0, full_text=text)
                text_chunks.extend(chunks)
        
        return text_chunks, table_chunks

    def chunk_batch(self, texts: List[str]) -> List[Tuple[List[RecursiveChunk], List[TableChunk]]]:
        """
        Chunk multiple texts while preserving tables in each.
        
        Args:
            texts: List of texts to chunk
            
        Returns:
            List of tuples, each containing:
            - List of RecursiveChunk objects for non-table text
            - List of TableChunk objects for tables
        """
        return [self.chunk(text) for text in texts]

    def __call__(self, texts: Union[str, List[str]]) -> Union[
        Tuple[List[RecursiveChunk], List[TableChunk]],
        List[Tuple[List[RecursiveChunk], List[TableChunk]]]
    ]:
        """Make the chunker callable for convenience."""
        if isinstance(texts, str):
            return self.chunk(texts)
        return self.chunk_batch(texts)
    


In [3]:
from typing import List
from langchain_ollama import OllamaEmbeddings

class EmbeddingModel:
    def __init__(self, model_name: str = "llama3.2"):
        """
        Initialize embedding model with LangChain OllamaEmbeddings
        
        Args:
            model_name (str): Name of the model to use
        """
        self.model_name = model_name
        self.embeddings = OllamaEmbeddings(
            model=model_name
        )

    def embed(self, text: str) -> List[float]:
        """
        Generate embeddings for a single text input
        
        Args:
            text (str): Input text to embed
            
        Returns:
            List[float]: Embedding vector
        """
        try:
            # Use embed_query for single text embedding
            return self.embeddings.embed_query(text)
        except Exception as e:
            print(f"Error generating embedding: {e}")
            return []

    def embed_batch(self, texts: List[str]) -> List[List[float]]:
        """
        Generate embeddings for multiple texts
        
        Args:
            texts (List[str]): List of input texts to embed
            
        Returns:
            List[List[float]]: List of embedding vectors
        """
        try:
            # Use embed_documents for batch embedding
            return self.embeddings.embed_documents(texts)
        except Exception as e:
            print(f"Error generating batch embeddings: {e}")
            return []
        
# if __name__ == "__main__":
#         # Initialize the embedding model
#     embedding_model = EmbeddingModel(model_name="llama3.2")

#     # Generate embedding for a single text
#     single_text = "The meaning of life is 42"
#     vector = embedding_model.embed(single_text)
#     print(vector[:3])  # Print first 3 dimensions

#     # Generate embeddings for multiple texts
#     texts = ["Document 1...", "Document 2..."]
#     vectors = embedding_model.embed_batch(texts)
#     print(len(vectors))  # Number of vectors
#     print(vectors[0][:3])  # First 3 dimensions of first vector

In [4]:
from typing import List, Dict, Optional
from langchain_ollama import ChatOllama
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate

class LLMChat:
    def __init__(self, model_name: str = "llama3.2", temperature: float = 0):
        """
        Initialize LLMChat with LangChain ChatOllama
        
        Args:
            model_name (str): Name of the model to use
            temperature (float): Temperature parameter for response generation
        """
        self.model_name = model_name
        self.llm = ChatOllama(
            model=model_name,
            temperature=temperature
        )
        self.history: List[Dict[str, str]] = []

    def chat_once(self, message: str):
        """
        Single chat interaction without maintaining history
        
        Args:
            message (str): User input message
            
        Returns:
            str: Model's response
        """
        try:
            # Create a simple prompt template for single messages
            prompt = ChatPromptTemplate.from_messages([
                ("human", "{input}")
            ])
            
            # Create and invoke the chain
            chain = prompt | self.llm
            response = chain.invoke({"input": message})
            
            return response.content
        except Exception as e:
            print(f"Error in chat: {e}")
            return ""

    def chat_with_history(self, message: str):
        """
        Chat interaction maintaining conversation history
        
        Args:
            message (str): User input message
            
        Returns:
            str: Model's response
        """
        try:
            # Add user message to history
            self.history.append({'role': 'human', 'content': message})
            
            # Convert history to LangChain message format
            messages = [
                HumanMessage(content=msg['content']) if msg['role'] == 'human'
                else AIMessage(content=msg['content'])
                for msg in self.history
            ]
            
            # Get response using chat method
            response = self.llm.invoke(messages)
            assistant_message = response.content
            
            # Add assistant response to history
            self.history.append({'role': 'assistant', 'content': assistant_message})
            
            return assistant_message
        except Exception as e:
            print(f"Error in chat with history: {e}")
            return ""

    def chat_with_template(self, template_messages: List[Dict[str, str]], 
                         input_variables: Dict[str, str]):
        """
        Chat using a custom template
        
        Args:
            template_messages (List[Dict[str, str]]): List of template messages
            input_variables (Dict[str, str]): Variables to fill in the template
            
        Returns:
            str: Model's response
        """
        try:
            # Create prompt template from messages
            prompt = ChatPromptTemplate.from_messages([
                (msg['role'], msg['content'])
                for msg in template_messages
            ])
            
            # Create and invoke the chain
            chain = prompt | self.llm
            response = chain.invoke(input_variables)
            
            return response.content
        except Exception as e:
            print(f"Error in template chat: {e}")
            return ""

    def clear_history(self):
        """Clear the conversation history"""
        self.history = []

    def get_history(self) -> List[Dict[str, str]]:
        """Return the current conversation history"""
        return self.history
    
# if __name__ == "__main__":
#     # Initialize the chat
#     chat = LLMChat(model_name="llama3.1", temperature=0)

#     # Example of using a template for translation
#     template_messages = [
#         {
#             "role": "system",
#             "content": "You are a helpful assistant that translates {input_language} to {output_language}."
#         },
#         {
#             "role": "human",
#             "content": "{input}"
#         }
#     ]

#     input_vars = {
#         "input_language": "English",
#         "output_language": "German",
#         "input": "I love programming."
#     }

#     response = chat.chat_with_template(template_messages, input_vars)
#     # Simple chat without history
#     response = chat.chat_once("Hello!")

#     # Chat with history
#     response = chat.chat_with_history("How are you?")

In [5]:
from typing import List, Dict, Any
from tqdm import tqdm
import time

# from src.embedding import EmbeddingModel
# from src.llm import LLMChat

class TableProcessor:
    def __init__(self, llm_model: LLMChat, embedding_model: EmbeddingModel, batch_size: int = 8):
        """
        Initialize the TableProcessor with pre-initialized models.
        
        Args:
            llm_model (LLMChat): Initialized LLM model
            embedding_model (EmbeddingModel): Initialized embedding model
            batch_size (int): Batch size for processing embeddings
        """
        self.llm = llm_model
        self.embedder = embedding_model
        self.batch_size = batch_size
    
    def get_table_description(self, markdown_table: str) -> str:
        """
        Generate description for a single markdown table using Ollama chat.
        
        Args:
            markdown_table (str): Input markdown table
            
        Returns:
            str: Generated description of the table
        """
        system_prompt = """You are an AI language model. Your task is to examine the provided table, taking into account both its rows and columns, and produce a concise summary of up to 200 words. Emphasize key patterns, trends, and notable data points that provide meaningful insights into the content of the table."""
        
        try:
            # Use chat_once to avoid maintaining history between tables
            full_prompt = f"{system_prompt}\n\nTable:\n{markdown_table}"
            return self.llm.chat_once(full_prompt)
        except Exception as e:
            print(f"Error generating table description: {e}")
            return ""
    
    def process_tables(self, markdown_tables) -> List[Dict[str, Any]]:
        """
        Process a list of markdown tables: generate descriptions and embeddings.
        
        Args:
            markdown_tables (List[str]): List of markdown tables to process
            
        Returns:
            List[Dict[str, Any]]: List of dictionaries containing processed information
        """
        results = []
        descriptions = []
        
        # Generate descriptions for all tables
        with tqdm(total=len(markdown_tables), desc="Generating table descriptions") as pbar:
            for i, table in enumerate(markdown_tables):
                description = self.get_table_description(table.text)
                print(f"\nTable {i+1}:")
                print(f"Description: {description}")
                print("-" * 50)
                descriptions.append(description)
                pbar.update(1)
                time.sleep(1)  # Rate limiting
            
        # Generate embeddings in batches
        embeddings = []
        total_batches = (len(descriptions) + self.batch_size - 1) // self.batch_size
        
        with tqdm(total=total_batches, desc="Generating embeddings") as pbar:
            for i in range(0, len(descriptions), self.batch_size):
                batch = descriptions[i:i + self.batch_size]
                if len(batch) == 1:
                    batch_embeddings = [self.embedder.embed(batch[0])]
                else:
                    batch_embeddings = self.embedder.embed_batch(batch)
                embeddings.extend(batch_embeddings)
                pbar.update(1)
        
        # Combine results with progress bar
        with tqdm(total=len(markdown_tables), desc="Combining results") as pbar:
            for table, description, embedding in zip(markdown_tables, descriptions, embeddings):
                results.append({
                    "embedding": embedding,
                    "text": table,
                    "table_description": description,
                    "type": "table_chunk"
                })
                pbar.update(1)
            
        return results

    def __call__(self, markdown_tables) -> List[Dict[str, Any]]:
        """
        Make the class callable for easier use.
        
        Args:
            markdown_tables (List[str]): List of markdown tables to process
            
        Returns:
            List[Dict[str, Any]]: Processed results
        """
        return self.process_tables(markdown_tables)

In [7]:
from typing import List, Dict, Any, Optional
import pandas as pd
import time
from tqdm import tqdm
import logging
from pinecone import Pinecone, ServerlessSpec
from dataclasses import dataclass
from enum import Enum
# from src.table_aware_chunker import TableRecursiveChunker
# from src.processor import TableProcessor
# from src.llm import LLMChat
# from src.embedding import EmbeddingModel
from chonkie import RecursiveRules
# from src.loader import MultiFormatDocumentLoader
from dotenv import load_dotenv
import os

load_dotenv()
# API Keys
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger('table_aware_rag')

class ChunkType(Enum):
    TEXT = "text_chunk"
    TABLE = "table_chunk"

@dataclass
class ProcessedChunk:
    text: str  # This will be the embedable text (table description for tables)
    chunk_type: ChunkType
    token_count: int
    markdown_table: Optional[str] = None  # Store original markdown table format
    start_index: Optional[int] = None
    end_index: Optional[int] = None

def process_documents(
    file_paths: List[str],
    chunker: TableRecursiveChunker,
    processor: TableProcessor,
    output_path: str = './output.md'
) -> List[ProcessedChunk]:
    """
    Process documents into text and table chunks
    """
    # Load documents
    loader = MultiFormatDocumentLoader(
        file_paths=file_paths,
        enable_ocr=False,
        enable_tables=True
    )
    
    # Save to markdown and read content
    with open(output_path, 'w') as f:
        for doc in loader.lazy_load():
            f.write(doc.page_content)
    
    with open(output_path, 'r') as file:
        text = file.read()
    
    # Get text and table chunks
    text_chunks, table_chunks = chunker.chunk(text)
    
    # Process chunks
    processed_chunks = []
    
    # Process text chunks
    for chunk in text_chunks:
        processed_chunks.append(
            ProcessedChunk(
                text=chunk.text,
                chunk_type=ChunkType.TEXT,
                token_count=chunk.token_count,
                start_index=chunk.start_index,
                end_index=chunk.end_index
            )
        )
    
    # Process table chunks
    table_results = processor(table_chunks)
    for table in table_results:
        # Convert table chunk to string representation if needed
        table_str = str(table["text"].text)
        
        processed_chunks.append(
            ProcessedChunk(
                text=table["table_description"],  # Use description for embedding
                chunk_type=ChunkType.TABLE,
                token_count=len(table["table_description"].split()),
                markdown_table=table_str  # Store string version of table
            )
        )
    
    return processed_chunks

class PineconeRetriever:
    def __init__(
        self,
        pinecone_client: Pinecone,
        index_name: str,
        namespace: str,
        embedding_model: Any,
        llm_model: Any
    ):
        """
        Initialize retriever with configurable models
        """
        self.pinecone = pinecone_client
        self.index = self.pinecone.Index(index_name)
        self.namespace = namespace
        self.embedding_model = embedding_model
        self.llm_model = llm_model
    
    def _prepare_query(self, question: str) -> List[float]:
        """Generate embedding for query"""
        return self.embedding_model.embed(question)
    
    def invoke(
        self,
        question: str,
        top_k: int = 5,
        chunk_type_filter: Optional[ChunkType] = None
    ) -> List[Dict[str, Any]]:
        """
        Retrieve similar documents with optional filtering by chunk type
        """
        query_embedding = self._prepare_query(question)
        
        # Prepare filter if chunk type specified
        filter_dict = None
        if chunk_type_filter:
            filter_dict = {"chunk_type": chunk_type_filter.value}
        
        results = self.index.query(
            namespace=self.namespace,
            vector=query_embedding,
            top_k=top_k,
            include_values=False,
            include_metadata=True,
            filter=filter_dict
        )
        
        retrieved_docs = []
        for match in results.matches:
            doc = {
                "score": match.score,
                "chunk_type": match.metadata["chunk_type"]
            }
            
            # Handle different chunk types
            if match.metadata["chunk_type"] == ChunkType.TABLE.value:
                doc["table_description"] = match.metadata["text"]  # The embedded description
                doc["markdown_table"] = match.metadata["markdown_table"]  # Original table format
            else:
                doc["page_content"] = match.metadata["text"]
                
            retrieved_docs.append(doc)
        
        return retrieved_docs

def ingest_data(
    processed_chunks: List[ProcessedChunk],
    embedding_model: Any,
    pinecone_client: Pinecone,
    index_name: str = "vector-index",
    namespace: str = "rag",
    batch_size: int = 100
):
    """
    Ingest processed chunks into Pinecone
    """
    # Create or get index
    if not pinecone_client.has_index(index_name):
        pinecone_client.create_index(
            name=index_name,
            dimension=768,
            metric="cosine",
            spec=ServerlessSpec(
                cloud='aws',
                region='us-east-1'
            )
        )
        
        while not pinecone_client.describe_index(index_name).status['ready']:
            time.sleep(1)
    
    index = pinecone_client.Index(index_name)
    
    # Process in batches
    for i in tqdm(range(0, len(processed_chunks), batch_size)):
        batch = processed_chunks[i:i+batch_size]
        
        # Generate embeddings for the text content
        texts = [chunk.text for chunk in batch]
        embeddings = embedding_model.embed_batch(texts)
        
        # Prepare records
        records = []
        for idx, chunk in enumerate(batch):
            metadata = {
                "text": chunk.text,  # This is the description for tables
                "chunk_type": chunk.chunk_type.value,
                "token_count": chunk.token_count
            }
            
            # Add markdown table to metadata if it's a table chunk
            if chunk.markdown_table is not None:
                # Ensure the table is in string format
                metadata["markdown_table"] = str(chunk.markdown_table)
            
            records.append({
                "id": f"chunk_{i + idx}",
                "values": embeddings[idx],
                "metadata": metadata
            })
        
        # Upsert to Pinecone
        try:
            index.upsert(vectors=records, namespace=namespace)
        except Exception as e:
            logger.error(f"Error during upsert: {str(e)}")
            logger.error(f"Problematic record metadata: {records[0]['metadata']}")
            raise
            
        time.sleep(0.5)  # Rate limiting


# def main():
#     # Initialize components
#     pc = Pinecone(api_key=PINECONE_API_KEY)
    
#     chunker = TableRecursiveChunker(
#         tokenizer="gpt2",
#         chunk_size=512,
#         rules=RecursiveRules(),
#         min_characters_per_chunk=12
#     )
    
#     llm = LLMChat("qwen2.5:0.5b")
#     embedder = EmbeddingModel("nomic-embed-text")
    
#     processor = TableProcessor(
#         llm_model=llm,
#         embedding_model=embedder,
#         batch_size=8
#     )
    
#     try:
#         # Process documents
#         processed_chunks = process_documents(
#             file_paths=['/teamspace/studios/this_studio/TabularRAG/data/FeesPaymentReceipt_7thsem.pdf'],
#             chunker=chunker,
#             processor=processor
#         )
        
#         # Ingest data
#         ingest_data(
#             processed_chunks=processed_chunks,
#             embedding_model=embedder,
#             pinecone_client=pc
#         )
        
#         # Test retrieval
#         retriever = PineconeRetriever(
#             pinecone_client=pc,
#             index_name="vector-index",
#             namespace="rag",
#             embedding_model=embedder,
#             llm_model=llm
#         )
        
        # # Test text-only retrieval
        # text_results = retriever.invoke(
        #     question="What is paid fees amount?",
        #     top_k=3,
        #     chunk_type_filter=ChunkType.TEXT
        # )
        # print("Text results:")
        # for result in text_results:
        #     print(result)
        # Test table-only retrieval
        # table_results = retriever.invoke(
        #     question="What is paid fees amount?",
        #     top_k=3,
        #     chunk_type_filter=ChunkType.TABLE
        # )
        # print("Table results:")
        # for result in table_results:
        #     print(result)
        
    #     results = retriever.invoke(
    #         question="What is paid fees amount?",
    #         top_k=3
    #     )
        
    #     for i, result in enumerate(results, 1):
    #         print(f"\nResult {i}:")
    #         if result["chunk_type"] == ChunkType.TABLE.value:
    #             print(f"Table Description: {result['table_description']}")
    #             print("Table Format:")
    #             print(result['markdown_table'])
    #         else:
    #             print(f"Content: {result['page_content']}")
    #         print(f"Score: {result['score']}")
            
    # except Exception as e:
    #     logger.error(f"Error in pipeline: {str(e)}")

# if __name__ == "__main__":
#     main()

In [8]:
from pathlib import Path
import tempfile
import os
from typing import List, Dict
from pinecone import Pinecone
# from src.table_aware_chunker import TableRecursiveChunker
# from src.processor import TableProcessor
# from src.llm import LLMChat
# from src.embedding import EmbeddingModel
from chonkie import RecursiveRules
# from src.vectordb import ChunkType, process_documents, ingest_data, PineconeRetriever

class TableRAGSystem:
    def __init__(self, pinecone_api_key: str):
        """Initialize the Table RAG system with necessary components."""
        self.pc = Pinecone(api_key=pinecone_api_key)
        
        # Initialize LLM
        self.llm = LLMChat(
            model_name="mistral:7b",
            temperature=0.3
        )
        
        # Initialize Embeddings
        self.embedder = EmbeddingModel("nomic-embed-text")
        
        # Initialize Chunker
        self.chunker = TableRecursiveChunker(
            tokenizer="gpt2",
            chunk_size=512,
            rules=RecursiveRules(),
            min_characters_per_chunk=12
        )
        
        # Initialize Processor
        self.processor = TableProcessor(
            llm_model=self.llm,
            embedding_model=self.embedder,
            batch_size=8
        )
        
        self.retriever = None
        
    def process_documents(self, file_paths: List[str]) -> bool:
        """Process documents and initialize the retriever."""
        try:
            # Process documents
            print("Processing documents...")
            processed_chunks = process_documents(
                file_paths=file_paths,
                chunker=self.chunker,
                processor=self.processor,
                output_path='./output.md'
            )
            
            # Ingest data
            print("Ingesting data to vector database...")
            ingest_data(
                processed_chunks=processed_chunks,
                embedding_model=self.embedder,
                pinecone_client=self.pc
            )
            
            # Setup retriever
            print("Setting up retriever...")
            self.retriever = PineconeRetriever(
                pinecone_client=self.pc,
                index_name="vector-index",
                namespace="rag",
                embedding_model=self.embedder,
                llm_model=self.llm
            )
            
            print("Processing complete!")
            return True

        except Exception as e:
            print(f"Error processing documents: {str(e)}")
            return False

    def format_context(self, results: List[Dict]) -> str:
        """Format retrieved results into context string."""
        context_parts = []
        
        for result in results:
            if result.get("chunk_type") == ChunkType.TABLE.value:
                table_text = f"Table: {result['markdown_table']}"
                if result.get("table_description"):
                    table_text += f"\nDescription: {result['table_description']}"
                context_parts.append(table_text)
            else:
                context_parts.append(result.get("page_content", ""))
        
        return "\n\n".join(context_parts)

    def query(self, question: str) -> Dict:
        """Query the system with a question."""
        if not self.retriever:
            raise ValueError("Documents must be processed before querying")
        
        # Retrieve relevant content
        results = self.retriever.invoke(
            question=question,
            top_k=3
        )
        
        # Format context and get response from LLM
        context = self.format_context(results)
        
        # RAG Template
        rag_template = [
            {
                "role": "system",
                "content": """You are a knowledgeable assistant specialized in analyzing documents and tables. 
                            Your responses should be:
                            - Accurate and based on the provided context
                            - Concise (three sentences maximum)
                            - Professional yet conversational
                            - Include specific references to tables when relevant
                            
                        If you cannot find an answer in the context, acknowledge this clearly."""
            },
            {
                "role": "human",
                "content": "Context: {context}\n\nQuestion: {question}"
            }
        ]
        
        input_vars = {
            "question": question,
            "context": context
        }

        response = self.llm.chat_with_template(rag_template, input_vars)
        
        return {
            "response": response,
            "context": context,
            "retrieved_results": results
        }

    def clear_index(self, index_name: str = "vector-index"):
        """Clear the Pinecone index."""
        try:
            self.pc.delete_index(index_name)
            self.retriever = None
            print("Database cleared successfully!")
        except Exception as e:
            print(f"Error clearing database: {str(e)}")

In [10]:
# Initialize the system
pinecone_api_key = "pcsk_3AEjJe_So4D99WCivWvTLohkzAWp12gJiDcHMNXk3V8RkkaVUywB2jVitnciQbAEYZQEVS"
rag_system = TableRAGSystem(pinecone_api_key)

# Process documents
file_paths = [
    "/teamspace/studios/this_studio/TabularRAG/data/FeesPaymentReceipt_7thsem.pdf"
]
rag_system.process_documents(file_paths)

# Query the system
question = "what is the paid amount?"
result = rag_system.query(question)

# Access different parts of the response
print("Answer:", result["response"])
print("\nRelevant Context:", result["context"])


INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference
INFO:pinecone_plugin_interface.logging:Installing plugin inference into Pinecone
INFO:docling.document_converter:Going to convert document batch...
INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'


Processing documents...


INFO:docling.utils.accelerator_utils:Accelerator device: 'cpu'
INFO:docling.pipeline.base_pipeline:Processing document FeesPaymentReceipt_7thsem.pdf
INFO:docling.document_converter:Finished converting document FeesPaymentReceipt_7thsem.pdf in 6.28 sec.
INFO:__main__:Processed 1 documents:
- Successfully converted: 1
- Partially converted: 0
- Failed: 0
Generating table descriptions:   0%|          | 0/1 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Generating table descriptions: 100%|██████████| 1/1 [01:36<00:00, 96.89s/it]


Table 1:
Description:  The table provides a breakdown of various costs associated with educational expenses, including tuition fees, lodging, fooding, and other charges. The most significant cost is the tuition fee at $22,500. It's interesting to note that there are two categories labeled as "Outstanding" for both tuition fees & others, and fooding, suggesting that these costs have not been fully paid.

   The lodging including facilities for one semester is also a substantial cost, although the amount is not specified in this table. The presence of an "Excess" and "Late Fine 22500 Total" categories implies that there may be additional fees for late payments or exceeding certain limits.

   Overall, the data suggests that the total educational costs can be quite high, with a significant portion of these costs being outstanding, potentially indicating a need for financial planning and budgeting strategies to manage these expenses effectively.
-------------------------------------------

Generating table descriptions: 100%|██████████| 1/1 [01:37<00:00, 97.89s/it]
Generating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
Generating embeddings: 100%|██████████| 1/1 [00:02<00:00,  2.13s/it]
Combining results: 100%|██████████| 1/1 [00:00<00:00, 24105.20it/s]


Ingesting data to vector database...


INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference
  0%|          | 0/1 [00:00<?, ?it/s]INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
100%|██████████| 1/1 [00:02<00:00,  2.26s/it]
INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


Setting up retriever...
Processing complete!


INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


Answer:  Based on the provided context, I am unable to determine the exact paid amount as no numerical values related to payment are present in the given data. Please provide more specific details or numbers for a precise answer.

Relevant Context: 


In [11]:
question = "what is the paid amount?"
result = rag_system.query(question)

INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


In [12]:
print("Answer:", result["response"])
print("\nRelevant Context:", result["context"])

Answer:  The paid amount for this receipt is $22,500. This can be found in the table under the "Online Payment Total" category.

Relevant Context: <!-- image -->

## THE NEOTIA UNIVERSITY

Diamond Harbour Road, Sarisha Hat, Sarisha, West Bengal - 743368, India

Payment Receipt

Student Details

Receipt Date

03/07/2024

Name

:

ANINDYA MITRA

UID No.

Course

:

Contact No.

Installment

:

Payment Type :

:

TNU2021053100042

Bachelor of Technology in Computer Science & Engineering with

8240716218

Semester Fee-7

Online Payment



Table: | Heads                                                    | Amount                                                   |
|----------------------------------------------------------|----------------------------------------------------------|
| Outstanding(Tuition Fees & Others)                       | Outstanding(Tuition Fees & Others)                       |
| Outstanding(Fooding)                                     | Outstanding(Fooding)           

In [13]:

# Clear the database when done
rag_system.clear_index()

Database cleared successfully!
