File size: 4,188 Bytes
fe52a97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from typing import List, Dict, Any
from tqdm import tqdm
import time

from src.embedding import EmbeddingModel
from src.llm import LLMChat

class TableProcessor:
    def __init__(self, llm_model: LLMChat, embedding_model: EmbeddingModel, batch_size: int = 8):
        """
        Initialize the TableProcessor with pre-initialized models.
        
        Args:
            llm_model (LLMChat): Initialized LLM model
            embedding_model (EmbeddingModel): Initialized embedding model
            batch_size (int): Batch size for processing embeddings
        """
        self.llm = llm_model
        self.embedder = embedding_model
        self.batch_size = batch_size
    
    def get_table_description(self, markdown_table: str) -> str:
        """
        Generate description for a single markdown table using Ollama chat.
        
        Args:
            markdown_table (str): Input markdown table
            
        Returns:
            str: Generated description of the table
        """
        system_prompt = """You are an AI language model. Your task is to examine the provided table, taking into account both its rows and columns, and produce a concise summary of up to 200 words. Emphasize key patterns, trends, and notable data points that provide meaningful insights into the content of the table."""
        
        try:
            # Use chat_once to avoid maintaining history between tables
            full_prompt = f"{system_prompt}\n\nTable:\n{markdown_table}"
            return self.llm.chat_once(full_prompt)
        except Exception as e:
            print(f"Error generating table description: {e}")
            return ""
    
    def process_tables(self, markdown_tables) -> List[Dict[str, Any]]:
        """
        Process a list of markdown tables: generate descriptions and embeddings.
        
        Args:
            markdown_tables (List[str]): List of markdown tables to process
            
        Returns:
            List[Dict[str, Any]]: List of dictionaries containing processed information
        """
        results = []
        descriptions = []
        
        # Generate descriptions for all tables
        with tqdm(total=len(markdown_tables), desc="Generating table descriptions") as pbar:
            for i, table in enumerate(markdown_tables):
                description = self.get_table_description(table.text)
                print(f"\nTable {i+1}:")
                print(f"Description: {description}")
                print("-" * 50)
                descriptions.append(description)
                pbar.update(1)
                time.sleep(1)  # Rate limiting
            
        # Generate embeddings in batches
        embeddings = []
        total_batches = (len(descriptions) + self.batch_size - 1) // self.batch_size
        
        with tqdm(total=total_batches, desc="Generating embeddings") as pbar:
            for i in range(0, len(descriptions), self.batch_size):
                batch = descriptions[i:i + self.batch_size]
                if len(batch) == 1:
                    batch_embeddings = [self.embedder.embed(batch[0])]
                else:
                    batch_embeddings = self.embedder.embed_batch(batch)
                embeddings.extend(batch_embeddings)
                pbar.update(1)
        
        # Combine results with progress bar
        with tqdm(total=len(markdown_tables), desc="Combining results") as pbar:
            for table, description, embedding in zip(markdown_tables, descriptions, embeddings):
                results.append({
                    "embedding": embedding,
                    "text": table,
                    "table_description": description,
                    "type": "table_chunk"
                })
                pbar.update(1)
            
        return results

    def __call__(self, markdown_tables) -> List[Dict[str, Any]]:
        """
        Make the class callable for easier use.
        
        Args:
            markdown_tables (List[str]): List of markdown tables to process
            
        Returns:
            List[Dict[str, Any]]: Processed results
        """
        return self.process_tables(markdown_tables)