from typing import List import google.generativeai as genai from langchain.embeddings.base import Embeddings from langchain_community.vectorstores import FAISS from langchain.text_splitter import RecursiveCharacterTextSplitter from PyPDF2 import PdfReader import pandas as pd import os class CustomGoogleEmbeddings(Embeddings): """Custom Embedding Class for Google Generative AI""" def __init__(self, model='models/embedding-001'): self.client = genai self.model = model def embed_documents(self, texts: List[str]) -> List[List[float]]: embeddings = [] for text in texts: text = text[:2048] if len(text) > 2048 else text try: embedding = self.client.embed_content( model=self.model, content=text, task_type="retrieval_document" )['embedding'] embeddings.append(embedding) except Exception as e: print(f"Embedding error: {e}") embeddings.append([0.0] * 768) return embeddings def embed_query(self, text: str) -> List[float]: text = text[:2048] if len(text) > 2048 else text try: return self.client.embed_content( model=self.model, content=text, task_type="retrieval_query" )['embedding'] except Exception as e: print(f"Query embedding error: {e}") return [0.0] * 768 class RAGProcessor: def __init__(self): self.embeddings = CustomGoogleEmbeddings() self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200, separators=["\n\n", "\n", ".", ",", " ", ""] ) genai.configure(api_key=os.getenv('GOOGLE_API_KEY')) self.model = genai.GenerativeModel('gemini-pro') def extract_text_from_pdf(self, pdf_file) -> str: """Extract text from PDF with focus on structured content""" try: pdf_reader = PdfReader(pdf_file) text = "" for page in pdf_reader.pages: text += page.extract_text() + "\n\n" # Basic structure preservation # Look for common P&L statement patterns lines = text.split('\n') structured_text = "" for line in lines: # Identify potential financial entries (e.g., "Revenue: $1000") if any(keyword in line.lower() for keyword in ['revenue', 'profit', 'loss', 'expenses', 'income', 'cost', 'margin', 'ebitda', 'tax']): structured_text += f"FINANCIAL_ENTRY: {line}\n" else: structured_text += line + "\n" return structured_text except Exception as e: print(f"Error extracting text from PDF: {e}") return "" def process_documents(self, pdf_files: List[str]) -> FAISS: """Process multiple PDF documents and create vector store""" combined_text = "" for pdf in pdf_files: combined_text += self.extract_text_from_pdf(pdf) # Create more focused chunks text_chunks = self.text_splitter.split_text(combined_text) # Create vector store try: vector_store = FAISS.from_texts(text_chunks, embedding=self.embeddings) return vector_store except Exception as e: print(f"Error creating vector store: {e}") raise def generate_response(self, question: str, vector_store: FAISS) -> str: """Generate response using RAG approach""" # Retrieve relevant context docs = vector_store.similarity_search(question, k=4) context = "\n".join([doc.page_content for doc in docs]) prompt = f""" You are a financial analyst assistant. Using the following financial data context, answer the question accurately and professionally. Include specific numbers and calculations when relevant. Context: {context} Question: {question} If the context doesn't contain enough information to answer accurately, please state that clearly. Focus on P&L related information and financial metrics. When providing financial figures, please format them clearly with appropriate units (e.g., "$1,234,567" or "1.2M" for millions). """ try: response = self.model.generate_content(prompt) return response.text except Exception as e: return f"Error generating response: {e}"