Finance_Bot

Build error

File size: 4,919 Bytes

65cdc34

from typing import List
import google.generativeai as genai
from langchain.embeddings.base import Embeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from PyPDF2 import PdfReader
import pandas as pd
import os

class CustomGoogleEmbeddings(Embeddings):
    """Custom Embedding Class for Google Generative AI"""
    def __init__(self, model='models/embedding-001'):
        self.client = genai
        self.model = model

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        embeddings = []
        for text in texts:
            text = text[:2048] if len(text) > 2048 else text
            try:
                embedding = self.client.embed_content(
                    model=self.model,
                    content=text,
                    task_type="retrieval_document"
                )['embedding']
                embeddings.append(embedding)
            except Exception as e:
                print(f"Embedding error: {e}")
                embeddings.append([0.0] * 768)
        return embeddings

    def embed_query(self, text: str) -> List[float]:
        text = text[:2048] if len(text) > 2048 else text
        try:
            return self.client.embed_content(
                model=self.model,
                content=text,
                task_type="retrieval_query"
            )['embedding']
        except Exception as e:
            print(f"Query embedding error: {e}")
            return [0.0] * 768

class RAGProcessor:
    def __init__(self):
        self.embeddings = CustomGoogleEmbeddings()
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1000,
            chunk_overlap=200,
            separators=["\n\n", "\n", ".", ",", " ", ""]
        )
        genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))
        self.model = genai.GenerativeModel('gemini-pro')
        
    def extract_text_from_pdf(self, pdf_file) -> str:
        """Extract text from PDF with focus on structured content"""
        try:
            pdf_reader = PdfReader(pdf_file)
            text = ""
            
            for page in pdf_reader.pages:
                text += page.extract_text() + "\n\n"
                
            # Basic structure preservation
            # Look for common P&L statement patterns
            lines = text.split('\n')
            structured_text = ""
            for line in lines:
                # Identify potential financial entries (e.g., "Revenue: $1000")
                if any(keyword in line.lower() for keyword in ['revenue', 'profit', 'loss', 'expenses', 'income', 'cost', 'margin', 'ebitda', 'tax']):
                    structured_text += f"FINANCIAL_ENTRY: {line}\n"
                else:
                    structured_text += line + "\n"
                    
            return structured_text
            
        except Exception as e:
            print(f"Error extracting text from PDF: {e}")
            return ""

    def process_documents(self, pdf_files: List[str]) -> FAISS:
        """Process multiple PDF documents and create vector store"""
        combined_text = ""
        for pdf in pdf_files:
            combined_text += self.extract_text_from_pdf(pdf)
            
        # Create more focused chunks
        text_chunks = self.text_splitter.split_text(combined_text)
        
        # Create vector store
        try:
            vector_store = FAISS.from_texts(text_chunks, embedding=self.embeddings)
            return vector_store
        except Exception as e:
            print(f"Error creating vector store: {e}")
            raise

    def generate_response(self, question: str, vector_store: FAISS) -> str:
        """Generate response using RAG approach"""
        # Retrieve relevant context
        docs = vector_store.similarity_search(question, k=4)
        context = "\n".join([doc.page_content for doc in docs])
        
        prompt = f"""

        You are a financial analyst assistant. Using the following financial data context, 

        answer the question accurately and professionally. Include specific numbers and 

        calculations when relevant.

        

        Context: {context}

        

        Question: {question}

        

        If the context doesn't contain enough information to answer accurately, 

        please state that clearly. Focus on P&L related information and financial metrics.

        When providing financial figures, please format them clearly with appropriate units 

        (e.g., "$1,234,567" or "1.2M" for millions).

        """
        
        try:
            response = self.model.generate_content(prompt)
            return response.text
        except Exception as e:
            return f"Error generating response: {e}"