Finance_Bot / rag.py
Vikrant26's picture
Upload 6 files
65cdc34 verified
from typing import List
import google.generativeai as genai
from langchain.embeddings.base import Embeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from PyPDF2 import PdfReader
import pandas as pd
import os
class CustomGoogleEmbeddings(Embeddings):
"""Custom Embedding Class for Google Generative AI"""
def __init__(self, model='models/embedding-001'):
self.client = genai
self.model = model
def embed_documents(self, texts: List[str]) -> List[List[float]]:
embeddings = []
for text in texts:
text = text[:2048] if len(text) > 2048 else text
try:
embedding = self.client.embed_content(
model=self.model,
content=text,
task_type="retrieval_document"
)['embedding']
embeddings.append(embedding)
except Exception as e:
print(f"Embedding error: {e}")
embeddings.append([0.0] * 768)
return embeddings
def embed_query(self, text: str) -> List[float]:
text = text[:2048] if len(text) > 2048 else text
try:
return self.client.embed_content(
model=self.model,
content=text,
task_type="retrieval_query"
)['embedding']
except Exception as e:
print(f"Query embedding error: {e}")
return [0.0] * 768
class RAGProcessor:
def __init__(self):
self.embeddings = CustomGoogleEmbeddings()
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
separators=["\n\n", "\n", ".", ",", " ", ""]
)
genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))
self.model = genai.GenerativeModel('gemini-pro')
def extract_text_from_pdf(self, pdf_file) -> str:
"""Extract text from PDF with focus on structured content"""
try:
pdf_reader = PdfReader(pdf_file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text() + "\n\n"
# Basic structure preservation
# Look for common P&L statement patterns
lines = text.split('\n')
structured_text = ""
for line in lines:
# Identify potential financial entries (e.g., "Revenue: $1000")
if any(keyword in line.lower() for keyword in ['revenue', 'profit', 'loss', 'expenses', 'income', 'cost', 'margin', 'ebitda', 'tax']):
structured_text += f"FINANCIAL_ENTRY: {line}\n"
else:
structured_text += line + "\n"
return structured_text
except Exception as e:
print(f"Error extracting text from PDF: {e}")
return ""
def process_documents(self, pdf_files: List[str]) -> FAISS:
"""Process multiple PDF documents and create vector store"""
combined_text = ""
for pdf in pdf_files:
combined_text += self.extract_text_from_pdf(pdf)
# Create more focused chunks
text_chunks = self.text_splitter.split_text(combined_text)
# Create vector store
try:
vector_store = FAISS.from_texts(text_chunks, embedding=self.embeddings)
return vector_store
except Exception as e:
print(f"Error creating vector store: {e}")
raise
def generate_response(self, question: str, vector_store: FAISS) -> str:
"""Generate response using RAG approach"""
# Retrieve relevant context
docs = vector_store.similarity_search(question, k=4)
context = "\n".join([doc.page_content for doc in docs])
prompt = f"""
You are a financial analyst assistant. Using the following financial data context,
answer the question accurately and professionally. Include specific numbers and
calculations when relevant.
Context: {context}
Question: {question}
If the context doesn't contain enough information to answer accurately,
please state that clearly. Focus on P&L related information and financial metrics.
When providing financial figures, please format them clearly with appropriate units
(e.g., "$1,234,567" or "1.2M" for millions).
"""
try:
response = self.model.generate_content(prompt)
return response.text
except Exception as e:
return f"Error generating response: {e}"