Spaces:
Running
Running
File size: 4,919 Bytes
65cdc34 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
from typing import List
import google.generativeai as genai
from langchain.embeddings.base import Embeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from PyPDF2 import PdfReader
import pandas as pd
import os
class CustomGoogleEmbeddings(Embeddings):
"""Custom Embedding Class for Google Generative AI"""
def __init__(self, model='models/embedding-001'):
self.client = genai
self.model = model
def embed_documents(self, texts: List[str]) -> List[List[float]]:
embeddings = []
for text in texts:
text = text[:2048] if len(text) > 2048 else text
try:
embedding = self.client.embed_content(
model=self.model,
content=text,
task_type="retrieval_document"
)['embedding']
embeddings.append(embedding)
except Exception as e:
print(f"Embedding error: {e}")
embeddings.append([0.0] * 768)
return embeddings
def embed_query(self, text: str) -> List[float]:
text = text[:2048] if len(text) > 2048 else text
try:
return self.client.embed_content(
model=self.model,
content=text,
task_type="retrieval_query"
)['embedding']
except Exception as e:
print(f"Query embedding error: {e}")
return [0.0] * 768
class RAGProcessor:
def __init__(self):
self.embeddings = CustomGoogleEmbeddings()
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
separators=["\n\n", "\n", ".", ",", " ", ""]
)
genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))
self.model = genai.GenerativeModel('gemini-pro')
def extract_text_from_pdf(self, pdf_file) -> str:
"""Extract text from PDF with focus on structured content"""
try:
pdf_reader = PdfReader(pdf_file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text() + "\n\n"
# Basic structure preservation
# Look for common P&L statement patterns
lines = text.split('\n')
structured_text = ""
for line in lines:
# Identify potential financial entries (e.g., "Revenue: $1000")
if any(keyword in line.lower() for keyword in ['revenue', 'profit', 'loss', 'expenses', 'income', 'cost', 'margin', 'ebitda', 'tax']):
structured_text += f"FINANCIAL_ENTRY: {line}\n"
else:
structured_text += line + "\n"
return structured_text
except Exception as e:
print(f"Error extracting text from PDF: {e}")
return ""
def process_documents(self, pdf_files: List[str]) -> FAISS:
"""Process multiple PDF documents and create vector store"""
combined_text = ""
for pdf in pdf_files:
combined_text += self.extract_text_from_pdf(pdf)
# Create more focused chunks
text_chunks = self.text_splitter.split_text(combined_text)
# Create vector store
try:
vector_store = FAISS.from_texts(text_chunks, embedding=self.embeddings)
return vector_store
except Exception as e:
print(f"Error creating vector store: {e}")
raise
def generate_response(self, question: str, vector_store: FAISS) -> str:
"""Generate response using RAG approach"""
# Retrieve relevant context
docs = vector_store.similarity_search(question, k=4)
context = "\n".join([doc.page_content for doc in docs])
prompt = f"""
You are a financial analyst assistant. Using the following financial data context,
answer the question accurately and professionally. Include specific numbers and
calculations when relevant.
Context: {context}
Question: {question}
If the context doesn't contain enough information to answer accurately,
please state that clearly. Focus on P&L related information and financial metrics.
When providing financial figures, please format them clearly with appropriate units
(e.g., "$1,234,567" or "1.2M" for millions).
"""
try:
response = self.model.generate_content(prompt)
return response.text
except Exception as e:
return f"Error generating response: {e}" |