Vikrant26 commited on
Commit
65cdc34
·
verified ·
1 Parent(s): 8ddef96

Upload 6 files

Browse files
Files changed (6) hide show
  1. .env +6 -0
  2. .gitignore +2 -0
  3. PL_image-removebg-preview.png +0 -0
  4. app.py +108 -0
  5. rag.py +123 -0
  6. requirements.txt +11 -0
.env ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ GOOGLE_API_KEY="AIzaSyA6pBfBHg3zK_3JtB6fRoYUcG4589RjSjg"
2
+ PINECONE_API_KEY="pcsk_3oYE7o_3JP3Y1f9zveyQYJxUy4WGwZy4TKqCWyemLAqUeCqpM6UPK8Ne1Bx2KGCkmDS3eq"
3
+ PINECONE_ENV="us-west1-gcp-free"
4
+ # Optional: ChromaDB Settings
5
+ CHROMA_DB_IMPL=duckdb+parquet
6
+ PERSIST_DIRECTORY=db
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ myenv
2
+ .env
PL_image-removebg-preview.png ADDED
app.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from rag import RAGProcessor
3
+ import os
4
+ from dotenv import load_dotenv
5
+ import tempfile
6
+
7
+ # Load environment variables
8
+ load_dotenv()
9
+
10
+ # Check for API key
11
+ if not os.getenv('GOOGLE_API_KEY'):
12
+ st.error("Please set the GOOGLE_API_KEY in your .env file.")
13
+ st.stop()
14
+
15
+ def initialize_session_state():
16
+ """Initialize session state variables."""
17
+ if "rag_processor" not in st.session_state:
18
+ st.session_state.rag_processor = RAGProcessor()
19
+ if "vector_store" not in st.session_state:
20
+ st.session_state.vector_store = None
21
+
22
+ def save_uploaded_files(uploaded_files):
23
+ """Save uploaded files to a temporary directory and return file paths."""
24
+ try:
25
+ temp_dir = tempfile.mkdtemp()
26
+ file_paths = []
27
+
28
+ for uploaded_file in uploaded_files:
29
+ file_path = os.path.join(temp_dir, uploaded_file.name)
30
+ with open(file_path, "wb") as f:
31
+ f.write(uploaded_file.getbuffer())
32
+ file_paths.append(file_path)
33
+
34
+ return file_paths
35
+ except Exception as e:
36
+ st.error(f"Error saving uploaded files: {e}")
37
+ return []
38
+
39
+ def main():
40
+ st.set_page_config(
41
+ page_title="Finance Buddy",
42
+ page_icon="💰",
43
+ layout="wide"
44
+ )
45
+
46
+ initialize_session_state()
47
+
48
+ # Main header with emoji
49
+ st.markdown("<div class='main-header'>", unsafe_allow_html=True)
50
+ st.markdown(
51
+ "<h1 style='text-align: center;'>💰 Finance Buddy</h1>",
52
+ unsafe_allow_html=True
53
+ )
54
+ st.markdown("</div>", unsafe_allow_html=True)
55
+
56
+ # Sidebar
57
+ with st.sidebar:
58
+ st.image("PL_image-removebg-preview.png", use_column_width=True)
59
+ st.title("📄 Document Analysis")
60
+ uploaded_files = st.file_uploader(
61
+ "Upload P&L Documents (PDF)",
62
+ accept_multiple_files=True,
63
+ type=['pdf']
64
+ )
65
+
66
+ if uploaded_files and st.button("Process Documents", key="process_docs"):
67
+ with st.spinner("Processing documents..."):
68
+ try:
69
+ # Save uploaded files and process them
70
+ file_paths = save_uploaded_files(uploaded_files)
71
+ if file_paths:
72
+ st.session_state.vector_store = st.session_state.rag_processor.process_documents(file_paths)
73
+ st.success("✅ Documents processed successfully!")
74
+ except Exception as e:
75
+ st.error(f"Error processing documents: {e}")
76
+
77
+ # Main content
78
+ st.markdown("""
79
+ 💡 **Ask questions about your P&L statements and financial data.**
80
+ """)
81
+
82
+ # Query input
83
+ query = st.text_input("🔍 Ask your question:", key="query")
84
+
85
+ if query:
86
+ if not st.session_state.vector_store:
87
+ st.warning("Please upload and process documents first!")
88
+ else:
89
+ with st.spinner("Analyzing..."):
90
+ try:
91
+ response = st.session_state.rag_processor.generate_response(
92
+ query,
93
+ st.session_state.vector_store
94
+ )
95
+ st.markdown("### 📋 Response:")
96
+ st.markdown(f">{response}")
97
+ except Exception as e:
98
+ st.error(f"Error generating response: {e}")
99
+
100
+ # Footer
101
+ st.markdown("---")
102
+ st.markdown(
103
+ "<p style='text-align: center;'>💼 Built with Streamlit & Google Generative AI</p>",
104
+ unsafe_allow_html=True
105
+ )
106
+
107
+ if __name__ == "__main__":
108
+ main()
rag.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ import google.generativeai as genai
3
+ from langchain.embeddings.base import Embeddings
4
+ from langchain_community.vectorstores import FAISS
5
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
6
+ from PyPDF2 import PdfReader
7
+ import pandas as pd
8
+ import os
9
+
10
+ class CustomGoogleEmbeddings(Embeddings):
11
+ """Custom Embedding Class for Google Generative AI"""
12
+ def __init__(self, model='models/embedding-001'):
13
+ self.client = genai
14
+ self.model = model
15
+
16
+ def embed_documents(self, texts: List[str]) -> List[List[float]]:
17
+ embeddings = []
18
+ for text in texts:
19
+ text = text[:2048] if len(text) > 2048 else text
20
+ try:
21
+ embedding = self.client.embed_content(
22
+ model=self.model,
23
+ content=text,
24
+ task_type="retrieval_document"
25
+ )['embedding']
26
+ embeddings.append(embedding)
27
+ except Exception as e:
28
+ print(f"Embedding error: {e}")
29
+ embeddings.append([0.0] * 768)
30
+ return embeddings
31
+
32
+ def embed_query(self, text: str) -> List[float]:
33
+ text = text[:2048] if len(text) > 2048 else text
34
+ try:
35
+ return self.client.embed_content(
36
+ model=self.model,
37
+ content=text,
38
+ task_type="retrieval_query"
39
+ )['embedding']
40
+ except Exception as e:
41
+ print(f"Query embedding error: {e}")
42
+ return [0.0] * 768
43
+
44
+ class RAGProcessor:
45
+ def __init__(self):
46
+ self.embeddings = CustomGoogleEmbeddings()
47
+ self.text_splitter = RecursiveCharacterTextSplitter(
48
+ chunk_size=1000,
49
+ chunk_overlap=200,
50
+ separators=["\n\n", "\n", ".", ",", " ", ""]
51
+ )
52
+ genai.configure(api_key=os.getenv('GOOGLE_API_KEY'))
53
+ self.model = genai.GenerativeModel('gemini-pro')
54
+
55
+ def extract_text_from_pdf(self, pdf_file) -> str:
56
+ """Extract text from PDF with focus on structured content"""
57
+ try:
58
+ pdf_reader = PdfReader(pdf_file)
59
+ text = ""
60
+
61
+ for page in pdf_reader.pages:
62
+ text += page.extract_text() + "\n\n"
63
+
64
+ # Basic structure preservation
65
+ # Look for common P&L statement patterns
66
+ lines = text.split('\n')
67
+ structured_text = ""
68
+ for line in lines:
69
+ # Identify potential financial entries (e.g., "Revenue: $1000")
70
+ if any(keyword in line.lower() for keyword in ['revenue', 'profit', 'loss', 'expenses', 'income', 'cost', 'margin', 'ebitda', 'tax']):
71
+ structured_text += f"FINANCIAL_ENTRY: {line}\n"
72
+ else:
73
+ structured_text += line + "\n"
74
+
75
+ return structured_text
76
+
77
+ except Exception as e:
78
+ print(f"Error extracting text from PDF: {e}")
79
+ return ""
80
+
81
+ def process_documents(self, pdf_files: List[str]) -> FAISS:
82
+ """Process multiple PDF documents and create vector store"""
83
+ combined_text = ""
84
+ for pdf in pdf_files:
85
+ combined_text += self.extract_text_from_pdf(pdf)
86
+
87
+ # Create more focused chunks
88
+ text_chunks = self.text_splitter.split_text(combined_text)
89
+
90
+ # Create vector store
91
+ try:
92
+ vector_store = FAISS.from_texts(text_chunks, embedding=self.embeddings)
93
+ return vector_store
94
+ except Exception as e:
95
+ print(f"Error creating vector store: {e}")
96
+ raise
97
+
98
+ def generate_response(self, question: str, vector_store: FAISS) -> str:
99
+ """Generate response using RAG approach"""
100
+ # Retrieve relevant context
101
+ docs = vector_store.similarity_search(question, k=4)
102
+ context = "\n".join([doc.page_content for doc in docs])
103
+
104
+ prompt = f"""
105
+ You are a financial analyst assistant. Using the following financial data context,
106
+ answer the question accurately and professionally. Include specific numbers and
107
+ calculations when relevant.
108
+
109
+ Context: {context}
110
+
111
+ Question: {question}
112
+
113
+ If the context doesn't contain enough information to answer accurately,
114
+ please state that clearly. Focus on P&L related information and financial metrics.
115
+ When providing financial figures, please format them clearly with appropriate units
116
+ (e.g., "$1,234,567" or "1.2M" for millions).
117
+ """
118
+
119
+ try:
120
+ response = self.model.generate_content(prompt)
121
+ return response.text
122
+ except Exception as e:
123
+ return f"Error generating response: {e}"
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ python-dotenv
3
+ google-generativeai
4
+ langchain
5
+ langchain-community
6
+ faiss-cpu
7
+ PyPDF2
8
+ tabula-py
9
+ pandas
10
+ numpy
11
+ python-multipart