alfa95 commited on
Commit
78dcd6a
Β·
0 Parent(s):

Initial commit

Browse files
Files changed (2) hide show
  1. app.py +181 -0
  2. requirments.txt +10 -0
app.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import faiss
4
+ import numpy as np
5
+ import requests
6
+ import pdfplumber
7
+ import spacy
8
+ from sentence_transformers import SentenceTransformer, CrossEncoder
9
+ from rank_bm25 import BM25Okapi
10
+ import gradio as gr
11
+
12
+ # βœ… Load Models
13
+ spacy.cli.download("en_core_web_sm")
14
+ nlp = spacy.load("en_core_web_sm")
15
+ embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
16
+ cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
17
+
18
+ # βœ… Load API Key from Hugging Face Secrets
19
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
20
+
21
+ if not GEMINI_API_KEY:
22
+ raise ValueError("🚨 Please set the Google API Key in Hugging Face Secrets!")
23
+
24
+ GEMINI_API_URL = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent"
25
+
26
+ # βœ… Financial Keywords for Filtering
27
+ FINANCIAL_KEYWORDS = [
28
+ "revenue", "profit", "loss", "balance sheet", "cash flow",
29
+ "earnings", "expenses", "investment", "financial", "liability",
30
+ "assets", "equity", "debt", "capital", "tax", "dividends",
31
+ "reserves", "net income", "operating income"
32
+ ]
33
+
34
+ # βœ… Global Variables for FAISS & BM25
35
+ bm25, chunk_texts, faiss_index = None, [], None
36
+
37
+
38
+ # πŸ”Ή 1. Extract and Clean Text from PDF
39
+ def extract_text_from_pdf(pdf_path):
40
+ text = ""
41
+ with pdfplumber.open(pdf_path) as pdf:
42
+ for page in pdf.pages:
43
+ extracted = page.extract_text()
44
+ if extracted:
45
+ text += extracted + "\n"
46
+ return clean_text(text)
47
+
48
+
49
+ # πŸ”Ή 2. Clean Extracted Text
50
+ def clean_text(text):
51
+ text = re.sub(r"https?://\S+", "", text) # Remove URLs
52
+ text = re.sub(r"^\d{2}/\d{2}/\d{4}.*$", "", text, flags=re.MULTILINE) # Remove timestamps
53
+ text = re.sub(r"(?i)this data can be easily copy pasted.*?", "", text, flags=re.MULTILINE) # Remove metadata
54
+ text = re.sub(r"(?i)moneycontrol.com.*?", "", text, flags=re.MULTILINE) # Remove source attribution
55
+ text = re.sub(r"(\n\s*)+", "\n", text) # Remove extra blank lines
56
+ return text.strip()
57
+
58
+
59
+ # πŸ”Ή 3. Chunking Extracted Text
60
+ def chunk_text(text, max_tokens=64):
61
+ doc = nlp(text)
62
+ sentences = [sent.text for sent in doc.sents]
63
+
64
+ chunks, current_chunk = [], []
65
+ token_count = 0
66
+
67
+ for sentence in sentences:
68
+ tokens = sentence.split()
69
+ if token_count + len(tokens) > max_tokens:
70
+ chunks.append(" ".join(current_chunk))
71
+ current_chunk = []
72
+ token_count = 0
73
+ current_chunk.append(sentence)
74
+ token_count += len(tokens)
75
+
76
+ if current_chunk:
77
+ chunks.append(" ".join(current_chunk))
78
+
79
+ return chunks
80
+
81
+
82
+ # πŸ”Ή 4. Store Chunks in FAISS & BM25
83
+ def store_in_faiss(chunks):
84
+ global bm25, chunk_texts, faiss_index
85
+ embeddings = embed_model.encode(chunks, convert_to_numpy=True)
86
+
87
+ # Create FAISS index
88
+ faiss_index = faiss.IndexFlatL2(embeddings.shape[1])
89
+ faiss_index.add(embeddings)
90
+
91
+ chunk_texts = chunks
92
+ bm25 = BM25Okapi([chunk.split() for chunk in chunks])
93
+ return faiss_index
94
+
95
+
96
+ # πŸ”Ή 5. Retrieve Chunks using BM25
97
+ def retrieve_bm25(query, top_k=2):
98
+ tokenized_query = query.split()
99
+ scores = bm25.get_scores(tokenized_query)
100
+ top_indices = np.argsort(scores)[-top_k:][::-1]
101
+ retrieved_chunks = [chunk_texts[i] for i in top_indices]
102
+ return retrieved_chunks
103
+
104
+
105
+ # πŸ”Ή 6. Generate Response Using Google Gemini
106
+ def refine_with_gemini(query, retrieved_text):
107
+ if not retrieved_text.strip():
108
+ return "❌ No relevant financial data found for your query."
109
+
110
+ payload = {
111
+ "contents": [{
112
+ "parts": [{
113
+ "text": f"You are an expert financial analyst. Based on the provided data, extract only the relevant financial details related to the query: '{query}' and present them in a clear format.\n\nData:\n{retrieved_text}"
114
+ }]
115
+ }]
116
+ }
117
+
118
+ try:
119
+ response = requests.post(
120
+ f"{GEMINI_API_URL}?key={GEMINI_API_KEY}",
121
+ json=payload, headers={"Content-Type": "application/json"}
122
+ )
123
+ response_json = response.json()
124
+
125
+ if response.status_code != 200:
126
+ print("🚨 Gemini API Error Response:", response_json)
127
+ return f"⚠️ Gemini API Error: {response_json.get('error', {}).get('message', 'Unknown error')}"
128
+
129
+ print("βœ… Gemini API Response:", response_json)
130
+ return response_json.get("candidates", [{}])[0].get("content", {}).get("parts", [{}])[0].get("text", "⚠️ Error generating response.")
131
+
132
+ except Exception as e:
133
+ print("🚨 Exception in Gemini API Call:", str(e))
134
+ return "⚠️ Gemini API Exception: Unable to fetch response."
135
+
136
+
137
+ # πŸ”Ή 7. Final Retrieval Function
138
+ def retrieve_and_generate_secure(query):
139
+ print("πŸ” Query Received:", query)
140
+ if bm25 is None or not chunk_texts:
141
+ return "❌ No PDF data loaded. Please upload a PDF first."
142
+
143
+ bm25_results = retrieve_bm25(query)
144
+ if not bm25_results:
145
+ return "❌ No relevant financial data found for your query."
146
+
147
+ return refine_with_gemini(query, "\n".join(bm25_results))
148
+
149
+
150
+ # πŸ”Ή 8. Load PDF and Process Data
151
+ def process_uploaded_pdf(pdf_file):
152
+ global faiss_index
153
+ text = extract_text_from_pdf(pdf_file.name)
154
+ chunks = chunk_text(text)
155
+ faiss_index = store_in_faiss(chunks)
156
+ return "βœ… PDF Processed Successfully! Now you can ask financial questions."
157
+
158
+
159
+ # πŸ”Ή 9. Build Gradio UI
160
+ with gr.Blocks() as app:
161
+ gr.Markdown("# πŸ“Š Financial RAG Model")
162
+ gr.Markdown("Upload a company financial report PDF and ask relevant financial questions.")
163
+
164
+ with gr.Row():
165
+ pdf_input = gr.File(label="πŸ“‚ Upload Financial PDF", type="filepath")
166
+ process_button = gr.Button("πŸ“œ Process PDF")
167
+
168
+ status_output = gr.Textbox(label="Processing Status", interactive=False)
169
+
170
+ with gr.Row():
171
+ query_input = gr.Textbox(label="❓ Ask a financial question")
172
+ answer_output = gr.Textbox(label="πŸ’¬ Answer", interactive=False)
173
+
174
+ query_button = gr.Button("πŸ” Get Answer")
175
+
176
+ # Events
177
+ process_button.click(process_uploaded_pdf, inputs=pdf_input, outputs=status_output)
178
+ query_button.click(retrieve_and_generate_secure, inputs=query_input, outputs=answer_output)
179
+
180
+ # πŸ”Ή 10. Launch UI
181
+ app.launch()
requirments.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ faiss-cpu
3
+ numpy
4
+ scipy
5
+ sentence-transformers
6
+ cross-encoder
7
+ spacy
8
+ pdfplumber
9
+ rank-bm25
10
+ requests