ajoy0071998 commited on
Commit
3b27a94
·
verified ·
1 Parent(s): c40f2ee

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +239 -0
app.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import nltk
3
+ import fitz
4
+ import spacy
5
+ import json
6
+ import subprocess
7
+ import re
8
+ import numpy as np
9
+ from summa import keywords
10
+ from nltk.tokenize import sent_tokenize, word_tokenize
11
+ from sentence_transformers import SentenceTransformer, util
12
+ import time
13
+
14
+ # Download required NLTK data
15
+ nltk.download('punkt_tab', quiet=True)
16
+ nltk.download('stopwords', quiet=True)
17
+
18
+ # Load models
19
+ nlp = spacy.load("en_core_web_sm")
20
+ sbert_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
21
+
22
+ CHARS_TO_REMOVE = "(){},;-'\":‘’“”"
23
+
24
+ # Text processing functions (unchanged from your code)
25
+ def clean_text(text):
26
+ text = "".join(char if char not in CHARS_TO_REMOVE else " " for char in text)
27
+ text = re.sub(r'\s+', ' ', text).strip()
28
+ return text
29
+
30
+ def extract_text_from_pdf(pdf_file):
31
+ try:
32
+ doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
33
+ text = "\n".join(page.get_text("text") for page in doc)
34
+ return text.strip()
35
+ except Exception as e:
36
+ st.error(f"Error reading PDF: {e}")
37
+ return ""
38
+
39
+ def custom_sent_tokenize(text):
40
+ sentence_endings = re.compile(r'(?<!\b[A-Z])(?<!\b[A-Z]\.)(?<!\b[A-Z]\.[A-Z])(?<=\.)\s+')
41
+ sentences = sentence_endings.split(text)
42
+ return [s.strip() for s in sentences if s.strip()]
43
+
44
+ def chunk_text(text, chunk_size=15, max_words=150, overlap=10):
45
+ sentences = custom_sent_tokenize(text)
46
+ chunks = []
47
+ i = 0
48
+ while i < len(sentences):
49
+ chunk = sentences[i:i + chunk_size]
50
+ chunk_text = " ".join(chunk)
51
+ words = word_tokenize(chunk_text)
52
+ if len(words) > max_words:
53
+ chunk.pop()
54
+ chunk_text = " ".join(chunk)
55
+ if chunks:
56
+ prev_words = word_tokenize(chunks[-1])[-overlap:]
57
+ chunk_text = " ".join(prev_words) + " " + chunk_text
58
+ chunks.append(chunk_text)
59
+ i += chunk_size
60
+ return chunks
61
+
62
+ # Keyword extraction functions (unchanged)
63
+ def normalize_text(text):
64
+ return text.lower().strip()
65
+
66
+ def lemmatize_keywords(keywords_list):
67
+ doc = nlp(" ".join(keywords_list))
68
+ return {token.lemma_ for token in doc if token.is_alpha}
69
+
70
+ def extract_keywords(chunk):
71
+ doc = nlp(chunk)
72
+ ner_keywords = {normalize_text(ent.text) for ent in doc.ents}
73
+ singlerank_keywords = {normalize_text(kw) for kw in keywords.keywords(chunk, scores=False).split("\n")}
74
+ all_tokens = {normalize_text(token.text) for token in doc if token.is_alpha}
75
+ all_keywords = ner_keywords | singlerank_keywords | all_tokens
76
+ return lemmatize_keywords(all_keywords)
77
+
78
+ # Embedding generation (unchanged)
79
+ def get_chunk_embeddings(chunks):
80
+ return [sbert_model.encode(chunk, convert_to_tensor=True) for chunk in chunks]
81
+
82
+ # Levenshtein distance and keyword correction (unchanged)
83
+ def levenshtein_distance(s1, s2):
84
+ if len(s1) < len(s2):
85
+ return levenshtein_distance(s2, s1)
86
+ if len(s2) == 0:
87
+ return len(s1)
88
+ previous_row = range(len(s2) + 1)
89
+ for i, c1 in enumerate(s1):
90
+ current_row = [i + 1]
91
+ for j, c2 in enumerate(s2):
92
+ insertions = previous_row[j + 1] + 1
93
+ deletions = current_row[j] + 1
94
+ substitutions = previous_row[j] + (c1 != c2)
95
+ current_row.append(min(insertions, deletions, substitutions))
96
+ previous_row = current_row
97
+ return previous_row[-1]
98
+
99
+ def correct_keywords(query_keywords, stored_keywords, threshold=2):
100
+ corrected_keywords = set()
101
+ for qk in query_keywords:
102
+ if qk in stored_keywords:
103
+ corrected_keywords.add(qk)
104
+ else:
105
+ min_dist = float('inf')
106
+ best_match = qk
107
+ for sk in stored_keywords:
108
+ dist = levenshtein_distance(qk, sk)
109
+ if dist < min_dist:
110
+ min_dist = dist
111
+ best_match = sk
112
+ if min_dist <= threshold:
113
+ corrected_keywords.add(best_match)
114
+ else:
115
+ corrected_keywords.add(qk)
116
+ return corrected_keywords
117
+
118
+ # Bit Vector-based search and retrieval (adapted for multiple PDFs)
119
+ def process_pdf(pdf_file):
120
+ text = extract_text_from_pdf(pdf_file)
121
+ text = clean_text(text)
122
+ chunks = chunk_text(text)
123
+ n_chunks = len(chunks)
124
+
125
+ keyword_bitmaps = {}
126
+ chunk_keywords = []
127
+ for i, chunk in enumerate(chunks):
128
+ keywords = extract_keywords(chunk)
129
+ chunk_keywords.append(keywords)
130
+ for kw in keywords:
131
+ if kw not in keyword_bitmaps:
132
+ keyword_bitmaps[kw] = np.zeros(n_chunks, dtype=bool)
133
+ keyword_bitmaps[kw][i] = 1
134
+
135
+ chunk_embeddings = get_chunk_embeddings(chunks)
136
+ all_keywords = set().union(*chunk_keywords)
137
+ return chunks, chunk_embeddings, keyword_bitmaps, chunk_keywords, all_keywords
138
+
139
+ def search_relevant_chunks(query, chunks, chunk_embeddings, keyword_bitmaps, chunk_keywords, all_keywords, top_k=5):
140
+ query_keywords = extract_keywords(query)
141
+ corrected_query_keywords = correct_keywords(query_keywords, all_keywords)
142
+
143
+ matched_bitmap = np.zeros(len(chunks), dtype=bool)
144
+ for keyword in corrected_query_keywords:
145
+ if keyword in keyword_bitmaps:
146
+ matched_bitmap |= keyword_bitmaps[keyword]
147
+
148
+ matched_chunk_indices = set(np.where(matched_bitmap)[0])
149
+ chunk_scores = {idx: len(corrected_query_keywords & chunk_keywords[idx]) for idx in matched_chunk_indices}
150
+ matched_chunks = [(chunks[idx], idx) for idx in sorted(chunk_scores, key=chunk_scores.get, reverse=True)]
151
+
152
+ if len(matched_chunks) >= top_k:
153
+ return [chunk for chunk, _ in matched_chunks[:top_k]]
154
+
155
+ remaining_slots = top_k - len(matched_chunks)
156
+ unmatched_indices = [i for i in range(len(chunks)) if i not in matched_chunk_indices]
157
+ query_embedding = sbert_model.encode(query, convert_to_tensor=True)
158
+ similarities = [util.pytorch_cos_sim(query_embedding, chunk_emb).item() for chunk_emb in chunk_embeddings]
159
+ top_indices = sorted(unmatched_indices, key=lambda i: similarities[i], reverse=True)[:remaining_slots]
160
+ similar_chunks = [chunks[i] for i in top_indices]
161
+ return [chunk for chunk, _ in matched_chunks] + similar_chunks[:top_k]
162
+
163
+ # Mistral API query (unchanged)
164
+ def query_mistral(prompt, MISTRAL_API_KEY):
165
+ payload = {"model": "mistral-large-latest", "messages": [{"role": "user", "content": prompt}]}
166
+ curl_command = [
167
+ "curl", "--location", "https://api.mistral.ai/v1/chat/completions",
168
+ "--header", "Content-Type: application/json",
169
+ "--header", "Accept: application/json",
170
+ "--header", f"Authorization: Bearer {MISTRAL_API_KEY}",
171
+ "--data", json.dumps(payload)
172
+ ]
173
+ response = subprocess.run(curl_command, capture_output=True, text=True)
174
+ if response.returncode == 0:
175
+ try:
176
+ response_json = json.loads(response.stdout)
177
+ return response_json['choices'][0]['message']['content']
178
+ except (KeyError, json.JSONDecodeError):
179
+ return "Error parsing the LLM response."
180
+ return f"Error: {response.stderr}"
181
+
182
+ # Streamlit app
183
+ st.title("PDF Query System")
184
+ st.write("Upload PDFs and ask questions about their content.")
185
+
186
+ # File uploader for multiple PDFs
187
+ uploaded_files = st.file_uploader("Upload PDF files", type="pdf", accept_multiple_files=True)
188
+
189
+ # Store processed PDFs in session state
190
+ if 'processed_pdfs' not in st.session_state:
191
+ st.session_state.processed_pdfs = {}
192
+
193
+ # Process uploaded PDFs
194
+ if uploaded_files:
195
+ for pdf_file in uploaded_files:
196
+ if pdf_file.name not in st.session_state.processed_pdfs:
197
+ with st.spinner(f"Processing {pdf_file.name}..."):
198
+ start_time = time.time()
199
+ chunks, chunk_embeddings, keyword_bitmaps, chunk_keywords, all_keywords = process_pdf(pdf_file)
200
+ st.session_state.processed_pdfs[pdf_file.name] = {
201
+ "chunks": chunks,
202
+ "chunk_embeddings": chunk_embeddings,
203
+ "keyword_bitmaps": keyword_bitmaps,
204
+ "chunk_keywords": chunk_keywords,
205
+ "all_keywords": all_keywords
206
+ }
207
+ end_time = time.time()
208
+ st.success(f"Processed {pdf_file.name} in {end_time - start_time:.4f} seconds")
209
+
210
+ # Query input
211
+ query = st.text_input("Enter your query:")
212
+
213
+ # Mistral API key (you may want to secure this differently in production)
214
+ MISTRAL_API_KEY = "S3vzsvK7rP5in24joHgL55dVCjqYSi1F"
215
+
216
+ if st.button("Search") and query and st.session_state.processed_pdfs:
217
+ with st.spinner("Searching..."):
218
+ all_relevant_chunks = []
219
+ for pdf_name, data in st.session_state.processed_pdfs.items():
220
+ start_search = time.time()
221
+ relevant_chunks = search_relevant_chunks(
222
+ query, data["chunks"], data["chunk_embeddings"],
223
+ data["keyword_bitmaps"], data["chunk_keywords"], data["all_keywords"]
224
+ )
225
+ end_search = time.time()
226
+ all_relevant_chunks.extend(relevant_chunks)
227
+ st.write(f"Search time for {pdf_name}: {end_search - start_search:.4f} seconds")
228
+
229
+ context = "\n".join(all_relevant_chunks)
230
+ start_response_time = time.time()
231
+ llm_prompt = f"Only Based on the following context, answer the query:\n{context}\n\nQuery: {query}"
232
+ response = query_mistral(llm_prompt, MISTRAL_API_KEY)
233
+ end_response_time = time.time()
234
+
235
+ st.subheader("Response:")
236
+ st.write(response)
237
+ st.write(f"Response time: {end_response_time - start_response_time:.4f} seconds")
238
+ elif st.button("Search") and not st.session_state.processed_pdfs:
239
+ st.warning("Please upload at least one PDF before searching.")