Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -10,18 +10,28 @@ from summa import keywords
|
|
10 |
from nltk.tokenize import sent_tokenize, word_tokenize
|
11 |
from sentence_transformers import SentenceTransformer, util
|
12 |
import time
|
|
|
13 |
|
14 |
# Download required NLTK data
|
15 |
nltk.download('punkt_tab', quiet=True)
|
16 |
nltk.download('stopwords', quiet=True)
|
17 |
|
18 |
-
# Load
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
sbert_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
|
21 |
|
22 |
CHARS_TO_REMOVE = "(){},;-'\":ββββ"
|
23 |
|
24 |
-
# Text processing functions (unchanged
|
25 |
def clean_text(text):
|
26 |
text = "".join(char if char not in CHARS_TO_REMOVE else " " for char in text)
|
27 |
text = re.sub(r'\s+', ' ', text).strip()
|
@@ -115,7 +125,7 @@ def correct_keywords(query_keywords, stored_keywords, threshold=2):
|
|
115 |
corrected_keywords.add(qk)
|
116 |
return corrected_keywords
|
117 |
|
118 |
-
# Bit Vector-based search and retrieval (
|
119 |
def process_pdf(pdf_file):
|
120 |
text = extract_text_from_pdf(pdf_file)
|
121 |
text = clean_text(text)
|
@@ -210,8 +220,8 @@ if uploaded_files:
|
|
210 |
# Query input
|
211 |
query = st.text_input("Enter your query:")
|
212 |
|
213 |
-
# Mistral API key
|
214 |
-
MISTRAL_API_KEY = "S3vzsvK7rP5in24joHgL55dVCjqYSi1F"
|
215 |
|
216 |
if st.button("Search") and query and st.session_state.processed_pdfs:
|
217 |
with st.spinner("Searching..."):
|
|
|
10 |
from nltk.tokenize import sent_tokenize, word_tokenize
|
11 |
from sentence_transformers import SentenceTransformer, util
|
12 |
import time
|
13 |
+
import os
|
14 |
|
15 |
# Download required NLTK data
|
16 |
nltk.download('punkt_tab', quiet=True)
|
17 |
nltk.download('stopwords', quiet=True)
|
18 |
|
19 |
+
# Load SpaCy model with a check to download if missing
|
20 |
+
def load_spacy_model():
|
21 |
+
model_name = "en_core_web_sm"
|
22 |
+
try:
|
23 |
+
return spacy.load(model_name)
|
24 |
+
except OSError:
|
25 |
+
st.warning(f"Model '{model_name}' not found. Downloading now...")
|
26 |
+
subprocess.run(["python", "-m", "spacy", "download", model_name], check=True)
|
27 |
+
return spacy.load(model_name)
|
28 |
+
|
29 |
+
nlp = load_spacy_model()
|
30 |
sbert_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
|
31 |
|
32 |
CHARS_TO_REMOVE = "(){},;-'\":ββββ"
|
33 |
|
34 |
+
# Text processing functions (unchanged)
|
35 |
def clean_text(text):
|
36 |
text = "".join(char if char not in CHARS_TO_REMOVE else " " for char in text)
|
37 |
text = re.sub(r'\s+', ' ', text).strip()
|
|
|
125 |
corrected_keywords.add(qk)
|
126 |
return corrected_keywords
|
127 |
|
128 |
+
# Bit Vector-based search and retrieval (unchanged)
|
129 |
def process_pdf(pdf_file):
|
130 |
text = extract_text_from_pdf(pdf_file)
|
131 |
text = clean_text(text)
|
|
|
220 |
# Query input
|
221 |
query = st.text_input("Enter your query:")
|
222 |
|
223 |
+
# Mistral API key from environment variable (recommended for security)
|
224 |
+
MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY", "S3vzsvK7rP5in24joHgL55dVCjqYSi1F") # Fallback for local testing
|
225 |
|
226 |
if st.button("Search") and query and st.session_state.processed_pdfs:
|
227 |
with st.spinner("Searching..."):
|