Spaces:

ajoy0071998
/

PDF_Query_System

Running

App Files Files Community

ajoy0071998 commited on Mar 19

Commit

33889d7

verified ·

1 Parent(s): 08219dd

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -6

app.py CHANGED Viewed

@@ -10,18 +10,28 @@ from summa import keywords
 from nltk.tokenize import sent_tokenize, word_tokenize
 from sentence_transformers import SentenceTransformer, util
 import time
 # Download required NLTK data
 nltk.download('punkt_tab', quiet=True)
 nltk.download('stopwords', quiet=True)
-# Load models
-nlp = spacy.load("en_core_web_sm")
 sbert_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
 CHARS_TO_REMOVE = "(){},;-'\":‘’“”"
-# Text processing functions (unchanged from your code)
 def clean_text(text):
     text = "".join(char if char not in CHARS_TO_REMOVE else " " for char in text)
     text = re.sub(r'\s+', ' ', text).strip()
@@ -115,7 +125,7 @@ def correct_keywords(query_keywords, stored_keywords, threshold=2):
                 corrected_keywords.add(qk)
     return corrected_keywords
-# Bit Vector-based search and retrieval (adapted for multiple PDFs)
 def process_pdf(pdf_file):
     text = extract_text_from_pdf(pdf_file)
     text = clean_text(text)
@@ -210,8 +220,8 @@ if uploaded_files:
 # Query input
 query = st.text_input("Enter your query:")
-# Mistral API key (you may want to secure this differently in production)
-MISTRAL_API_KEY = "S3vzsvK7rP5in24joHgL55dVCjqYSi1F"
 if st.button("Search") and query and st.session_state.processed_pdfs:
     with st.spinner("Searching..."):

 from nltk.tokenize import sent_tokenize, word_tokenize
 from sentence_transformers import SentenceTransformer, util
 import time
+import os
 # Download required NLTK data
 nltk.download('punkt_tab', quiet=True)
 nltk.download('stopwords', quiet=True)
+# Load SpaCy model with a check to download if missing
+def load_spacy_model():
+    model_name = "en_core_web_sm"
+    try:
+        return spacy.load(model_name)
+    except OSError:
+        st.warning(f"Model '{model_name}' not found. Downloading now...")
+        subprocess.run(["python", "-m", "spacy", "download", model_name], check=True)
+        return spacy.load(model_name)
+nlp = load_spacy_model()
 sbert_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
 CHARS_TO_REMOVE = "(){},;-'\":‘’“”"
+# Text processing functions (unchanged)
 def clean_text(text):
     text = "".join(char if char not in CHARS_TO_REMOVE else " " for char in text)
     text = re.sub(r'\s+', ' ', text).strip()
                 corrected_keywords.add(qk)
     return corrected_keywords
+# Bit Vector-based search and retrieval (unchanged)
 def process_pdf(pdf_file):
     text = extract_text_from_pdf(pdf_file)
     text = clean_text(text)
 # Query input
 query = st.text_input("Enter your query:")
+# Mistral API key from environment variable (recommended for security)
+MISTRAL_API_KEY = os.environ.get("MISTRAL_API_KEY", "S3vzsvK7rP5in24joHgL55dVCjqYSi1F")  # Fallback for local testing
 if st.button("Search") and query and st.session_state.processed_pdfs:
     with st.spinner("Searching..."):