Spaces:

NaimaAqeel
/

Chatbot

Runtime error

App Files Files Community

NaimaAqeel commited on Apr 19

Commit

24d9947

verified ·

1 Parent(s): b2bfd05

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -102

app.py CHANGED Viewed

@@ -1,156 +1,152 @@
 import os
-import fitz
-from docx import Document
-from sentence_transformers import SentenceTransformer
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-import faiss
-import numpy as np
 import pickle
 import gradio as gr
 from typing import List
-from langchain_community.llms import HuggingFaceEndpoint
-from langchain_community.vectorstores import FAISS
-from langchain_community.embeddings import HuggingFaceEmbeddings
-# Function to extract text from a PDF file
 def extract_text_from_pdf(pdf_path):
     text = ""
     try:
         doc = fitz.open(pdf_path)
-        for page_num in range(len(doc)):
-            page = doc.load_page(page_num)
             text += page.get_text()
     except Exception as e:
-        print(f"Error extracting text from PDF: {e}")
     return text
-# Function to extract text from a Word document
 def extract_text_from_docx(docx_path):
     text = ""
     try:
         doc = Document(docx_path)
         text = "\n".join([para.text for para in doc.paragraphs])
     except Exception as e:
-        print(f"Error extracting text from DOCX: {e}")
     return text
-# Initialize the embedding model
-embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
-# Hugging Face API token
-api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
-if not api_token:
-    raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable is not set")
-# Initialize RAG models from Hugging Face
-generator_model_name = "facebook/bart-base"
-retriever_model_name = "facebook/bart-base"
-generator = AutoModelForSeq2SeqLM.from_pretrained(generator_model_name)
-generator_tokenizer = AutoTokenizer.from_pretrained(generator_model_name)
-retriever = AutoModelForSeq2SeqLM.from_pretrained(retriever_model_name)
-retriever_tokenizer = AutoTokenizer.from_pretrained(retriever_model_name)
-# Initialize the HuggingFace LLM
-llm = HuggingFaceEndpoint(
-    endpoint_url="https://api-inference.huggingface.co/models/gpt2",
-    model_kwargs={"api_key": api_token}
-)
-# Initialize the HuggingFace embeddings
-embedding = HuggingFaceEmbeddings()
-# FAISS index and storage paths
-index_path = "faiss_index.pkl"
-document_texts_path = "document_texts.pkl"
-document_texts = []
-# Load or create FAISS index using cosine similarity (Inner Product + Normalized vectors)
-if os.path.exists(index_path) and os.path.exists(document_texts_path):
-    try:
-        with open(index_path, "rb") as f:
-            index = pickle.load(f)
-            print("Loaded FAISS index from faiss_index.pkl")
-        with open(document_texts_path, "rb") as f:
-            document_texts = pickle.load(f)
-            print("Loaded document texts from document_texts.pkl")
-    except Exception as e:
-        print(f"Error loading FAISS index or document texts: {e}")
-else:
-    index = faiss.IndexFlatIP(embedding_model.get_sentence_embedding_dimension())
-    with open(index_path, "wb") as f:
-        pickle.dump(index, f)
-        print("Created new FAISS index and saved to faiss_index.pkl")
 def upload_files(files):
     global index, document_texts
     try:
         for file in files:
             file_path = file.name
-            file.save(file_path)  # Saving file in Hugging Face space
             if file_path.endswith('.pdf'):
                 text = extract_text_from_pdf(file_path)
             elif file_path.endswith('.docx'):
                 text = extract_text_from_docx(file_path)
             else:
-                return "Unsupported file format"
-            print(f"Extracted text: {text[:100]}...")
-            sentences = text.split("\n")
-            embeddings = embedding_model.encode(sentences, normalize_embeddings=True)  # Cosine similarity step
-            print(f"Embeddings shape: {embeddings.shape}")
             index.add(np.array(embeddings))
             document_texts.extend(sentences)
-        # Save updated index and texts
         with open(index_path, "wb") as f:
             pickle.dump(index, f)
-            print("Saved updated FAISS index to faiss_index.pkl")
         with open(document_texts_path, "wb") as f:
             pickle.dump(document_texts, f)
-            print("Saved updated document texts to document_texts.pkl")
-        return "Files processed successfully"
     except Exception as e:
-        print(f"Error processing files: {e}")
-        return f"Error processing files: {e}"
-def query_text(text):
     try:
-        print(f"Query text: {text}")
-        query_embedding = embedding_model.encode([text], normalize_embeddings=True)  # Cosine similarity step
-        print(f"Query embedding shape: {query_embedding.shape}")
-        D, I = index.search(np.array(query_embedding), k=5)
-        print(f"Distances: {D}, Indices: {I}")
-        top_documents = []
         for idx in I[0]:
-            if idx != -1 and idx < len(document_texts):
-                top_documents.append(document_texts[idx])
-            else:
-                print(f"Invalid index found: {idx}")
-        return "\n\n".join(top_documents)
     except Exception as e:
-        print(f"Error querying text: {e}")
-        return f"Error querying text: {e}"
-# Gradio Interface
 with gr.Blocks() as demo:
-    gr.Markdown("## Document Upload and Query System with Cosine Similarity")
-    with gr.Tab("Upload Files"):
-        upload = gr.File(file_count="multiple", label="Upload PDF or DOCX files")
-        upload_button = gr.Button("Upload")
         upload_output = gr.Textbox()
-        upload_button.click(fn=upload_files, inputs=upload, outputs=upload_output)
-    with gr.Tab("Query"):
-        query = gr.Textbox(label="Enter your query")
-        query_button = gr.Button("Search")
-        query_output = gr.Textbox()
-        query_button.click(fn=query_text, inputs=query, outputs=query_output)
 demo.launch()

 import os
+import sys
 import pickle
+import numpy as np
 import gradio as gr
 from typing import List
+import fitz  # PyMuPDF
+from docx import Document
+from transformers import AutoModel, AutoTokenizer, AutoModelForSeq2SeqLM
+import faiss
+# =============================================
+# FIX FOR HUGGINGFACE HUB IMPORT ISSUE
+# =============================================
+try:
+    from huggingface_hub import cached_download
+except ImportError:
+    from huggingface_hub.utils import cached_download
+    import huggingface_hub
+    sys.modules['huggingface_hub'].cached_download = cached_download
+# Now we can safely import sentence-transformers
+from sentence_transformers import SentenceTransformer
+# =============================================
+# INITIALIZE MODELS
+# =============================================
+# Initialize embedding model (using direct transformers as fallback)
+try:
+    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+except Exception as e:
+    print(f"Failed to load SentenceTransformer, falling back to direct transformers: {e}")
+    model_name = "sentence-transformers/all-MiniLM-L6-v2"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    embedding_model = AutoModel.from_pretrained(model_name)
+    def get_embeddings(texts):
+        inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
+        outputs = embedding_model(**inputs)
+        return outputs.last_hidden_state.mean(dim=1).detach().numpy()
+# Initialize FAISS index
+index_path = "faiss_index.pkl"
+document_texts_path = "document_texts.pkl"
+document_texts = []
+if os.path.exists(index_path) and os.path.exists(document_texts_path):
+    try:
+        with open(index_path, "rb") as f:
+            index = pickle.load(f)
+        with open(document_texts_path, "rb") as f:
+            document_texts = pickle.load(f)
+    except Exception as e:
+        print(f"Error loading FAISS index: {e}")
+        index = faiss.IndexFlatIP(384)  # 384 is dim for all-MiniLM-L6-v2
+else:
+    index = faiss.IndexFlatIP(384)
+# =============================================
+# DOCUMENT PROCESSING FUNCTIONS
+# =============================================
 def extract_text_from_pdf(pdf_path):
     text = ""
     try:
         doc = fitz.open(pdf_path)
+        for page in doc:
             text += page.get_text()
     except Exception as e:
+        print(f"PDF error: {e}")
     return text
 def extract_text_from_docx(docx_path):
     text = ""
     try:
         doc = Document(docx_path)
         text = "\n".join([para.text for para in doc.paragraphs])
     except Exception as e:
+        print(f"DOCX error: {e}")
     return text
+# =============================================
+# CORE FUNCTIONALITY
+# =============================================
 def upload_files(files):
     global index, document_texts
     try:
         for file in files:
             file_path = file.name
             if file_path.endswith('.pdf'):
                 text = extract_text_from_pdf(file_path)
             elif file_path.endswith('.docx'):
                 text = extract_text_from_docx(file_path)
             else:
+                continue
+            sentences = [s for s in text.split("\n") if s.strip()]
+            if hasattr(embedding_model, 'encode'):
+                embeddings = embedding_model.encode(sentences, normalize_embeddings=True)
+            else:
+                embeddings = get_embeddings(sentences)
             index.add(np.array(embeddings))
             document_texts.extend(sentences)
+        # Save updated index
         with open(index_path, "wb") as f:
             pickle.dump(index, f)
         with open(document_texts_path, "wb") as f:
             pickle.dump(document_texts, f)
+        return f"Processed {len(files)} files, added {len(sentences)} sentences"
     except Exception as e:
+        return f"Error: {str(e)}"
+def query_text(query):
     try:
+        if hasattr(embedding_model, 'encode'):
+            query_embedding = embedding_model.encode([query], normalize_embeddings=True)
+        else:
+            query_embedding = get_embeddings([query])
+        D, I = index.search(np.array(query_embedding), k=3)
+        results = []
         for idx in I[0]:
+            if 0 <= idx < len(document_texts):
+                results.append(document_texts[idx])
+        return "\n\n---\n\n".join(results) if results else "No matches found"
     except Exception as e:
+        return f"Query error: {str(e)}"
+# =============================================
+# GRADIO INTERFACE
+# =============================================
 with gr.Blocks() as demo:
+    gr.Markdown("## Document Search with Semantic Similarity")
+    with gr.Tab("Upload Documents"):
+        file_input = gr.File(file_count="multiple", file_types=[".pdf", ".docx"])
+        upload_btn = gr.Button("Process Files")
         upload_output = gr.Textbox()
+    with gr.Tab("Search"):
+        query_input = gr.Textbox(label="Enter your query")
+        search_btn = gr.Button("Search")
+        results_output = gr.Textbox()
+    upload_btn.click(upload_files, inputs=file_input, outputs=upload_output)
+    search_btn.click(query_text, inputs=query_input, outputs=results_output)
 demo.launch()