Spaces:

NaimaAqeel
/

Chatbot

Runtime error

App Files Files Community

NaimaAqeel commited on Jun 4, 2024

Commit

3a0b46d

verified ·

1 Parent(s): 85a9507

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -32

app.py CHANGED Viewed

@@ -10,8 +10,6 @@ from typing import List
 from langchain_community.llms import HuggingFaceEndpoint
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
-from nltk.tokenize import sent_tokenize  # Import for sentence segmentation
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 # Function to extract text from a PDF file
 def extract_text_from_pdf(pdf_path):
@@ -25,9 +23,8 @@ def extract_text_from_pdf(pdf_path):
         print(f"Error extracting text from PDF: {e}")
     return text
-# Function to extract text from a Word document
 def extract_text_from_docx(docx_path):
-    """Extracts text from a Word document."""
     text = ""
     try:
         doc = Document(docx_path)
@@ -36,32 +33,29 @@ def extract_text_from_docx(docx_path):
         print(f"Error extracting text from DOCX: {e}")
     return text
-# Initialize the embedding model
 embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
-# Hugging Face API token
 api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
 if not api_token:
     raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable is not set")
-# Define RAG models
-generator_model_name = "facebook/bart-base"
-retriever_model_name = "facebook/bart-base"
-generator = AutoModelForSeq2SeqLM.from_pretrained(generator_model_name)
-generator_tokenizer = AutoTokenizer.from_pretrained(generator_model_name)
-retriever = AutoModelForSeq2SeqLM.from_pretrained(retriever_model_name)
-retriever_tokenizer = AutoTokenizer.from_pretrained(retriever_model_name)
 # Load or create FAISS index
 index_path = "faiss_index.pkl"
 document_texts_path = "document_texts.pkl"
 document_texts = []
 if os.path.exists(index_path) and os.path.exists(document_texts_path):
     try:
         with open(index_path, "rb") as f:
@@ -79,16 +73,11 @@ else:
         pickle.dump(index, f)
         print("Created new FAISS index and saved to faiss_index.pkl")
-def preprocess_text(text):
-    sentences = sent_tokenize(text)
-    return sentences
 def upload_files(files):
     global index, document_texts
     try:
-        for file_path in files:
             if file_path.endswith('.pdf'):
                 text = extract_text_from_pdf(file_path)
             elif file_path.endswith('.docx'):
@@ -96,18 +85,52 @@ def upload_files(files):
             else:
                 return "Unsupported file format"
-            # Preprocess text (call the new function)
-            sentences = preprocess_text(text)
-            # Encode sentences and add to FAISS index
             embeddings = embedding_model.encode(sentences)
             index.add(np.array(embeddings))
-        # Save the updated index and documents
         return "Files processed successfully"
     except Exception as e:
         print(f"Error processing files: {e}")

 from langchain_community.llms import HuggingFaceEndpoint
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
 # Function to extract text from a PDF file
 def extract_text_from_pdf(pdf_path):
         print(f"Error extracting text from PDF: {e}")
     return text
+# Function to extract text from a Word document
 def extract_text_from_docx(docx_path):
     text = ""
     try:
         doc = Document(docx_path)
         print(f"Error extracting text from DOCX: {e}")
     return text
+# Initialize the embedding model
 embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+# Hugging Face API token
 api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
 if not api_token:
     raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable is not set")
+# Initialize the HuggingFace LLM
+llm = HuggingFaceEndpoint(
+    endpoint_url="https://api-inference.huggingface.co/models/gpt2",
+    model_kwargs={"api_key": api_token}
+)
+# Initialize the HuggingFace embeddings
+embedding = HuggingFaceEmbeddings()
 # Load or create FAISS index
 index_path = "faiss_index.pkl"
 document_texts_path = "document_texts.pkl"
 document_texts = []
 if os.path.exists(index_path) and os.path.exists(document_texts_path):
     try:
         with open(index_path, "rb") as f:
         pickle.dump(index, f)
         print("Created new FAISS index and saved to faiss_index.pkl")
 def upload_files(files):
     global index, document_texts
     try:
+        for file in files:
+            file_path = file.name  # Get the file path from the NamedString object
             if file_path.endswith('.pdf'):
                 text = extract_text_from_pdf(file_path)
             elif file_path.endswith('.docx'):
             else:
                 return "Unsupported file format"
+            # Process the text and update FAISS index
+            sentences = text.split("\n")
             embeddings = embedding_model.encode(sentences)
             index.add(np.array(embeddings))
+            document_texts.append(text)
+        # Save the updated index and documents
+        with open(index_path, "wb") as f:
+            pickle.dump(index, f)
+            print("Saved updated FAISS index to faiss_index.pkl")
+        with open(document_texts_path, "wb") as f:
+            pickle.dump(document_texts, f)
+            print("Saved updated document texts to document_texts.pkl")
         return "Files processed successfully"
     except Exception as e:
         print(f"Error processing files: {e}")
+        return f"Error processing files: {e}"
+def query_text(text):
+    try:
+        # Encode the query text
+        query_embedding = embedding_model.encode([text])
+        # Search the FAISS index
+        D, I = index.search(np.array(query_embedding), k=5)
+        top_documents = []
+        for idx in I[0]:
+            if idx != -1 and idx < len(document_texts):  # Ensure that a valid index is found
+                top_documents.append(document_texts[idx])
+            else:
+                print(f"Invalid index found: {idx}")
+        return top_documents
+    except Exception as e:
+        print(f"Error querying text: {e}")
+        return f"Error querying text: {e}"
+# Create Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("## Document Upload and Query System")
+    with gr.Tab("Upload Files"):
+        upload = gr.File(file_count="multiple", label="Upload PDF or DOCX files")
+        upload_button = gr.Button("Upload")
+        upload_output = gr