Spaces:

NaimaAqeel
/

Chatbot

Runtime error

App Files Files Community

NaimaAqeel commited on Jun 7, 2024

Commit

834c71a

verified ·

1 Parent(s): 0b59402

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -107

app.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import os
-import io
 import PyPDF2
-import gradio as gr
 from docx import Document
 from sentence_transformers import SentenceTransformer
 from langchain_community.vectorstores import FAISS
@@ -9,16 +10,32 @@ from langchain_community.embeddings import HuggingFaceEmbeddings
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 from nltk.tokenize import sent_tokenize
 import torch
 import pickle
 import nltk
-import faiss
-import numpy as np
-# Ensure NLTK resources are downloaded
-try:
-    nltk.data.find('tokenizers/punkt')
-except LookupError:
-    nltk.download('punkt')
 # Initialize the embedding model
 embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
@@ -40,133 +57,69 @@ retriever_tokenizer = AutoTokenizer.from_pretrained(retriever_model_name)
 hf_embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
 # Load or create FAISS index
-index_path = "faiss_index.index"
 if os.path.exists(index_path):
-    faiss_index = faiss.read_index(index_path)
-    print("Loaded FAISS index from faiss_index.index")
 else:
-    # Create a new FAISS index
-    d = embedding_model.get_sentence_embedding_dimension()  # Dimension of the embeddings
-    faiss_index = faiss.IndexFlatL2(d)  # Using IndexFlatL2 for simplicity
-state = {
-    "conversation": [],
-    "sentences": []
-}
-def extract_text_from_pdf(file):
-    text = ""
-    try:
-        pdf_data = file.read()
-        pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_data))
-        pdf_pages = pdf_reader.pages
-        text = "\n\n".join(page.extract_text() for page in pdf_pages)
-    except Exception as e:
-        raise RuntimeError(f"Error extracting text from PDF: {e}")
-    return text
-def extract_text_from_docx(file):
-    text = ""
-    try:
-        doc = Document(file)
-        text = "\n".join([para.text for para in doc.paragraphs])
-    except Exception as e:
-        raise RuntimeError(f"Error extracting text from DOCX: {e}")
-    return text
 def preprocess_text(text):
     sentences = sent_tokenize(text)
     return sentences
 def upload_files(files):
-    global state, faiss_index
     try:
         for file in files:
             if file.name.endswith('.pdf'):
-                text = extract_text_from_pdf(file)
             elif file.name.endswith('.docx'):
-                text = extract_text_from_docx(file)
             else:
-                return {"error": f"Unsupported file format: {file.name}"}
             sentences = preprocess_text(text)
             embeddings = embedding_model.encode(sentences)
-            faiss_index.add(np.array(embeddings).astype(np.float32))  # Add embeddings
-            state["sentences"].extend(sentences)
         # Save the updated index
-        faiss.write_index(faiss_index, index_path)
         return {"message": "Files processed successfully"}
     except Exception as e:
         print(f"Error processing files: {e}")
-        return {"error": str(e)}
-def process_and_query(question):
-    global state, faiss_index
-    if not question:
-        return {"error": "No question provided"}
-    try:
         question_embedding = embedding_model.encode([question])
-        # Perform FAISS search
-        D, I = faiss_index.search(np.array(question_embedding).astype(np.float32), k=5)
-        retrieved_results = [state["sentences"][i] for i in I[0] if i != -1]  # Ensure valid indices
-        # Generate response based on retrieved results
-        context = " ".join(retrieved_results)
-        # Enhanced prompt template
         prompt_template = """
         Answer the question as detailed as possible from the provided context,
         make sure to provide all the details, if the answer is not in
         provided context just say, "answer is not available in the context",
         don't provide the wrong answer
-        Context:
-        {context}
-        Question:
-        {question}
-        Answer:
-        --------------------------------------------------
-        Prompt Suggestions:
-        1. Summarize the primary theme of the context.
-        2. Elaborate on the crucial concepts highlighted in the context.
-        3. Pinpoint any supporting details or examples pertinent to the question.
-        4. Examine any recurring themes or patterns relevant to the question within the context.
-        5. Contrast differing viewpoints or elements mentioned in the context.
-        6. Explore the potential implications or outcomes of the information provided.
-        7. Assess the trustworthiness and validity of the information given.
-        8. Propose recommendations or advice based on the presented information.
-        9. Forecast likely future events or results stemming from the context.
-        10. Expand on the context or background information pertinent to the question.
-        11. Define any specialized terms or technical language used within the context.
-        12. Analyze any visual representations like charts or graphs in the context.
-        13. Highlight any restrictions or important considerations when responding to the question.
-        14. Examine any presuppositions or biases evident within the context.
-        15. Present alternate interpretations or viewpoints regarding the information provided.
-        16. Reflect on any moral or ethical issues raised by the context.
-        17. Investigate any cause-and-effect relationships identified in the context.
-        18. Uncover any questions or areas requiring further exploration.
-        19. Resolve any vague or conflicting information in the context.
-        20. Cite case studies or examples that demonstrate the concepts discussed in the context.
-        --------------------------------------------------
-        Context:
-        {context}
-        Question:
-        {question}
         Answer:
         """
-        combined_input = prompt_template.format(context=context, question=question)
-        inputs = generator_tokenizer(combined_input, return_tensors="pt", max_length=512, truncation=True)
         with torch.no_grad():
             generator_outputs = generator.generate(**inputs)
             generated_text = generator_tokenizer.decode(generator_outputs[0], skip_special_tokens=True)
@@ -176,9 +129,7 @@ def process_and_query(question):
         return {"message": generated_text, "conversation": state["conversation"]}
-    except Exception as e:
-        print(f"Error processing query: {e}")
-        return {"error": str(e)}
 # Create Gradio interface
 with gr.Blocks() as demo:
@@ -188,12 +139,13 @@ with gr.Blocks() as demo:
         upload = gr.File(file_count="multiple", label="Upload PDF or DOCX files")
         upload_button = gr.Button("Upload")
         upload_output = gr.Textbox()
-        upload_button.click(fn=upload_files, inputs=[upload], outputs=upload_output)
     with gr.Tab("Query"):
         query = gr.Textbox(label="Enter your query")
         query_button = gr.Button("Search")
         query_output = gr.Textbox()
-        query_button.click(fn=process_and_query, inputs=[query], outputs=query_output)
 demo.launch()

 import os
+import faiss
+import numpy as np
 import PyPDF2
+import io
 from docx import Document
 from sentence_transformers import SentenceTransformer
 from langchain_community.vectorstores import FAISS
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 from nltk.tokenize import sent_tokenize
 import torch
+import gradio as gr
 import pickle
 import nltk
+nltk.download('punkt')
+# Function to extract text from a PDF file
+def extract_text_from_pdf(pdf_file):
+    text = ""
+    try:
+        pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_file))
+        for page in pdf_reader.pages:
+            text += page.extract_text()
+    except Exception as e:
+        print(f"Error extracting text from PDF: {e}")
+    return text
+# Function to extract text from a Word document
+def extract_text_from_docx(docx_file):
+    text = ""
+    try:
+        doc = Document(io.BytesIO(docx_file))
+        text = "\n".join([para.text for para in doc.paragraphs])
+    except Exception as e:
+        print(f"Error extracting text from DOCX: {e}")
+    return text
 # Initialize the embedding model
 embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
 hf_embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
 # Load or create FAISS index
+index_path = "faiss_index.pkl"
 if os.path.exists(index_path):
+    with open(index_path, "rb") as f:
+        faiss_index = pickle.load(f)
+        print("Loaded FAISS index from faiss_index.pkl")
 else:
+    faiss_index = FAISS(embedding_function=hf_embeddings)
 def preprocess_text(text):
     sentences = sent_tokenize(text)
     return sentences
 def upload_files(files):
+    global faiss_index
     try:
         for file in files:
+            file_data = file.read()
             if file.name.endswith('.pdf'):
+                text = extract_text_from_pdf(file_data)
             elif file.name.endswith('.docx'):
+                text = extract_text_from_docx(file_data)
             else:
+                return {"error": "Unsupported file format"}
+            # Preprocess text
             sentences = preprocess_text(text)
+            # Encode sentences and add to FAISS index
             embeddings = embedding_model.encode(sentences)
+            for embedding in embeddings:
+                faiss_index.add(np.expand_dims(embedding, axis=0))
         # Save the updated index
+        with open(index_path, "wb") as f:
+            pickle.dump(faiss_index, f)
         return {"message": "Files processed successfully"}
     except Exception as e:
         print(f"Error processing files: {e}")
+        return {"error": str(e)}  # Provide informative error message
+def process_and_query(state, question):
+    if question:
+        # Preprocess the question
         question_embedding = embedding_model.encode([question])
+        # Search the FAISS index for similar passages
+        D, I = faiss_index.search(np.array(question_embedding), k=5)
+        retrieved_passages = [faiss_index.index_to_text(i) for i in I[0]]
+        # Use generator model to generate response based on question and retrieved passages
         prompt_template = """
         Answer the question as detailed as possible from the provided context,
         make sure to provide all the details, if the answer is not in
         provided context just say, "answer is not available in the context",
         don't provide the wrong answer
+        Context:\n{context}\n
+        Question:\n{question}\n
         Answer:
         """
+        combined_input = prompt_template.format(context=' '.join(retrieved_passages), question=question)
+        inputs = generator_tokenizer(combined_input, return_tensors="pt")
         with torch.no_grad():
             generator_outputs = generator.generate(**inputs)
             generated_text = generator_tokenizer.decode(generator_outputs[0], skip_special_tokens=True)
         return {"message": generated_text, "conversation": state["conversation"]}
+    return {"error": "No question provided"}
 # Create Gradio interface
 with gr.Blocks() as demo:
         upload = gr.File(file_count="multiple", label="Upload PDF or DOCX files")
         upload_button = gr.Button("Upload")
         upload_output = gr.Textbox()
+        upload_button.click(fn=upload_files, inputs=upload, outputs=upload_output)
     with gr.Tab("Query"):
+        state = gr.State(initial_value={"conversation": []})
         query = gr.Textbox(label="Enter your query")
         query_button = gr.Button("Search")
         query_output = gr.Textbox()
+        query_button.click(fn=process_and_query, inputs=[state, query], outputs=query_output)
 demo.launch()