Spaces:

NaimaAqeel
/

Chatbot

Build error

App Files Files Community

NaimaAqeel commited on Jun 8, 2024

Commit

944d263

verified ·

1 Parent(s): cf26f9f

Update app.py

Browse files

Files changed (1) hide show

app.py +123 -136

app.py CHANGED Viewed

@@ -1,22 +1,42 @@
 import os
-import io
-import PyPDF2
 from docx import Document
-import numpy as np
-from nltk.tokenize import sent_tokenize
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 from sentence_transformers import SentenceTransformer
 import gradio as gr
-import torch
-# Download NLTK punkt tokenizer if not already downloaded
-import nltk
-nltk.download('punkt')
-# Initialize Sentence Transformer model for embeddings
 embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
-# Initialize Hugging Face API token
 api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
 if not api_token:
     raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable is not set")
@@ -27,144 +47,111 @@ retriever_model_name = "facebook/bart-base"
 generator = AutoModelForSeq2SeqLM.from_pretrained(generator_model_name)
 generator_tokenizer = AutoTokenizer.from_pretrained(generator_model_name)
 retriever = AutoModelForSeq2SeqLM.from_pretrained(retriever_model_name)
-retriever_tokenizer = AutoTokenizer.from_pretrained(retriever_model_name)
-# Initialize FAISS index using LangChain
-from langchain_community.vectorstores import FAISS
-from langchain_community.embeddings import HuggingFaceEmbeddings
-# Initialize Hugging Face embeddings
-hf_embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
-# Dummy implementations for index, docstore, and index_to_docstore_id
-# Replace with actual implementations or configurations as per LangChain documentation
-index = None
-docstore = None
-index_to_docstore_id = None
-# Initialize FAISS index with required parameters
-faiss_index = FAISS(
-    embedding_function=hf_embeddings,
-    index=index,
-    docstore=docstore,
-    index_to_docstore_id=index_to_docstore_id
 )
-# Function to extract text from a PDF file
-def extract_text_from_pdf(pdf_data):
-    text = ""
-    try:
-        pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_data))
-        for page in pdf_reader.pages:
-            text += page.extract_text()
-    except Exception as e:
-        print(f"Error extracting text from PDF: {e}")
-    return text
-# Function to extract text from a Word document
-def extract_text_from_docx(docx_data):
-    text = ""
     try:
-        doc = Document(io.BytesIO(docx_data))
-        text = "\n".join([para.text for para in doc.paragraphs])
     except Exception as e:
-        print(f"Error extracting text from DOCX: {e}")
-    return text
-# Function to preprocess text into sentences
-def preprocess_text(text):
-    sentences = sent_tokenize(text)
-    return sentences
-# Function to handle file uploads and update FAISS index
 def upload_files(files):
-    global faiss_index
     try:
         for file in files:
-            file_name = file.name
-            file_content = file.read()  # Get the file content as bytes
-            if file_name.endswith('.pdf'):
-                text = extract_text_from_pdf(file_content)
-            elif file_name.endswith('.docx'):
-                text = extract_text_from_docx(file_content)
             else:
-                return {"error": "Unsupported file format"}
-            # Preprocess text
-            sentences = preprocess_text(text)
-            # Encode sentences and add to FAISS index
             embeddings = embedding_model.encode(sentences)
-            if faiss_index is not None:
-                for embedding in embeddings:
-                    faiss_index.add(np.expand_dims(embedding, axis=0))
-        # Save the updated index (if needed)
-        # Add your logic here to save the FAISS index if you're using persistence
-        return {"message": "Files processed successfully"}
     except Exception as e:
         print(f"Error processing files: {e}")
-        return {"error": str(e)}  # Provide informative error message
-# Function to process queries using RAG model
-def process_and_query(state, question):
-    if question:
-        try:
-            # Search the FAISS index for similar passages
-            question_embedding = embedding_model.encode([question])
-            D, I = faiss_index.search(np.array(question_embedding), k=5)
-            retrieved_passages = [faiss_index.index_to_text(i) for i in I[0]]
-            # Use generator model to generate response based on question and retrieved passages
-            prompt_template = """
-            Answer the question as detailed as possible from the provided context,
-            make sure to provide all the details, if the answer is not in
-            provided context just say, "answer is not available in the context",
-            don't provide the wrong answer
-            Context:\n{context}\n
-            Question:\n{question}\n
-            Answer:
-            """
-            combined_input = prompt_template.format(context=' '.join(retrieved_passages), question=question)
-            inputs = generator_tokenizer(combined_input, return_tensors="pt")
-            with torch.no_grad():
-                generator_outputs = generator.generate(**inputs)
-                generated_text = generator_tokenizer.decode(generator_outputs[0], skip_special_tokens=True)
-            # Update conversation history
-            state.append({"question": question, "answer": generated_text})
-            return {"message": generated_text, "conversation": state}
-        except Exception as e:
-            print(f"Error processing query: {e}")
-            return {"error": str(e)}
-    else:
-        return {"error": "No question provided"}
-# Define the Gradio interface
-def main():
-    upload_interface = gr.Interface(
-        fn=upload_files,
-        inputs=gr.inputs.File(label="Upload PDF or DOCX files", multiple=True),
-        outputs=gr.outputs.Textbox(label="Upload Status")
-    )
-    query_interface = gr.Interface(
-        fn=process_and_query,
-        inputs=[gr.inputs.Textbox(label="Conversation State"), gr.inputs.Textbox(label="Enter your query")],
-        outputs=[gr.outputs.Textbox(label="Query Response"), gr.outputs.Textbox(label="Conversation State")]
-    )
-    gr.Interface(
-        fn=None,
-        inputs=[
-            gr.Interface.Tab("Upload Files", upload_interface),
-            gr.Interface.Tab("Query", query_interface)
-        ],
-        outputs=gr.outputs.Textbox(label="Output", default="Output will be shown here")
-    ).launch()
-if __name__ == "__main__":
-    main()

 import os
+import fitz
 from docx import Document
 from sentence_transformers import SentenceTransformer
+import faiss
+import numpy as np
+import pickle
 import gradio as gr
+from typing import List
+from langchain_community.llms import HuggingFaceEndpoint
+from langchain_community.vectorstores import FAISS
+from langchain_community.embeddings import HuggingFaceEmbeddings
+# Function to extract text from a PDF file
+def extract_text_from_pdf(pdf_path):
+    text = ""
+    try:
+        doc = fitz.open(pdf_path)
+        for page_num in range(len(doc)):
+            page = doc.load_page(page_num)
+            text += page.get_text()
+    except Exception as e:
+        print(f"Error extracting text from PDF: {e}")
+    return text
+# Function to extract text from a Word document
+def extract_text_from_docx(docx_path):
+    text = ""
+    try:
+        doc = Document(docx_path)
+        text = "\n".join([para.text for para in doc.paragraphs])
+    except Exception as e:
+        print(f"Error extracting text from DOCX: {e}")
+    return text
+# Initialize the embedding model
 embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+# Hugging Face API token
 api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
 if not api_token:
     raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable is not set")
 generator = AutoModelForSeq2SeqLM.from_pretrained(generator_model_name)
 generator_tokenizer = AutoTokenizer.from_pretrained(generator_model_name)
 retriever = AutoModelForSeq2SeqLM.from_pretrained(retriever_model_name)
+retriever_tokenizer = AutoTokenizer.from_pretrained(retriever_model_name)
+# Initialize the HuggingFace LLM
+llm = HuggingFaceEndpoint(
+    endpoint_url="https://api-inference.huggingface.co/models/gpt2",
+    model_kwargs={"api_key": api_token}
 )
+# Initialize the HuggingFace embeddings
+embedding = HuggingFaceEmbeddings()
+# Load or create FAISS index
+index_path = "faiss_index.pkl"
+document_texts_path = "document_texts.pkl"
+document_texts = []
+if os.path.exists(index_path) and os.path.exists(document_texts_path):
     try:
+        with open(index_path, "rb") as f:
+            index = pickle.load(f)
+            print("Loaded FAISS index from faiss_index.pkl")
+        with open(document_texts_path, "rb") as f:
+            document_texts = pickle.load(f)
+            print("Loaded document texts from document_texts.pkl")
     except Exception as e:
+        print(f"Error loading FAISS index or document texts: {e}")
+else:
+    # Create a new FAISS index if it doesn't exist
+    index = faiss.IndexFlatL2(embedding_model.get_sentence_embedding_dimension())
+    with open(index_path, "wb") as f:
+        pickle.dump(index, f)
+        print("Created new FAISS index and saved to faiss_index.pkl")
 def upload_files(files):
+    global index, document_texts
     try:
         for file in files:
+            file_path = file.name  # Get the file path from the NamedString object
+            if file_path.endswith('.pdf'):
+                text = extract_text_from_pdf(file_path)
+            elif file_path.endswith('.docx'):
+                text = extract_text_from_docx(file_path)
             else:
+                return "Unsupported file format"
+            print(f"Extracted text: {text[:100]}...")  # Debug: Show the first 100 characters of the extracted text
+            # Process the text and update FAISS index
+            sentences = text.split("\n")
             embeddings = embedding_model.encode(sentences)
+            print(f"Embeddings shape: {embeddings.shape}")  # Debug: Show the shape of the embeddings
+            index.add(np.array(embeddings))
+            document_texts.extend(sentences)  # Store sentences for retrieval
+        # Save the updated index and documents
+        with open(index_path, "wb") as f:
+            pickle.dump(index, f)
+            print("Saved updated FAISS index to faiss_index.pkl")
+        with open(document_texts_path, "wb") as f:
+            pickle.dump(document_texts, f)
+            print("Saved updated document texts to document_texts.pkl")
+        return "Files processed successfully"
     except Exception as e:
         print(f"Error processing files: {e}")
+        return f"Error processing files: {e}"
+def query_text(text):
+    try:
+        print(f"Query text: {text}")  # Debug: Show the query text
+        # Encode the query text
+        query_embedding = embedding_model.encode([text])
+        print(f"Query embedding shape: {query_embedding.shape}")  # Debug: Show the shape of the query embedding
+        # Search the FAISS index
+        D, I = index.search(np.array(query_embedding), k=5)
+        print(f"Distances: {D}, Indices: {I}")  # Debug: Show the distances and indices of the search results
+        top_documents = []
+        for idx in I[0]:
+            if idx != -1 and idx < len(document_texts):  # Ensure that a valid index is found
+                top_documents.append(document_texts[idx])  # Append the actual sentences for the response
+            else:
+                print(f"Invalid index found: {idx}")
+        return top_documents
+    except Exception as e:
+        print(f"Error querying text: {e}")
+        return f"Error querying text: {e}"
+# Create Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("## Document Upload and Query System")
+    with gr.Tab("Upload Files"):
+        upload = gr.File(file_count="multiple", label="Upload PDF or DOCX files")
+        upload_button = gr.Button("Upload")
+        upload_output = gr.Textbox()
+        upload_button.click(fn=upload_files, inputs=upload, outputs=upload_output)
+    with gr.Tab("Query"):
+        query = gr.Textbox(label="Enter your query")
+        query_button = gr.Button("Search")
+        query_output = gr.Textbox()
+        query_button.click(fn=query_text, inputs=query, outputs=query_output)
+demo.launch()