Spaces:

NaimaAqeel
/

Chatbot

Runtime error

App Files Files Community

NaimaAqeel commited on Jun 6, 2024

Commit

be68f20

verified ·

1 Parent(s): 9fac60a

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -128

app.py CHANGED Viewed

@@ -2,168 +2,124 @@ import os
 import fitz
 from docx import Document
 from sentence_transformers import SentenceTransformer
-import faiss
-import numpy as np
-import pickle
-import gradio as gr
-from langchain_community.llms import HuggingFaceEndpoint
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
-# Function to extract text from a PDF file
 def extract_text_from_pdf(pdf_path):
-    text = ""
-    try:
-        doc = fitz.open(pdf_path)
-        for page_num in range(len(doc)):
-            page = doc.load_page(page_num)
-            text += page.get_text()
-    except Exception as e:
-        print(f"Error extracting text from PDF: {e}")
-    return text
-# Function to extract text from a Word document
 def extract_text_from_docx(docx_path):
-    text = ""
-    try:
-        doc = Document(docx_path)
-        text = "\n".join([para.text for para in doc.paragraphs])
-    except Exception as e:
-        print(f"Error extracting text from DOCX: {e}")
-    return text
-# Initialize the embedding model
 embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
-# Hugging Face API token
 api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
 if not api_token:
-    raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable is not set or invalid")
-# Initialize the HuggingFace LLM
-llm = HuggingFaceEndpoint(
-    endpoint_url="https://api-inference.huggingface.co/models/gpt2",  # Using gpt2 model
-    model_kwargs={"api_key": api_token}
-)
-# Initialize the HuggingFace embeddings
-embedding = HuggingFaceEmbeddings()
-# Load or create FAISS index
 index_path = "faiss_index.pkl"
-document_texts_path = "document_texts.pkl"
-document_texts = []
-if os.path.exists(index_path) and os.path.exists(document_texts_path):
-    try:
-        with open(index_path, "rb") as f:
-            index = pickle.load(f)
-            print("Loaded FAISS index from faiss_index.pkl")
-        with open(document_texts_path, "rb") as f:
-            document_texts = pickle.load(f)
-            print("Loaded document texts from document_texts.pkl")
-    except Exception as e:
-        print(f"Error loading FAISS index or document texts: {e}")
 else:
     # Create a new FAISS index if it doesn't exist
-    index = faiss.IndexFlatL2(embedding_model.get_sentence_embedding_dimension())
     with open(index_path, "wb") as f:
-        pickle.dump(index, f)
         print("Created new FAISS index and saved to faiss_index.pkl")
 def preprocess_text(text):
-    # Add more preprocessing steps if necessary
-    return text.strip()
-def upload_files(files):
-    global index, document_texts
     try:
-        for file in files:
-            file_path = file.name  # Get the file path from the NamedString object
             if file_path.endswith('.pdf'):
                 text = extract_text_from_pdf(file_path)
             elif file_path.endswith('.docx'):
                 text = extract_text_from_docx(file_path)
             else:
-                return "Unsupported file format"
-            print(f"Extracted text: {text[:100]}...")  # Debug: Show the first 100 characters of the extracted text
-            # Process the text and update FAISS index
-            sentences = text.split("\n")
-            sentences = [preprocess_text(sentence) for sentence in sentences if sentence.strip()]
             embeddings = embedding_model.encode(sentences)
-            print(f"Embeddings shape: {embeddings.shape}")  # Debug: Show the shape of the embeddings
-            index.add(np.array(embeddings))
-            document_texts.extend(sentences)  # Store sentences for retrieval
-        # Save the updated index and documents
-        with open(index_path, "wb") as f:
-            pickle.dump(index, f)
-            print("Saved updated FAISS index to faiss_index.pkl")
-        with open(document_texts_path, "wb") as f:
-            pickle.dump(document_texts, f)
-            print("Saved updated document texts to document_texts.pkl")
-        return "Files processed successfully"
     except Exception as e:
         print(f"Error processing files: {e}")
-        return f"Error processing files: {e}"
-# Improved prompt template
-prompt_template = """
-You are a helpful assistant. Use the provided context to answer the question accurately.
-If the answer is not in the context, say "answer is not available in the context".
-Do not provide false information.
-Context:
-{context}
-Question:
-{question}
-Answer:
-"""
-def query_text(text):
-    try:
-        print(f"Query text: {text}")  # Debug: Show the query text
-        # Encode the query text
-        query_embedding = embedding_model.encode([text])
-        print(f"Query embedding shape: {query_embedding.shape}")  # Debug: Show the shape of the query embedding
-        # Search the FAISS index
-        D, I = index.search(np.array(query_embedding), k=5)
-        print(f"Distances: {D}, Indices: {I}")  # Debug: Show the distances and indices of the search results
-        top_documents = []
-  for idx in I[0]:
-    if idx != -1 and idx < len(document_texts):
-      # Get a passage around the retrieved sentence (e.g., paragraph)
-      passage_start = max(0, idx - 5)  # Adjust window size as needed
-      passage_end = min(len(document_texts), idx + 5)
-      passage = "\n".join(document_texts[passage_start:passage_end])
-      top_documents.append(passage)
-    else:
-      print(f"Invalid index found: {idx}")
-        # Remove duplicates and sort by relevance
-        top_documents = list(dict.fromkeys(top_documents))
-        # Join the top documents for the context
-        context = "\n".join(top_documents)
-        # Prepare the prompt
-        prompt = prompt_template.format(context=context, question=text)
-        # Query the LLM
-        response = llm(prompt)
-        return response
-    except Exception as e:
-        print(f"Error querying text: {e}")
-        return f"Error querying text: {e}"
 # Create Gradio interface
 with gr.Blocks() as demo:
@@ -183,6 +139,3 @@ with gr.Blocks() as demo:
 demo.launch()

 import fitz
 from docx import Document
 from sentence_transformers import SentenceTransformer
+from langchain_community.llms import HuggingFaceEndpoint  # Might need update (optional)
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+from nltk.tokenize import sent_tokenize  # Import for sentence segmentation
+# Function to extract text from a PDF file (same as before)
 def extract_text_from_pdf(pdf_path):
+    # ... (implementation)
+# Function to extract text from a Word document (same as before)
 def extract_text_from_docx(docx_path):
+    # ... (implementation)
+# Initialize the embedding model (same as before)
 embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+# Hugging Face API token (same as before)
 api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
 if not api_token:
+    raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable is not set")
+# Define RAG models (same as before)
+generator_model_name = "facebook/bart-base"
+retriever_model_name = "facebook/bart-base"  # Can be the same as generator
+generator = AutoModelForSeq2SeqLM.from_pretrained(generator_model_name)
+generator_tokenizer = AutoTokenizer.from_pretrained(generator_model_name)
+retriever = AutoModelForSeq2SeqLM.from_pretrained(retriever_model_name)
+retriever_tokenizer = AutoTokenizer.from_pretrained(retriever_model_name)
+# Load or create FAISS index (using LangChain)
 index_path = "faiss_index.pkl"
+if os.path.exists(index_path):
+    with open(index_path, "rb") as f:
+        index = FAISS.load(f)
+        print("Loaded FAISS index from faiss_index.pkl")
 else:
     # Create a new FAISS index if it doesn't exist
+    index = FAISS(embedding_dimension=embedding_model.get_sentence_embedding_dimension())
     with open(index_path, "wb") as f:
+        FAISS.save(index, f)
         print("Created new FAISS index and saved to faiss_index.pkl")
 def preprocess_text(text):
+    sentences = sent_tokenize(text)
+    return sentences
+def upload_files(state, files):
+    global index
     try:
+        for file_path in files:
             if file_path.endswith('.pdf'):
                 text = extract_text_from_pdf(file_path)
             elif file_path.endswith('.docx'):
                 text = extract_text_from_docx(file_path)
             else:
+                return {"error": "Unsupported file format"}
+            # Preprocess text (call the new function)
+            sentences = preprocess_text(text)
+            # Encode sentences and add to FAISS index
             embeddings = embedding_model.encode(sentences)
+            index.add(embeddings)
+        return {"message": "Files processed successfully"}
     except Exception as e:
         print(f"Error processing files: {e}")
+        return {"error": "Error processing files"}  # Provide informative error message
+def process_and_query(state, files, question):
+    # State management for conversation history (similar to previous example)
+    # ...
+    # Handle file upload (using upload_files function)
+    if files:
+        upload_result = upload_files(state, files)
+        if "error" in upload_result:
+            return upload_result  # Return error message from upload_files if any
+    # Handle user question and generate response using RAG models if question and state.
+    def process_and_query(state, files, question):
+    # State management for conversation history (similar to previous example)
+    # ...
+    # Handle file upload (using upload_files function)
+    if files:
+        upload_result = upload_files(state, files)
+        if "error" in upload_result:
+            return upload_result  # Return error message from upload_files if any
+    # Handle user question and generate response using RAG models
+    if question and state["processed_text"]:
+        # Preprocess the question
+        question_embedding = embedding_model.encode([question])
+        # Use retriever model to retrieve relevant passages
+        with torch.no_grad():  # Disable gradient calculation for efficiency
+            retriever_outputs = retriever(**retriever_tokenizer(question, return_tensors="pt"))
+            retriever_hidden_states = retriever_outputs.hidden_states[-1]  # Last hidden state
+        # Search the FAISS index for similar passages based on retrieved hidden states
+        distances, retrieved_ids = index.search(retriever_hidden_states.cpu().numpy(), k=5)  # Retrieve top 5 passages
+        # Get the retrieved passages from the document text
+        retrieved_passages = [state["processed_text"].split("\n")[i] for i in retrieved_ids.flatten()]
+        # Use generator model to generate response based on question and retrieved passages
+        # Combine question embedding with retrieved passages (consider weighting or attention mechanism)
+        combined_input = torch.cat([question_embedding, embedding_model.encode(retrieved_passages)], dim=0)
+        with torch.no_grad():
+            generator_outputs = generator(**generator_tokenizer(combined_input, return_tensors="pt"))
+            generated_text = generator_tokenizer.decode(generator_outputs.sequences.squeeze())
+        # Update conversation history
+        state["conversation"].append({"question": question, "answer": generated_text})
+        return state  # Return the updated state with conversation history
 # Create Gradio interface
 with gr.Blocks() as demo:
 demo.launch()