Spaces:

NaimaAqeel
/

Chatbot

Runtime error

App Files Files Community

NaimaAqeel commited on Jun 4, 2024

Commit

f812db9

verified ·

1 Parent(s): 70fd172

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -93

app.py CHANGED Viewed

@@ -10,135 +10,105 @@ from typing import List
 from langchain_community.llms import HuggingFaceEndpoint
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
-# Function to extract text from a PDF file
 def extract_text_from_pdf(pdf_path):
-    text = ""
-    try:
-        doc = fitz.open(pdf_path)
-        for page_num in range(len(doc)):
-            page = doc.load_page(page_num)
-            text += page.get_text()
-    except Exception as e:
-        print(f"Error extracting text from PDF: {e}")
-    return text
-# Function to extract text from a Word document
 def extract_text_from_docx(docx_path):
-    text = ""
-    try:
-        doc = Document(docx_path)
-        text = "\n".join([para.text for para in doc.paragraphs])
-    except Exception as e:
-        print(f"Error extracting text from DOCX: {e}")
-    return text
-# Initialize the embedding model
 embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
-# Hugging Face API token
 api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
 if not api_token:
     raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable is not set")
-# Initialize the HuggingFace LLM
-llm = HuggingFaceEndpoint(
-    endpoint_url="https://api-inference.huggingface.co/models/gpt2",
-    model_kwargs={"api_key": api_token}
-)
-# Initialize the HuggingFace embeddings
-embedding = HuggingFaceEmbeddings()
-# Load or create FAISS index
 index_path = "faiss_index.pkl"
 document_texts_path = "document_texts.pkl"
 document_texts = []
-if os.path.exists(index_path) and os.path.exists(document_texts_path):
-    try:
-        with open(index_path, "rb") as f:
-            index = pickle.load(f)
-            print("Loaded FAISS index from faiss_index.pkl")
-        with open(document_texts_path, "rb") as f:
-            document_texts = pickle.load(f)
-            print("Loaded document texts from document_texts.pkl")
-    except Exception as e:
-        print(f"Error loading FAISS index or document texts: {e}")
-else:
-    # Create a new FAISS index if it doesn't exist
-    index = faiss.IndexFlatL2(embedding_model.get_sentence_embedding_dimension())
-    with open(index_path, "wb") as f:
-        pickle.dump(index, f)
-        print("Created new FAISS index and saved to faiss_index.pkl")
 def upload_files(files):
     global index, document_texts
     try:
         for file_path in files:
-            if file_path.endswith('.pdf'):
-                text = extract_text_from_pdf(file_path)
-            elif file_path.endswith('.docx'):
-                text = extract_text_from_docx(file_path)
-            else:
-                return "Unsupported file format"
-            # Process the text and update FAISS index
-            sentences = text.split("\n")
             embeddings = embedding_model.encode(sentences)
             index.add(np.array(embeddings))
-            document_texts.append(text)
-        # Save the updated index and documents
-        with open(index_path, "wb") as f:
-            pickle.dump(index, f)
-            print("Saved updated FAISS index to faiss_index.pkl")
-        with open(document_texts_path, "wb") as f:
-            pickle.dump(document_texts, f)
-            print("Saved updated document texts to document_texts.pkl")
         return "Files processed successfully"
     except Exception as e:
         print(f"Error processing files: {e}")
         return f"Error processing files: {e}"
 def query_text(text):
     try:
-        # Encode the query text
-        query_embedding = embedding_model.encode([text])
-        # Search the FAISS index
-        D, I = index.search(np.array(query_embedding), k=5)
-        top_documents = []
-        for idx in I[0]:
-            if idx != -1 and idx < len(document_texts):  # Ensure that a valid index is found
-                top_documents.append(document_texts[idx])
-            else:
-                print(f"Invalid index found: {idx}")
-        return top_documents
     except Exception as e:
         print(f"Error querying text: {e}")
         return f"Error querying text: {e}"
-# Create Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("## Document Upload and Query System")
-    with gr.Tab("Upload Files"):
-        upload = gr.File(file_count="multiple", label="Upload PDF or DOCX files")
-        upload_button = gr.Button("Upload")
-        upload_output = gr.Textbox()
-        upload_button.click(fn=upload_files, inputs=upload, outputs=upload_output)
-    with gr.Tab("Query"):
-        query = gr.Textbox(label="Enter your query")
-        query_button = gr.Button("Search")
-        query_output = gr.Textbox()
-        query_button.click(fn=query_text, inputs=query, outputs=query_output)
-demo.launch()

 from langchain_community.llms import HuggingFaceEndpoint
 from langchain_community.vectorstores import FAISS
 from langchain_community.embeddings import HuggingFaceEmbeddings
+from nltk.tokenize import sent_tokenize  # Import for sentence segmentation
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+# Function to extract text from a PDF file (same as before)
 def extract_text_from_pdf(pdf_path):
+    # ...
+# Function to extract text from a Word document (same as before)
 def extract_text_from_docx(docx_path):
+    # ...
+# Initialize the embedding model (same as before)
 embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+# Hugging Face API token (same as before)
 api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
 if not api_token:
     raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable is not set")
+# Define RAG models (replace with your chosen models)
+generator_model_name = "facebook/bart-base"
+retriever_model_name = "facebook/bart-base"  # Can be the same as generator
+generator = AutoModelForSeq2SeqLM.from_pretrained(generator_model_name)
+generator_tokenizer = AutoTokenizer.from_pretrained(generator_model_name)
+retriever = AutoModelForSeq2SeqLM.from_pretrained(retriever_model_name)
+retriever_tokenizer = AutoTokenizer.from_pretrained(retriever_model_name)
+# Load or create FAISS index (same as before)
 index_path = "faiss_index.pkl"
 document_texts_path = "document_texts.pkl"
 document_texts = []
+# ... (rest of the FAISS index loading logic)
+def preprocess_text(text):
+    # ... (text preprocessing logic, same as before)
 def upload_files(files):
     global index, document_texts
     try:
         for file_path in files:
+            # ... (file processing logic, same as before)
+            # Preprocess text (call the new function)
+            sentences = preprocess_text(text)
+            # Encode sentences and add to FAISS index
             embeddings = embedding_model.encode(sentences)
             index.add(np.array(embeddings))
+        # Save the updated index and documents (same as before)
+        # ...
         return "Files processed successfully"
     except Exception as e:
         print(f"Error processing files: {e}")
         return f"Error processing files: {e}"
 def query_text(text):
     try:
+        # Preprocess query text
+        query_sentences = preprocess_text(text)
+        query_embeddings = embedding_model.encode(query_sentences)
+        # Retrieve relevant documents using FAISS
+        D, I = index.search(np.array(query_embeddings), k=5)
+        retrieved_docs = [document_texts[idx] for idx in I[0] if idx != -1]
+        # Retriever-Augmented Generation (RAG)
+        retriever_inputs = retriever_tokenizer(
+            text=retrieved_docs, return_tensors="pt", padding=True
+        )
+        retriever_outputs = retriever(**retriever_inputs)
+        retrieved_texts = retriever_tokenizer.batch_decode(retriever_outputs.logits)
+        # Generate response using retrieved information (as prompts/context)
+        generator_inputs = generator_tokenizer(
+            text=[text] + retrieved_texts, return_tensors="pt", padding=True
+        )
+        generator_outputs = generator(**generator_inputs)
+        response = generator_tokenizer.decode(generator_outputs.sequences[0], skip_special_tokens=True)
+        return response
     except Exception as e:
         print(f"Error querying text: {e}")
         return f"Error querying text: {e}"
+# Create Gradio interface
 with gr.Blocks() as demo:
+    # ... (rest of the Gradio interface definition)
+    query_button.click(fn=query_text, inputs