Chatbot / app.py
NaimaAqeel's picture
Update app.py
8d35da0 verified
raw
history blame
4.19 kB
import os
import io
import pickle
import PyPDF2
from docx import Document
import numpy as np
from nltk.tokenize import sent_tokenize
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from sentence_transformers import SentenceTransformer
import gradio as gr
# Download NLTK punkt tokenizer if not already downloaded
import nltk
nltk.download('punkt')
# Initialize Sentence Transformer model for embeddings
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
# Initialize FAISS index using LangChain
faiss_index = None # Initialize or load your FAISS index as needed
# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_data):
text = ""
try:
pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_data))
for page in pdf_reader.pages:
text += page.extract_text()
except Exception as e:
print(f"Error extracting text from PDF: {e}")
return text
# Function to extract text from a Word document
def extract_text_from_docx(docx_data):
text = ""
try:
doc = Document(io.BytesIO(docx_data))
text = "\n".join([para.text for para in doc.paragraphs])
except Exception as e:
print(f"Error extracting text from DOCX: {e}")
return text
# Function to preprocess text into sentences
def preprocess_text(text):
sentences = sent_tokenize(text)
return sentences
# Function to handle file uploads
def upload_files(files):
global faiss_index
try:
for file in files:
file_name = file.name
# Extract file content
if isinstance(file, str):
file_content = file
else:
file_content = file.read().decode("utf-8")
if file_name.endswith('.pdf'):
text = extract_text_from_pdf(file_content.encode())
elif file_name.endswith('.docx'):
text = extract_text_from_docx(file_content.encode())
else:
return {"error": "Unsupported file format"}
# Preprocess text
sentences = preprocess_text(text)
# Encode sentences and add to FAISS index
embeddings = embedding_model.encode(sentences)
if faiss_index is not None:
for embedding in embeddings:
faiss_index.add(np.expand_dims(embedding, axis=0))
# Save the updated index (if needed)
# Add your logic here to save the FAISS index if you're using persistence
return {"message": "Files processed successfully"}
except Exception as e:
print(f"Error processing files: {e}")
return {"error": str(e)} # Provide informative error message
# Function to process queries
def process_and_query(state, question):
if question:
try:
# Placeholder response based on query processing
response_message = "Placeholder response based on query processing"
return {"message": response_message, "conversation": state}
except Exception as e:
print(f"Error processing query: {e}")
return {"error": str(e)}
else:
return {"error": "No question provided"}
# Define the Gradio interface
def main():
gr.Interface(
fn=None, # Replace with your function that handles interface logic
inputs=gr.Interface.Layout([
gr.Tab("Upload Files", gr.Interface.Layout([
gr.File(label="Upload PDF or DOCX files", multiple=True),
gr.Button("Upload", onclick=upload_files),
gr.Textbox("Upload Status", default="No file uploaded yet", multiline=True)
])),
gr.Tab("Query", gr.Interface.Layout([
gr.Textbox("Enter your query", label="Query Input"),
gr.Button("Search", onclick=process_and_query),
gr.Textbox("Query Response", default="No query processed yet", multiline=True)
]))
]),
outputs=gr.Textbox("Output", label="Output", default="Output will be shown here", multiline=True),
live=True,
capture_session=True
).launch()
if __name__ == "__main__":
main()