Spaces:

NaimaAqeel
/

Chatbot

Runtime error

App Files Files Community

Chatbot / app.py

NaimaAqeel

Update app.py

84f3457 verified about 1 year ago

raw

history blame

3 kB

	import os
	from dotenv import load_dotenv
	import fitz # PyMuPDF
	from docx import Document
	from sentence_transformers import SentenceTransformer
	import faiss
	import numpy as np
	import pickle
	from langchain_community.llms import HuggingFaceEndpoint
	from langchain_community.vectorstores import FAISS
	from langchain_community.embeddings import HuggingFaceEmbeddings
	import gradio as gr

	# Load environment variables from .env
	load_dotenv()

	# Initialize the embedding model
	embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

	# Hugging Face API token
	api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
	if not api_token:
	raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable is not set")
	print(f"API Token: {api_token[:5]}...")

	# Initialize the HuggingFace LLM
	llm = HuggingFaceEndpoint(
	endpoint_url="https://api-inference.huggingface.co/models/gpt2",
	model_kwargs={"api_key": api_token}
	)

	# Initialize the HuggingFace embedding
	embedding = HuggingFaceEmbeddings()

	# Load or create FAISS index
	index_path = "faiss_index.pkl"
	if os.path.exists(index_path):
	with open(index_path, "rb") as f:
	index = pickle.load(f)
	else:
	# Create a new FAISS index if it doesn't exist
	index = faiss.IndexFlatL2(embedding_model.get_sentence_embedding_dimension())
	with open(index_path, "wb") as f:
	pickle.dump(index, f)


	# Function to extract text from a PDF file
	def extract_text_from_pdf(pdf_path):
	text = ""
	doc = fitz.open(pdf_path)
	for page_num in range(len(doc)):
	page = doc.load_page(page_num)
	text += page.get_text()
	return text


	# Function to extract text from a Word document
	def extract_text_from_docx(docx_path):
	doc = Document(docx_path)
	text = "\n".join([para.text for para in doc.paragraphs])
	return text


	def process_and_query(text):
	# Process the text and update FAISS index (similar to the previous code)
	sentences = text.split("\n")
	embeddings = embedding_model.encode(sentences)
	index.add(np.array(embeddings))

	# Search the FAISS index
	query_embedding = embedding_model.encode([text])
	D, I = index.search(np.array(query_embedding), k=5)

	top_documents = []
	for idx in I[0]:
	if idx != -1: # Ensure that a valid index is found
	top_documents.append(f"Document {idx}")

	# Generate response using LLM (optional)
	# You can replace this with your desired LLM interaction logic
	response = llm.run(inputs=text, max_length=100, temperature=0.7)["generated_text"]

	return {"top_documents": top_documents, "response": response}


	# Define the Gradio interface
	interface = gr.Interface(
	fn=process_and_query,
	inputs="textbox",
	outputs=["list", "text"],
	title="Chatbot with Text Processing and Retrieval",
	description="Upload a document (PDF or Word) or enter text to process. The chatbot will retrieve relevant documents and generate a response (optional).",
	)

	# Launch the Gradio interface
	interface.launch()