Spaces:

NaimaAqeel
/

Chatbot

Runtime error

App Files Files Community

Chatbot / app.py

NaimaAqeel

Update app.py

f2ca711 verified about 1 year ago

raw

history blame

3.1 kB

	import os
	from dotenv import load_dotenv
	import fitz # PyMuPDF
	from docx import Document
	from sentence_transformers import SentenceTransformer
	import faiss
	import numpy as np
	import pickle
	from langchain_community.llms import HuggingFaceEndpoint # Might need update
	from langchain_community.vectorstores import FAISS
	from langchain_community.embeddings import HuggingFaceEmbeddings
	import gradio as gr

	# Load environment variables from .env
	load_dotenv()

	# Initialize the embedding model
	embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

	# Hugging Face API token
	api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
	if not api_token:
	raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable is not set")
	print(f"API Token: {api_token[:5]}...")

	# Initialize the HuggingFace LLM (Optional, comment out if not used)
	llm = HuggingFaceEndpoint(
	endpoint_url="https://api-inference.huggingface.co/models/gpt2",
	model_kwargs={"api_key": api_token}
	)

	# Initialize the HuggingFace embedding
	embedding = HuggingFaceEmbeddings()

	# Load or create FAISS index
	index_path = "faiss_index.pkl"
	if os.path.exists(index_path):
	with open(index_path, "rb") as f:
	index = pickle.load(f)
	else:
	# Create a new FAISS index if it doesn't exist
	index = faiss.IndexFlatL2(embedding_model.get_sentence_embedding_dimension())
	with open(index_path, "wb") as f:
	pickle.dump(index, f)


	# Function to extract text from a PDF file
	def extract_text_from_pdf(pdf_path):
	text = ""
	doc = fitz.open(pdf_path)
	for page_num in range(len(doc)):
	page = doc.load_page(page_num)
	text += page.get_text()
	return text


	# Function to extract text from a Word document
	def extract_text_from_docx(docx_path):
	doc = Document(docx_path)
	text = "\n".join([para.text for para in doc.paragraphs])
	return text


	def process_and_query(state, text, file=None):
	# Initialize state on first run
	if state is None:
	state = {"processed_text": None, "conversation": []}

	# Check if a file is uploaded
	if file:
	# Get the uploaded file content
	content = file.read()
	if file.filename.endswith('.pdf'):
	with open("temp.pdf", "wb") as f:
	f.write(content)
	state["processed_text"] = extract_text_from_pdf("temp.pdf")
	elif file.filename.endswith('.docx'):
	with open("temp.docx", "wb") as f:
	f.write(content)
	state["processed_text"] = extract_text_from_docx("temp.docx")
	else:
	return {"error": "Unsupported file format"}

	# Handle user question
	if state["processed_text"] and text:
	# Process the question and potentially use LLM for answering (optional)
	question_embedding = embedding_model.encode([text])
	# ... (logic to search the index and potentially use LLM for answering)
	answer = "Answer retrieved from the document based on your question." # Placeholder answer

	# Update conversation history
	state["conversation"].append({"question": text,