Spaces:

NaimaAqeel
/

Chatbot

Build error

App Files Files Community

Chatbot / app.py

NaimaAqeel

Update app.py

2c02a9e verified 11 months ago

raw

history blame

3.73 kB

	import os
	import fitz
	from docx import Document
	from sentence_transformers import SentenceTransformer
	import faiss
	import numpy as np
	import pickle
	from langchain_community.llms import HuggingFaceEndpoint
	from langchain_community.vectorstores import FAISS
	from langchain_community.embeddings import HuggingFaceEmbeddings
	import gradio as gr
	from fastapi import FastAPI

	# Initialize FastAPI
	app = FastAPI()

	# Function to extract text from a PDF file
	def extract_text_from_pdf(pdf_path):
	text = ""
	doc = fitz.open(pdf_path)
	for page_num in range(len(doc)):
	page = doc.load_page(page_num)
	text += page.get_text()
	return text

	# Function to extract text from a Word document
	def extract_text_from_docx(docx_path):
	doc = Document(docx_path)
	text = "\n".join([para.text for para in doc.paragraphs])
	return text

	# Initialize the embedding model
	embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

	# Hugging Face API token
	api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
	if not api_token:
	raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable is not set")

	print(f"API Token: {api_token[:5]}...")

	# Initialize the HuggingFace LLM
	llm = HuggingFaceEndpoint(
	endpoint_url="https://api-inference.huggingface.co/models/gpt2",
	model_kwargs={"api_key": api_token}
	)

	# Initialize the HuggingFace embeddings
	embedding = HuggingFaceEmbeddings()

	# Load or create FAISS index
	index_path = "faiss_index.pkl"
	if os.path.exists(index_path):
	with open(index_path, "rb") as f:
	index = pickle.load(f)
	else:
	# Create a new FAISS index if it doesn't exist
	index = faiss.IndexFlatL2(embedding_model.get_sentence_embedding_dimension())
	with open(index_path, "wb") as f:
	pickle.dump(index, f)

	def upload_files(files):
	for file in files:
	content = file.read()
	if file.name.endswith('.pdf'):
	with open("temp.pdf", "wb") as f:
	f.write(content)
	text = extract_text_from_pdf("temp.pdf")
	elif file.name.endswith('.docx'):
	with open("temp.docx", "wb") as f:
	f.write(content)
	text = extract_text_from_docx("temp.docx")
	else:
	return {"error": "Unsupported file format"}

	# Process the text and update FAISS index
	sentences = text.split("\n")
	embeddings = embedding_model.encode(sentences)
	index.add(np.array(embeddings))

	# Save the updated index
	with open(index_path, "wb") as f:
	pickle.dump(index, f)

	return "Files processed successfully"

	def query_text(text):
	# Encode the query text
	query_embedding = embedding_model.encode([text])

	# Search the FAISS index
	D, I = index.search(np.array(query_embedding), k=5)

	top_documents = []
	for idx in I[0]:
	if idx != -1: # Ensure that a valid index is found
	top_documents.append(f"Document {idx}")

	return top_documents

	# Create Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("## Document Upload and Query System")

	with gr.Tab("Upload Files"):
	upload = gr.File(file_count="multiple", label="Upload PDF or DOCX files")
	upload_button = gr.Button("Upload")
	upload_output = gr.Textbox()
	upload_button.click(fn=upload_files, inputs=upload, outputs=upload_output)

	with gr.Tab("Query"):
	query = gr.Textbox(label="Enter your query")
	query_button = gr.Button("Search")
	query_output = gr.Textbox()
	query_button.click(fn=query_text, inputs=query, outputs=query_output)

	demo.launch()

	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=8001)