Spaces:

NaimaAqeel
/

Chatbot

Sleeping

App Files Files Community

Chatbot / app.py

NaimaAqeel

Update app.py

d382509 verified 11 months ago

raw

history blame

4.44 kB



	import os
	from docx import Document
	from sentence_transformers import SentenceTransformer
	import faiss
	import numpy as np
	import pickle
	from langchain_community.llms import HuggingFaceEndpoint
	from langchain_community.vectorstores import FAISS
	from langchain_community.embeddings import HuggingFaceEmbeddings

	# Initialize the embedding model
	embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

	# Initialize the HuggingFace LLM
	llm = HuggingFaceEndpoint(
	endpoint_url="https://api-inference.huggingface.co/models/gpt2",
	model_kwargs={"api_key": os.getenv('HUGGINGFACEHUB_API_TOKEN')}
	)

	# Initialize the HuggingFace embeddings
	embedding = HuggingFaceEmbeddings()

	# Function to extract text from a Word document
	def extract_text_from_docx(docx_path):
	text = ""
	try:
	doc = Document(docx_path)
	text = "\n".join([para.text for para in doc.paragraphs])
	except Exception as e:
	print(f"Error extracting text from DOCX: {e}")
	return text

	# Load or create FAISS index
	index_path = "faiss_index.pkl"
	document_texts_path = "document_texts.pkl"

	document_texts = []

	if os.path.exists(index_path) and os.path.exists(document_texts_path):
	try:
	with open(index_path, "rb") as f:
	index = pickle.load(f)
	print("Loaded FAISS index from faiss_index.pkl")
	with open(document_texts_path, "rb") as f:
	document_texts = pickle.load(f)
	print("Loaded document texts from document_texts.pkl")
	except Exception as e:
	print(f"Error loading FAISS index or document texts: {e}")
	else:
	# Create a new FAISS index if it doesn't exist
	index = faiss.IndexFlatL2(embedding_model.get_sentence_embedding_dimension())
	with open(index_path, "wb") as f:
	pickle.dump(index, f)
	print("Created new FAISS index and saved to faiss_index.pkl")

	def preprocess_text(text):
	# Add more preprocessing steps if necessary
	return text.strip()

	def upload_files(files):
	global index, document_texts
	try:
	for file in files:
	file_path = file.name # Get the file path from the NamedString object
	if file_path.endswith('.docx'):
	text = extract_text_from_docx(file_path)

	# Process the text and update FAISS index
	sentences = text.split("\n")
	sentences = [preprocess_text(sentence) for sentence in sentences if sentence.strip()]
	embeddings = embedding_model.encode(sentences)
	index.add(np.array(embeddings))
	document_texts.extend(sentences) # Store sentences for retrieval

	# Save the updated index and documents
	with open(index_path, "wb") as f:
	pickle.dump(index, f)
	print("Saved updated FAISS index to faiss_index.pkl")
	with open(document_texts_path, "wb") as f:
	pickle.dump(document_texts, f)
	print("Saved updated document texts to document_texts.pkl")

	return "Files processed successfully"
	except Exception as e:
	print(f"Error processing files: {e}")
	return f"Error processing files: {e}"

	def query_text(text):
	try:
	# Encode the query text
	query_embedding = embedding_model.encode([text])

	# Search the FAISS index
	D, I = index.search(np.array(query_embedding), k=5)

	top_documents = []
	for idx in I[0]:
	if idx != -1 and idx < len(document_texts): # Ensure that a valid index is found
	top_documents.append(document_texts[idx]) # Append the actual sentences for the response

	# Prepare the prompt
	context = "\n".join(top_documents)
	prompt = f"Context:\n{context}\n\nQuestion:\n{text}\n\nAnswer:\n"

	# Query the LLM
	response = llm(prompt)
	return response
	except Exception as e:
	print(f"Error querying text: {e}")
	return f"Error querying text: {e}"

	# Sample Gradio integration (for illustration)
	import gradio as gr

	def main():
	gr.Interface(
	[upload_files, query_text],
	["files", "text"],
	["text", "text"],
	title="Document Upload and Query System",
	description="Upload DOCX files to build an index, then query for answers based on uploaded documents.",
	).launch()

	if __name__ == "__main__":
	main()