Spaces:

NaimaAqeel
/

Chatbot

Build error

App Files Files Community

Chatbot / app.py

NaimaAqeel

Update app.py

4d0c42b verified 11 months ago

raw

history blame

3.8 kB

	import os
	import fitz
	from docx import Document
	from sentence_transformers import SentenceTransformer
	import faiss
	import numpy as np
	import pickle
	import gradio as gr
	from typing import List
	from langchain_community.llms import HuggingFaceEndpoint
	from langchain_community.vectorstores import FAISS
	from langchain_community.embeddings import HuggingFaceEmbeddings
	from nltk.tokenize import sent_tokenize # Import for sentence segmentation
	from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

	# Function to extract text from a PDF file
	def extract_text_from_pdf(pdf_path):
	text = ""
	try:
	doc = fitz.open(pdf_path)
	for page_num in range(len(doc)):
	page = doc.load_page(page_num)
	text += page.get_text()
	except Exception as e:
	print(f"Error extracting text from PDF: {e}")
	return text

	# Function to extract text from a Word document
	def extract_text_from_docx(docx_path):
	"""Extracts text from a Word document."""
	text = ""
	try:
	doc = Document(docx_path)
	text = "\n".join([para.text for para in doc.paragraphs])
	except Exception as e:
	print(f"Error extracting text from DOCX: {e}")
	return text


	# Initialize the embedding model (same as before)
	embedding_model = SentenceTransformer('all-MiniLM-L6-v2')


	# Hugging Face API token (same as before)
	api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')
	if not api_token:
	raise ValueError("HUGGINGFACEHUB_API_TOKEN environment variable is not set")


	# Define RAG models (replace with your chosen models)
	generator_model_name = "facebook/bart-base"
	retriever_model_name = "facebook/bart-base" # Can be the same as generator

	generator = AutoModelForSeq2SeqLM.from_pretrained(generator_model_name)
	generator_tokenizer = AutoTokenizer.from_pretrained(generator_model_name)

	retriever = AutoModelForSeq2SeqLM.from_pretrained(retriever_model_name)
	retriever_tokenizer = AutoTokenizer.from_pretrained(retriever_model_name)


	# Load or create FAISS index
	index_path = "faiss_index.pkl"
	document_texts_path = "document_texts.pkl"
	document_texts = []
	if os.path.exists(index_path) and os.path.exists(document_texts_path):
	try:
	with open(index_path, "rb") as f:
	index = pickle.load(f)
	print("Loaded FAISS index from faiss_index.pkl")
	with open(document_texts_path, "rb") as f:
	document_texts = pickle.load(f)
	print("Loaded document texts from document_texts.pkl")
	except Exception as e:
	print(f"Error loading FAISS index or document texts: {e}")
	else:
	# Create a new FAISS index if it doesn't exist
	index = faiss.IndexFlatL2(embedding_model.get_sentence_embedding_dimension())
	with open(index_path, "wb") as f:
	pickle.dump(index, f)
	print("Created new FAISS index and saved to faiss_index.pkl")


	def preprocess_text(text):
	sentences = sent_tokenize(text)
	return sentences


	def upload_files(files):
	global index, document_texts
	try:
	for file_path in files:
	if file_path.endswith('.pdf'):
	text = extract_text_from_pdf(file_path)
	elif file_path.endswith('.docx'):
	text = extract_text_from_docx(file_path)
	else:
	return "Unsupported file format"

	# Preprocess text (call the new function)
	sentences = preprocess_text(text)

	# Encode sentences and add to FAISS index
	embeddings = embedding_model.encode(sentences)
	index.add(np.array(embeddings))

	# Save the updated index and documents (same as before)
	# ...
	return "Files processed successfully"
	except Exception as e:
	print(f"Error processing files: {e}")