Spaces:

Kazel
/

demo

Running on Zero

demo / app.py

Kazel

pls

0af2fdf 4 months ago

11.7 kB

	import gradio as gr
	import tempfile
	import os
	import fitz # PyMuPDF
	import uuid
	import shutil
	from pymilvus import MilvusClient

	from middleware import Middleware
	from rag import Rag
	from pathlib import Path
	import subprocess
	import getpass

	rag = Rag()


	def generate_uuid(state):
	# Check if UUID already exists in session state
	if state["user_uuid"] is None:
	# Generate a new UUID if not already set
	state["user_uuid"] = str(uuid.uuid4())

	return state["user_uuid"]


	class PDFSearchApp:
	def __init__(self):
	self.indexed_docs = {}
	self.current_pdf = None

	def upload_and_convert(self, state, files, max_pages):
	#change id
	#id = generate_uuid(state)


	pages = 0

	if files is None:
	return "No file uploaded"
	try: #if onlyy one file
	for file in files[:]: # Iterate over a shallow copy of the list, TEST THIS

	# Extract the last part of the path (file name)
	filename = os.path.basename(file.name)

	# Split the base name into name and extension
	name, ext = os.path.splitext(filename)
	self.current_pdf = file.name
	pdf_path=file.name
	#if ppt will get replaced with path of ppt!

	#if extension is .ppt or .pptx, convert
	if ext == ".ppt" or ext == ".pptx": #need to test with a ppt key...
	'''
	import comtypes.client
	powerpoint = comtypes.client.CreateObject("PowerPoint.Application")
	powerpoint.Visible = 1
	presentation = powerpoint.Presentations.Open(file)
	output_file = os.path.splitext(file)[0] + '.pdf'
	output_directory = os.path.dirname(file)
	presentation.SaveAs(os.path.join(output_directory, output_file), 32) # 32 is the formatType for PDF
	presentation.Close()
	powerpoint.Quit()
	file = os.path.join(output_directory, output_file) #swap file to be used to the outputted pdf file instead
	# Extract the last part of the path (file name)
	name = os.path.basename(file)
	# Split the base name into name and extension
	name, ext = os.path.splitext(name)
	print(name)
	self.current_pdf = os.path.join(output_directory, output_file)
	pdf_path = os.path.join(output_directory, output_file)'
	'''
	print("pptx not supported on spaces")


	# Replace spaces and hyphens with underscores in the name
	modified_filename = name.replace(" ", "_").replace("-", "_")

	id = modified_filename #if string cmi then serialize the name, test for later

	print(f"Uploading file: {id}, id: abc")
	middleware = Middleware(modified_filename, create_collection=True)


	pages = middleware.index(pdf_path, id=id, max_pages=max_pages)


	self.indexed_docs[id] = True

	#clear files for next consec upload after loop is complete
	files = []
	return f"Uploaded and extracted {len(pages)} pages"
	except Exception as e:
	return f"Error processing PDF: {str(e)}"


	def display_file_list(text):
	try:
	# Retrieve all entries in the specified directory
	directory_path = "pages"
	current_working_directory = os.getcwd()
	directory_path = os.path.join(current_working_directory, directory_path)
	entries = os.listdir(directory_path)
	# Filter out entries that are directories
	directories = [entry for entry in entries if os.path.isdir(os.path.join(directory_path, entry))]
	return directories
	except FileNotFoundError:
	return f"The directory {directory_path} does not exist."
	except PermissionError:
	return f"Permission denied to access {directory_path}."
	except Exception as e:
	return str(e)


	def search_documents(self, state, query, num_results=1):
	print(f"Searching for query: {query}")
	#id = generate_uuid(state)
	id = "test" # not used anyway

	"""
	if not self.indexed_docs[id]:
	print("Please index documents first")
	return "Please index documents first", "--"
	""" #edited out to allow direct query on db to test persistency
	if not query:
	print("Please enter a search query")
	return "Please enter a search query", "--"
	try:

	middleware = Middleware(id, create_collection=False)

	search_results = middleware.search([query])[0]
	#direct retrieve file path rather than rely on page nums!
	#try to retrieve multiple files rather than a single page (TBD)

	page_num = search_results[0][1] + 1 # final return value is a list of tuples, each tuple being: (score, doc_id, collection_name), so use [0][2] to get collection name of first ranked item
	coll_num = search_results[0][2]

	print(f"Retrieved page number: {page_num}")

	img_path = f"pages/{coll_num}/page_{page_num}.png"
	path = f"pages/{coll_num}/page_{page_num}"

	print(f"Retrieved image path: {img_path}")

	rag_response = rag.get_answer_from_gemini(query, [img_path])

	return path,img_path, rag_response

	except Exception as e:
	return f"Error during search: {str(e)}", "--"

	def delete(state,choice):
	#delete file in pages, then use middleware to delete collection
	# 1. Create a milvus client

	client = MilvusClient(uri="localhost")
	#client = MilvusClient(
	# uri="http://localhost:19530",
	# token="root:Milvus"
	# )
	path = f"pages/{choice}"
	if os.path.exists(path):
	shutil.rmtree(path)
	#call milvus manager to delete collection
	client.drop_collection(collection_name=choice)
	return f"Deleted {choice}"
	else:
	return "Directory not found"

	def list_downloaded_hf_models(state):
	# Determine the cache directory
	hf_cache_dir = Path(os.getenv('HF_HOME', Path.home() / '.cache/huggingface/hub'))

	# Initialize a list to store model names
	model_names = []

	# Traverse the cache directory
	for repo_dir in hf_cache_dir.glob('models--*'):
	# Extract the model name from the directory structure
	model_name = repo_dir.name.split('--', 1)[-1].replace('-', '/')
	model_names.append(model_name)

	return model_names


	def list_downloaded_ollama_models(state,):
	# Retrieve the current user's name
	username = getpass.getuser()

	# Construct the target directory path
	base_path = f"C:\\Users\\{username}\\NEW_PATH\\manifests\\registry.ollama.ai\\library"

	try:
	# List all entries in the directory
	with os.scandir(base_path) as entries:
	# Filter and print only directories
	directories = [entry.name for entry in entries if entry.is_dir()]

	return directories
	except FileNotFoundError:
	print(f"The directory {base_path} does not exist.")
	except PermissionError:
	print(f"Permission denied to access {base_path}.")
	except Exception as e:
	print(f"An error occurred: {e}")

	def model_settings(state,hfchoice, ollamachoice,tokensize):
	os.environ['colpali'] = hfchoice
	os.environ['ollama'] = ollamachoice
	os.environ['tokens'] = tokensize
	return "abc"



	def create_ui():
	app = PDFSearchApp()

	with gr.Blocks(css="footer{display:none !important}") as demo:
	state = gr.State(value={"user_uuid": None})


	gr.Markdown("# Collar Multimodal RAG Demo")
	gr.Markdown("Made by Collar")

	with gr.Tab("Upload PDF"):
	with gr.Column():
	max_pages_input = gr.Slider(
	minimum=1,
	maximum=10000,
	value=20,
	step=10,
	label="Max pages to extract and index per document"
	)
	file_input = gr.Files(label="Upload PDFs")
	file_list = gr.Textbox(label="Uploaded Files", interactive=False, value=app.display_file_list())
	status = gr.Textbox(label="Indexing Status", interactive=False)


	with gr.Tab("Query"):
	with gr.Column():
	query_input = gr.Textbox(label="Enter query")
	#num_results = gr.Slider(
	# minimum=1,
	# maximum=10,
	# value=5,
	# step=1,
	# label="Number of results"
	#)
	search_btn = gr.Button("Query")
	llm_answer = gr.Textbox(label="RAG Response", interactive=False)
	path = gr.Textbox(label="Link To Document Page", interactive=False)
	images = gr.Image(label="Top page matching query")
	with gr.Tab("Data Settings"): #deletion of collections, changing of model parameters etc
	with gr.Column():
	# Button to delete (TBD)
	choice = gr.Dropdown(list(app.display_file_list()),label="Choice")
	delete_button = gr.Button("Delete Document From DB")
	status1 = gr.Textbox(label="Deletion Status", interactive=False)

	with gr.Tab("AI Model Settings"): #deletion of collections, changing of model parameters etc
	with gr.Column():
	# Button to delete (TBD)
	hfchoice = gr.Dropdown(app.list_downloaded_hf_models(),label="Visual Document Retrieval (VDR) Model")
	ollamachoice = gr.Dropdown(app.list_downloaded_ollama_models(),label="Secondary Visual Retrieval-Augmented Generation (RAG) Model")
	tokensize = gr.Slider(
	minimum=256,
	maximum=4096,
	value=20,
	step=10,
	label="Max tokens per response (Reply Length)"
	)
	model_button = gr.Button("Update Settings")
	status2 = gr.Textbox(label="Update Status", interactive=False)




	# Event handlers
	file_input.change(
	fn=app.upload_and_convert,
	inputs=[state, file_input, max_pages_input],
	outputs=[status]
	)

	search_btn.click(
	#try to query without uploading first
	fn= app.search_documents,
	inputs=[state, query_input],
	outputs=[path,images, llm_answer]
	)

	delete_button.click(
	fn=app.delete,
	inputs=[choice],
	outputs=[status1]
	)

	model_button.click(
	fn=app.model_settings,
	inputs=[hfchoice, ollamachoice,tokensize],
	outputs=[status2]
	)

	return demo

	if __name__ == "__main__":
	demo = create_ui()
	demo.launch()