Spaces:

Kazel
/

demo

Sleeping

File size: 11,730 Bytes

import gradio as gr
import tempfile
import os
import fitz  # PyMuPDF
import uuid
import shutil
from pymilvus import MilvusClient

from middleware import Middleware
from rag import Rag
from pathlib import Path
import subprocess
import getpass

rag = Rag()


def generate_uuid(state):
    # Check if UUID already exists in session state
    if state["user_uuid"] is None:
        # Generate a new UUID if not already set
        state["user_uuid"] = str(uuid.uuid4())

    return state["user_uuid"]


class PDFSearchApp:
    def __init__(self):
        self.indexed_docs = {}
        self.current_pdf = None
        
    def upload_and_convert(self, state, files, max_pages):
        #change id
        #id = generate_uuid(state)
        
        
        pages = 0

        if files is None:
            return "No file uploaded"
        try: #if onlyy one file
            for file in files[:]:  # Iterate over a shallow copy of the list, TEST THIS
               
                # Extract the last part of the path (file name)
                filename = os.path.basename(file.name)

                # Split the base name into name and extension
                name, ext = os.path.splitext(filename)
                self.current_pdf = file.name 
                pdf_path=file.name
                #if ppt will get replaced with path of ppt!

                #if extension is .ppt or .pptx, convert 
                if ext == ".ppt" or ext == ".pptx": #need to test with a ppt key...
                    '''
                    import comtypes.client
                    powerpoint = comtypes.client.CreateObject("PowerPoint.Application")
                    powerpoint.Visible = 1
                    presentation = powerpoint.Presentations.Open(file)
                    output_file = os.path.splitext(file)[0] + '.pdf'
                    output_directory = os.path.dirname(file)
                    presentation.SaveAs(os.path.join(output_directory, output_file), 32)  # 32 is the formatType for PDF
                    presentation.Close()
                    powerpoint.Quit()
                    file = os.path.join(output_directory, output_file) #swap file to be used to the outputted pdf file instead
                    # Extract the last part of the path (file name)
                    name = os.path.basename(file)
                    # Split the base name into name and extension
                    name, ext = os.path.splitext(name)
                    print(name)
                    self.current_pdf = os.path.join(output_directory, output_file)
                    pdf_path = os.path.join(output_directory, output_file)'
                    '''
                    print("pptx not supported on spaces")


                # Replace spaces and hyphens with underscores in the name
                modified_filename = name.replace(" ", "_").replace("-", "_")

                id = modified_filename #if string cmi then serialize the name, test for later

                print(f"Uploading file: {id}, id: abc")
                middleware = Middleware(modified_filename, create_collection=True)
               
                
                pages = middleware.index(pdf_path, id=id, max_pages=max_pages)
                

                self.indexed_docs[id] = True
                
            #clear files for next consec upload after loop is complete
            files = []
            return f"Uploaded and extracted {len(pages)} pages"
        except Exception as e:
            return f"Error processing PDF: {str(e)}"
    

    def display_file_list(text):
        try:
        # Retrieve all entries in the specified directory
            directory_path = "pages"
            current_working_directory = os.getcwd()
            directory_path = os.path.join(current_working_directory, directory_path)
            entries = os.listdir(directory_path)
            # Filter out entries that are directories
            directories = [entry for entry in entries if os.path.isdir(os.path.join(directory_path, entry))]
            return directories
        except FileNotFoundError:
            return f"The directory {directory_path} does not exist."
        except PermissionError:
            return f"Permission denied to access {directory_path}."
        except Exception as e:
            return str(e)

    
    def search_documents(self, state, query, num_results=1):
        print(f"Searching for query: {query}")
        #id = generate_uuid(state)
        id = "test" # not used anyway
       
        """
        if not self.indexed_docs[id]:
            print("Please index documents first")
            return "Please index documents first", "--"
        """ #edited out to allow direct query on db to test persistency
        if not query:
            print("Please enter a search query")
            return "Please enter a search query", "--"            
        try:

            middleware = Middleware(id, create_collection=False)
            
            search_results = middleware.search([query])[0]
            #direct retrieve file path rather than rely on page nums!
            #try to retrieve multiple files rather than a single page (TBD)

            page_num = search_results[0][1] + 1 # final return value is a list of tuples, each tuple being: (score, doc_id, collection_name), so use [0][2] to get collection name of first ranked item
            coll_num = search_results[0][2]

            print(f"Retrieved page number: {page_num}")

            img_path = f"pages/{coll_num}/page_{page_num}.png"
            path = f"pages/{coll_num}/page_{page_num}"

            print(f"Retrieved image path: {img_path}")

            rag_response = rag.get_answer_from_gemini(query, [img_path])

            return path,img_path, rag_response
            
        except Exception as e:
            return f"Error during search: {str(e)}", "--"
        
    def delete(state,choice):
        #delete file in pages, then use middleware to delete collection 
        # 1. Create a milvus client

        client = MilvusClient(uri="localhost")
        #client = MilvusClient(
        #        uri="http://localhost:19530",
        #        token="root:Milvus"
        #    )
        path = f"pages/{choice}"
        if os.path.exists(path):
            shutil.rmtree(path)
            #call milvus manager to delete collection
            client.drop_collection(collection_name=choice)
            return f"Deleted {choice}"
        else:
            return "Directory not found"
        
    def list_downloaded_hf_models(state):
        # Determine the cache directory
        hf_cache_dir = Path(os.getenv('HF_HOME', Path.home() / '.cache/huggingface/hub'))

        # Initialize a list to store model names
        model_names = []

        # Traverse the cache directory
        for repo_dir in hf_cache_dir.glob('models--*'):
            # Extract the model name from the directory structure
            model_name = repo_dir.name.split('--', 1)[-1].replace('-', '/')
            model_names.append(model_name)

        return model_names


    def list_downloaded_ollama_models(state,):
        # Retrieve the current user's name
        username = getpass.getuser()
        
        # Construct the target directory path
        base_path = f"C:\\Users\\{username}\\NEW_PATH\\manifests\\registry.ollama.ai\\library"
        
        try:
            # List all entries in the directory
            with os.scandir(base_path) as entries:
                # Filter and print only directories
                directories = [entry.name for entry in entries if entry.is_dir()]
                
            return directories
        except FileNotFoundError:
            print(f"The directory {base_path} does not exist.")
        except PermissionError:
            print(f"Permission denied to access {base_path}.")
        except Exception as e:
            print(f"An error occurred: {e}")
            
    def model_settings(state,hfchoice, ollamachoice,tokensize):
        os.environ['colpali'] = hfchoice
        os.environ['ollama'] = ollamachoice
        os.environ['tokens'] = tokensize
        return "abc"



def create_ui():
    app = PDFSearchApp()
    
    with gr.Blocks(css="footer{display:none !important}") as demo:
        state = gr.State(value={"user_uuid": None})
        
  
        gr.Markdown("# Collar Multimodal RAG Demo")
        gr.Markdown("Made by Collar")   
        
        with gr.Tab("Upload PDF"):
            with gr.Column():
                max_pages_input = gr.Slider(
                    minimum=1,
                    maximum=10000,
                    value=20,
                    step=10,
                    label="Max pages to extract and index per document"
                )
                file_input = gr.Files(label="Upload PDFs")
                file_list = gr.Textbox(label="Uploaded Files", interactive=False, value=app.display_file_list())
                status = gr.Textbox(label="Indexing Status", interactive=False)
                
        
        with gr.Tab("Query"):
            with gr.Column():
                query_input = gr.Textbox(label="Enter query")
                #num_results = gr.Slider(
                #    minimum=1,
                #    maximum=10,
                #    value=5,
                #    step=1,
                #    label="Number of results"
                #)
                search_btn = gr.Button("Query")
                llm_answer = gr.Textbox(label="RAG Response", interactive=False)
                path = gr.Textbox(label="Link To Document Page", interactive=False)
                images = gr.Image(label="Top page matching query")
        with gr.Tab("Data Settings"): #deletion of collections, changing of model parameters etc
            with gr.Column():
                 # Button to delete (TBD)
                choice = gr.Dropdown(list(app.display_file_list()),label="Choice")
                delete_button = gr.Button("Delete Document From DB")
                status1 = gr.Textbox(label="Deletion Status", interactive=False)

        with gr.Tab("AI Model Settings"): #deletion of collections, changing of model parameters etc
            with gr.Column():
                 # Button to delete (TBD)
                hfchoice = gr.Dropdown(app.list_downloaded_hf_models(),label="Visual Document Retrieval (VDR) Model")
                ollamachoice = gr.Dropdown(app.list_downloaded_ollama_models(),label="Secondary Visual Retrieval-Augmented Generation (RAG) Model")
                tokensize = gr.Slider(
                    minimum=256,
                    maximum=4096,
                    value=20,
                    step=10,
                    label="Max tokens per response (Reply Length)"
                )
                model_button = gr.Button("Update Settings")
                status2 = gr.Textbox(label="Update Status", interactive=False)

        

        
        # Event handlers
        file_input.change(
            fn=app.upload_and_convert,
            inputs=[state, file_input, max_pages_input],
            outputs=[status]
        )
        
        search_btn.click(
         #try to query without uploading first
            fn= app.search_documents,
            inputs=[state, query_input],
            outputs=[path,images, llm_answer]
        )
    
        delete_button.click(
            fn=app.delete,
            inputs=[choice],
            outputs=[status1]
        )
        
        model_button.click(
            fn=app.model_settings,
            inputs=[hfchoice, ollamachoice,tokensize],
            outputs=[status2]
        )
        
    return demo

if __name__ == "__main__":
    demo = create_ui()
    demo.launch()