import time import os import re import gradio as gr from langchain.document_loaders import PyPDFLoader #from langchain.document_loaders import PyMuPDFLoader ABS_PATH = os.path.dirname(os.path.abspath(__file__)) DB_DIR = os.path.join(ABS_PATH, "db") on_load=""" async()=>{ console.log("HELLO"); } """ def get_documents(): return PyPDFLoader("doc1.pdf").load() """ loader = PyMuPDFLoader( "example.pdf", extract_images=True, ) docs = loader.load() """ def extract_pdfs(x, request: gr.Request, progress=gr.Progress()): progress(0, desc="Test", unit = "Files") print("request", request) # Delete existing index directory and recreate the directory if os.path.exists(DB_DIR): import shutil shutil.rmtree(DB_DIR, ignore_errors=True) os.mkdir(DB_DIR) documents = [] all_text = "" for num, doc in enumerate(progress.tqdm(get_documents())): print(" {num} DocPg : ", doc.page_content) doc.page_content = replace_newlines_and_spaces(doc.page_content) documents.append(doc) all_text += doc.page_content time.sleep(0.1) return documents, all_text def replace_newlines_and_spaces(text): # Replace all newline characters with spaces text = text.replace("\n", " ") # Replace multiple spaces with a single space text = re.sub(r'\s+', ' ', text) return text def test(x, request: gr.Request, progress=gr.Progress()): progress(0, desc="Test", unit = "Files") print("request", request) a = "abcdefghijklmnopqrstuv" for letter in progress.tqdm(a, desc = "TEST", unit = "Files"): time.sleep(0.1) return a with gr.Blocks() as demo: selected = gr.Dataframe( interactive=False, col_count=(1, "fixed"), headers=["Selected Files"], ) prog = gr.HTML( value="

Processing...

" ) #gr.Interface(test, inputs=[selected]) b = gr.Button() b.click(test, selected, prog) demo.load(extract_pdfs, inputs=None, outputs=[prog, selected]) #, _js=on_load) demo.launch()