Spaces:

tcy6
/

VisRAG_Pipeline

Running

App Files Files Community

tcy6 commited on Nov 11, 2024

Commit

d111a09

1 Parent(s): 4784e2e

Upload app.py

Browse files

Files changed (1) hide show

app.py +46 -43

app.py CHANGED Viewed

@@ -58,58 +58,60 @@ def get_image_md5(img: Image.Image):
     hex_digest = hash_md5.hexdigest()
     return hex_digest
-def calculate_md5_from_pdf_path(pdf_file_path):
-    hash_md5 = hashlib.md5()
-    with open(pdf_file_path, "rb") as f:
-        file_content = f.read()
-        hash_md5.update(file_content)
-    return hash_md5.hexdigest()
 @spaces.GPU
-def add_pdf_gradio(pdf_file_path, progress=gr.Progress()):
     global model, tokenizer
     model.eval()
-    print(f"pdf_file_path: {pdf_file_path}")
-    knowledge_base_name = calculate_md5_from_pdf_path(pdf_file_path)
     this_cache_dir = os.path.join(cache_dir, knowledge_base_name)
     os.makedirs(this_cache_dir, exist_ok=True)
-    with open(os.path.join(this_cache_dir, f"src.pdf"), 'wb') as file1:
-        with open(pdf_file_path, "rb") as file2:
-            file1.write(file2.read())
-    dpi = 200
-    doc = fitz.open(pdf_file_path)
-    reps_list = []
-    images = []
-    image_md5s = []
-    for page in progress.tqdm(doc):
-        # with self.lock: # because we hope one 16G gpu only process one image at the same time
-        pix = page.get_pixmap(dpi=dpi)
-        image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
-        image_md5 = get_image_md5(image)
-        image_md5s.append(image_md5)
-        with torch.no_grad():
-            reps = encode([image])
-        reps_list.append(reps)
-        images.append(image)
-    for idx in range(len(images)):
-        image = images[idx]
-        image_md5 = image_md5s[idx]
-        cache_image_path = os.path.join(this_cache_dir, f"{image_md5}.png")
-        image.save(cache_image_path)
-    np.save(os.path.join(this_cache_dir, f"reps.npy"), reps_list)
     with open(os.path.join(this_cache_dir, f"md5s.txt"), 'w') as f:
-        for item in image_md5s:
             f.write(item+'\n')
     return knowledge_base_name
 @spaces.GPU
@@ -128,7 +130,8 @@ def retrieve_gradio(knowledge_base: str, query: str, topk: int):
         for line in f:
             md5s.append(line.rstrip('\n'))
-    doc_reps = np.load(os.path.join(target_cache_dir, f"reps.npy"))
     query_with_instruction = "Represent this query for retrieving relevant document: " + query
     with torch.no_grad():
@@ -262,7 +265,7 @@ with gr.Blocks() as app:
     gr.Markdown("Thank you very much to [@bokesyo](https://huggingface.co/bokesyo) for writing the code.")
     with gr.Row():
-        file_input = gr.File(file_types=["pdf"], label="Step 1: Upload PDF")
         file_result = gr.Text(label="Knowledge Base ID (remember it, it is re-usable!)")
         process_button = gr.Button("Process PDF (Don't click until PDF uploaded successfully)")

     hex_digest = hash_md5.hexdigest()
     return hex_digest
 @spaces.GPU
+def add_pdf_gradio(pdf_file_list, progress=gr.Progress()):
     global model, tokenizer
     model.eval()
+    print(pdf_file_list)
+    pdf_file_list = sorted(pdf_file_list)
+    knowledge_base_name = str(int(time.time()))
     this_cache_dir = os.path.join(cache_dir, knowledge_base_name)
     os.makedirs(this_cache_dir, exist_ok=True)
+    global_image_md5s = []
+    for pdf_file_path in pdf_file_list:
+        with open(os.path.join(this_cache_dir, os.path.basename(pdf_file_path)), 'wb') as file1:
+            with open(pdf_file_path, "rb") as file2:
+                file1.write(file2.read())
+    for pdf_file_path in pdf_file_list:
+        print(f"Processing {pdf_file_path}")
+        dpi = 200
+        doc = fitz.open(pdf_file_path)
+        image_md5s = []
+        reps_list = []
+        images = []
+        for page in progress.tqdm(doc):
+            # with self.lock: # because we hope one 16G gpu only process one image at the same time
+            pix = page.get_pixmap(dpi=dpi)
+            image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            image_md5 = get_image_md5(image)
+            image_md5s.append(image_md5)
+            with torch.no_grad():
+                reps = encode([image])
+            reps_list.append(reps)
+            images.append(image)
+        for idx in range(len(images)):
+            image = images[idx]
+            image_md5 = image_md5s[idx]
+            cache_image_path = os.path.join(this_cache_dir, f"{image_md5}.png")
+            image.save(cache_image_path)
+        np.save(os.path.join(this_cache_dir, f"{os.path.basename(pdf_file_path).split('.')[0]}.npy"), reps_list)
+        global_image_md5s.extend(image_md5s)
     with open(os.path.join(this_cache_dir, f"md5s.txt"), 'w') as f:
+        for item in global_image_md5s:
             f.write(item+'\n')
     return knowledge_base_name
 @spaces.GPU
         for line in f:
             md5s.append(line.rstrip('\n'))
+    doc_list = [f for f in os.listdir(target_cache_dir) if f.endswith('.npy')]
+    doc_list = sorted(doc_list)
     query_with_instruction = "Represent this query for retrieving relevant document: " + query
     with torch.no_grad():
     gr.Markdown("Thank you very much to [@bokesyo](https://huggingface.co/bokesyo) for writing the code.")
     with gr.Row():
+        file_input = gr.File(file_types=["pdf"], file_count="multiple", label="Step 1: Upload PDF")
         file_result = gr.Text(label="Knowledge Base ID (remember it, it is re-usable!)")
         process_button = gr.Button("Process PDF (Don't click until PDF uploaded successfully)")