tcy6 commited on
Commit
d111a09
·
1 Parent(s): 4784e2e

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -43
app.py CHANGED
@@ -58,58 +58,60 @@ def get_image_md5(img: Image.Image):
58
  hex_digest = hash_md5.hexdigest()
59
  return hex_digest
60
 
61
- def calculate_md5_from_pdf_path(pdf_file_path):
62
- hash_md5 = hashlib.md5()
63
- with open(pdf_file_path, "rb") as f:
64
- file_content = f.read()
65
- hash_md5.update(file_content)
66
- return hash_md5.hexdigest()
67
-
68
  @spaces.GPU
69
- def add_pdf_gradio(pdf_file_path, progress=gr.Progress()):
70
  global model, tokenizer
71
  model.eval()
72
- print(f"pdf_file_path: {pdf_file_path}")
73
-
74
- knowledge_base_name = calculate_md5_from_pdf_path(pdf_file_path)
75
 
 
 
76
  this_cache_dir = os.path.join(cache_dir, knowledge_base_name)
77
  os.makedirs(this_cache_dir, exist_ok=True)
 
78
 
79
- with open(os.path.join(this_cache_dir, f"src.pdf"), 'wb') as file1:
80
- with open(pdf_file_path, "rb") as file2:
81
- file1.write(file2.read())
 
82
 
83
- dpi = 200
84
- doc = fitz.open(pdf_file_path)
85
-
86
- reps_list = []
87
- images = []
88
- image_md5s = []
89
-
90
- for page in progress.tqdm(doc):
91
- # with self.lock: # because we hope one 16G gpu only process one image at the same time
92
- pix = page.get_pixmap(dpi=dpi)
93
- image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
94
- image_md5 = get_image_md5(image)
95
- image_md5s.append(image_md5)
96
- with torch.no_grad():
97
- reps = encode([image])
98
- reps_list.append(reps)
99
- images.append(image)
100
-
101
- for idx in range(len(images)):
102
- image = images[idx]
103
- image_md5 = image_md5s[idx]
104
- cache_image_path = os.path.join(this_cache_dir, f"{image_md5}.png")
105
- image.save(cache_image_path)
106
-
107
- np.save(os.path.join(this_cache_dir, f"reps.npy"), reps_list)
 
 
 
 
 
 
108
 
109
  with open(os.path.join(this_cache_dir, f"md5s.txt"), 'w') as f:
110
- for item in image_md5s:
111
  f.write(item+'\n')
112
-
113
  return knowledge_base_name
114
 
115
  @spaces.GPU
@@ -128,7 +130,8 @@ def retrieve_gradio(knowledge_base: str, query: str, topk: int):
128
  for line in f:
129
  md5s.append(line.rstrip('\n'))
130
 
131
- doc_reps = np.load(os.path.join(target_cache_dir, f"reps.npy"))
 
132
 
133
  query_with_instruction = "Represent this query for retrieving relevant document: " + query
134
  with torch.no_grad():
@@ -262,7 +265,7 @@ with gr.Blocks() as app:
262
  gr.Markdown("Thank you very much to [@bokesyo](https://huggingface.co/bokesyo) for writing the code.")
263
 
264
  with gr.Row():
265
- file_input = gr.File(file_types=["pdf"], label="Step 1: Upload PDF")
266
  file_result = gr.Text(label="Knowledge Base ID (remember it, it is re-usable!)")
267
  process_button = gr.Button("Process PDF (Don't click until PDF uploaded successfully)")
268
 
 
58
  hex_digest = hash_md5.hexdigest()
59
  return hex_digest
60
 
 
 
 
 
 
 
 
61
  @spaces.GPU
62
+ def add_pdf_gradio(pdf_file_list, progress=gr.Progress()):
63
  global model, tokenizer
64
  model.eval()
65
+
66
+ print(pdf_file_list)
 
67
 
68
+ pdf_file_list = sorted(pdf_file_list)
69
+ knowledge_base_name = str(int(time.time()))
70
  this_cache_dir = os.path.join(cache_dir, knowledge_base_name)
71
  os.makedirs(this_cache_dir, exist_ok=True)
72
+ global_image_md5s = []
73
 
74
+ for pdf_file_path in pdf_file_list:
75
+ with open(os.path.join(this_cache_dir, os.path.basename(pdf_file_path)), 'wb') as file1:
76
+ with open(pdf_file_path, "rb") as file2:
77
+ file1.write(file2.read())
78
 
79
+ for pdf_file_path in pdf_file_list:
80
+
81
+ print(f"Processing {pdf_file_path}")
82
+
83
+ dpi = 200
84
+ doc = fitz.open(pdf_file_path)
85
+
86
+ image_md5s = []
87
+ reps_list = []
88
+ images = []
89
+
90
+ for page in progress.tqdm(doc):
91
+ # with self.lock: # because we hope one 16G gpu only process one image at the same time
92
+ pix = page.get_pixmap(dpi=dpi)
93
+ image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
94
+ image_md5 = get_image_md5(image)
95
+ image_md5s.append(image_md5)
96
+ with torch.no_grad():
97
+ reps = encode([image])
98
+ reps_list.append(reps)
99
+ images.append(image)
100
+
101
+ for idx in range(len(images)):
102
+ image = images[idx]
103
+ image_md5 = image_md5s[idx]
104
+ cache_image_path = os.path.join(this_cache_dir, f"{image_md5}.png")
105
+ image.save(cache_image_path)
106
+
107
+ np.save(os.path.join(this_cache_dir, f"{os.path.basename(pdf_file_path).split('.')[0]}.npy"), reps_list)
108
+
109
+ global_image_md5s.extend(image_md5s)
110
 
111
  with open(os.path.join(this_cache_dir, f"md5s.txt"), 'w') as f:
112
+ for item in global_image_md5s:
113
  f.write(item+'\n')
114
+
115
  return knowledge_base_name
116
 
117
  @spaces.GPU
 
130
  for line in f:
131
  md5s.append(line.rstrip('\n'))
132
 
133
+ doc_list = [f for f in os.listdir(target_cache_dir) if f.endswith('.npy')]
134
+ doc_list = sorted(doc_list)
135
 
136
  query_with_instruction = "Represent this query for retrieving relevant document: " + query
137
  with torch.no_grad():
 
265
  gr.Markdown("Thank you very much to [@bokesyo](https://huggingface.co/bokesyo) for writing the code.")
266
 
267
  with gr.Row():
268
+ file_input = gr.File(file_types=["pdf"], file_count="multiple", label="Step 1: Upload PDF")
269
  file_result = gr.Text(label="Knowledge Base ID (remember it, it is re-usable!)")
270
  process_button = gr.Button("Process PDF (Don't click until PDF uploaded successfully)")
271