ndurner commited on
Commit
fd9a199
·
1 Parent(s): 8b1250d

improve OCR quality

Browse files
Files changed (1) hide show
  1. llm.py +50 -16
llm.py CHANGED
@@ -118,31 +118,65 @@ class LLM:
118
  def _process_pdf_img(self, pdf_fn: str):
119
  pdf = fitz.open(pdf_fn)
120
  message_parts = []
 
 
 
 
121
 
122
  for page in pdf.pages():
123
- # Create a transformation matrix for rendering at the calculated scale
124
- mat = fitz.Matrix(0.6, 0.6)
125
-
126
- # Render the page to a pixmap
127
- pix = page.get_pixmap(matrix=mat, alpha=False)
 
 
128
 
129
- # Convert pixmap to PIL Image
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
131
 
132
- # Convert PIL Image to bytes
133
- img_byte_arr = io.BytesIO()
134
- img.save(img_byte_arr, format='PNG')
135
- img_byte_arr = img_byte_arr.getvalue()
136
-
137
- # Append the message parts
138
- message_parts.append({"text": f"Page {page.number} of file '{pdf_fn}'"})
 
 
 
 
 
 
139
  message_parts.append({"image": {
140
- "format": "png",
141
- "source": {"bytes": img_byte_arr}
142
  }})
143
 
144
  pdf.close()
145
-
146
  return message_parts
147
 
148
  def _encode_image(self, image_data):
 
118
  def _process_pdf_img(self, pdf_fn: str):
119
  pdf = fitz.open(pdf_fn)
120
  message_parts = []
121
+ page_scales = {} # Cache for similar page sizes
122
+
123
+ def calculate_tokens(width, height):
124
+ return (width * height) / 750
125
 
126
  for page in pdf.pages():
127
+ page_rect = page.rect
128
+ orig_width = page_rect.width
129
+ orig_height = page_rect.height
130
+ page_key = (orig_width, orig_height)
131
+
132
+ # Use cached scale as starting point if available
133
+ scale = page_scales.get(page_key, 1.0)
134
 
135
+ while True:
136
+ # Render with current scale
137
+ mat = fitz.Matrix(scale, scale)
138
+ pix = page.get_pixmap(matrix=mat, alpha=False)
139
+
140
+ # Check actual rendered dimensions
141
+ actual_tokens = calculate_tokens(pix.width, pix.height)
142
+ actual_long_edge = max(pix.width, pix.height)
143
+
144
+ if actual_long_edge <= 1568 and actual_tokens <= 1600:
145
+ # We found a good scale, cache it
146
+ if page_key not in page_scales:
147
+ page_scales[page_key] = scale
148
+ break
149
+
150
+ # Calculate new scale factor based on both constraints
151
+ if actual_long_edge > 1568:
152
+ scale_factor = min(1568 / actual_long_edge, 0.9)
153
+ else:
154
+ scale_factor = min(math.sqrt(1600 / actual_tokens), 0.9)
155
+
156
+ scale *= scale_factor
157
+
158
+ # Convert to PIL Image
159
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
160
 
161
+ # Handle compression
162
+ quality = 95
163
+ while True:
164
+ buffer = io.BytesIO()
165
+ img.save(buffer, format="webp", quality=quality)
166
+ img_bytes = buffer.getvalue()
167
+
168
+ if len(img_bytes) <= 5 * 1024 * 1024 or quality <= 20:
169
+ break
170
+
171
+ quality = max(int(quality * 0.9), 20)
172
+
173
+ message_parts.append({"text": f"Page {page.number + 1} of file '{pdf_fn}'"})
174
  message_parts.append({"image": {
175
+ "format": "webp",
176
+ "source": {"bytes": img_bytes}
177
  }})
178
 
179
  pdf.close()
 
180
  return message_parts
181
 
182
  def _encode_image(self, image_data):