Spaces:
Runtime error
Runtime error
improve OCR quality
Browse files
llm.py
CHANGED
@@ -118,31 +118,65 @@ class LLM:
|
|
118 |
def _process_pdf_img(self, pdf_fn: str):
|
119 |
pdf = fitz.open(pdf_fn)
|
120 |
message_parts = []
|
|
|
|
|
|
|
|
|
121 |
|
122 |
for page in pdf.pages():
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
|
|
|
|
128 |
|
129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
131 |
|
132 |
-
#
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
message_parts.append({"image": {
|
140 |
-
"format": "
|
141 |
-
"source": {"bytes":
|
142 |
}})
|
143 |
|
144 |
pdf.close()
|
145 |
-
|
146 |
return message_parts
|
147 |
|
148 |
def _encode_image(self, image_data):
|
|
|
118 |
def _process_pdf_img(self, pdf_fn: str):
|
119 |
pdf = fitz.open(pdf_fn)
|
120 |
message_parts = []
|
121 |
+
page_scales = {} # Cache for similar page sizes
|
122 |
+
|
123 |
+
def calculate_tokens(width, height):
|
124 |
+
return (width * height) / 750
|
125 |
|
126 |
for page in pdf.pages():
|
127 |
+
page_rect = page.rect
|
128 |
+
orig_width = page_rect.width
|
129 |
+
orig_height = page_rect.height
|
130 |
+
page_key = (orig_width, orig_height)
|
131 |
+
|
132 |
+
# Use cached scale as starting point if available
|
133 |
+
scale = page_scales.get(page_key, 1.0)
|
134 |
|
135 |
+
while True:
|
136 |
+
# Render with current scale
|
137 |
+
mat = fitz.Matrix(scale, scale)
|
138 |
+
pix = page.get_pixmap(matrix=mat, alpha=False)
|
139 |
+
|
140 |
+
# Check actual rendered dimensions
|
141 |
+
actual_tokens = calculate_tokens(pix.width, pix.height)
|
142 |
+
actual_long_edge = max(pix.width, pix.height)
|
143 |
+
|
144 |
+
if actual_long_edge <= 1568 and actual_tokens <= 1600:
|
145 |
+
# We found a good scale, cache it
|
146 |
+
if page_key not in page_scales:
|
147 |
+
page_scales[page_key] = scale
|
148 |
+
break
|
149 |
+
|
150 |
+
# Calculate new scale factor based on both constraints
|
151 |
+
if actual_long_edge > 1568:
|
152 |
+
scale_factor = min(1568 / actual_long_edge, 0.9)
|
153 |
+
else:
|
154 |
+
scale_factor = min(math.sqrt(1600 / actual_tokens), 0.9)
|
155 |
+
|
156 |
+
scale *= scale_factor
|
157 |
+
|
158 |
+
# Convert to PIL Image
|
159 |
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
160 |
|
161 |
+
# Handle compression
|
162 |
+
quality = 95
|
163 |
+
while True:
|
164 |
+
buffer = io.BytesIO()
|
165 |
+
img.save(buffer, format="webp", quality=quality)
|
166 |
+
img_bytes = buffer.getvalue()
|
167 |
+
|
168 |
+
if len(img_bytes) <= 5 * 1024 * 1024 or quality <= 20:
|
169 |
+
break
|
170 |
+
|
171 |
+
quality = max(int(quality * 0.9), 20)
|
172 |
+
|
173 |
+
message_parts.append({"text": f"Page {page.number + 1} of file '{pdf_fn}'"})
|
174 |
message_parts.append({"image": {
|
175 |
+
"format": "webp",
|
176 |
+
"source": {"bytes": img_bytes}
|
177 |
}})
|
178 |
|
179 |
pdf.close()
|
|
|
180 |
return message_parts
|
181 |
|
182 |
def _encode_image(self, image_data):
|