Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -7,7 +7,7 @@ import re
|
|
7 |
import base64
|
8 |
import mimetypes
|
9 |
from datasets import Dataset
|
10 |
-
from huggingface_hub import HfApi,
|
11 |
import huggingface_hub
|
12 |
import os
|
13 |
from mistralai import Mistral
|
@@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
|
|
18 |
|
19 |
# --- Mistral OCR Setup ---
|
20 |
api_key = os.environ.get("MISTRAL_API_KEY")
|
21 |
-
hf_token_global = None
|
22 |
client = None
|
23 |
|
24 |
if not api_key:
|
@@ -112,18 +112,33 @@ def perform_ocr_file(file_obj: Any) -> Tuple[str, str, Dict[str, str]]:
|
|
112 |
uploaded_file_id = None
|
113 |
|
114 |
if file_ext == '.pdf':
|
115 |
-
|
|
|
|
|
|
|
116 |
logger.info(f"Uploading PDF {file_name} to Mistral...")
|
117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
uploaded_file_id = uploaded_pdf.id
|
|
|
|
|
119 |
signed_url_response = client.files.get_signed_url(file_id=uploaded_file_id)
|
120 |
ocr_response = client.ocr.process(
|
121 |
model="mistral-ocr-latest",
|
122 |
document={"type": "document_url", "document_url": signed_url_response.url},
|
123 |
include_image_base64=True
|
124 |
)
|
125 |
-
|
126 |
-
|
|
|
|
|
|
|
|
|
127 |
|
128 |
elif file_ext in ['.png', '.jpg', '.jpeg', '.webp', '.bmp']:
|
129 |
with open(file_path, "rb") as f:
|
|
|
7 |
import base64
|
8 |
import mimetypes
|
9 |
from datasets import Dataset
|
10 |
+
from huggingface_hub import HfApi, get_token
|
11 |
import huggingface_hub
|
12 |
import os
|
13 |
from mistralai import Mistral
|
|
|
18 |
|
19 |
# --- Mistral OCR Setup ---
|
20 |
api_key = os.environ.get("MISTRAL_API_KEY")
|
21 |
+
hf_token_global = None
|
22 |
client = None
|
23 |
|
24 |
if not api_key:
|
|
|
112 |
uploaded_file_id = None
|
113 |
|
114 |
if file_ext == '.pdf':
|
115 |
+
try:
|
116 |
+
with open(file_path, "rb") as f:
|
117 |
+
file_content = f.read()
|
118 |
+
|
119 |
logger.info(f"Uploading PDF {file_name} to Mistral...")
|
120 |
+
files = {
|
121 |
+
"file": (file_name, file_content, "application/pdf")
|
122 |
+
}
|
123 |
+
uploaded_pdf = client.files.upload(
|
124 |
+
file=files["file"],
|
125 |
+
purpose="ocr"
|
126 |
+
)
|
127 |
uploaded_file_id = uploaded_pdf.id
|
128 |
+
logger.info(f"PDF uploaded successfully. File ID: {uploaded_file_id}")
|
129 |
+
|
130 |
signed_url_response = client.files.get_signed_url(file_id=uploaded_file_id)
|
131 |
ocr_response = client.ocr.process(
|
132 |
model="mistral-ocr-latest",
|
133 |
document={"type": "document_url", "document_url": signed_url_response.url},
|
134 |
include_image_base64=True
|
135 |
)
|
136 |
+
finally:
|
137 |
+
if uploaded_file_id:
|
138 |
+
try:
|
139 |
+
client.files.delete(file_id=uploaded_file_id)
|
140 |
+
except Exception as delete_err:
|
141 |
+
logger.warning(f"Failed to delete temporary file {uploaded_file_id}: {delete_err}")
|
142 |
|
143 |
elif file_ext in ['.png', '.jpg', '.jpeg', '.webp', '.bmp']:
|
144 |
with open(file_path, "rb") as f:
|