Svngoku commited on
Commit
d8af6ac
·
verified ·
1 Parent(s): f4eeb19

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -6
app.py CHANGED
@@ -7,7 +7,7 @@ import re
7
  import base64
8
  import mimetypes
9
  from datasets import Dataset
10
- from huggingface_hub import HfApi, login, get_token
11
  import huggingface_hub
12
  import os
13
  from mistralai import Mistral
@@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
18
 
19
  # --- Mistral OCR Setup ---
20
  api_key = os.environ.get("MISTRAL_API_KEY")
21
- hf_token_global = None # Store HF token globally
22
  client = None
23
 
24
  if not api_key:
@@ -112,18 +112,33 @@ def perform_ocr_file(file_obj: Any) -> Tuple[str, str, Dict[str, str]]:
112
  uploaded_file_id = None
113
 
114
  if file_ext == '.pdf':
115
- with open(file_path, "rb") as f:
 
 
 
116
  logger.info(f"Uploading PDF {file_name} to Mistral...")
117
- uploaded_pdf = client.files.upload(file=(file_name, f), purpose="ocr")
 
 
 
 
 
 
118
  uploaded_file_id = uploaded_pdf.id
 
 
119
  signed_url_response = client.files.get_signed_url(file_id=uploaded_file_id)
120
  ocr_response = client.ocr.process(
121
  model="mistral-ocr-latest",
122
  document={"type": "document_url", "document_url": signed_url_response.url},
123
  include_image_base64=True
124
  )
125
- if uploaded_file_id:
126
- client.files.delete(file_id=uploaded_file_id)
 
 
 
 
127
 
128
  elif file_ext in ['.png', '.jpg', '.jpeg', '.webp', '.bmp']:
129
  with open(file_path, "rb") as f:
 
7
  import base64
8
  import mimetypes
9
  from datasets import Dataset
10
+ from huggingface_hub import HfApi, get_token
11
  import huggingface_hub
12
  import os
13
  from mistralai import Mistral
 
18
 
19
  # --- Mistral OCR Setup ---
20
  api_key = os.environ.get("MISTRAL_API_KEY")
21
+ hf_token_global = None
22
  client = None
23
 
24
  if not api_key:
 
112
  uploaded_file_id = None
113
 
114
  if file_ext == '.pdf':
115
+ try:
116
+ with open(file_path, "rb") as f:
117
+ file_content = f.read()
118
+
119
  logger.info(f"Uploading PDF {file_name} to Mistral...")
120
+ files = {
121
+ "file": (file_name, file_content, "application/pdf")
122
+ }
123
+ uploaded_pdf = client.files.upload(
124
+ file=files["file"],
125
+ purpose="ocr"
126
+ )
127
  uploaded_file_id = uploaded_pdf.id
128
+ logger.info(f"PDF uploaded successfully. File ID: {uploaded_file_id}")
129
+
130
  signed_url_response = client.files.get_signed_url(file_id=uploaded_file_id)
131
  ocr_response = client.ocr.process(
132
  model="mistral-ocr-latest",
133
  document={"type": "document_url", "document_url": signed_url_response.url},
134
  include_image_base64=True
135
  )
136
+ finally:
137
+ if uploaded_file_id:
138
+ try:
139
+ client.files.delete(file_id=uploaded_file_id)
140
+ except Exception as delete_err:
141
+ logger.warning(f"Failed to delete temporary file {uploaded_file_id}: {delete_err}")
142
 
143
  elif file_ext in ['.png', '.jpg', '.jpeg', '.webp', '.bmp']:
144
  with open(file_path, "rb") as f: