File size: 9,037 Bytes
3ade8fc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 |
import requests
import logging
import os
import sys
from typing import List, Dict, Any
from langchain_core.documents import Document
from open_webui.env import SRC_LOG_LEVELS, GLOBAL_LOG_LEVEL
logging.basicConfig(stream=sys.stdout, level=GLOBAL_LOG_LEVEL)
log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["RAG"])
class MistralLoader:
"""
Loads documents by processing them through the Mistral OCR API.
"""
BASE_API_URL = "https://api.mistral.ai/v1"
def __init__(self, api_key: str, file_path: str):
"""
Initializes the loader.
Args:
api_key: Your Mistral API key.
file_path: The local path to the PDF file to process.
"""
if not api_key:
raise ValueError("API key cannot be empty.")
if not os.path.exists(file_path):
raise FileNotFoundError(f"File not found at {file_path}")
self.api_key = api_key
self.file_path = file_path
self.headers = {"Authorization": f"Bearer {self.api_key}"}
def _handle_response(self, response: requests.Response) -> Dict[str, Any]:
"""Checks response status and returns JSON content."""
try:
response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
# Handle potential empty responses for certain successful requests (e.g., DELETE)
if response.status_code == 204 or not response.content:
return {} # Return empty dict if no content
return response.json()
except requests.exceptions.HTTPError as http_err:
log.error(f"HTTP error occurred: {http_err} - Response: {response.text}")
raise
except requests.exceptions.RequestException as req_err:
log.error(f"Request exception occurred: {req_err}")
raise
except ValueError as json_err: # Includes JSONDecodeError
log.error(f"JSON decode error: {json_err} - Response: {response.text}")
raise # Re-raise after logging
def _upload_file(self) -> str:
"""Uploads the file to Mistral for OCR processing."""
log.info("Uploading file to Mistral API")
url = f"{self.BASE_API_URL}/files"
file_name = os.path.basename(self.file_path)
try:
with open(self.file_path, "rb") as f:
files = {"file": (file_name, f, "application/pdf")}
data = {"purpose": "ocr"}
upload_headers = self.headers.copy() # Avoid modifying self.headers
response = requests.post(
url, headers=upload_headers, files=files, data=data
)
response_data = self._handle_response(response)
file_id = response_data.get("id")
if not file_id:
raise ValueError("File ID not found in upload response.")
log.info(f"File uploaded successfully. File ID: {file_id}")
return file_id
except Exception as e:
log.error(f"Failed to upload file: {e}")
raise
def _get_signed_url(self, file_id: str) -> str:
"""Retrieves a temporary signed URL for the uploaded file."""
log.info(f"Getting signed URL for file ID: {file_id}")
url = f"{self.BASE_API_URL}/files/{file_id}/url"
params = {"expiry": 1}
signed_url_headers = {**self.headers, "Accept": "application/json"}
try:
response = requests.get(url, headers=signed_url_headers, params=params)
response_data = self._handle_response(response)
signed_url = response_data.get("url")
if not signed_url:
raise ValueError("Signed URL not found in response.")
log.info("Signed URL received.")
return signed_url
except Exception as e:
log.error(f"Failed to get signed URL: {e}")
raise
def _process_ocr(self, signed_url: str) -> Dict[str, Any]:
"""Sends the signed URL to the OCR endpoint for processing."""
log.info("Processing OCR via Mistral API")
url = f"{self.BASE_API_URL}/ocr"
ocr_headers = {
**self.headers,
"Content-Type": "application/json",
"Accept": "application/json",
}
payload = {
"model": "mistral-ocr-latest",
"document": {
"type": "document_url",
"document_url": signed_url,
},
"include_image_base64": False,
}
try:
response = requests.post(url, headers=ocr_headers, json=payload)
ocr_response = self._handle_response(response)
log.info("OCR processing done.")
log.debug("OCR response: %s", ocr_response)
return ocr_response
except Exception as e:
log.error(f"Failed during OCR processing: {e}")
raise
def _delete_file(self, file_id: str) -> None:
"""Deletes the file from Mistral storage."""
log.info(f"Deleting uploaded file ID: {file_id}")
url = f"{self.BASE_API_URL}/files/{file_id}"
# No specific Accept header needed, default or Authorization is usually sufficient
try:
response = requests.delete(url, headers=self.headers)
delete_response = self._handle_response(
response
) # Check status, ignore response body unless needed
log.info(
f"File deleted successfully: {delete_response}"
) # Log the response if available
except Exception as e:
# Log error but don't necessarily halt execution if deletion fails
log.error(f"Failed to delete file ID {file_id}: {e}")
# Depending on requirements, you might choose to raise the error here
def load(self) -> List[Document]:
"""
Executes the full OCR workflow: upload, get URL, process OCR, delete file.
Returns:
A list of Document objects, one for each page processed.
"""
file_id = None
try:
# 1. Upload file
file_id = self._upload_file()
# 2. Get Signed URL
signed_url = self._get_signed_url(file_id)
# 3. Process OCR
ocr_response = self._process_ocr(signed_url)
# 4. Process results
pages_data = ocr_response.get("pages")
if not pages_data:
log.warning("No pages found in OCR response.")
return [Document(page_content="No text content found", metadata={})]
documents = []
total_pages = len(pages_data)
for page_data in pages_data:
page_content = page_data.get("markdown")
page_index = page_data.get("index") # API uses 0-based index
if page_content is not None and page_index is not None:
documents.append(
Document(
page_content=page_content,
metadata={
"page": page_index, # 0-based index from API
"page_label": page_index
+ 1, # 1-based label for convenience
"total_pages": total_pages,
# Add other relevant metadata from page_data if available/needed
# e.g., page_data.get('width'), page_data.get('height')
},
)
)
else:
log.warning(
f"Skipping page due to missing 'markdown' or 'index'. Data: {page_data}"
)
if not documents:
# Case where pages existed but none had valid markdown/index
log.warning(
"OCR response contained pages, but none had valid content/index."
)
return [
Document(
page_content="No text content found in valid pages", metadata={}
)
]
return documents
except Exception as e:
log.error(f"An error occurred during the loading process: {e}")
# Return an empty list or a specific error document on failure
return [Document(page_content=f"Error during processing: {e}", metadata={})]
finally:
# 5. Delete file (attempt even if prior steps failed after upload)
if file_id:
try:
self._delete_file(file_id)
except Exception as del_e:
# Log deletion error, but don't overwrite original error if one occurred
log.error(
f"Cleanup error: Could not delete file ID {file_id}. Reason: {del_e}"
)
|