Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -11,11 +11,73 @@ from huggingface_hub import HfApi, get_token
|
|
11 |
import huggingface_hub
|
12 |
import os
|
13 |
from mistralai import Mistral
|
|
|
14 |
|
15 |
# Configure logging
|
16 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
17 |
logger = logging.getLogger(__name__)
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
# --- Mistral OCR Setup ---
|
20 |
api_key = os.environ.get("MISTRAL_API_KEY")
|
21 |
hf_token_global = None
|
@@ -286,12 +348,16 @@ def get_hf_token(explicit_token: str = None) -> str:
|
|
286 |
return None
|
287 |
|
288 |
def process_file_and_save(
|
289 |
-
file_objs:
|
290 |
strip_headers: bool, hf_token: str, repo_name: str
|
291 |
) -> str:
|
292 |
"""Orchestrates OCR, chunking, and saving to Hugging Face for multiple files."""
|
|
|
293 |
if not file_objs:
|
294 |
return "Error: No files uploaded."
|
|
|
|
|
|
|
295 |
if not repo_name or '/' not in repo_name:
|
296 |
return "Error: Invalid repository name (use 'username/dataset-name')."
|
297 |
|
@@ -443,6 +509,12 @@ with gr.Blocks(title="Mistral OCR & Dataset Creator",
|
|
443 |
gr.Markdown("*Requires MISTRAL_API_KEY or HF token*")
|
444 |
|
445 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
|
|
446 |
initial_token = get_hf_token()
|
447 |
if not initial_token and not client:
|
448 |
print("\nWARNING: Neither Mistral API key nor HF token found.")
|
@@ -452,4 +524,4 @@ if __name__ == "__main__":
|
|
452 |
share=os.getenv('GRADIO_SHARE', 'False').lower() == 'true',
|
453 |
debug=True,
|
454 |
auth_message="Provide a valid Hugging Face token if prompted"
|
455 |
-
)
|
|
|
11 |
import huggingface_hub
|
12 |
import os
|
13 |
from mistralai import Mistral
|
14 |
+
import gradio_client.utils as client_utils
|
15 |
|
16 |
# Configure logging
|
17 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
18 |
logger = logging.getLogger(__name__)
|
19 |
|
20 |
+
# --- Patch Gradio's get_type function to handle boolean schemas ---
|
21 |
+
def patched_get_type(schema: Any) -> str:
|
22 |
+
"""Patched version of get_type to handle boolean schemas."""
|
23 |
+
if isinstance(schema, bool):
|
24 |
+
return "bool"
|
25 |
+
if "const" in schema:
|
26 |
+
return f"Literal[{repr(schema['const'])}]"
|
27 |
+
if "enum" in schema:
|
28 |
+
return f"Literal[{', '.join(repr(v) for v in schema['enum'])}]"
|
29 |
+
if "type" not in schema:
|
30 |
+
return "Any"
|
31 |
+
type_ = schema["type"]
|
32 |
+
if isinstance(type_, list):
|
33 |
+
return f"Union[{', '.join(t for t in type_ if t != 'null')}]"
|
34 |
+
if type_ == "array":
|
35 |
+
items = schema.get("items", {})
|
36 |
+
return f"List[{patched_json_schema_to_python_type(items, schema.get('$defs'))}]"
|
37 |
+
if type_ == "object":
|
38 |
+
return "Dict[str, Any]"
|
39 |
+
if type_ == "null":
|
40 |
+
return "None"
|
41 |
+
if type_ == "integer":
|
42 |
+
return "int"
|
43 |
+
if type_ == "number":
|
44 |
+
return "float"
|
45 |
+
if type_ == "boolean":
|
46 |
+
return "bool"
|
47 |
+
return type_
|
48 |
+
|
49 |
+
def patched_json_schema_to_python_type(schema: Any, defs: Dict[str, Any] = None) -> str:
|
50 |
+
"""Patched version of json_schema_to_python_type to use patched_get_type."""
|
51 |
+
defs = defs or {}
|
52 |
+
if not schema:
|
53 |
+
return "Any"
|
54 |
+
if "$ref" in schema:
|
55 |
+
ref = schema["$ref"].split("/")[-1]
|
56 |
+
return patched_json_schema_to_python_type(defs.get(ref, {}), defs)
|
57 |
+
if "anyOf" in schema:
|
58 |
+
types = [
|
59 |
+
patched_json_schema_to_python_type(s, defs) for s in schema["anyOf"]
|
60 |
+
]
|
61 |
+
return f"Union[{', '.join(t for t in types if t != 'None')}]"
|
62 |
+
if "type" in schema and schema["type"] == "array":
|
63 |
+
items = schema.get("items", {})
|
64 |
+
elements = patched_json_schema_to_python_type(items, defs)
|
65 |
+
return f"List[{elements}]"
|
66 |
+
if "type" in schema and schema["type"] == "object":
|
67 |
+
if "properties" in schema:
|
68 |
+
des = [
|
69 |
+
f"{n}: {patched_json_schema_to_python_type(v, defs)}{client_utils.get_desc(v)}"
|
70 |
+
for n, v in schema["properties"].items()
|
71 |
+
]
|
72 |
+
return f"Dict[str, Union[{', '.join(des)}]]"
|
73 |
+
if "additionalProperties" in schema:
|
74 |
+
return f"Dict[str, {patched_json_schema_to_python_type(schema['additionalProperties'], defs)}]"
|
75 |
+
return "Dict[str, Any]"
|
76 |
+
return patched_get_type(schema)
|
77 |
+
|
78 |
+
# Override Gradio's json_schema_to_python_type
|
79 |
+
client_utils.json_schema_to_python_type = patched_json_schema_to_python_type
|
80 |
+
|
81 |
# --- Mistral OCR Setup ---
|
82 |
api_key = os.environ.get("MISTRAL_API_KEY")
|
83 |
hf_token_global = None
|
|
|
348 |
return None
|
349 |
|
350 |
def process_file_and_save(
|
351 |
+
file_objs: Any, chunk_size: int, chunk_overlap: int,
|
352 |
strip_headers: bool, hf_token: str, repo_name: str
|
353 |
) -> str:
|
354 |
"""Orchestrates OCR, chunking, and saving to Hugging Face for multiple files."""
|
355 |
+
# Handle case where file_objs is a single file or None
|
356 |
if not file_objs:
|
357 |
return "Error: No files uploaded."
|
358 |
+
if not isinstance(file_objs, list):
|
359 |
+
file_objs = [file_objs]
|
360 |
+
|
361 |
if not repo_name or '/' not in repo_name:
|
362 |
return "Error: Invalid repository name (use 'username/dataset-name')."
|
363 |
|
|
|
509 |
gr.Markdown("*Requires MISTRAL_API_KEY or HF token*")
|
510 |
|
511 |
if __name__ == "__main__":
|
512 |
+
import gradio
|
513 |
+
logger.info(f"Using Gradio version: {gradio.__version__}")
|
514 |
+
if not gradio.__version__.startswith("4."):
|
515 |
+
logger.warning("Gradio version is not 4.x. Updating to the latest version is recommended.")
|
516 |
+
print("Consider running: pip install --upgrade gradio")
|
517 |
+
|
518 |
initial_token = get_hf_token()
|
519 |
if not initial_token and not client:
|
520 |
print("\nWARNING: Neither Mistral API key nor HF token found.")
|
|
|
524 |
share=os.getenv('GRADIO_SHARE', 'False').lower() == 'true',
|
525 |
debug=True,
|
526 |
auth_message="Provide a valid Hugging Face token if prompted"
|
527 |
+
)
|