Svngoku commited on
Commit
2156f14
·
verified ·
1 Parent(s): 3165e1e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -2
app.py CHANGED
@@ -11,11 +11,73 @@ from huggingface_hub import HfApi, get_token
11
  import huggingface_hub
12
  import os
13
  from mistralai import Mistral
 
14
 
15
  # Configure logging
16
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
17
  logger = logging.getLogger(__name__)
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  # --- Mistral OCR Setup ---
20
  api_key = os.environ.get("MISTRAL_API_KEY")
21
  hf_token_global = None
@@ -286,12 +348,16 @@ def get_hf_token(explicit_token: str = None) -> str:
286
  return None
287
 
288
  def process_file_and_save(
289
- file_objs: List[Any], chunk_size: int, chunk_overlap: int,
290
  strip_headers: bool, hf_token: str, repo_name: str
291
  ) -> str:
292
  """Orchestrates OCR, chunking, and saving to Hugging Face for multiple files."""
 
293
  if not file_objs:
294
  return "Error: No files uploaded."
 
 
 
295
  if not repo_name or '/' not in repo_name:
296
  return "Error: Invalid repository name (use 'username/dataset-name')."
297
 
@@ -443,6 +509,12 @@ with gr.Blocks(title="Mistral OCR & Dataset Creator",
443
  gr.Markdown("*Requires MISTRAL_API_KEY or HF token*")
444
 
445
  if __name__ == "__main__":
 
 
 
 
 
 
446
  initial_token = get_hf_token()
447
  if not initial_token and not client:
448
  print("\nWARNING: Neither Mistral API key nor HF token found.")
@@ -452,4 +524,4 @@ if __name__ == "__main__":
452
  share=os.getenv('GRADIO_SHARE', 'False').lower() == 'true',
453
  debug=True,
454
  auth_message="Provide a valid Hugging Face token if prompted"
455
- )
 
11
  import huggingface_hub
12
  import os
13
  from mistralai import Mistral
14
+ import gradio_client.utils as client_utils
15
 
16
  # Configure logging
17
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
18
  logger = logging.getLogger(__name__)
19
 
20
+ # --- Patch Gradio's get_type function to handle boolean schemas ---
21
+ def patched_get_type(schema: Any) -> str:
22
+ """Patched version of get_type to handle boolean schemas."""
23
+ if isinstance(schema, bool):
24
+ return "bool"
25
+ if "const" in schema:
26
+ return f"Literal[{repr(schema['const'])}]"
27
+ if "enum" in schema:
28
+ return f"Literal[{', '.join(repr(v) for v in schema['enum'])}]"
29
+ if "type" not in schema:
30
+ return "Any"
31
+ type_ = schema["type"]
32
+ if isinstance(type_, list):
33
+ return f"Union[{', '.join(t for t in type_ if t != 'null')}]"
34
+ if type_ == "array":
35
+ items = schema.get("items", {})
36
+ return f"List[{patched_json_schema_to_python_type(items, schema.get('$defs'))}]"
37
+ if type_ == "object":
38
+ return "Dict[str, Any]"
39
+ if type_ == "null":
40
+ return "None"
41
+ if type_ == "integer":
42
+ return "int"
43
+ if type_ == "number":
44
+ return "float"
45
+ if type_ == "boolean":
46
+ return "bool"
47
+ return type_
48
+
49
+ def patched_json_schema_to_python_type(schema: Any, defs: Dict[str, Any] = None) -> str:
50
+ """Patched version of json_schema_to_python_type to use patched_get_type."""
51
+ defs = defs or {}
52
+ if not schema:
53
+ return "Any"
54
+ if "$ref" in schema:
55
+ ref = schema["$ref"].split("/")[-1]
56
+ return patched_json_schema_to_python_type(defs.get(ref, {}), defs)
57
+ if "anyOf" in schema:
58
+ types = [
59
+ patched_json_schema_to_python_type(s, defs) for s in schema["anyOf"]
60
+ ]
61
+ return f"Union[{', '.join(t for t in types if t != 'None')}]"
62
+ if "type" in schema and schema["type"] == "array":
63
+ items = schema.get("items", {})
64
+ elements = patched_json_schema_to_python_type(items, defs)
65
+ return f"List[{elements}]"
66
+ if "type" in schema and schema["type"] == "object":
67
+ if "properties" in schema:
68
+ des = [
69
+ f"{n}: {patched_json_schema_to_python_type(v, defs)}{client_utils.get_desc(v)}"
70
+ for n, v in schema["properties"].items()
71
+ ]
72
+ return f"Dict[str, Union[{', '.join(des)}]]"
73
+ if "additionalProperties" in schema:
74
+ return f"Dict[str, {patched_json_schema_to_python_type(schema['additionalProperties'], defs)}]"
75
+ return "Dict[str, Any]"
76
+ return patched_get_type(schema)
77
+
78
+ # Override Gradio's json_schema_to_python_type
79
+ client_utils.json_schema_to_python_type = patched_json_schema_to_python_type
80
+
81
  # --- Mistral OCR Setup ---
82
  api_key = os.environ.get("MISTRAL_API_KEY")
83
  hf_token_global = None
 
348
  return None
349
 
350
  def process_file_and_save(
351
+ file_objs: Any, chunk_size: int, chunk_overlap: int,
352
  strip_headers: bool, hf_token: str, repo_name: str
353
  ) -> str:
354
  """Orchestrates OCR, chunking, and saving to Hugging Face for multiple files."""
355
+ # Handle case where file_objs is a single file or None
356
  if not file_objs:
357
  return "Error: No files uploaded."
358
+ if not isinstance(file_objs, list):
359
+ file_objs = [file_objs]
360
+
361
  if not repo_name or '/' not in repo_name:
362
  return "Error: Invalid repository name (use 'username/dataset-name')."
363
 
 
509
  gr.Markdown("*Requires MISTRAL_API_KEY or HF token*")
510
 
511
  if __name__ == "__main__":
512
+ import gradio
513
+ logger.info(f"Using Gradio version: {gradio.__version__}")
514
+ if not gradio.__version__.startswith("4."):
515
+ logger.warning("Gradio version is not 4.x. Updating to the latest version is recommended.")
516
+ print("Consider running: pip install --upgrade gradio")
517
+
518
  initial_token = get_hf_token()
519
  if not initial_token and not client:
520
  print("\nWARNING: Neither Mistral API key nor HF token found.")
 
524
  share=os.getenv('GRADIO_SHARE', 'False').lower() == 'true',
525
  debug=True,
526
  auth_message="Provide a valid Hugging Face token if prompted"
527
+ )