Spaces:

Svngoku
/

PDF2Dataset

Running

App Files Files Community

Svngoku commited on May 16

Commit

2156f14

verified ·

1 Parent(s): 3165e1e

Update app.py

Browse files

Files changed (1) hide show

app.py +74 -2

app.py CHANGED Viewed

@@ -11,11 +11,73 @@ from huggingface_hub import HfApi, get_token
 import huggingface_hub
 import os
 from mistralai import Mistral
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 # --- Mistral OCR Setup ---
 api_key = os.environ.get("MISTRAL_API_KEY")
 hf_token_global = None
@@ -286,12 +348,16 @@ def get_hf_token(explicit_token: str = None) -> str:
     return None
 def process_file_and_save(
-    file_objs: List[Any], chunk_size: int, chunk_overlap: int,
     strip_headers: bool, hf_token: str, repo_name: str
 ) -> str:
     """Orchestrates OCR, chunking, and saving to Hugging Face for multiple files."""
     if not file_objs:
         return "Error: No files uploaded."
     if not repo_name or '/' not in repo_name:
         return "Error: Invalid repository name (use 'username/dataset-name')."
@@ -443,6 +509,12 @@ with gr.Blocks(title="Mistral OCR & Dataset Creator",
     gr.Markdown("*Requires MISTRAL_API_KEY or HF token*")
 if __name__ == "__main__":
     initial_token = get_hf_token()
     if not initial_token and not client:
         print("\nWARNING: Neither Mistral API key nor HF token found.")
@@ -452,4 +524,4 @@ if __name__ == "__main__":
         share=os.getenv('GRADIO_SHARE', 'False').lower() == 'true',
         debug=True,
         auth_message="Provide a valid Hugging Face token if prompted"
-    )

 import huggingface_hub
 import os
 from mistralai import Mistral
+import gradio_client.utils as client_utils
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
+# --- Patch Gradio's get_type function to handle boolean schemas ---
+def patched_get_type(schema: Any) -> str:
+    """Patched version of get_type to handle boolean schemas."""
+    if isinstance(schema, bool):
+        return "bool"
+    if "const" in schema:
+        return f"Literal[{repr(schema['const'])}]"
+    if "enum" in schema:
+        return f"Literal[{', '.join(repr(v) for v in schema['enum'])}]"
+    if "type" not in schema:
+        return "Any"
+    type_ = schema["type"]
+    if isinstance(type_, list):
+        return f"Union[{', '.join(t for t in type_ if t != 'null')}]"
+    if type_ == "array":
+        items = schema.get("items", {})
+        return f"List[{patched_json_schema_to_python_type(items, schema.get('$defs'))}]"
+    if type_ == "object":
+        return "Dict[str, Any]"
+    if type_ == "null":
+        return "None"
+    if type_ == "integer":
+        return "int"
+    if type_ == "number":
+        return "float"
+    if type_ == "boolean":
+        return "bool"
+    return type_
+def patched_json_schema_to_python_type(schema: Any, defs: Dict[str, Any] = None) -> str:
+    """Patched version of json_schema_to_python_type to use patched_get_type."""
+    defs = defs or {}
+    if not schema:
+        return "Any"
+    if "$ref" in schema:
+        ref = schema["$ref"].split("/")[-1]
+        return patched_json_schema_to_python_type(defs.get(ref, {}), defs)
+    if "anyOf" in schema:
+        types = [
+            patched_json_schema_to_python_type(s, defs) for s in schema["anyOf"]
+        ]
+        return f"Union[{', '.join(t for t in types if t != 'None')}]"
+    if "type" in schema and schema["type"] == "array":
+        items = schema.get("items", {})
+        elements = patched_json_schema_to_python_type(items, defs)
+        return f"List[{elements}]"
+    if "type" in schema and schema["type"] == "object":
+        if "properties" in schema:
+            des = [
+                f"{n}: {patched_json_schema_to_python_type(v, defs)}{client_utils.get_desc(v)}"
+                for n, v in schema["properties"].items()
+            ]
+            return f"Dict[str, Union[{', '.join(des)}]]"
+        if "additionalProperties" in schema:
+            return f"Dict[str, {patched_json_schema_to_python_type(schema['additionalProperties'], defs)}]"
+        return "Dict[str, Any]"
+    return patched_get_type(schema)
+# Override Gradio's json_schema_to_python_type
+client_utils.json_schema_to_python_type = patched_json_schema_to_python_type
 # --- Mistral OCR Setup ---
 api_key = os.environ.get("MISTRAL_API_KEY")
 hf_token_global = None
     return None
 def process_file_and_save(
+    file_objs: Any, chunk_size: int, chunk_overlap: int,
     strip_headers: bool, hf_token: str, repo_name: str
 ) -> str:
     """Orchestrates OCR, chunking, and saving to Hugging Face for multiple files."""
+    # Handle case where file_objs is a single file or None
     if not file_objs:
         return "Error: No files uploaded."
+    if not isinstance(file_objs, list):
+        file_objs = [file_objs]
     if not repo_name or '/' not in repo_name:
         return "Error: Invalid repository name (use 'username/dataset-name')."
     gr.Markdown("*Requires MISTRAL_API_KEY or HF token*")
 if __name__ == "__main__":
+    import gradio
+    logger.info(f"Using Gradio version: {gradio.__version__}")
+    if not gradio.__version__.startswith("4."):
+        logger.warning("Gradio version is not 4.x. Updating to the latest version is recommended.")
+        print("Consider running: pip install --upgrade gradio")
     initial_token = get_hf_token()
     if not initial_token and not client:
         print("\nWARNING: Neither Mistral API key nor HF token found.")
         share=os.getenv('GRADIO_SHARE', 'False').lower() == 'true',
         debug=True,
         auth_message="Provide a valid Hugging Face token if prompted"
+    )