deenasun commited on
Commit
f37f939
·
1 Parent(s): 8da3927

fix for catching Gradio DataFile objects when they are passed from API calls as strings

Browse files
__pycache__/asl_gloss.cpython-311.pyc CHANGED
Binary files a/__pycache__/asl_gloss.cpython-311.pyc and b/__pycache__/asl_gloss.cpython-311.pyc differ
 
__pycache__/document_parsing.cpython-311.pyc CHANGED
Binary files a/__pycache__/document_parsing.cpython-311.pyc and b/__pycache__/document_parsing.cpython-311.pyc differ
 
app.py CHANGED
@@ -199,7 +199,8 @@ def cleanup_temp_video(file_path):
199
  def determine_input_type(input_data):
200
  """
201
  Determine the type of input data and return a standardized format.
202
- Returns: (input_type, processed_data) where input_type is 'text', 'file_path', or 'file_object'
 
203
  """
204
  if isinstance(input_data, str):
205
  # Check if it's a file path (contains file extension)
@@ -209,11 +210,20 @@ def determine_input_type(input_data):
209
  elif input_data.startswith('{') and 'gradio.FileData' in input_data:
210
  try:
211
  import ast
212
- # Safely evaluate the string as a dictionary
213
- file_data = ast.literal_eval(input_data)
 
 
 
 
 
 
214
  if isinstance(file_data, dict) and 'path' in file_data:
 
215
  return 'file_path', file_data['path']
216
- except (ValueError, SyntaxError):
 
 
217
  pass
218
  else:
219
  return 'text', input_data.strip()
@@ -255,9 +265,7 @@ def process_input(input_data):
255
  async def parse_vectorize_and_search_unified(input_data):
256
  """
257
  Unified function that handles both text and file inputs
258
- """
259
- print(f"Input type: {type(input_data)}")
260
-
261
  # Process the input to get gloss
262
  gloss = process_input(input_data)
263
  if not gloss:
@@ -356,7 +364,6 @@ def predict_unified(input_data):
356
  "message": "Please provide text or upload a document"
357
  }, None
358
 
359
- print("Input", input_data, type(input_data))
360
  # Use the unified processing function
361
  result = parse_vectorize_and_search_unified_sync(input_data)
362
 
@@ -444,7 +451,8 @@ def predict(text, file):
444
  "message": "Please provide either text or upload a file"
445
  }, None
446
 
447
- print("Input", input_data)
 
448
  # Process using the unified function
449
  return predict_unified(input_data)
450
 
 
199
  def determine_input_type(input_data):
200
  """
201
  Determine the type of input data and return a standardized format.
202
+ Returns: (input_type, processed_data) where input_type is 'text',
203
+ 'file_path', or 'file_object'
204
  """
205
  if isinstance(input_data, str):
206
  # Check if it's a file path (contains file extension)
 
210
  elif input_data.startswith('{') and 'gradio.FileData' in input_data:
211
  try:
212
  import ast
213
+ import json
214
+ # Try to parse as JSON first
215
+ try:
216
+ file_data = json.loads(input_data)
217
+ except json.JSONDecodeError:
218
+ # Fall back to ast.literal_eval for safer parsing
219
+ file_data = ast.literal_eval(input_data)
220
+
221
  if isinstance(file_data, dict) and 'path' in file_data:
222
+ print(f"Parsed FileData: {file_data}")
223
  return 'file_path', file_data['path']
224
+ except (ValueError, SyntaxError, json.JSONDecodeError) as e:
225
+ print(f"Error parsing FileData string: {e}")
226
+ print(f"Input data: {input_data}")
227
  pass
228
  else:
229
  return 'text', input_data.strip()
 
265
  async def parse_vectorize_and_search_unified(input_data):
266
  """
267
  Unified function that handles both text and file inputs
268
+ """
 
 
269
  # Process the input to get gloss
270
  gloss = process_input(input_data)
271
  if not gloss:
 
364
  "message": "Please provide text or upload a document"
365
  }, None
366
 
 
367
  # Use the unified processing function
368
  result = parse_vectorize_and_search_unified_sync(input_data)
369
 
 
451
  "message": "Please provide either text or upload a file"
452
  }, None
453
 
454
+ print("Input to the prediction function", input_data)
455
+ print("Input type:", type(input))
456
  # Process using the unified function
457
  return predict_unified(input_data)
458
 
asl_gloss.py CHANGED
@@ -10,7 +10,6 @@ that preserves the spatial and grammatical structure of ASL.
10
  import os
11
  import sys
12
  import argparse
13
- import json
14
  from typing import Optional, Dict, Any
15
  from pathlib import Path
16
 
 
10
  import os
11
  import sys
12
  import argparse
 
13
  from typing import Optional, Dict, Any
14
  from pathlib import Path
15
 
document_parsing.py CHANGED
@@ -66,7 +66,61 @@ class DocumentParser:
66
  '.doc': 'application/msword',
67
  '.epub': 'application/epub+zip'
68
  }
69
- return extension_map.get(extension, 'unknown')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  def extract_text(self, file_path: Union[str, Path]) -> Optional[str]:
72
  """
 
66
  '.doc': 'application/msword',
67
  '.epub': 'application/epub+zip'
68
  }
69
+
70
+ mime_type = extension_map.get(extension, 'unknown')
71
+
72
+ # If no extension or unknown extension, try to detect by content
73
+ if mime_type == 'unknown':
74
+ mime_type = self._detect_mime_by_content(file_path)
75
+
76
+ return mime_type
77
+
78
+ def _detect_mime_by_content(self, file_path: Union[str, Path]) -> str:
79
+ """
80
+ Detect MIME type by reading file content.
81
+
82
+ Args:
83
+ file_path: Path to the file
84
+
85
+ Returns:
86
+ MIME type string
87
+ """
88
+ try:
89
+ with open(file_path, 'rb') as f:
90
+ # Read first 1024 bytes to detect file type
91
+ header = f.read(1024)
92
+
93
+ # PDF detection
94
+ if header.startswith(b'%PDF'):
95
+ return 'application/pdf'
96
+
97
+ # ZIP-based formats (DOCX, EPUB)
98
+ if header.startswith(b'PK\x03\x04'):
99
+ # Check if it's EPUB by looking for mimetype file
100
+ try:
101
+ import zipfile
102
+ with zipfile.ZipFile(file_path, 'r') as zf:
103
+ if 'mimetype' in zf.namelist():
104
+ with zf.open('mimetype') as mf:
105
+ mimetype = mf.read().decode('utf-8').strip()
106
+ if mimetype == 'application/epub+zip':
107
+ return 'application/epub+zip'
108
+ # If not EPUB, assume DOCX
109
+ return 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
110
+ except:
111
+ pass
112
+
113
+ # Plain text detection (try to decode as UTF-8)
114
+ try:
115
+ header.decode('utf-8')
116
+ return 'text/plain'
117
+ except UnicodeDecodeError:
118
+ pass
119
+
120
+ except Exception as e:
121
+ logger.warning(f"Error detecting MIME type by content: {e}")
122
+
123
+ return 'unknown'
124
 
125
  def extract_text(self, file_path: Union[str, Path]) -> Optional[str]:
126
  """