deenasun commited on
Commit
26288e8
·
1 Parent(s): 505e15a

refactor process_input function for clearer file type detection and data processing

Browse files
Files changed (1) hide show
  1. app.py +31 -64
app.py CHANGED
@@ -162,63 +162,42 @@ def cleanup_temp_video(file_path):
162
  print(f"Error cleaning up file: {e}")
163
 
164
 
165
- def process_input(input_data):
166
- """Process input data to extract text for ASL conversion"""
 
 
 
167
  if isinstance(input_data, str):
168
  # Check if it's a file path (contains file extension)
169
  if any(ext in input_data.lower() for ext in ['.pdf', '.txt', '.docx', '.doc', '.epub']):
170
- # It's a file path - extract text directly
171
- try:
172
- print(f"Processing file path: {input_data}")
173
- from document_parsing import DocumentParser
174
- parser = DocumentParser()
175
- extracted_text = parser.extract_text(input_data)
176
- if extracted_text:
177
- print(f"Extracted {len(extracted_text)} characters from file")
178
- # Convert the extracted text to ASL gloss
179
- gloss = asl_converter.asl_converter.convert_text(extracted_text)
180
- print(f"Converted gloss: {gloss[:100]}...")
181
- return gloss
182
- else:
183
- print("No text extracted from file")
184
- return None
185
- except Exception as e:
186
- print(f"Error processing file path: {e}")
187
- return None
188
  else:
189
- # Direct text input
190
- return input_data.strip()
191
  elif isinstance(input_data, dict) and 'path' in input_data:
192
  # This is a gradio.FileData object from API calls
193
- try:
194
- print(f"Processing API file: {input_data['path']}")
195
-
196
- # Read the file content from the blob path
197
- with open(input_data['path'], 'rb') as f:
198
- file_content = f.read()
199
-
200
- # Check if it's a text file or binary document
201
- if input_data.get('mime_type') == 'text/plain' or input_data['path'].endswith('.txt'):
202
- # Text file - decode as text
203
- text_content = file_content.decode('utf-8')
204
- print(f"Extracted {len(text_content)} characters from text file")
205
- # Convert text to ASL gloss
206
- gloss = asl_converter.asl_converter.convert_text(text_content)
207
- else:
208
- # Binary document - use document converter
209
- gloss = asl_converter.convert_document(input_data['path'])
210
-
211
- print(f"Converted gloss: {gloss[:100]}...") # Show first 100 chars
212
- return gloss
213
- except Exception as e:
214
- print(f"Error processing API file: {e}")
215
- return None
216
  elif hasattr(input_data, 'name'):
217
- # File input - extract text from document
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  try:
219
- print(f"Processing file: {input_data.name}")
220
- gloss = asl_converter.convert_document(input_data.name)
221
- print(f"Converted gloss: {gloss[:100]}...") # Show first 100 chars
 
222
  return gloss
223
  except Exception as e:
224
  print(f"Error processing file: {e}")
@@ -228,6 +207,7 @@ def process_input(input_data):
228
  return None
229
 
230
 
 
231
  async def parse_vectorize_and_search_unified(input_data):
232
  """
233
  Unified function that handles both text and file inputs
@@ -411,21 +391,8 @@ def predict(text, file):
411
  # Use text input
412
  input_data = text.strip()
413
  elif file is not None:
414
- # Handle different file input types
415
- if isinstance(file, dict) and 'path' in file:
416
- # This is a gradio.FileData object from API calls
417
- print(f"Processing API file: {file}")
418
- input_data = file
419
- elif hasattr(file, 'name'):
420
- # This is a regular file object
421
- print(f"Processing regular file: {file.name}")
422
- input_data = file
423
- else:
424
- print(f"Unknown file type: {type(file)}")
425
- return {
426
- "status": "error",
427
- "message": "Unsupported file format"
428
- }, None
429
  else:
430
  # No input provided
431
  return {
 
162
  print(f"Error cleaning up file: {e}")
163
 
164
 
165
+ def determine_input_type(input_data):
166
+ """
167
+ Determine the type of input data and return a standardized format.
168
+ Returns: (input_type, processed_data) where input_type is 'text', 'file_path', or 'file_object'
169
+ """
170
  if isinstance(input_data, str):
171
  # Check if it's a file path (contains file extension)
172
  if any(ext in input_data.lower() for ext in ['.pdf', '.txt', '.docx', '.doc', '.epub']):
173
+ return 'file_path', input_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  else:
175
+ return 'text', input_data.strip()
 
176
  elif isinstance(input_data, dict) and 'path' in input_data:
177
  # This is a gradio.FileData object from API calls
178
+ return 'file_path', input_data['path']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  elif hasattr(input_data, 'name'):
180
+ # This is a regular file object
181
+ return 'file_path', input_data.name
182
+ else:
183
+ return 'unknown', None
184
+
185
+
186
+ def process_input(input_data):
187
+ """
188
+ Extract text content from various input types.
189
+ Returns the text content ready for ASL conversion.
190
+ """
191
+ input_type, processed_data = determine_input_type(input_data)
192
+
193
+ if input_type == 'text':
194
+ return processed_data
195
+ elif input_type == 'file_path':
196
  try:
197
+ print(f"Processing file: {processed_data}")
198
+ # Use document converter for all file types
199
+ gloss = asl_converter.convert_document(processed_data)
200
+ print(f"Converted gloss: {gloss[:100]}...")
201
  return gloss
202
  except Exception as e:
203
  print(f"Error processing file: {e}")
 
207
  return None
208
 
209
 
210
+
211
  async def parse_vectorize_and_search_unified(input_data):
212
  """
213
  Unified function that handles both text and file inputs
 
391
  # Use text input
392
  input_data = text.strip()
393
  elif file is not None:
394
+ # Use file input - let the centralized processor handle the type
395
+ input_data = file
 
 
 
 
 
 
 
 
 
 
 
 
 
396
  else:
397
  # No input provided
398
  return {