Spaces:
Running
Running
refactor process_input function for clearer file type detection and data processing
Browse files
app.py
CHANGED
@@ -162,63 +162,42 @@ def cleanup_temp_video(file_path):
|
|
162 |
print(f"Error cleaning up file: {e}")
|
163 |
|
164 |
|
165 |
-
def
|
166 |
-
"""
|
|
|
|
|
|
|
167 |
if isinstance(input_data, str):
|
168 |
# Check if it's a file path (contains file extension)
|
169 |
if any(ext in input_data.lower() for ext in ['.pdf', '.txt', '.docx', '.doc', '.epub']):
|
170 |
-
|
171 |
-
try:
|
172 |
-
print(f"Processing file path: {input_data}")
|
173 |
-
from document_parsing import DocumentParser
|
174 |
-
parser = DocumentParser()
|
175 |
-
extracted_text = parser.extract_text(input_data)
|
176 |
-
if extracted_text:
|
177 |
-
print(f"Extracted {len(extracted_text)} characters from file")
|
178 |
-
# Convert the extracted text to ASL gloss
|
179 |
-
gloss = asl_converter.asl_converter.convert_text(extracted_text)
|
180 |
-
print(f"Converted gloss: {gloss[:100]}...")
|
181 |
-
return gloss
|
182 |
-
else:
|
183 |
-
print("No text extracted from file")
|
184 |
-
return None
|
185 |
-
except Exception as e:
|
186 |
-
print(f"Error processing file path: {e}")
|
187 |
-
return None
|
188 |
else:
|
189 |
-
|
190 |
-
return input_data.strip()
|
191 |
elif isinstance(input_data, dict) and 'path' in input_data:
|
192 |
# This is a gradio.FileData object from API calls
|
193 |
-
|
194 |
-
print(f"Processing API file: {input_data['path']}")
|
195 |
-
|
196 |
-
# Read the file content from the blob path
|
197 |
-
with open(input_data['path'], 'rb') as f:
|
198 |
-
file_content = f.read()
|
199 |
-
|
200 |
-
# Check if it's a text file or binary document
|
201 |
-
if input_data.get('mime_type') == 'text/plain' or input_data['path'].endswith('.txt'):
|
202 |
-
# Text file - decode as text
|
203 |
-
text_content = file_content.decode('utf-8')
|
204 |
-
print(f"Extracted {len(text_content)} characters from text file")
|
205 |
-
# Convert text to ASL gloss
|
206 |
-
gloss = asl_converter.asl_converter.convert_text(text_content)
|
207 |
-
else:
|
208 |
-
# Binary document - use document converter
|
209 |
-
gloss = asl_converter.convert_document(input_data['path'])
|
210 |
-
|
211 |
-
print(f"Converted gloss: {gloss[:100]}...") # Show first 100 chars
|
212 |
-
return gloss
|
213 |
-
except Exception as e:
|
214 |
-
print(f"Error processing API file: {e}")
|
215 |
-
return None
|
216 |
elif hasattr(input_data, 'name'):
|
217 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
try:
|
219 |
-
print(f"Processing file: {
|
220 |
-
|
221 |
-
|
|
|
222 |
return gloss
|
223 |
except Exception as e:
|
224 |
print(f"Error processing file: {e}")
|
@@ -228,6 +207,7 @@ def process_input(input_data):
|
|
228 |
return None
|
229 |
|
230 |
|
|
|
231 |
async def parse_vectorize_and_search_unified(input_data):
|
232 |
"""
|
233 |
Unified function that handles both text and file inputs
|
@@ -411,21 +391,8 @@ def predict(text, file):
|
|
411 |
# Use text input
|
412 |
input_data = text.strip()
|
413 |
elif file is not None:
|
414 |
-
#
|
415 |
-
|
416 |
-
# This is a gradio.FileData object from API calls
|
417 |
-
print(f"Processing API file: {file}")
|
418 |
-
input_data = file
|
419 |
-
elif hasattr(file, 'name'):
|
420 |
-
# This is a regular file object
|
421 |
-
print(f"Processing regular file: {file.name}")
|
422 |
-
input_data = file
|
423 |
-
else:
|
424 |
-
print(f"Unknown file type: {type(file)}")
|
425 |
-
return {
|
426 |
-
"status": "error",
|
427 |
-
"message": "Unsupported file format"
|
428 |
-
}, None
|
429 |
else:
|
430 |
# No input provided
|
431 |
return {
|
|
|
162 |
print(f"Error cleaning up file: {e}")
|
163 |
|
164 |
|
165 |
+
def determine_input_type(input_data):
|
166 |
+
"""
|
167 |
+
Determine the type of input data and return a standardized format.
|
168 |
+
Returns: (input_type, processed_data) where input_type is 'text', 'file_path', or 'file_object'
|
169 |
+
"""
|
170 |
if isinstance(input_data, str):
|
171 |
# Check if it's a file path (contains file extension)
|
172 |
if any(ext in input_data.lower() for ext in ['.pdf', '.txt', '.docx', '.doc', '.epub']):
|
173 |
+
return 'file_path', input_data
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
else:
|
175 |
+
return 'text', input_data.strip()
|
|
|
176 |
elif isinstance(input_data, dict) and 'path' in input_data:
|
177 |
# This is a gradio.FileData object from API calls
|
178 |
+
return 'file_path', input_data['path']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
179 |
elif hasattr(input_data, 'name'):
|
180 |
+
# This is a regular file object
|
181 |
+
return 'file_path', input_data.name
|
182 |
+
else:
|
183 |
+
return 'unknown', None
|
184 |
+
|
185 |
+
|
186 |
+
def process_input(input_data):
|
187 |
+
"""
|
188 |
+
Extract text content from various input types.
|
189 |
+
Returns the text content ready for ASL conversion.
|
190 |
+
"""
|
191 |
+
input_type, processed_data = determine_input_type(input_data)
|
192 |
+
|
193 |
+
if input_type == 'text':
|
194 |
+
return processed_data
|
195 |
+
elif input_type == 'file_path':
|
196 |
try:
|
197 |
+
print(f"Processing file: {processed_data}")
|
198 |
+
# Use document converter for all file types
|
199 |
+
gloss = asl_converter.convert_document(processed_data)
|
200 |
+
print(f"Converted gloss: {gloss[:100]}...")
|
201 |
return gloss
|
202 |
except Exception as e:
|
203 |
print(f"Error processing file: {e}")
|
|
|
207 |
return None
|
208 |
|
209 |
|
210 |
+
|
211 |
async def parse_vectorize_and_search_unified(input_data):
|
212 |
"""
|
213 |
Unified function that handles both text and file inputs
|
|
|
391 |
# Use text input
|
392 |
input_data = text.strip()
|
393 |
elif file is not None:
|
394 |
+
# Use file input - let the centralized processor handle the type
|
395 |
+
input_data = file
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
396 |
else:
|
397 |
# No input provided
|
398 |
return {
|