Spaces:
Sleeping
Sleeping
enhance VLM and fix bugs with step extraciton
Browse files- app.py +1 -1
- mini_agents.py +1 -1
- vlm_tools.py +69 -9
app.py
CHANGED
@@ -53,7 +53,7 @@ class BasicAgent:
|
|
53 |
fixed_answer = self.agent.run(question)
|
54 |
|
55 |
# Log steps
|
56 |
-
all_steps = self.agent.master_agent.memory.
|
57 |
for step in all_steps:
|
58 |
if isinstance(step, ActionStep):
|
59 |
step_class = "ActionStep"
|
|
|
53 |
fixed_answer = self.agent.run(question)
|
54 |
|
55 |
# Log steps
|
56 |
+
all_steps = self.agent.master_agent.memory.steps
|
57 |
for step in all_steps:
|
58 |
if isinstance(step, ActionStep):
|
59 |
step_class = "ActionStep"
|
mini_agents.py
CHANGED
@@ -49,7 +49,7 @@ AUTHORIZED_IMPORTS = [
|
|
49 |
# Data processing
|
50 |
"numpy", "pandas", "sklearn", "scipy", "math", "hmmlearn",
|
51 |
# File handling
|
52 |
-
"base64", "io", "json", "os", "pickle",
|
53 |
# Visualization
|
54 |
"pyplot", "matplotlib", "matplotlib.pyplot",
|
55 |
# Utilities
|
|
|
49 |
# Data processing
|
50 |
"numpy", "pandas", "sklearn", "scipy", "math", "hmmlearn",
|
51 |
# File handling
|
52 |
+
"base64", "io", "json", "os", "pickle", "openpyxl", "pyxlsb"
|
53 |
# Visualization
|
54 |
"pyplot", "matplotlib", "matplotlib.pyplot",
|
55 |
# Utilities
|
vlm_tools.py
CHANGED
@@ -304,12 +304,40 @@ class ObjectDetectionTool(Tool):
|
|
304 |
name = "object_detection"
|
305 |
description = """
|
306 |
Detect objects in a list of images.
|
307 |
-
|
308 |
-
|
309 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
310 |
"""
|
311 |
inputs = {
|
312 |
-
"images": {
|
|
|
|
|
|
|
313 |
}
|
314 |
output_type = "any"
|
315 |
|
@@ -390,14 +418,46 @@ class ObjectDetectionTool(Tool):
|
|
390 |
|
391 |
class OCRTool(Tool):
|
392 |
description = """
|
393 |
-
Scan an image for text.
|
394 |
-
|
395 |
-
|
396 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
397 |
"""
|
398 |
name = "ocr_scan"
|
399 |
inputs = {
|
400 |
-
"images": {
|
|
|
|
|
|
|
401 |
}
|
402 |
output_type = "any"
|
403 |
|
|
|
304 |
name = "object_detection"
|
305 |
description = """
|
306 |
Detect objects in a list of images.
|
307 |
+
|
308 |
+
Input Requirements:
|
309 |
+
- Input must be a list of images, where each image is a base64-encoded string
|
310 |
+
- Each base64 string must be properly padded (length must be a multiple of 4)
|
311 |
+
- Images will be resized to 416x416 pixels during processing
|
312 |
+
- Images should be in RGB or BGR format (3 channels)
|
313 |
+
- Supported image formats: JPG, PNG
|
314 |
+
|
315 |
+
Processing:
|
316 |
+
- Images are automatically resized to 416x416
|
317 |
+
- Images are normalized to [0,1] range
|
318 |
+
- Model expects input shape: [1, 3, 416, 416] (batch, channels, height, width)
|
319 |
+
|
320 |
+
Output:
|
321 |
+
- Returns a list of detected objects for each image
|
322 |
+
- Each detection includes: (label, confidence, bounding_box)
|
323 |
+
- Bounding boxes are in format: [x, y, width, height]
|
324 |
+
- Confidence threshold: 0.5
|
325 |
+
- NMS threshold: 0.4
|
326 |
+
|
327 |
+
Example input format:
|
328 |
+
["base64_encoded_image1", "base64_encoded_image2"]
|
329 |
+
|
330 |
+
Example output format:
|
331 |
+
[
|
332 |
+
[("person", 0.95, [100, 200, 50, 100]), ("car", 0.88, [300, 400, 80, 60])], # detections for image1
|
333 |
+
[("dog", 0.92, [150, 250, 40, 80])] # detections for image2
|
334 |
+
]
|
335 |
"""
|
336 |
inputs = {
|
337 |
+
"images": {
|
338 |
+
"type": "any",
|
339 |
+
"description": "List of base64-encoded images. Each image must be a valid base64 string with proper padding (length multiple of 4). Images will be resized to 416x416."
|
340 |
+
}
|
341 |
}
|
342 |
output_type = "any"
|
343 |
|
|
|
418 |
|
419 |
class OCRTool(Tool):
|
420 |
description = """
|
421 |
+
Scan an image for text using OCR (Optical Character Recognition).
|
422 |
+
|
423 |
+
Input Requirements:
|
424 |
+
- Input must be a list of images, where each image is a base64-encoded string
|
425 |
+
- Each base64 string must be properly padded (length must be a multiple of 4)
|
426 |
+
- Images should be in RGB or BGR format (3 channels)
|
427 |
+
- Supported image formats: JPG, PNG
|
428 |
+
- For best results:
|
429 |
+
* Text should be clear and well-lit
|
430 |
+
* Image should have good contrast
|
431 |
+
* Text should be properly oriented
|
432 |
+
* Avoid blurry or distorted images
|
433 |
+
|
434 |
+
Processing:
|
435 |
+
- Uses Tesseract OCR engine
|
436 |
+
- Automatically handles text orientation
|
437 |
+
- Supports multiple languages (default: English)
|
438 |
+
- Processes each image independently
|
439 |
+
|
440 |
+
Output:
|
441 |
+
- Returns a list of text strings, one for each input image
|
442 |
+
- Empty string is returned if no text is detected
|
443 |
+
- Text is returned in the order it appears in the image
|
444 |
+
- Line breaks are preserved in the output
|
445 |
+
|
446 |
+
Example input format:
|
447 |
+
["base64_encoded_image1", "base64_encoded_image2"]
|
448 |
+
|
449 |
+
Example output format:
|
450 |
+
[
|
451 |
+
"This is text from image 1\nSecond line of text", # text from image1
|
452 |
+
"Text from image 2" # text from image2
|
453 |
+
]
|
454 |
"""
|
455 |
name = "ocr_scan"
|
456 |
inputs = {
|
457 |
+
"images": {
|
458 |
+
"type": "any",
|
459 |
+
"description": "List of base64-encoded images. Each image must be a valid base64 string with proper padding (length multiple of 4). Images should be clear and well-lit for best OCR results."
|
460 |
+
}
|
461 |
}
|
462 |
output_type = "any"
|
463 |
|