huytofu92 commited on
Commit
f864d65
·
1 Parent(s): ff1ad14

enhance VLM and fix bugs with step extraciton

Browse files
Files changed (3) hide show
  1. app.py +1 -1
  2. mini_agents.py +1 -1
  3. vlm_tools.py +69 -9
app.py CHANGED
@@ -53,7 +53,7 @@ class BasicAgent:
53
  fixed_answer = self.agent.run(question)
54
 
55
  # Log steps
56
- all_steps = self.agent.master_agent.memory.get_full_steps()
57
  for step in all_steps:
58
  if isinstance(step, ActionStep):
59
  step_class = "ActionStep"
 
53
  fixed_answer = self.agent.run(question)
54
 
55
  # Log steps
56
+ all_steps = self.agent.master_agent.memory.steps
57
  for step in all_steps:
58
  if isinstance(step, ActionStep):
59
  step_class = "ActionStep"
mini_agents.py CHANGED
@@ -49,7 +49,7 @@ AUTHORIZED_IMPORTS = [
49
  # Data processing
50
  "numpy", "pandas", "sklearn", "scipy", "math", "hmmlearn",
51
  # File handling
52
- "base64", "io", "json", "os", "pickle",
53
  # Visualization
54
  "pyplot", "matplotlib", "matplotlib.pyplot",
55
  # Utilities
 
49
  # Data processing
50
  "numpy", "pandas", "sklearn", "scipy", "math", "hmmlearn",
51
  # File handling
52
+ "base64", "io", "json", "os", "pickle", "openpyxl", "pyxlsb"
53
  # Visualization
54
  "pyplot", "matplotlib", "matplotlib.pyplot",
55
  # Utilities
vlm_tools.py CHANGED
@@ -304,12 +304,40 @@ class ObjectDetectionTool(Tool):
304
  name = "object_detection"
305
  description = """
306
  Detect objects in a list of images.
307
- It takes a list of images as input and returns
308
- a list of detected objects with labels, confidence, and bounding boxes.
309
- The output type will be List[List[str]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
  """
311
  inputs = {
312
- "images": {"type": "any", "description": "The list of images to detect objects in. Must be a List[str] or a List[np.ndarray]"}
 
 
 
313
  }
314
  output_type = "any"
315
 
@@ -390,14 +418,46 @@ class ObjectDetectionTool(Tool):
390
 
391
  class OCRTool(Tool):
392
  description = """
393
- Scan an image for text.
394
- It takes a list of images as input and returns
395
- a list of text in the images.
396
- The output type will be List[List[str]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  """
398
  name = "ocr_scan"
399
  inputs = {
400
- "images": {"type": "any", "description": "The list of images to scan for text. Must be a List[str] or a List[np.ndarray]"}
 
 
 
401
  }
402
  output_type = "any"
403
 
 
304
  name = "object_detection"
305
  description = """
306
  Detect objects in a list of images.
307
+
308
+ Input Requirements:
309
+ - Input must be a list of images, where each image is a base64-encoded string
310
+ - Each base64 string must be properly padded (length must be a multiple of 4)
311
+ - Images will be resized to 416x416 pixels during processing
312
+ - Images should be in RGB or BGR format (3 channels)
313
+ - Supported image formats: JPG, PNG
314
+
315
+ Processing:
316
+ - Images are automatically resized to 416x416
317
+ - Images are normalized to [0,1] range
318
+ - Model expects input shape: [1, 3, 416, 416] (batch, channels, height, width)
319
+
320
+ Output:
321
+ - Returns a list of detected objects for each image
322
+ - Each detection includes: (label, confidence, bounding_box)
323
+ - Bounding boxes are in format: [x, y, width, height]
324
+ - Confidence threshold: 0.5
325
+ - NMS threshold: 0.4
326
+
327
+ Example input format:
328
+ ["base64_encoded_image1", "base64_encoded_image2"]
329
+
330
+ Example output format:
331
+ [
332
+ [("person", 0.95, [100, 200, 50, 100]), ("car", 0.88, [300, 400, 80, 60])], # detections for image1
333
+ [("dog", 0.92, [150, 250, 40, 80])] # detections for image2
334
+ ]
335
  """
336
  inputs = {
337
+ "images": {
338
+ "type": "any",
339
+ "description": "List of base64-encoded images. Each image must be a valid base64 string with proper padding (length multiple of 4). Images will be resized to 416x416."
340
+ }
341
  }
342
  output_type = "any"
343
 
 
418
 
419
  class OCRTool(Tool):
420
  description = """
421
+ Scan an image for text using OCR (Optical Character Recognition).
422
+
423
+ Input Requirements:
424
+ - Input must be a list of images, where each image is a base64-encoded string
425
+ - Each base64 string must be properly padded (length must be a multiple of 4)
426
+ - Images should be in RGB or BGR format (3 channels)
427
+ - Supported image formats: JPG, PNG
428
+ - For best results:
429
+ * Text should be clear and well-lit
430
+ * Image should have good contrast
431
+ * Text should be properly oriented
432
+ * Avoid blurry or distorted images
433
+
434
+ Processing:
435
+ - Uses Tesseract OCR engine
436
+ - Automatically handles text orientation
437
+ - Supports multiple languages (default: English)
438
+ - Processes each image independently
439
+
440
+ Output:
441
+ - Returns a list of text strings, one for each input image
442
+ - Empty string is returned if no text is detected
443
+ - Text is returned in the order it appears in the image
444
+ - Line breaks are preserved in the output
445
+
446
+ Example input format:
447
+ ["base64_encoded_image1", "base64_encoded_image2"]
448
+
449
+ Example output format:
450
+ [
451
+ "This is text from image 1\nSecond line of text", # text from image1
452
+ "Text from image 2" # text from image2
453
+ ]
454
  """
455
  name = "ocr_scan"
456
  inputs = {
457
+ "images": {
458
+ "type": "any",
459
+ "description": "List of base64-encoded images. Each image must be a valid base64 string with proper padding (length multiple of 4). Images should be clear and well-lit for best OCR results."
460
+ }
461
  }
462
  output_type = "any"
463