Final_Assignment_Template

Sleeping

App Files Files Community

huytofu92 commited on May 19

Commit

5b6dc13

1 Parent(s): 33b9b1f

new requirements and enhance tools

Browse files

Files changed (3) hide show

audio_tools.py +2 -9
requirements.txt +7 -1
vlm_tools.py +102 -32

audio_tools.py CHANGED Viewed

@@ -27,21 +27,14 @@ class TranscribeAudioTool(Tool):
 transcribe_audio_tool = TranscribeAudioTool()
 @tool
-def audio_to_base64(file_path_or_key: str, state: dict) -> str:
     """
     Convert an audio file to base64 format
     Args:
-        file_path_or_key: Path to the audio file or a key in the state dictionary
-        state: The state dictionary containing file paths
     Returns:
         The audio file in base64 format
     """
-    # Check if the input is a key in the state dictionary
-    if file_path_or_key in state:
-        file_path = state[file_path_or_key]
-    else:
-        file_path = file_path_or_key
     # Load the audio file
     try:
         audio = AudioSegment.from_file(file_path)

 transcribe_audio_tool = TranscribeAudioTool()
 @tool
+def audio_to_base64(file_path: str) -> str:
     """
     Convert an audio file to base64 format
     Args:
+        file_path: Path to the audio file
     Returns:
         The audio file in base64 format
     """
     # Load the audio file
     try:
         audio = AudioSegment.from_file(file_path)

requirements.txt CHANGED Viewed

@@ -7,6 +7,7 @@ anyio==4.9.0
 arxiv==2.2.0
 attrs==25.3.0
 beautifulsoup4==4.13.4
 cachetools==5.5.2
 certifi==2025.4.26
 charset-normalizer==3.4.2
@@ -18,6 +19,7 @@ cycler==0.12.1
 dataclasses-json==0.6.7
 defusedxml==0.7.1
 deprecation==2.1.0
 duckduckgo_search==8.0.2
 eyeD3==0.9.8
 fastapi==0.115.12
@@ -28,7 +30,7 @@ filetype==1.2.0
 flatbuffers==25.2.10
 fonttools==4.58.0
 frozenlist==1.6.0
-fsspec==2025.3.2
 google-api-core==2.24.2
 google-api-python-client==2.169.0
 google-auth==2.40.1
@@ -72,6 +74,7 @@ matplotlib==3.10.3
 mdurl==0.1.2
 mpmath==1.3.0
 multidict==6.4.3
 mypy_extensions==1.1.0
 narwhals==1.39.1
 numpy==2.2.5
@@ -86,6 +89,7 @@ primp==0.15.0
 propcache==0.3.1
 proto-plus==1.26.1
 protobuf==6.31.0
 pyasn1==0.6.1
 pyasn1_modules==0.4.2
 pyAudioAnalysis==0.3.14
@@ -137,6 +141,8 @@ uvicorn==0.34.2
 websockets==15.0.1
 wikipedia==1.4.0
 Wikipedia-API==0.8.1
 yarl==1.20.0
 youtube-transcript-api==1.0.3
 zstandard==0.23.0

 arxiv==2.2.0
 attrs==25.3.0
 beautifulsoup4==4.13.4
+bs4==0.0.2
 cachetools==5.5.2
 certifi==2025.4.26
 charset-normalizer==3.4.2
 dataclasses-json==0.6.7
 defusedxml==0.7.1
 deprecation==2.1.0
+dill==0.3.8
 duckduckgo_search==8.0.2
 eyeD3==0.9.8
 fastapi==0.115.12
 flatbuffers==25.2.10
 fonttools==4.58.0
 frozenlist==1.6.0
+fsspec==2025.3.0
 google-api-core==2.24.2
 google-api-python-client==2.169.0
 google-auth==2.40.1
 mdurl==0.1.2
 mpmath==1.3.0
 multidict==6.4.3
+multiprocess==0.70.16
 mypy_extensions==1.1.0
 narwhals==1.39.1
 numpy==2.2.5
 propcache==0.3.1
 proto-plus==1.26.1
 protobuf==6.31.0
+pyarrow==20.0.0
 pyasn1==0.6.1
 pyasn1_modules==0.4.2
 pyAudioAnalysis==0.3.14
 websockets==15.0.1
 wikipedia==1.4.0
 Wikipedia-API==0.8.1
+xxhash==3.5.0
 yarl==1.20.0
+youtube-dl==2021.12.17
 youtube-transcript-api==1.0.3
 zstandard==0.23.0

vlm_tools.py CHANGED Viewed

@@ -73,39 +73,26 @@ def post_processing(onnx_output, classes, original_shape, conf_threshold=0.5, nm
     return detected_objects
 @tool
-def extract_frames_from_video(video_path: str) -> list:
     """
-    Extract frames from a video
     Args:
         video_path: The path to the video file
     Returns:
-        A list of frames as numpy arrays
     """
     cap = cv2.VideoCapture(video_path)
-    frames = []
     while cap.isOpened():
-        ret, frame = cap.read()
         if not ret:
             break
-        frames.append(frame)
     cap.release()
-    return frames
 @tool
-def download_image(image_url: str)->str:
-    """
-    Download an image from a url
-    Args:
-        image_url: The url of the image to download
-    Returns:
-        The image as a base64 string
-    """
-    response = requests.get(image_url)
-    image = base64.b64encode(response.content).decode('utf-8')
-    return image
-@tool
-def get_image_from_file(file_path: str)->str:
     """
     Get an image from a file
     Args:
@@ -138,6 +125,89 @@ def get_image_from_file(file_path: str)->str:
             image = base64.b64encode(buffer.getvalue()).decode('utf-8')
     return image
 @tool
 def image_processing(image: str, brightness: float = 1.0, contrast: float = 1.0)->str:
     """
@@ -166,13 +236,13 @@ names_path = "vlm_assets/obj.names"
 class ObjectDetectionTool(Tool):
     name = "object_detection"
     description = """
-        Detect objects in a list of frames (images).
-        It takes a list of frames (images) as input and returns
         a list of detected objects with labels, confidence, and bounding boxes.
         The output type will be List[List[str]]
     """
     inputs = {
-        "frames": {"type": "any", "description": "The list of frames (images) to detect objects in. Must be a List[str] or a List[np.ndarray]"}
     }
     output_type = "any"
@@ -197,10 +267,10 @@ class ObjectDetectionTool(Tool):
             'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
         ]
-    def forward(self, frames: any)->any:
         detected_objects = []
-        for frame in frames:
-            img = pre_processing(frame)
             # Preprocess the image
             blob = cv2.dnn.blobFromImage(img, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
@@ -214,20 +284,20 @@ class ObjectDetectionTool(Tool):
 class OCRTool(Tool):
     description = """
     Scan an image for text.
-    It takes a list of frames (images) as input and returns
     a list of text in the images.
     The output type will be List[List[str]]
     """
     name = "ocr_scan"
     inputs = {
-        "frames": {"type": "any", "description": "The list of frames (images) to scan for text. Must be a List[str] or a List[np.ndarray]"}
     }
     output_type = "any"
-    def forward(self, frames: any)->any:
         scanned_text = []
-        for frame in frames:
-            image_data = base64.b64decode(frame)
             img = Image.open(BytesIO(image_data))
             scanned_text.append(pytesseract.image_to_string(img))
         return scanned_text

     return detected_objects
 @tool
+def extract_images_from_video(video_path: str) -> list:
     """
+    Extract images (frames) from a video
     Args:
         video_path: The path to the video file
     Returns:
+        A list of images (frames) as numpy arrays
     """
     cap = cv2.VideoCapture(video_path)
+    images = []
     while cap.isOpened():
+        ret, image = cap.read()
         if not ret:
             break
+        images.append(image)
     cap.release()
+    return images
 @tool
+def get_image_from_file_path(file_path: str)->str:
     """
     Get an image from a file
     Args:
             image = base64.b64encode(buffer.getvalue()).decode('utf-8')
     return image
+@tool
+def get_video_from_file_path(file_path: str)->str:
+    """
+    Get a video from a file using cv2 and BytesIO
+    Args:
+        file_path: The path to the file
+    Returns:
+        The video as a base64 string
+    """
+    try:
+        # Use cv2 to read the video
+        cap = cv2.VideoCapture(file_path)
+        if not cap.isOpened():
+            raise FileNotFoundError(f"Could not read video at {file_path}")
+        # Get video properties
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        # Create a BytesIO buffer to store the images (frames)
+        images = []
+        while cap.isOpened():
+            ret, image = cap.read()
+            if not ret:
+                break
+            # Convert frame to jpg and store in memory
+            _, buffer = cv2.imencode('.jpg', image)
+            images.append(buffer.tobytes())
+        # Release the video capture
+        cap.release()
+        # Combine all images into a single buffer
+        with BytesIO() as buffer:
+            # Write each image to the buffer
+            for image_data in images:
+                buffer.write(image_data)
+            # Encode to base64
+            video_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
+    except Exception as e:
+        current_file_path = os.path.abspath(__file__)
+        current_file_dir = os.path.dirname(current_file_path)
+        file_path = os.path.join(current_file_dir, file_path.replace("Final_Assignment_Template", ""))
+        # Try again with the new path
+        cap = cv2.VideoCapture(file_path)
+        if not cap.isOpened():
+            raise FileNotFoundError(f"Could not read video at {file_path}")
+        # Get video properties
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        # Create a BytesIO buffer to store the images (frames)
+        images = []
+        while cap.isOpened():
+            ret, image = cap.read()
+            if not ret:
+                break
+            # Convert image to jpg and store in memory
+            _, buffer = cv2.imencode('.jpg', image)
+            images.append(buffer.tobytes())
+        # Release the video capture
+        cap.release()
+        # Combine all images into a single buffer
+        with BytesIO() as buffer:
+            # Write each image to the buffer
+            for image_data in images:
+                buffer.write(image_data)
+            # Encode to base64
+            video_base64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
+    return video_base64
 @tool
 def image_processing(image: str, brightness: float = 1.0, contrast: float = 1.0)->str:
     """
 class ObjectDetectionTool(Tool):
     name = "object_detection"
     description = """
+        Detect objects in a list of images.
+        It takes a list of images as input and returns
         a list of detected objects with labels, confidence, and bounding boxes.
         The output type will be List[List[str]]
     """
     inputs = {
+        "images": {"type": "any", "description": "The list of images to detect objects in. Must be a List[str] or a List[np.ndarray]"}
     }
     output_type = "any"
             'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
         ]
+    def forward(self, images: any)->any:
         detected_objects = []
+        for image in images:
+            img = pre_processing(image)
             # Preprocess the image
             blob = cv2.dnn.blobFromImage(img, 0.00392, (416, 416), (0, 0, 0), True, crop=False)
 class OCRTool(Tool):
     description = """
     Scan an image for text.
+    It takes a list of images as input and returns
     a list of text in the images.
     The output type will be List[List[str]]
     """
     name = "ocr_scan"
     inputs = {
+        "images": {"type": "any", "description": "The list of images to scan for text. Must be a List[str] or a List[np.ndarray]"}
     }
     output_type = "any"
+    def forward(self, images: any)->any:
         scanned_text = []
+        for image in images:
+            image_data = base64.b64decode(image)
             img = Image.open(BytesIO(image_data))
             scanned_text.append(pytesseract.image_to_string(img))
         return scanned_text