Spaces:

KoonJamesZ
/

ccib-qwen

Sleeping

App Files Files Community

KoonJamesZ commited on Dec 17, 2024

Commit

604a2d4

verified ·

1 Parent(s): a36377d

Create app.py

Browse files

Files changed (1) hide show

app.py +334 -0

app.py ADDED Viewed

	@@ -0,0 +1,334 @@

+from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+import torch
+import uuid
+from moviepy.editor import VideoFileClip
+import os
+import torch
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+import cv2
+from ultralytics import YOLO
+from heapq import heappush, heappushpop
+import numpy as np
+import uuid
+import uuid
+from ultralytics import YOLO
+import gradio as gr
+# # default: Load the model on the available device(s)
+# model = Qwen2VLForConditionalGeneration.from_pretrained(
+#     "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
+# )
+# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
+model = Qwen2VLForConditionalGeneration.from_pretrained(
+    "Qwen/Qwen2-VL-7B-Instruct",
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+    device_map="auto",
+)
+# default processer
+processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
+# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
+# min_pixels = 256*28*28
+# max_pixels = 1280*28*28
+# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+model_id = "openai/whisper-large-v3"
+model_whisper = AutoModelForSpeechSeq2Seq.from_pretrained(
+    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+)
+model_whisper.to(device)
+processor_whisper = AutoProcessor.from_pretrained(model_id)
+pipe = pipeline(
+    "automatic-speech-recognition",
+    model=model_whisper,
+    tokenizer=processor_whisper.tokenizer,
+    feature_extractor=processor_whisper.feature_extractor,
+    torch_dtype=torch_dtype,
+    device=device,
+    return_timestamps=True
+)
+output_directory = "temp"  # Replace with your desired output directory
+os.makedirs(output_directory, exist_ok=True)
+def extract_audio(video_path):
+    try:
+        # Load the video file
+        video = VideoFileClip(video_path)
+        # Extract the audio
+        audio = video.audio
+        # Generate a unique filename using uuid
+        unique_filename = f"{uuid.uuid4()}.mp3"
+        audio_output_path = f"{output_directory}/{unique_filename}"
+        # Save the audio to the unique file
+        audio.write_audiofile(audio_output_path)
+        result = pipe(audio_output_path)
+        os.remove(audio_output_path)
+        return result["text"]
+    except Exception as e:
+        print(f"Error: {str(e)}")
+        return ""
+output_dir = '/content/images'
+model_yolo = YOLO('/content/drive/MyDrive/CCIB-AI-YOLO/runs/detect/train/weights/best.pt')
+def extract_top_weapon_frames(video_path, threshold=30):
+    os.makedirs(output_dir, exist_ok=True)
+    saved_paths = {
+        'original': [],  # Paths for original frames
+        'boxed': []     # Paths for frames with boxes
+    }
+    weapon_classes = ['weapon', 'knife']
+    top_frames = []  # (confidence_score, original_frame, boxed_frame, frame_number)
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        print("Error: Could not open video.")
+        return saved_paths
+    ret, prev_frame = cap.read()
+    if not ret:
+        print("Error: Could not read the first frame.")
+        return saved_paths
+    prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
+    frame_number = 0
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
+        frame_diff = cv2.absdiff(gray, prev_gray)
+        mean_diff = frame_diff.mean()
+        if mean_diff > threshold:
+            print(f"Processing frame {frame_number}")
+            results = model_yolo.predict(source=frame, show=False)
+            frame_max_conf = 0
+            frame_with_boxes = frame.copy()
+            for result in results:
+                for box in result.boxes:
+                    class_id = int(box.cls[0])
+                    class_name = model_yolo.names[class_id]
+                    confidence = float(box.conf[0])
+                    if class_name in weapon_classes:
+                        frame_max_conf = max(frame_max_conf, confidence)
+                        x1, y1, x2, y2 = map(int, box.xyxy[0])
+                        cv2.rectangle(frame_with_boxes, (x1, y1), (x2, y2), (0, 255, 0), 2)
+                        label = f"{class_name} ({confidence:.2f})"
+                        cv2.putText(frame_with_boxes, label, (x1, y1 - 10),
+                                  cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
+            if frame_max_conf > 0:
+                if len(top_frames) < 2:
+                    heappush(top_frames, (frame_max_conf, frame.copy(), frame_with_boxes, frame_number))
+                elif frame_max_conf > top_frames[0][0]:
+                    heappushpop(top_frames, (frame_max_conf, frame.copy(), frame_with_boxes, frame_number))
+        prev_gray = gray
+        frame_number += 1
+    # Save the top 2 frames (both original and with boxes)
+    for confidence, original_frame, boxed_frame, _ in sorted(top_frames, reverse=True):
+        # Save original frame
+        original_filename = f"{uuid.uuid4()}.jpg"
+        original_path = os.path.join(output_dir, original_filename)
+        cv2.imwrite(original_path, original_frame)
+        saved_paths['original'].append(original_path)
+        # Save frame with boxes
+        boxed_filename = f"{uuid.uuid4()}.jpg"
+        boxed_path = os.path.join(output_dir, boxed_filename)
+        cv2.imwrite(boxed_path, boxed_frame)
+        saved_paths['boxed'].append(boxed_path)
+        print(f"Saved frame pair with confidence {confidence:.3f}")
+    cap.release()
+    return saved_paths
+def detect_weapon_image(source_image_path):
+    # Ensure the output directory exists
+    os.makedirs(output_dir, exist_ok=True)
+    # Run YOLO predictions
+    results = model_yolo.predict(source=source_image_path, save=False, show=False)
+    # List to store paths to saved images
+    saved_paths = []
+    for result in results:
+        # Get the annotated image
+        annotated_img = result.plot()
+        # Generate a unique filename using UUID
+        unique_filename = f"{uuid.uuid4()}.jpg"
+        output_path = os.path.join(output_dir, unique_filename)
+        # Save the annotated image
+        cv2.imwrite(output_path, annotated_img)
+        saved_paths.append(output_path)
+    return saved_paths
+def response(messages):
+  # Preparation for inference
+  text = processor.apply_chat_template(
+      messages, tokenize=False, add_generation_prompt=True
+  )
+  image_inputs, video_inputs = process_vision_info(messages)
+  inputs = processor(
+      text=[text],
+      images=image_inputs,
+      videos=video_inputs,
+      padding=True,
+      return_tensors="pt",
+  )
+  inputs = inputs.to("cuda")
+  # Inference: Generation of the output
+  generated_ids = model.generate(**inputs, max_new_tokens=1024)
+  generated_ids_trimmed = [
+      out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+  ]
+  output_text = processor.batch_decode(
+      generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+  )
+  return output_text[0]
+system_prompt = """
+Analyze the image for illegal items or contraband. Detect and categorize objects like guns, knives, drugs, and hidden compartments. Highlight areas of interest and provide:
+1. A detailed explanation in Thai describing illegal items and their context.
+2. A JSON output summarizing the findings.
+Output Example:
+1. Explanation (Thai): (detailed explanation in Thai describing illegal items and their context.)
+2. JSON: [{"category": "weapon", "type": "gun"}]
+"""
+def is_mp4_file(file_path):
+    return os.path.isfile(file_path) and file_path.lower().endswith(".mp4")
+def process_inputs(text_input, file_input):
+    if is_mp4_file(file_input):
+        extract_images_from_video = extract_top_weapon_frames(file_input)
+        transcription = extract_audio(file_input)
+        try:
+            # Prepare image content for messages
+            image_content = []
+            # Check if we have any original images
+            if extract_images_from_video['original']:
+                # Add first image if available
+                image_content.append({
+                    "type": "image",
+                    "image": f"file://{extract_images_from_video['original'][0]}"
+                })
+                # Add second image if available
+                if len(extract_images_from_video['original']) > 1:
+                    image_content.append({
+                        "type": "image",
+                        "image": f"file://{extract_images_from_video['original'][1]}"
+                    })
+            # Create messages list with available content
+            messages = [{"role": "system", "content": system_prompt},
+                {
+                    "role": "user",
+                    "content": [
+                        *image_content,  # Unpack available image content
+                        {"type": "text", "text": f"Content From Social Media Post: {text_input}."},
+                        {"type": "text", "text": f"this is transcription from video:{transcription}"}
+                    ]
+                }
+            ]
+            # Return response and available boxed images (empty list if none)
+            result = response(messages), extract_images_from_video.get('boxed', [])
+            return result
+        except Exception as e:
+            return f"Error: {str(e)}", []
+    else:
+      try:
+        # Call your response function with text and file path
+        messages = [ {"role": "system", "content": system_prompt},
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": f"file://{file_input}",
+                },
+                {"type": "text", "text": f"Content From Social Media Post: {text_input}."},
+            ],
+        }]
+        result = response(messages)
+        detect_weapon = detect_weapon_image(file_input)
+        # Optionally, delete the temporary file after processing
+        return result,detect_weapon
+      except Exception as e:
+          # Handle any exceptions and return the error
+        return f"Error: {str(e)}",[]
+# Create the Gradio interface
+demo = gr.Interface(
+    fn=process_inputs,
+    inputs=[
+        gr.Textbox(
+            label="Text Input",
+            placeholder="Enter your text here...",
+            lines=3
+        ),
+        gr.File(
+            label="File Upload",
+            file_types=[".mp4", ".png", ".jpeg",".jpg"],
+            type="filepath"
+        )
+    ],
+    outputs= [gr.Textbox(label="Process Results", lines=8),
+              gr.Gallery(label="Generated images", show_label=False, elem_id="gallery", columns=[2], rows=[1], object_fit="contain", height="auto")],
+    title="Text and File Input Processor Qwen2-VL-7B-Instruct",
+    description="Enter text and/or upload a file to process them together",
+)
+if __name__ == "__main__":
+    demo.launch()