import os
import cv2
import gradio as gr
import numpy as np
import supervision as sv
from pathlib import Path
from dds_cloudapi_sdk import Config, Client, TextPrompt
from dds_cloudapi_sdk.tasks.dinox import DinoxTask
from dds_cloudapi_sdk.tasks.detection import DetectionTask
from dds_cloudapi_sdk.tasks.types import DetectionTarget

# Constants
API_TOKEN = "361d32fa5ce22649133660c65cfcaf22"
TEXT_PROMPT = "wheel . eye . helmet . mouse . mouth . vehicle . steering wheel . ear . nose"
VID_PROMPT = "wheel . mouse . pot . acquariam . box"
TEMP_DIR = "./temp"
OUTPUT_DIR = "./outputs"

# Ensure directories exist
os.makedirs(TEMP_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

def initialize_dino_client():
    """Initialize the DINO-X client"""
    config = Config(API_TOKEN)
    return Client(config)

def get_class_mappings(text_prompt):
    """Create class name to ID mappings"""
    classes = [x.strip().lower() for x in text_prompt.split('.') if x]
    class_name_to_id = {name: id for id, name in enumerate(classes)}
    return classes, class_name_to_id

def process_predictions(predictions, class_name_to_id):
    """Process DINO-X predictions into detection format"""
    boxes = []
    masks = []
    confidences = []
    class_names = []
    class_ids = []
    
    for obj in predictions:
        boxes.append(obj.bbox)
        if hasattr(obj, 'mask') and obj.mask:
            masks.append(DetectionTask.rle2mask(
                DetectionTask.string2rle(obj.mask.counts), 
                obj.mask.size
            ))
        cls_name = obj.category.lower().strip()
        class_names.append(cls_name)
        class_ids.append(class_name_to_id[cls_name])
        confidences.append(obj.score)
    
    return {
        'boxes': np.array(boxes),
        'masks': np.array(masks) if masks else None,
        'class_ids': np.array(class_ids),
        'class_names': class_names,
        'confidences': confidences
    }

def process_image(image_path, prompt=TEXT_PROMPT):
    """Process a single image with DINO-X"""
    try:
        client = initialize_dino_client()
        _, class_name_to_id = get_class_mappings(prompt)
        
        # Upload and process image
        image_url = client.upload_file(image_path)
        task = DinoxTask(
            image_url=image_url,
            prompts=[TextPrompt(text=prompt)],
            bbox_threshold=0.25,
            targets=[DetectionTarget.BBox, DetectionTarget.Mask]
        )
        client.run_task(task)
        
        # Process predictions
        results = process_predictions(task.result.objects, class_name_to_id)
        
        # Annotate image
        img = cv2.imread(image_path)
        detections = sv.Detections(
            xyxy=results['boxes'],
            mask=results['masks'].astype(bool) if results['masks'] is not None else None,
            class_id=results['class_ids']
        )
        
        labels = [
            f"{name} {conf:.2f}"
            for name, conf in zip(results['class_names'], results['confidences'])
        ]
        
        # Apply annotations
        annotator = sv.BoxAnnotator()
        annotated_frame = annotator.annotate(scene=img.copy(), detections=detections)
        
        label_annotator = sv.LabelAnnotator()
        annotated_frame = label_annotator.annotate(
            scene=annotated_frame, 
            detections=detections, 
            labels=labels
        )
        
        if results['masks'] is not None:
            mask_annotator = sv.MaskAnnotator()
            annotated_frame = mask_annotator.annotate(
                scene=annotated_frame, 
                detections=detections
            )
        
        output_path = os.path.join(OUTPUT_DIR, "result.jpg")
        cv2.imwrite(output_path, annotated_frame)
        
        return output_path
        
    except Exception as e:
        return f"Error processing image: {str(e)}"

def process_video(video_path, prompt=VID_PROMPT):
    """Process a video with DINO-X"""
    try:
        client = initialize_dino_client()
        _, class_name_to_id = get_class_mappings(prompt)
        
        cap = cv2.VideoCapture(video_path)
        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
        fps = int(cap.get(cv2.CAP_PROP_FPS))
        
        output_path = os.path.join(OUTPUT_DIR, "result.mp4")
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
        
        frame_count = 0
        temp_frame_path = os.path.join(TEMP_DIR, "temp_frame.jpg")
        
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break
                
            frame_count += 1
            if frame_count % 3 != 0:  # Process every 3rd frame for speed
                continue
                
            cv2.imwrite(temp_frame_path, frame)
            image_url = client.upload_file(temp_frame_path)
            
            task = DinoxTask(
                image_url=image_url,
                prompts=[TextPrompt(text=prompt)],
                bbox_threshold=0.25
            )
            client.run_task(task)
            
            results = process_predictions(task.result.objects, class_name_to_id)
            
            detections = sv.Detections(
                xyxy=results['boxes'],
                class_id=results['class_ids']
            )
            
            labels = [
                f"{name} {conf:.2f}"
                for name, conf in zip(results['class_names'], results['confidences'])
            ]
            
            annotator = sv.BoxAnnotator()
            annotated_frame = annotator.annotate(scene=frame.copy(), detections=detections)
            
            label_annotator = sv.LabelAnnotator()
            annotated_frame = label_annotator.annotate(
                scene=annotated_frame,
                detections=detections,
                labels=labels
            )
            
            out.write(annotated_frame)
        
        cap.release()
        out.release()
        
        if os.path.exists(temp_frame_path):
            os.remove(temp_frame_path)
            
        return output_path
        
    except Exception as e:
        return f"Error processing video: {str(e)}"

def process_input(input_file, prompt=TEXT_PROMPT):
    """Process either image or video input"""
    if input_file is None:
        return "Please provide an input file"
        
    file_path = input_file.name
    extension = os.path.splitext(file_path)[1].lower()
    
    if extension in ['.jpg', '.jpeg', '.png']:
        return process_image(file_path, prompt)
    elif extension in ['.mp4', '.avi', '.mov']:
        return process_video(file_path, prompt)
    else:
        return "Unsupported file format. Please use jpg/jpeg/png for images or mp4/avi/mov for videos."

# Create Gradio interface
demo = gr.Interface(
    fn=process_input,
    inputs=[
        gr.File(
            label="Upload Image/Video",
            file_types=["image", "video"]
        ),
        gr.Textbox(
            label="Detection Prompt",
            value=TEXT_PROMPT,
            lines=2
        )
    ],
    outputs=gr.Image(label="Detection Result"),
    title="DINO-X Object Detection",
    description="Upload an image or video to detect objects using DINO-X. You can modify the detection prompt to specify what objects to look for.",
    examples=[
        ["assets/demo.png", TEXT_PROMPT],
        ["assets/demo.mp4", VID_PROMPT]
    ],
    cache_examples=True
)

if __name__ == "__main__":
    demo.launch()