# -*- coding: utf-8 -*-


"""## hugging face funcs"""

import io
import matplotlib.pyplot as plt
import requests
import inflect
from PIL import Image

def load_image_from_url(url):
    return Image.open(requests.get(url, stream=True).raw)

def render_results_in_image(in_pil_img, in_results):
    plt.figure(figsize=(16, 10))
    plt.imshow(in_pil_img)

    ax = plt.gca()

    for prediction in in_results:

        x, y = prediction['box']['xmin'], prediction['box']['ymin']
        w = prediction['box']['xmax'] - prediction['box']['xmin']
        h = prediction['box']['ymax'] - prediction['box']['ymin']

        ax.add_patch(plt.Rectangle((x, y),
                                   w,
                                   h,
                                   fill=False,
                                   color="green",
                                   linewidth=2))
        ax.text(
           x,
           y,
           f"{prediction['label']}: {round(prediction['score']*100, 1)}%",
           color='red'
        )

    plt.axis("off")

    # Save the modified image to a BytesIO object
    img_buf = io.BytesIO()
    plt.savefig(img_buf, format='png',
                bbox_inches='tight',
                pad_inches=0)
    img_buf.seek(0)
    modified_image = Image.open(img_buf)

    # Close the plot to prevent it from being displayed
    plt.close()

    return modified_image

def summarize_predictions_natural_language(predictions):
    summary = {}
    p = inflect.engine()

    for prediction in predictions:
        label = prediction['label']
        if label in summary:
            summary[label] += 1
        else:
            summary[label] = 1

    result_string = "In this image, there are "
    for i, (label, count) in enumerate(summary.items()):
        count_string = p.number_to_words(count)
        result_string += f"{count_string} {label}"
        if count > 1:
          result_string += "s"

        result_string += " "

        if i == len(summary) - 2:
          result_string += "and "

    # Remove the trailing comma and space
    result_string = result_string.rstrip(', ') + "."

    return result_string


##### To ignore warnings #####
import warnings
import logging
from transformers import logging as hf_logging

def ignore_warnings():
    # Ignore specific Python warnings
    warnings.filterwarnings("ignore", message="Some weights of the model checkpoint")
    warnings.filterwarnings("ignore", message="Could not find image processor class")
    warnings.filterwarnings("ignore", message="The `max_size` parameter is deprecated")

    # Adjust logging for libraries using the logging module
    logging.basicConfig(level=logging.ERROR)
    hf_logging.set_verbosity_error()

########

import numpy as np
import torch
import matplotlib.pyplot as plt


def show_mask(mask, ax, random_color=False):
    if random_color:
        color = np.concatenate([np.random.random(3),
                                np.array([0.6])],
                               axis=0)
    else:
        color = np.array([30/255, 144/255, 255/255, 0.6])
    h, w = mask.shape[-2:]
    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
    ax.imshow(mask_image)


def show_box(box, ax):
    x0, y0 = box[0], box[1]
    w, h = box[2] - box[0], box[3] - box[1]
    ax.add_patch(plt.Rectangle((x0, y0),
                               w,
                               h, edgecolor='green',
                               facecolor=(0,0,0,0),
                               lw=2))

def show_boxes_on_image(raw_image, boxes):
    plt.figure(figsize=(10,10))
    plt.imshow(raw_image)
    for box in boxes:
      show_box(box, plt.gca())
    plt.axis('on')
    plt.show()

def show_points_on_image(raw_image, input_points, input_labels=None):
    plt.figure(figsize=(10,10))
    plt.imshow(raw_image)
    input_points = np.array(input_points)
    if input_labels is None:
      labels = np.ones_like(input_points[:, 0])
    else:
      labels = np.array(input_labels)
    show_points(input_points, labels, plt.gca())
    plt.axis('on')
    plt.show()

def show_points_and_boxes_on_image(raw_image,
                                   boxes,
                                   input_points,
                                   input_labels=None):
    plt.figure(figsize=(10,10))
    plt.imshow(raw_image)
    input_points = np.array(input_points)
    if input_labels is None:
      labels = np.ones_like(input_points[:, 0])
    else:
      labels = np.array(input_labels)
    show_points(input_points, labels, plt.gca())
    for box in boxes:
      show_box(box, plt.gca())
    plt.axis('on')
    plt.show()


def show_points_and_boxes_on_image(raw_image,
                                   boxes,
                                   input_points,
                                   input_labels=None):
    plt.figure(figsize=(10,10))
    plt.imshow(raw_image)
    input_points = np.array(input_points)
    if input_labels is None:
      labels = np.ones_like(input_points[:, 0])
    else:
      labels = np.array(input_labels)
    show_points(input_points, labels, plt.gca())
    for box in boxes:
      show_box(box, plt.gca())
    plt.axis('on')
    plt.show()


def show_points(coords, labels, ax, marker_size=375):
    pos_points = coords[labels==1]
    neg_points = coords[labels==0]
    ax.scatter(pos_points[:, 0],
               pos_points[:, 1],
               color='green',
               marker='*',
               s=marker_size,
               edgecolor='white',
               linewidth=1.25)
    ax.scatter(neg_points[:, 0],
               neg_points[:, 1],
               color='red',
               marker='*',
               s=marker_size,
               edgecolor='white',
               linewidth=1.25)


def fig2img(fig):
    """Convert a Matplotlib figure to a PIL Image and return it"""
    import io
    buf = io.BytesIO()
    fig.savefig(buf)
    buf.seek(0)
    img = Image.open(buf)
    return img


def show_mask_on_image(raw_image, mask, return_image=False):
    if not isinstance(mask, torch.Tensor):
      mask = torch.Tensor(mask)

    if len(mask.shape) == 4:
      mask = mask.squeeze()

    fig, axes = plt.subplots(1, 1, figsize=(15, 15))

    mask = mask.cpu().detach()
    axes.imshow(np.array(raw_image))
    show_mask(mask, axes)
    axes.axis("off")
    plt.show()

    if return_image:
      fig = plt.gcf()
      return fig2img(fig)


def show_pipe_masks_on_image(raw_image, outputs, return_image=False):
  plt.imshow(np.array(raw_image))
  ax = plt.gca()
  for mask in outputs["masks"]:
      show_mask(mask, ax=ax, random_color=True)
  plt.axis("off")
  plt.show()
  if return_image:
      fig = plt.gcf()
      return fig2img(fig)

"""## imports"""

from transformers import pipeline
from transformers import SamModel, SamProcessor
from transformers import BlipForImageTextRetrieval
from transformers import AutoProcessor

from transformers.utils import logging
logging.set_verbosity_error()
#ignore_warnings()

import io
import matplotlib.pyplot as plt
import requests
import inflect
from PIL import Image

import os
import gradio as gr

import time

"""# Object detection

## hugging face model ("facebook/detr-resnet-50"). 167MB
"""

od_pipe = pipeline("object-detection", "facebook/detr-resnet-50")

chosen_model = pipeline("object-detection", "hustvl/yolos-small")

"""## gradio funcs"""

def get_object_detection_prediction(model_name, raw_image):
  model = od_pipe
  if "chosen-model" in model_name:
    model = chosen_model
  start = time.time()
  pipeline_output = model(raw_image)
  end = time.time()
  elapsed_result = f'{model_name} object detection elapsed {end-start} seconds'
  print(elapsed_result)
  processed_image = render_results_in_image(raw_image, pipeline_output)
  return [processed_image, elapsed_result]

"""# Image segmentation

## hugging face models: Zigeng/SlimSAM-uniform-77(segmentation) 39MB, Intel/dpt-hybrid-midas(depth) 490MB
"""

hugging_face_segmentation_pipe = pipeline("mask-generation", "Zigeng/SlimSAM-uniform-77")
hugging_face_segmentation_model = SamModel.from_pretrained("Zigeng/SlimSAM-uniform-77")
hugging_face_segmentation_processor = SamProcessor.from_pretrained("Zigeng/SlimSAM-uniform-77")
hugging_face_depth_estimator = pipeline(task="depth-estimation", model="Intel/dpt-hybrid-midas")

"""## chosen models: facebook/sam-vit-base(segmentation) 375MB, LiheYoung/depth-anything-small-hf(depth) 100MB"""

chosen_name = "facebook/sam-vit-base"
chosen_segmentation_pipe = pipeline("mask-generation", chosen_name)
chosen_segmentation_model = SamModel.from_pretrained(chosen_name)
chosen_segmentation_processor = SamProcessor.from_pretrained(chosen_name)
chosen_depth_estimator = pipeline(task="depth-estimation", model="LiheYoung/depth-anything-small-hf")

"""## gradio funcs"""

input_points = [[[1600, 700]]]

def segment_image_pretrained(model_name, raw_image):
  processor = hugging_face_segmentation_processor
  model = hugging_face_segmentation_model
  if("chosen" in model_name):
    processor = chosen_segmentation_processor
    model = chosen_segmentation_model
  start = time.time()
  inputs = processor(raw_image,
                  input_points=input_points,
                  return_tensors="pt")
  with torch.no_grad():
    outputs = model(**inputs)
  predicted_masks = processor.image_processor.post_process_masks(
    outputs.pred_masks,
    inputs["original_sizes"],
    inputs["reshaped_input_sizes"])
  results = []
  predicted_mask = predicted_masks[0]
  end = time.time()
  elapsed_result = f'{model_name} pretrained image segmentation elapsed {end-start} seconds'
  print(elapsed_result)
  for i in range(3):
    results.append(show_mask_on_image(raw_image, predicted_mask[:, i], return_image=True))
  results.append(elapsed_result);
  return results

def segment_image(model_name, raw_image):
  model = hugging_face_segmentation_pipe
  if("chosen" in model_name):
    print("chosen model used")
    model = chosen_segmentation_pipe
  start = time.time()
  output = model(raw_image, points_per_batch=32)
  end = time.time()
  elapsed_result = f'{model_name} raw image segmentation elapsed {end-start} seconds'
  print(elapsed_result)
  return [show_pipe_masks_on_image(raw_image, output, return_image = True), elapsed_result]

def depth_image(model_name, input_image):
  depth_estimator = hugging_face_depth_estimator
  print(model_name)
  if("chosen" in model_name):
    print("chosen model used")
    depth_estimator = chosen_depth_estimator
  start = time.time()
  out = depth_estimator(input_image)
  prediction = torch.nn.functional.interpolate(
      out["predicted_depth"].unsqueeze(0).unsqueeze(0),
      size=input_image.size[::-1],
      mode="bicubic",
      align_corners=False,
  )
  end = time.time()
  elapsed_result = f'{model_name} Depth Estimation elapsed {end-start} seconds'
  print(elapsed_result)
  output = prediction.squeeze().numpy()
  formatted = (output * 255 / np.max(output)).astype("uint8")
  depth = Image.fromarray(formatted)
  return [depth, elapsed_result]

"""# Image retrieval

## hugging face model: Salesforce/blip-itm-base-coco 900MB
"""

hugging_face_retrieval_model = BlipForImageTextRetrieval.from_pretrained(
    "Salesforce/blip-itm-base-coco")
hugging_face_retrieval_processor = AutoProcessor.from_pretrained(
    "Salesforce/blip-itm-base-coco")

"""## chosen model: Salesforce/blip-itm-base-flickr 900MB"""

chosen_retrieval_model = BlipForImageTextRetrieval.from_pretrained(
    "Salesforce/blip-itm-base-flickr")
chosen_retrieval_processor = AutoProcessor.from_pretrained(
    "Salesforce/blip-itm-base-flickr")

"""## gradion func"""

def retrieve_image(model_name, raw_image, predict_text):
  processor = hugging_face_retrieval_processor
  model = hugging_face_retrieval_model
  if("chosen" in model_name):
    processor = chosen_retrieval_processor
    model = chosen_retrieval_model
  start = time.time()
  inputs = processor(images=raw_image,
                   text=predict_text,
                   return_tensors="pt")
  end = time.time()
  elapsed_result = f"{model_name} image retrieval elapsed {end-start} seconds"
  print(elapsed_result)
  itm_scores = model(**inputs)[0]
  itm_score = torch.nn.functional.softmax(itm_scores,dim=1)
  return [f"""\
    The image and text are matched \
    with a probability of {itm_score[0][1]:.4f}""",
          elapsed_result]

"""# gradio"""

with gr.Blocks() as object_detection_tab:
  gr.Markdown("# Detect objects on image")
  gr.Markdown("Upload an image, choose model, press button.")

  with gr.Row():
      with gr.Column():
          # Input components
          input_image = gr.Image(label="Upload Image", type="pil")
          model_selector = gr.Dropdown(["hugging-face(facebook/detr-resnet-50)", "chosen-model(hustvl/yolos-small)"],
                                        label = "Select Model")

      with gr.Column():
          # Output image
          elapsed_result = gr.Textbox(label="Seconds elapsed", lines=1)
          output_image = gr.Image(label="Output Image", type="pil")

  # Process button
  process_btn = gr.Button("Detect objects")

  # Connect the input components to the processing function
  process_btn.click(
      fn=get_object_detection_prediction,
      inputs=[
          model_selector,
          input_image
      ],
      outputs=[output_image, elapsed_result]
  )

with gr.Blocks() as image_segmentation_detection_tab:
  gr.Markdown("# Image segmentation")
  gr.Markdown("Upload an image, choose model, press button.")

  with gr.Row():
      with gr.Column():
          # Input components
          input_image = gr.Image(label="Upload Image", type="pil")
          model_selector = gr.Dropdown(["hugging-face(Zigeng/SlimSAM-uniform-77)", "chosen-model(facebook/sam-vit-base)"],
                                        label = "Select Model")

      with gr.Column():
          elapsed_result = gr.Textbox(label="Seconds elapsed", lines=1)
          # Output image
          output_image = gr.Image(label="Segmented image", type="pil")
  with gr.Row():
    with gr.Column():
      segment_btn = gr.Button("Segment image(not pretrained)")

  with gr.Row():
    elapsed_result_pretrained_segment = gr.Textbox(label="Seconds elapsed", lines=1)
    with gr.Column():
        segment_pretrained_output_image_1 = gr.Image(label="Segmented image by pretrained model", type="pil")
    with gr.Column():
        segment_pretrained_output_image_2 = gr.Image(label="Segmented image by pretrained model", type="pil")
    with gr.Column():
      segment_pretrained_output_image_3 = gr.Image(label="Segmented image by pretrained model", type="pil")
  with gr.Row():
    with gr.Column():
      segment_pretrained_model_selector = gr.Dropdown(["hugging-face(Zigeng/SlimSAM-uniform-77)", "chosen-model(facebook/sam-vit-base)"],
                                        label = "Select Model")
      segment_pretrained_btn = gr.Button("Segment image(pretrained)")

  with gr.Row():
    with gr.Column():
        depth_output_image = gr.Image(label="Depth image", type="pil")
        elapsed_result_depth = gr.Textbox(label="Seconds elapsed", lines=1)
  with gr.Row():
    with gr.Column():
      depth_model_selector = gr.Dropdown(["hugging-face(Intel/dpt-hybrid-midas)", "chosen-model(LiheYoung/depth-anything-small-hf)"],
                                        label = "Select Model")
      depth_btn = gr.Button("Get image depth")

  segment_btn.click(
      fn=segment_image,
      inputs=[
          model_selector,
          input_image
      ],
      outputs=[output_image, elapsed_result]
  )
  segment_pretrained_btn.click(
      fn=segment_image_pretrained,
      inputs=[
          segment_pretrained_model_selector,
          input_image
      ],
      outputs=[segment_pretrained_output_image_1, segment_pretrained_output_image_2, segment_pretrained_output_image_3, elapsed_result_pretrained_segment]
  )

  depth_btn.click(
      fn=depth_image,
      inputs=[
          depth_model_selector,
          input_image,
      ],
      outputs=[depth_output_image, elapsed_result_depth]
  )

with gr.Blocks() as image_retrieval_tab:
  gr.Markdown("# Check is text describes image")
  gr.Markdown("Upload an image, choose model, press button.")

  with gr.Row():
      with gr.Column():
          # Input components
          input_image = gr.Image(label="Upload Image", type="pil")
          text_prediction = gr.TextArea(label="Describe image")
          model_selector = gr.Dropdown(["hugging-face(Salesforce/blip-itm-base-coco)", "chosen-model(Salesforce/blip-itm-base-flickr)"],
                                        label = "Select Model")

      with gr.Column():
          # Output image
          output_result = gr.Textbox(label="Probability result", lines=3)
          elapsed_result = gr.Textbox(label="Seconds elapsed", lines=1)

  # Process button
  process_btn = gr.Button("Detect objects")

  # Connect the input components to the processing function
  process_btn.click(
      fn=retrieve_image,
      inputs=[
          model_selector,
          input_image,
          text_prediction
      ],
      outputs=[output_result, elapsed_result]
  )

with gr.Blocks() as app:
  gr.TabbedInterface(
          [object_detection_tab,
           image_segmentation_detection_tab,
           image_retrieval_tab],
          ["Object detection",
           "Image segmentation",
           "Retrieve image"
          ],
      )

app.launch(share=True, debug=True)

app.close()