{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "gpuType": "T4"
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "code",
      "source": [
        "!pip install -q gradio_client gradio torch torchaudio transformers"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "CA8lr7CjQnU5",
        "outputId": "1a2dfb9c-8e3d-4177-cc19-e17ceb6da284"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m16.5/16.5 MB\u001b[0m \u001b[31m89.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.9/92.9 kB\u001b[0m \u001b[31m13.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m138.7/138.7 kB\u001b[0m \u001b[31m17.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m381.9/381.9 kB\u001b[0m \u001b[31m24.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.7/45.7 kB\u001b[0m \u001b[31m7.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.7/59.7 kB\u001b[0m \u001b[31m8.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.1/2.1 MB\u001b[0m \u001b[31m76.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.0/67.0 kB\u001b[0m \u001b[31m9.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h  Building wheel for ffmpy (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
            "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
            "lida 0.0.10 requires kaleido, which is not installed.\n",
            "llmx 0.0.15a0 requires cohere, which is not installed.\n",
            "llmx 0.0.15a0 requires openai, which is not installed.\n",
            "llmx 0.0.15a0 requires tiktoken, which is not installed.\n",
            "tensorflow-probability 0.22.0 requires typing-extensions<4.6.0, but you have typing-extensions 4.8.0 which is incompatible.\u001b[0m\u001b[31m\n",
            "\u001b[0m"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import gradio as gr\n",
        "from transformers import pipeline\n",
        "import numpy as np\n",
        "import requests\n",
        "from PIL import Image\n",
        "from transformers import BlipProcessor, BlipForQuestionAnswering\n",
        "\n",
        "transcriber = pipeline(\"automatic-speech-recognition\", model=\"openai/whisper-base.en\")\n",
        "\n",
        "processor = BlipProcessor.from_pretrained(\"jaimik69/blip_finetuned\")\n",
        "model = BlipForQuestionAnswering.from_pretrained(\"jaimik69/blip_finetuned\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ExsvuGVbQpHK",
        "outputId": "98d83b01-4cd0-4507-bc93-942e998a6c85"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import torch\n",
        "\n",
        "def transcribe(audio, img_url):\n",
        "    sr, y = audio\n",
        "    y = y.astype(np.float32)\n",
        "    y /= np.max(np.abs(y))\n",
        "    prompt = transcriber({\"sampling_rate\": sr, \"raw\": y})[\"text\"]\n",
        "    raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')\n",
        "    inputs = processor(raw_image, text=prompt, return_tensors=\"pt\")\n",
        "    generated_ids = model.generate(**inputs, max_new_tokens=10)\n",
        "    aqa_ans = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()\n",
        "\n",
        "    return prompt, aqa_ans, gr.Image(img_url)\n",
        "\n",
        "demo = gr.Interface(\n",
        "    transcribe,\n",
        "    [gr.Audio(sources=[\"microphone\"], label = \"User Audio Input\"), gr.Textbox(label=\"User Image URL\")],\n",
        "    [gr.Textbox(label=\"Question\"), gr.Textbox(label=\"Answer\"), gr.Image(label = \"User Image\")],\n",
        "    title = 'Vox Helios', theme = 'dark-grass',\n",
        "    description = 'An Audio Question Answering Project'\n",
        ")\n",
        "\n",
        "if __name__ == \"__main__\":\n",
        "    demo.launch(debug=True,auth=(\"sai\", \"letmein\"))"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 719
        },
        "id": "V0vKhLvCQyUj",
        "outputId": "9ce5a8c8-26aa-4dd7-caba-56583799f6a0"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py:528: UserWarning: Cannot load dark-grass. Caught Exception: The space dark-grass does not exist\n",
            "  warnings.warn(f\"Cannot load {theme}. Caught Exception: {str(e)}\")\n"
          ]
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).\n",
            "\n",
            "Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().\n",
            "Running on public URL: https://90c36f56b801550f7e.gradio.live\n",
            "\n",
            "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ],
            "text/html": [
              "<div><iframe src=\"https://90c36f56b801550f7e.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Keyboard interruption in main thread... closing server.\n",
            "Killing tunnel 127.0.0.1:7861 <> https://90c36f56b801550f7e.gradio.live\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "8nnM-fTOhVex"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}