Spaces:

yuukicammy
/

vit-gpt2-image-captioning

Runtime error

App Files Files Community

yuukicammy commited on Apr 25, 2023

Commit

a6394d0

1 Parent(s): d3e071f

Changed to be able to run in Modal.

Browse files

Files changed (7) hide show

.gitignore +1 -0
README.md +0 -11
app.py +0 -4
frontend/app.jsx +119 -0
frontend/index.html +22 -0
vit_gpt2_image_caption_webapp.py +43 -0
vit_gpt2_image_captioning.py +65 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

README.md DELETED Viewed

@@ -1,11 +0,0 @@
----
-title: Vit Gpt2 Image Captioning
-emoji: 👀
-colorFrom: blue
-colorTo: blue
-sdk: gradio
-sdk_version: 3.27.0
-app_file: app.py
-pinned: false
-license: apache-2.0
----

app.py DELETED Viewed

@@ -1,4 +0,0 @@
-import gradio as gr
-gr.Interface.load("models/nlpconnect/vit-gpt2-image-captioning").launch()

frontend/app.jsx ADDED Viewed

	@@ -0,0 +1,119 @@

+function Spinner({ config }) {
+  const ref = React.useRef(null);
+  React.useEffect(() => {
+    const spinner = new Spin.Spinner({
+      lines: 13,
+      color: "#ffffff",
+      ...config,
+    });
+    spinner.spin(ref.current);
+    return () => spinner.stop();
+  }, [ref]);
+  return <span ref={ref} />;
+}
+function Result({ callId, selectedFile }) {
+  const [result, setResult] = React.useState();
+  const [intervalId, setIntervalId] = React.useState();
+  React.useEffect(() => {
+    if (result) {
+      clearInterval(intervalId);
+      return;
+    }
+    const _intervalID = setInterval(async () => {
+      const resp = await fetch(`/result/${callId}`);
+      if (resp.status === 200) {
+        setResult(await resp.json());
+      }
+    }, 100);
+    setIntervalId(_intervalID);
+    return () => clearInterval(intervalId);
+  }, [result]);
+  return (
+    <div class="flex items-center content-center justify-center space-x-4 ">
+      <img src={URL.createObjectURL(selectedFile)} class="h-[300px]" />
+      {!result && <Spinner config={{}} />}
+      {result && (
+        <p class="w-[200px] p-4 bg-zinc-200 rounded-lg whitespace-pre-wrap text-xs font-mono">
+          {JSON.stringify(result, undefined, 1)}
+        </p>
+      )}
+    </div>
+  );
+}
+function Form({ onSubmit, onFileSelect, selectedFile }) {
+  return (
+    <form class="flex flex-col space-y-4 items-center">
+      <div class="text-2xl font-semibold text-gray-700"> ViT-GPT2 Image Captioning </div>
+      <input
+        accept="image/*"
+        type="file"
+        name="file"
+        onChange={onFileSelect}
+        class="block w-full text-sm text-gray-900 bg-gray-50 rounded-lg border border-gray-300 cursor-pointer"
+      />
+      {selectedFile ? (
+        <img src={URL.createObjectURL(selectedFile)} class="h-[300px]" />
+      ) : null}
+      <div>
+        <button
+          type="button"
+          onClick={onSubmit}
+          disabled={!selectedFile}
+          class="bg-indigo-400 disabled:bg-zinc-500 hover:bg-indigo-600 text-white font-bold py-2 px-4 rounded text-sm"
+        >
+          Upload
+        </button>
+      </div>
+    </form>
+  );
+}
+function App() {
+  const [selectedFile, setSelectedFile] = React.useState();
+  const [callId, setCallId] = React.useState();
+  const handleSubmission = async () => {
+    const formData = new FormData();
+    formData.append("image", selectedFile);
+    const resp = await fetch("/parse", {
+      method: "POST",
+      body: formData,
+    });
+    if (resp.status !== 200) {
+      throw new Error("An error occurred: " + resp.status);
+    }
+    const body = await resp.json();
+    setCallId(body.call_id);
+  };
+  return (
+    <div class="absolute inset-0 bg-gradient-to-r from-indigo-300 via-purple-300 to-pink-300">
+      <div class="mx-auto max-w-md py-8">
+        <main class="rounded-xl bg-white p-6">
+          {!callId && (
+            <Form
+              onSubmit={handleSubmission}
+              onFileSelect={(e) => setSelectedFile(e.target.files[0])}
+              selectedFile={selectedFile}
+            />
+          )}
+          {callId && <Result callId={callId} selectedFile={selectedFile} />}
+        </main>
+      </div>
+    </div>
+  );
+}
+const container = document.getElementById("react");
+ReactDOM.createRoot(container).render(<App />);

frontend/index.html ADDED Viewed

	@@ -0,0 +1,22 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="utf-8" />
+  <meta name="viewport" content="width=device-width, initial-scale=1" />
+  <title>ViT-GPT2 Image Captioning powered by Modal</title>
+  <script src="https://cdn.tailwindcss.com"></script>
+  <script crossorigin src="https://unpkg.com/react@18/umd/react.development.js"></script>
+  <script crossorigin src="https://unpkg.com/react-dom@18/umd/react-dom.development.js"></script>
+  <script crossorigin src="https://unpkg.com/@babel/standalone/babel.min.js"></script>
+  <script crossorigin src="https://spin.js.org/spin.umd.js"></script>
+  <link rel="stylesheet" href="https://spin.js.org/spin.css" />
+</head>
+<body class="bg-gray-50">
+  <noscript>You must have JavaScript enabled to use this app.</noscript>
+  <script type="text/babel" src="/app.jsx"></script>
+  <div id="react"></div>
+</body>
+</html>

vit_gpt2_image_caption_webapp.py ADDED Viewed

	@@ -0,0 +1,43 @@

+from pathlib import Path
+import fastapi
+import fastapi.staticfiles
+from modal import Function, Mount, Stub, asgi_app
+stub = Stub("vit-gpt2-image-caption-webapp")
+web_app = fastapi.FastAPI()
+@web_app.post("/parse")
+async def parse(request: fastapi.Request):
+    predict_step = Function.lookup("vit-gpt2-image-captioning", "predict_step")
+    form = await request.form()
+    image = await form["image"].read()  # type: ignore
+    call = predict_step.spawn(image)
+    return {"call_id": call.object_id}
+@web_app.get("/result/{call_id}")
+async def poll_results(call_id: str):
+    from modal.functions import FunctionCall
+    function_call = FunctionCall.from_id(call_id)
+    try:
+        result = function_call.get(timeout=0)
+    except TimeoutError:
+        return fastapi.responses.JSONResponse(content="", status_code=202)
+    return result[0]
+assets_path = Path(__file__).parent / "frontend"
+@stub.function(mounts=[Mount.from_local_dir(assets_path, remote_path="/assets")])
+@asgi_app()
+def wrapper():
+    web_app.mount("/", fastapi.staticfiles.StaticFiles(directory="/assets", html=True))
+    return web_app

vit_gpt2_image_captioning.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# https://huggingface.co/nlpconnect/vit-gpt2-image-captioning
+import urllib.request
+import modal
+stub = modal.Stub("vit-gpt2-image-captioning")
+volume = modal.SharedVolume().persist("shared_vol")
+CACHE_PATH = "/root/model_cache"
+@stub.function(
+    gpu="any",
+    image=modal.Image.debian_slim().pip_install("Pillow", "transformers", "torch"),
+    shared_volumes={CACHE_PATH: volume},
+    retries=3,
+)
+def predict_step(image):
+    import io
+    from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
+    import torch
+    from PIL import Image
+    model = VisionEncoderDecoderModel.from_pretrained(
+        "nlpconnect/vit-gpt2-image-captioning"
+    )
+    feature_extractor = ViTImageProcessor.from_pretrained(
+        "nlpconnect/vit-gpt2-image-captioning"
+    )
+    tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    max_length = 16
+    num_beams = 4
+    gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
+    input_img = Image.open(io.BytesIO(image))
+    pixel_values = feature_extractor(
+        images=[input_img], return_tensors="pt"
+    ).pixel_values
+    pixel_values = pixel_values.to(device)
+    output_ids = model.generate(pixel_values, **gen_kwargs)
+    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
+    preds = [pred.strip() for pred in preds]
+    return preds
+@stub.local_entrypoint()
+def main():
+    from pathlib import Path
+    image_filepath = Path(__file__).parent / "sample.png"
+    if image_filepath.exists():
+        with open(image_filepath, "rb") as f:
+            image = f.read()
+    else:
+        try:
+            image = urllib.request.urlopen(
+                "https://drive.google.com/uc?id=0B0TjveMhQDhgLTlpOENiOTZ6Y00&export=download"
+            ).read()
+        except urllib.error.URLError as e:
+            print(e.reason)
+    print(predict_step.call(image)[0])