Mnddoke

Running

App Files Files Community

Ashrafb commited on Apr 15, 2024

Commit

00465c6

verified ·

1 Parent(s): 44ad5df

Rename app.py to main.py

Browse files

Files changed (1) hide show

app.py → main.py +20 -55

app.py → main.py RENAMED Viewed

@@ -1162,68 +1162,33 @@ from threading import Thread
 from transformers import TextIteratorStreamer
 import hashlib
 import os
 model_path = snapshot_download("vikhyatk/moondream1")
 vision_encoder = VisionEncoder(model_path).to(DEVICE, dtype=DTYPE)
 text_model = TextModel(model_path).to(DEVICE, dtype=DTYPE)
-def cached_vision_encoder(image):
-    # Calculate checksum of the image
-    image_hash = hashlib.sha256(image.tobytes()).hexdigest()
-    # Check if `image_encoder_cache/{image_hash}.pt` exists, if so load and return it.
-    # Otherwise, save the encoded image to `image_encoder_cache/{image_hash}.pt` and return it.
-    cache_path = f"image_encoder_cache/{image_hash}.pt"
-    if os.path.exists(cache_path):
-        return torch.load(cache_path).to(DEVICE, dtype=DTYPE)
-    else:
-        image_vec = vision_encoder(image).to("cpu", dtype=torch.float16)
-        os.makedirs("image_encoder_cache", exist_ok=True)
-        torch.save(image_vec, cache_path)
-        return image_vec.to(DEVICE, dtype=DTYPE)
-def answer_question(image, question):
-    yield "Encoding image..."
-    streamer = TextIteratorStreamer(text_model.tokenizer, skip_special_tokens=True)
-    generation_kwargs = dict(
-        image_embeds=cached_vision_encoder(image), question=question, streamer=streamer
-    )
-    thread = Thread(target=text_model.answer_question, kwargs=generation_kwargs)
-    thread.start()
-    buffer = ""
-    for new_text in streamer:
-        buffer += new_text
-        if len(buffer) > 1:
-            yield re.sub("<$", "", re.sub("END$", "", buffer))
-gr.Interface(
-    title="🌔 moondream1",
-    description="""
-        moondream1 is a tiny (1.6B parameter) vision language model trained by
-        <a href="https://x.com/vikhyatk">@vikhyatk</a> that performs on par with
-        models twice its size. It is trained on the LLaVa training dataset, and
-        initialized with SigLIP as the vision tower and Phi-1.5 as the text encoder.
-        Check out the <a href="https://huggingface.co/vikhyatk/moondream1">HuggingFace
-        model card</a> for more details.
-    """,
-    fn=answer_question,
-    inputs=[gr.Image(type="pil"), gr.Textbox(lines=2, label="Question")],
-    examples=[
-        [Image.open("assets/demo-1.jpg"), "Who is the author of this book?"],
-        [Image.open("assets/demo-2.jpg"), "What type of food is the girl eating?"],
-        [
-            Image.open("assets/demo-3.jpg"),
-            "What kind of public transportation is in the image?",
-        ],
-        [Image.open("assets/demo-4.jpg"), "What is the girl looking at?"],
-        [Image.open("assets/demo-5.jpg"), "What kind of dog is in the picture?"],
-    ],
-    outputs=gr.TextArea(label="Answer"),
-    allow_flagging="never",
-    cache_examples=False,
-).launch()

 from transformers import TextIteratorStreamer
 import hashlib
 import os
+from fastapi import FastAPI, File, UploadFile, Form
+from PIL import Image
+from io import BytesIO
+from typing import List
+from pydantic import BaseModel
+from fastapi.responses import HTMLResponse, FileResponse
+from fastapi.staticfiles import StaticFiles
 model_path = snapshot_download("vikhyatk/moondream1")
 vision_encoder = VisionEncoder(model_path).to(DEVICE, dtype=DTYPE)
 text_model = TextModel(model_path).to(DEVICE, dtype=DTYPE)
+# Define a FastAPI app
+app = FastAPI()
+# Define route for answering questions
+@app.post("/upload/")
+async def answer(image: UploadFile = File(...), Question: str = Form(...)):
+    image_bytes = await image.read()
+    image = Image.open(BytesIO(image_bytes))
+    answer = answer_question(image, Question)
+    return {"answer": answer}
+app.mount("/", StaticFiles(directory="static", html=True), name="static")
+@app.get("/")
+def index() -> FileResponse:
+    return FileResponse(path="/app/static/index.html", media_type="text/html")