aisatsu-api / main.py
vumichien's picture
Update main.py
08ce8d2
raw
history blame
2.03 kB
from ultralytics import YOLO
from base64 import b64encode
from speech_recognition import AudioFile, Recognizer
import numpy as np
from scipy.spatial import distance as dist
from sahi.utils.cv import read_image_as_pil
from fastapi import FastAPI, File, UploadFile, Form
from utils import tts, read_image_file, pil_to_base64, base64_to_pil, get_hist
from typing import Optional
from huggingface_hub import hf_hub_download
model_path = hf_hub_download(repo_id="ultralyticsplus/yolov8s", filename='yolov8s.pt')
model = YOLO(model_path)
CLASS = model.model.names
defaul_bot_voice = "γŠγ―γ„γ‚ˆγ†γ”γ–γ„γΎγ™"
area_thres = 0.3
app = FastAPI()
@app.get("/")
def read_root():
return {"Message": "Application startup complete"}
@app.post("/aisatsu_api/")
async def predict_api(
file: UploadFile = File(...),
last_seen: Optional[str] = Form(None)
):
image = read_image_file(await file.read())
results = model.predict(image, show=False)[0]
image = read_image_as_pil(image)
masks, boxes = results.masks, results.boxes
area_image = image.width * image.height
voice_bot = None
most_close = 0
out_img = None
diff_value = 0.5
if boxes is not None:
for xyxy, conf, cls in zip(boxes.xyxy, boxes.conf, boxes.cls):
if int(cls) != 0:
continue
box = xyxy.tolist()
area_rate = (box[2] - box[0]) * (box[3] - box[1]) / area_image
if area_rate >= most_close:
out_img = image.crop(tuple(box)).resize((64, 64))
most_close = area_rate
if last_seen is not None:
last_seen = base64_to_pil(last_seen)
if out_img is not None:
diff_value = dist.euclidean(get_hist(out_img), get_hist(last_seen))
print(most_close, diff_value)
if most_close >= area_thres and diff_value >= 0.5:
voice_bot = tts(defaul_bot_voice, language="ja")
return {
"voice": voice_bot,
"image": pil_to_base64(out_img) if out_img is not None else None
}