Spaces:
Runtime error
Runtime error
File size: 5,533 Bytes
a23f4af e63f3e2 bba21a6 e63f3e2 cb26eb3 e63f3e2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
import gradio as gr
import argparse
import datetime
import json
import os
import time
import gradio as gr
import requests
from PIL import Image
from q_align.model.builder import load_pretrained_model
from q_align.conversation import (default_conversation, conv_templates,
SeparatorStyle)
from q_align.constants import LOGDIR
from q_align.utils import (build_logger, server_error_msg,
violates_moderation, moderation_msg)
from q_align.evaluate.scorer import QAlignScorer, QAlignAestheticScorer, QAlignVideoScorer
import gradio as gr
def load_video(video_file):
from decord import VideoReader
vr = VideoReader(video_file)
# Get video frame rate
fps = vr.get_avg_fps()
# Calculate frame indices for 1fps
frame_indices = [int(fps * i) for i in range(int(len(vr) / fps))]
frames = vr.get_batch(frame_indices).asnumpy()
return [Image.fromarray(frames[i]) for i in range(int(len(vr) / fps))]
pretrained="q-future/one-align"
device="cuda:0"
tokenizer, model, image_processor, _ = load_pretrained_model(pretrained, None, "mplug_owl2", device=device)
iqa_scorer = QAlignScorer(tokenizer=tokenizer, model=model, image_processor=image_processor)
iaa_scorer = QAlignAestheticScorer(tokenizer=tokenizer, model=model, image_processor=image_processor)
vqa_scorer = QAlignVideoScorer(tokenizer=tokenizer, model=model, image_processor=image_processor)
scorers = {"Image Aesthetics (IAA)": iaa_scorer, "Image Quality (IQA)": iqa_scorer, "Video Quality (VQA)": vqa_scorer}
LEVELS = ["excellent (5)", "good (4)", "fair (3)", "poor (2)", "bad (1)"]
scores = [5,4,3,2,1]
def image_classifier(input_img, input_vid, scorer_type):
if scorer_type is None:
scorer_type = "Image Quality (IQA)"
this_scorer = scorers[scorer_type]
if input_vid is not None:
input_ = load_video(input_vid)
elif input_img is not None:
input_ = [input_img]
if "Video" in scorer_type:
input_ = [input_]
probs = this_scorer(input_).mean(0).tolist()
prob_dict = {LEVEL: prob for LEVEL, prob in zip(LEVELS, probs)}
score = sum([prob * score for score, prob in zip(scores, probs)])
return prob_dict, score
title_markdown = ("""
<h3 align="center">Q-Align: Teaching LMMs for Visual Scoring via Discrete Text-Defined Levels</h3>
<h3 align="center"> One Unified Model for Visual scoring. </h3>
<h5 align="center">
<a href="https://teowu.github.io/" target="_blank">Haoning Wu</a><sup>1</sup><sup>*</sup><sup>+</sup>,
<a href="https://github.com/zzc-1998" target="_blank">Zicheng Zhang</a><sup>2</sup><sup>*</sup>,
<a href="https://sites.google.com/view/r-panda" target="_blank">Weixia Zhang</a><sup>2</sup>,
<a href="https://chaofengc.github.io" target="_blank">Chaofeng Chen</a><sup>1</sup>,
<a href="https://liaoliang92.github.io" target="_blank">Liang Liao</a><sup>1</sup>,
<a href="https://github.com/lcysyzxdxc" target="_blank">Chunyi Li</a><sup>2</sup>,
</h5>
<h5 align="center">
<a href="https://github.com/YixuanGao98" target="_blank">Yixuan Gao</a><sup>2</sup>,
<a href="https://github.com/AnnanWangDaniel" target="_blank">Annan Wang</a><sup>1</sup>,
<a href="https://github.com/ZhangErliCarl/" target="_blank">Erli Zhang</a><sup>1</sup>,
<a href="https://wenxiusun.com" target="_blank">Wenxiu Sun</a><sup>3</sup>,
<a href="https://scholar.google.com/citations?user=uT9CtPYAAAAJ&hl=en" target="_blank">Qiong Yan</a><sup>3</sup>,
<a href="https://sites.google.com/site/minxiongkuo/" target="_blank">Xiongkuo Min</a><sup>2</sup>,
<a href="https://ee.sjtu.edu.cn/en/FacultyDetail.aspx?id=24&infoid=153&flag=153" target="_blank">Guangtao Zhai</a><sup>2</sup><sup>#</sup>,
<a href="https://personal.ntu.edu.sg/wslin/Home.html" target="_blank">Weisi Lin</a><sup>1</sup><sup>#</sup>
</h5>
<h5 align="center">
<sup>1</sup>Nanyang Technological University, <sup>2</sup>Shanghai Jiao Tong University, <sup>3</sup>Sensetime Research
</h5>
<h5 align="center">
<sup>*</sup>Equal contribution. <sup>+</sup>Project Lead. <sup>#</sup>Corresponding author(s).
</h5>
<h4 align="center"> If you like the OneScorer, please give us a star ✨ on <a href='https://github.com/Q-Future/Q-Align'>GitHub</a> for latest update. </h4>
<h5 align="center">
<div style="display:flex; gap: 0.25rem;" align="center">
<a href='https://q-align.github.io'><img src='https://img.shields.io/badge/Homepage-green'></a>
<a href='https://github.com/Q-Future/Q-Align'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
<a href="https://Q-Future.github.io/Q-Align/fig/Q_Align_v0_1_preview.pdf"><img src="https://img.shields.io/badge/Technical-Report-red"></a>
<a href='https://github.com/Q-Future/Q-Align/stargazers'><img src='https://img.shields.io/github/stars/Q-Future/Q-Align.svg?style=social'></a>
</div>
</h5>
""")
input_img = gr.Image(type='pil', label="Upload an Image")
input_vid = gr.Video(label="Upload a Video (will INGORE the image if a video is uploaded)")
labels = gr.Label(label="Probabilities of rating levels:")
number = gr.Number(label="Output score:", info="Range in [1,5]. Higher is better.")
demo = gr.Interface(fn=image_classifier, inputs=[input_img, input_vid, gr.Radio(["Image Aesthetics (IAA)", "Image Quality (IQA)", "Video Quality (VQA)"], label="Task", info="Which Scorer will you need?"),], outputs=[labels, number], title="OneScorer", description=title_markdown)
demo.launch(share=True) |