File size: 2,342 Bytes
1325e72
aa819ab
 
 
1325e72
03d2f46
161b347
03d2f46
161b347
03d2f46
 
aa819ab
61f2a3d
03d2f46
 
 
 
 
 
2941c6d
a2fc719
1325e72
a2fc719
1325e72
 
 
 
 
161b347
 
 
 
1325e72
 
 
 
03d2f46
 
 
1325e72
 
 
 
 
 
03d2f46
 
 
 
161b347
 
03d2f46
a2fc719
03d2f46
 
161b347
 
 
2941c6d
161b347
03d2f46
161b347
2941c6d
 
 
 
161b347
 
 
 
03d2f46
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from gradio import ChatInterface, Textbox, Slider
from spaces import GPU
from threading import Thread
from torch import bfloat16
from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor, TextIteratorStreamer, AutoProcessor, BatchFeature
from qwen_vl_utils import process_vision_info

model_path = "Pectics/Softie-VL-7B-250123"

model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype=bfloat16,
    attn_implementation="flash_attention_2",
    device_map="auto",
)
min_pixels = 256 * 28 * 28
max_pixels = 1280 * 28 * 28
processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_path, min_pixels=min_pixels, max_pixels=max_pixels)

@GPU
async def infer(inputs: BatchFeature, **kwargs):
    inputs = inputs.to("cuda")
    model.generate(**inputs, **kwargs)

def respond(
    message,
    history,
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    messages = [{"role": "system", "content": system_message}]
    for m in history:
        messages.append({"role": m["role"], "content": m["content"]})
    messages.append({"role": "user", "content": message})
    text_inputs = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text = [text_inputs],
        images = image_inputs,
        videos = video_inputs,
        padding = True,
        return_tensors = "pt",
    )
    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
    kwargs = dict(
        streamer=streamer,
        max_new_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
    )
    infer(inputs, **kwargs)
    response = ""
    for token in streamer:
        response += token
        yield response

app = ChatInterface(
    respond,
    type="messages",
    additional_inputs=[
        Textbox(value="You are Softie, a helpful assistant.", label="系统设定"),
        Slider(minimum=1, maximum=2048, value=512, step=1, label="最大生成长度"),
        Slider(minimum=0.01, maximum=4.0, value=0.75, step=0.01, label="温度系数(Temperature)"),
        Slider(minimum=0.01, maximum=1.0, value=0.5, step=0.01, label="核取样系数(Top-p)"),
    ],
)

if __name__ == "__main__":
    app.launch()