Spaces:
Sleeping
Sleeping
File size: 4,525 Bytes
69698e1 7758cb9 69698e1 7758cb9 69698e1 7758cb9 69698e1 7758cb9 69698e1 d889050 ced196e 7758cb9 69698e1 7758cb9 69698e1 7758cb9 69698e1 7758cb9 69698e1 7758cb9 69698e1 7758cb9 69698e1 7758cb9 69698e1 7758cb9 69698e1 4e40a03 69698e1 6c82cee 69698e1 6c82cee 69698e1 7758cb9 9086e70 7758cb9 69698e1 d9a5ffa 69698e1 3857db2 d889050 9086e70 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import gradio as gr
import os, gc
from datetime import datetime
from transformers import CLIPImageProcessor
from huggingface_hub import hf_hub_download
from typing import List, Dict
from dataclasses import dataclass
DEFAULT_IMAGE_TOKEN = "<image>"
ctx_limit = 3500
num_image_embeddings = 4096
title = "rwkv1b5-vitl336p14-577token_mix665k_rwkv"
vision_tower_name = 'openai/clip-vit-large-patch14-336'
os.environ["RWKV_JIT_ON"] = '1'
os.environ["RWKV_CUDA_ON"] = '0' # if '1' then use CUDA kernel for seq mode (much faster)
from rwkv.model import RWKV
model_path = hf_hub_download(repo_id="howard-hou/visualrwkv-5", filename=f"{title}.pth")
model = RWKV(model=model_path, strategy='cpu fp32')
from rwkv.utils import PIPELINE, PIPELINE_ARGS
pipeline = PIPELINE(model, "rwkv_vocab_v20230424")
##########################################################################
from modeling import VisualEncoder, EmbeddingMixer, VisualEncoderConfig
emb_mixer = EmbeddingMixer(model.w["emb.weight"],
num_image_embeddings=num_image_embeddings)
config = VisualEncoderConfig(n_embd=model.args.n_embd,
vision_tower_name=vision_tower_name,
grid_size=-1)
visual_encoder = VisualEncoder(config)
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
##########################################################################
def generate_prompt(instruction):
instruction = instruction.strip().replace('\r\n','\n').replace('\n\n','\n')
input = input.strip().replace('\r\n','\n').replace('\n\n','\n')
return f"\n{instruction}\n\nAssistant:"
def generate(
ctx,
image_ids,
token_count=200,
temperature=1.0,
top_p=0.7,
presencePenalty = 0.1,
countPenalty = 0.1,
):
args = PIPELINE_ARGS(temperature = max(0.2, float(temperature)), top_p = float(top_p),
alpha_frequency = countPenalty,
alpha_presence = presencePenalty,
token_ban = [], # ban the generation of some tokens
token_stop = [0]) # stop generation whenever you see any token here
ctx = ctx.strip()
all_tokens = []
out_last = 0
out_str = ''
occurrence = {}
state = None
for i in range(int(token_count)):
if i == 0:
input_ids = (image_ids + pipeline.encode(ctx))[-ctx_limit:]
else:
input_ids = [token]
out, state = model.forward(input_ids, state)
for n in occurrence:
out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)
token = pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p)
if token in args.token_stop:
break
all_tokens += [token]
for xxx in occurrence:
occurrence[xxx] *= 0.996
if token not in occurrence:
occurrence[token] = 1
else:
occurrence[token] += 1
tmp = pipeline.decode(all_tokens[out_last:])
if '\ufffd' not in tmp:
out_str += tmp
yield out_str.strip()
out_last = i + 1
del out
del state
gc.collect()
yield out_str.strip()
##########################################################################
cur_dir = os.path.dirname(os.path.abspath(__file__))
examples = [
[
f"{cur_dir}/examples_extreme_ironing.jpg",
"What is unusual about this image?",
],
[
f"{cur_dir}/examples_waterview.jpg",
"What are the things I should be cautious about when I visit here?",
]
]
def test(image, question):
image = image_processor(images=image.convert('RGB'), return_tensors='pt')['pixel_values']
image_features = visual_encoder.encode_images(image.unsqueeze(0))
emb_mixer.set_image_embeddings(image_features)
model.w["emb.weight"] = emb_mixer.get_input_embeddings()
image_ids = [i for i in range(emb_mixer.image_start_index, emb_mixer.image_start_index + len(image_features))]
input_text = generate_prompt(question)
for output in generate(input_text, image_ids):
yield output
demo = gr.Interface(fn=test,
inputs=[gr.Image(type='pil'), "text"],
outputs="text",
examples=examples,
title=title,
description="VisualRWKV-v5.0",
live=True)
demo.queue(concurrency_count=1, max_size=10)
demo.launch(share=False, enable_queue=True) |