Spaces:
Runtime error
Runtime error
howard-hou
commited on
Commit
·
786e086
1
Parent(s):
c71bb52
Update app.py
Browse files
app.py
CHANGED
@@ -1,32 +1,34 @@
|
|
1 |
import gradio as gr
|
2 |
import os, gc
|
3 |
-
|
4 |
from transformers import CLIPImageProcessor
|
5 |
from huggingface_hub import hf_hub_download
|
6 |
-
DEFAULT_IMAGE_TOKEN = "<image>"
|
7 |
-
|
8 |
|
9 |
ctx_limit = 3500
|
10 |
num_image_embeddings = 4096
|
11 |
-
title =
|
|
|
|
|
12 |
vision_tower_name = 'openai/clip-vit-large-patch14-336'
|
13 |
|
14 |
os.environ["RWKV_JIT_ON"] = '1'
|
15 |
os.environ["RWKV_CUDA_ON"] = '0' # if '1' then use CUDA kernel for seq mode (much faster)
|
16 |
|
17 |
-
from
|
18 |
-
|
19 |
-
|
|
|
20 |
from rwkv.utils import PIPELINE, PIPELINE_ARGS
|
21 |
pipeline = PIPELINE(model, "rwkv_vocab_v20230424")
|
22 |
|
23 |
##########################################################################
|
24 |
-
|
25 |
-
num_image_embeddings=num_image_embeddings)
|
26 |
-
config = VisualEncoderConfig(n_embd=model.args.n_embd,
|
27 |
vision_tower_name=vision_tower_name,
|
28 |
grid_size=-1)
|
29 |
-
visual_encoder =
|
|
|
|
|
|
|
30 |
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
|
31 |
##########################################################################
|
32 |
def generate_prompt(instruction):
|
@@ -35,7 +37,7 @@ def generate_prompt(instruction):
|
|
35 |
|
36 |
def generate(
|
37 |
ctx,
|
38 |
-
|
39 |
token_count=200,
|
40 |
temperature=1.0,
|
41 |
top_p=0.7,
|
@@ -52,14 +54,15 @@ def generate(
|
|
52 |
out_last = 0
|
53 |
out_str = ''
|
54 |
occurrence = {}
|
55 |
-
state = None
|
56 |
-
print(model.w["emb.weight"].shape)
|
57 |
for i in range(int(token_count)):
|
58 |
if i == 0:
|
59 |
-
input_ids =
|
|
|
|
|
|
|
60 |
else:
|
61 |
input_ids = [token]
|
62 |
-
|
63 |
for n in occurrence:
|
64 |
out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)
|
65 |
|
@@ -100,14 +103,9 @@ examples = [
|
|
100 |
]
|
101 |
def chatbot(image, question):
|
102 |
image = image_processor(images=image.convert('RGB'), return_tensors='pt')['pixel_values']
|
103 |
-
image_features = visual_encoder.encode_images(image.unsqueeze(0))
|
104 |
-
emb_mixer.set_image_embeddings(image_features.squeeze(0))
|
105 |
-
model.update_emb_weight(emb_mixer.get_input_embeddings())
|
106 |
-
print(emb_mixer.get_input_embeddings().shape)
|
107 |
-
print(model.w["emb.weight"].shape)
|
108 |
-
image_ids = [i for i in range(emb_mixer.image_start_index, emb_mixer.image_start_index + len(image_features))]
|
109 |
input_text = generate_prompt(question)
|
110 |
-
for output in generate(input_text,
|
111 |
yield output
|
112 |
|
113 |
with gr.Blocks(title=title) as demo:
|
|
|
1 |
import gradio as gr
|
2 |
import os, gc
|
3 |
+
import torch
|
4 |
from transformers import CLIPImageProcessor
|
5 |
from huggingface_hub import hf_hub_download
|
|
|
|
|
6 |
|
7 |
ctx_limit = 3500
|
8 |
num_image_embeddings = 4096
|
9 |
+
title = 'ViusualRWKV-v5'
|
10 |
+
rwkv_remote_path = "rwkv1b5-vitl336p14-577token_mix665k_rwkv.pth"
|
11 |
+
vision_remote_path = "rwkv1b5-vitl336p14-577token_mix665k_visual.pth"
|
12 |
vision_tower_name = 'openai/clip-vit-large-patch14-336'
|
13 |
|
14 |
os.environ["RWKV_JIT_ON"] = '1'
|
15 |
os.environ["RWKV_CUDA_ON"] = '0' # if '1' then use CUDA kernel for seq mode (much faster)
|
16 |
|
17 |
+
from modeling_vision import VisionEncoder, VisionEncoderConfig
|
18 |
+
from modeling_rwkv import RWKV
|
19 |
+
model_path = hf_hub_download(repo_id="howard-hou/visualrwkv-5", filename=rwkv_remote_path)
|
20 |
+
model = RWKV(model=model_path, strategy='cpu fp32')
|
21 |
from rwkv.utils import PIPELINE, PIPELINE_ARGS
|
22 |
pipeline = PIPELINE(model, "rwkv_vocab_v20230424")
|
23 |
|
24 |
##########################################################################
|
25 |
+
config = VisionEncoderConfig(n_embd=model.args.n_embd,
|
|
|
|
|
26 |
vision_tower_name=vision_tower_name,
|
27 |
grid_size=-1)
|
28 |
+
visual_encoder = VisionEncoder(config)
|
29 |
+
vision_local_path = hf_hub_download(repo_id="howard-hou/visualrwkv-5", filename=vision_remote_path)
|
30 |
+
vision_state_dict = torch.load(vision_local_path, map_location='cpu')
|
31 |
+
visual_encoder.load_state_dict(vision_state_dict)
|
32 |
image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
|
33 |
##########################################################################
|
34 |
def generate_prompt(instruction):
|
|
|
37 |
|
38 |
def generate(
|
39 |
ctx,
|
40 |
+
image_features,
|
41 |
token_count=200,
|
42 |
temperature=1.0,
|
43 |
top_p=0.7,
|
|
|
54 |
out_last = 0
|
55 |
out_str = ''
|
56 |
occurrence = {}
|
|
|
|
|
57 |
for i in range(int(token_count)):
|
58 |
if i == 0:
|
59 |
+
input_ids = pipeline.encode(ctx)
|
60 |
+
text_embs = model.w['emb.weight'][input_ids]
|
61 |
+
input_embs = torch.cat((image_features, text_embs), dim=0)[-ctx_limit:]
|
62 |
+
out, state = model.forward(embs=input_embs, state=None)
|
63 |
else:
|
64 |
input_ids = [token]
|
65 |
+
out, state = model.forward(input_ids, state)
|
66 |
for n in occurrence:
|
67 |
out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)
|
68 |
|
|
|
103 |
]
|
104 |
def chatbot(image, question):
|
105 |
image = image_processor(images=image.convert('RGB'), return_tensors='pt')['pixel_values']
|
106 |
+
image_features = visual_encoder.encode_images(image.unsqueeze(0)).squeeze(0) # [L, D]
|
|
|
|
|
|
|
|
|
|
|
107 |
input_text = generate_prompt(question)
|
108 |
+
for output in generate(input_text, image_features):
|
109 |
yield output
|
110 |
|
111 |
with gr.Blocks(title=title) as demo:
|