howard-hou commited on
Commit
786e086
·
1 Parent(s): c71bb52

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -23
app.py CHANGED
@@ -1,32 +1,34 @@
1
  import gradio as gr
2
  import os, gc
3
- from datetime import datetime
4
  from transformers import CLIPImageProcessor
5
  from huggingface_hub import hf_hub_download
6
- DEFAULT_IMAGE_TOKEN = "<image>"
7
-
8
 
9
  ctx_limit = 3500
10
  num_image_embeddings = 4096
11
- title = "rwkv1b5-vitl336p14-577token_mix665k_rwkv"
 
 
12
  vision_tower_name = 'openai/clip-vit-large-patch14-336'
13
 
14
  os.environ["RWKV_JIT_ON"] = '1'
15
  os.environ["RWKV_CUDA_ON"] = '0' # if '1' then use CUDA kernel for seq mode (much faster)
16
 
17
- from modeling import UpdatableRWKV, VisualEncoder, EmbeddingMixer, VisualEncoderConfig
18
- model_path = hf_hub_download(repo_id="howard-hou/visualrwkv-5", filename=f"{title}.pth")
19
- model = UpdatableRWKV(model=model_path, strategy='cpu fp32')
 
20
  from rwkv.utils import PIPELINE, PIPELINE_ARGS
21
  pipeline = PIPELINE(model, "rwkv_vocab_v20230424")
22
 
23
  ##########################################################################
24
- emb_mixer = EmbeddingMixer(model.w["emb.weight"],
25
- num_image_embeddings=num_image_embeddings)
26
- config = VisualEncoderConfig(n_embd=model.args.n_embd,
27
  vision_tower_name=vision_tower_name,
28
  grid_size=-1)
29
- visual_encoder = VisualEncoder(config)
 
 
 
30
  image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
31
  ##########################################################################
32
  def generate_prompt(instruction):
@@ -35,7 +37,7 @@ def generate_prompt(instruction):
35
 
36
  def generate(
37
  ctx,
38
- image_ids,
39
  token_count=200,
40
  temperature=1.0,
41
  top_p=0.7,
@@ -52,14 +54,15 @@ def generate(
52
  out_last = 0
53
  out_str = ''
54
  occurrence = {}
55
- state = None
56
- print(model.w["emb.weight"].shape)
57
  for i in range(int(token_count)):
58
  if i == 0:
59
- input_ids = (image_ids + pipeline.encode(ctx))[-ctx_limit:]
 
 
 
60
  else:
61
  input_ids = [token]
62
- out, state = model.forward(input_ids, state)
63
  for n in occurrence:
64
  out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)
65
 
@@ -100,14 +103,9 @@ examples = [
100
  ]
101
  def chatbot(image, question):
102
  image = image_processor(images=image.convert('RGB'), return_tensors='pt')['pixel_values']
103
- image_features = visual_encoder.encode_images(image.unsqueeze(0))
104
- emb_mixer.set_image_embeddings(image_features.squeeze(0))
105
- model.update_emb_weight(emb_mixer.get_input_embeddings())
106
- print(emb_mixer.get_input_embeddings().shape)
107
- print(model.w["emb.weight"].shape)
108
- image_ids = [i for i in range(emb_mixer.image_start_index, emb_mixer.image_start_index + len(image_features))]
109
  input_text = generate_prompt(question)
110
- for output in generate(input_text, image_ids):
111
  yield output
112
 
113
  with gr.Blocks(title=title) as demo:
 
1
  import gradio as gr
2
  import os, gc
3
+ import torch
4
  from transformers import CLIPImageProcessor
5
  from huggingface_hub import hf_hub_download
 
 
6
 
7
  ctx_limit = 3500
8
  num_image_embeddings = 4096
9
+ title = 'ViusualRWKV-v5'
10
+ rwkv_remote_path = "rwkv1b5-vitl336p14-577token_mix665k_rwkv.pth"
11
+ vision_remote_path = "rwkv1b5-vitl336p14-577token_mix665k_visual.pth"
12
  vision_tower_name = 'openai/clip-vit-large-patch14-336'
13
 
14
  os.environ["RWKV_JIT_ON"] = '1'
15
  os.environ["RWKV_CUDA_ON"] = '0' # if '1' then use CUDA kernel for seq mode (much faster)
16
 
17
+ from modeling_vision import VisionEncoder, VisionEncoderConfig
18
+ from modeling_rwkv import RWKV
19
+ model_path = hf_hub_download(repo_id="howard-hou/visualrwkv-5", filename=rwkv_remote_path)
20
+ model = RWKV(model=model_path, strategy='cpu fp32')
21
  from rwkv.utils import PIPELINE, PIPELINE_ARGS
22
  pipeline = PIPELINE(model, "rwkv_vocab_v20230424")
23
 
24
  ##########################################################################
25
+ config = VisionEncoderConfig(n_embd=model.args.n_embd,
 
 
26
  vision_tower_name=vision_tower_name,
27
  grid_size=-1)
28
+ visual_encoder = VisionEncoder(config)
29
+ vision_local_path = hf_hub_download(repo_id="howard-hou/visualrwkv-5", filename=vision_remote_path)
30
+ vision_state_dict = torch.load(vision_local_path, map_location='cpu')
31
+ visual_encoder.load_state_dict(vision_state_dict)
32
  image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
33
  ##########################################################################
34
  def generate_prompt(instruction):
 
37
 
38
  def generate(
39
  ctx,
40
+ image_features,
41
  token_count=200,
42
  temperature=1.0,
43
  top_p=0.7,
 
54
  out_last = 0
55
  out_str = ''
56
  occurrence = {}
 
 
57
  for i in range(int(token_count)):
58
  if i == 0:
59
+ input_ids = pipeline.encode(ctx)
60
+ text_embs = model.w['emb.weight'][input_ids]
61
+ input_embs = torch.cat((image_features, text_embs), dim=0)[-ctx_limit:]
62
+ out, state = model.forward(embs=input_embs, state=None)
63
  else:
64
  input_ids = [token]
65
+ out, state = model.forward(input_ids, state)
66
  for n in occurrence:
67
  out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)
68
 
 
103
  ]
104
  def chatbot(image, question):
105
  image = image_processor(images=image.convert('RGB'), return_tensors='pt')['pixel_values']
106
+ image_features = visual_encoder.encode_images(image.unsqueeze(0)).squeeze(0) # [L, D]
 
 
 
 
 
107
  input_text = generate_prompt(question)
108
+ for output in generate(input_text, image_features):
109
  yield output
110
 
111
  with gr.Blocks(title=title) as demo: