Spaces:

howard-hou
/

VisualRWKV-Gradio-1

Runtime error

howard-hou commited on Jan 7, 2024

Commit

88c85c4

1 Parent(s): 9ee77be

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import os
 os.environ["RWKV_JIT_ON"] = '1'
 os.environ["RWKV_CUDA_ON"] = '1' # if '1' then use CUDA kernel for seq mode (much faster)
 from modeling_rwkv import RWKV
 import gc
@@ -14,6 +15,7 @@ from huggingface_hub import hf_hub_download
 from pynvml import *
 nvmlInit()
 gpu_h = nvmlDeviceGetHandleByIndex(0)
 ctx_limit = 3500
 title = 'ViusualRWKV-v5'
@@ -36,8 +38,7 @@ vision_local_path = hf_hub_download(repo_id="howard-hou/visualrwkv-5", filename=
 vision_state_dict = torch.load(vision_local_path, map_location='cpu')
 visual_encoder.load_state_dict(vision_state_dict)
 image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
-if torch.cuda.is_available():
-    visual_encoder = visual_encoder.cuda()
 ##########################################################################
 def generate_prompt(instruction):
     instruction = instruction.strip().replace('\r\n','\n').replace('\n\n','\n')
@@ -135,6 +136,7 @@ def compute_image_state(image):
         image_state = image_cache[base64_image]
     else:
         image = image_processor(images=image.convert('RGB'), return_tensors='pt')['pixel_values']
         image_features = visual_encoder.encode_images(image.unsqueeze(0)).squeeze(0) # [L, D]
         # apply layer norm to image feature, very important
         image_features = F.layer_norm(image_features,

 import os
 os.environ["RWKV_JIT_ON"] = '1'
 os.environ["RWKV_CUDA_ON"] = '1' # if '1' then use CUDA kernel for seq mode (much faster)
+# make sure cuda dir is in the same level as modeling_rwkv.py
 from modeling_rwkv import RWKV
 import gc
 from pynvml import *
 nvmlInit()
 gpu_h = nvmlDeviceGetHandleByIndex(0)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 ctx_limit = 3500
 title = 'ViusualRWKV-v5'
 vision_state_dict = torch.load(vision_local_path, map_location='cpu')
 visual_encoder.load_state_dict(vision_state_dict)
 image_processor = CLIPImageProcessor.from_pretrained(vision_tower_name)
+visual_encoder = visual_encoder.to(device)
 ##########################################################################
 def generate_prompt(instruction):
     instruction = instruction.strip().replace('\r\n','\n').replace('\n\n','\n')
         image_state = image_cache[base64_image]
     else:
         image = image_processor(images=image.convert('RGB'), return_tensors='pt')['pixel_values']
+        image = image.to(device)
         image_features = visual_encoder.encode_images(image.unsqueeze(0)).squeeze(0) # [L, D]
         # apply layer norm to image feature, very important
         image_features = F.layer_norm(image_features,