joy-caption-pre-alpha

Sleeping

bobber commited on Dec 28, 2024

Commit

83f96b0

verified ·

1 Parent(s): eb70836

Update app.py

add vlm_prompt

Files changed (1) hide show

app.py CHANGED Viewed

@@ -11,7 +11,7 @@ import os
 CLIP_PATH = "google/siglip-so400m-patch14-384"
-VLM_PROMPT = "A descriptive caption for this image:\n"
 MODEL_PATH = "meta-llama/Meta-Llama-3.1-8B"
 CHECKPOINT_PATH = Path("wpkklhc6")
 TITLE = "<h1><center>JoyCaption Pre-Alpha (2024-07-30a)</center></h1>"
@@ -63,7 +63,7 @@ image_adapter.to("cuda")
 @spaces.GPU()
 @torch.no_grad()
-def stream_chat(input_image: Image.Image):
 	torch.cuda.empty_cache()
 	# Preprocess image
@@ -71,7 +71,10 @@ def stream_chat(input_image: Image.Image):
 	image = image.to('cuda')
 	# Tokenize the prompt
-	prompt = tokenizer.encode(VLM_PROMPT, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
 	# Embed image
 	with torch.amp.autocast_mode.autocast('cuda', enabled=True):
@@ -121,8 +124,18 @@ with gr.Blocks() as demo:
 		with gr.Column():
 			output_caption = gr.Textbox(label="Caption")
-	run_button.click(fn=stream_chat, inputs=[input_image], outputs=[output_caption])
 if __name__ == "__main__":

 CLIP_PATH = "google/siglip-so400m-patch14-384"
+VLM_PROMPT = "A descriptive caption for this image:"
 MODEL_PATH = "meta-llama/Meta-Llama-3.1-8B"
 CHECKPOINT_PATH = Path("wpkklhc6")
 TITLE = "<h1><center>JoyCaption Pre-Alpha (2024-07-30a)</center></h1>"
 @spaces.GPU()
 @torch.no_grad()
+def stream_chat(input_image: Image.Image, vlm_prompt):
 	torch.cuda.empty_cache()
 	# Preprocess image
 	image = image.to('cuda')
 	# Tokenize the prompt
+    if not vlm_prompt:
+        vlm_prompt = VLM_PROMPT
+    vlm_prompt = vlm_prompt + "\n"
+	prompt = tokenizer.encode(vlm_prompt, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
 	# Embed image
 	with torch.amp.autocast_mode.autocast('cuda', enabled=True):
 		with gr.Column():
 			output_caption = gr.Textbox(label="Caption")
+    with gr.Row():
+        vlm_prompt = gr.Text(
+            label="VLM Prompt",
+            show_label=False,
+            max_lines=1,
+            placeholder="Enter your VLM prompt",
+            container=False,
+            value="A descriptive caption for this image:",
+        )
+	run_button.click(fn=stream_chat, inputs=[input_image, vlm_prompt], outputs=[output_caption])
 if __name__ == "__main__":