BullseyeMxP
/

joy-caption-alpha-two

Safetensors

Model card Files Files and versions Community

BullseyeMxP commited on 16 days ago

Commit

1a23e22

•

1 Parent(s): a351b6b

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -67

app.py CHANGED Viewed

@@ -114,20 +114,25 @@ assert isinstance(tokenizer, PreTrainedTokenizer) or isinstance(tokenizer, PreTr
 print("Loading LLM")
 print("Loading VLM's custom text model")
-# Configure 4-bit quantization
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
     bnb_4bit_compute_dtype=torch.float16,
     bnb_4bit_use_double_quant=True,
 )
 text_model = AutoModelForCausalLM.from_pretrained(
     CHECKPOINT_PATH / "text_model",
     device_map="auto",
     quantization_config=bnb_config,
-    torch_dtype=torch.float16
 )
 text_model.gradient_checkpointing_enable()
 text_model.eval()
 text_model = torch.compile(text_model)
@@ -140,15 +145,27 @@ image_adapter.eval()
 image_adapter.to("cuda")
 image_adapter = torch.compile(image_adapter)
 @spaces.GPU()
 @torch.no_grad()
 def stream_chat(input_image: Image.Image, caption_type: str, caption_length: str | int, extra_options: list[str], name_input: str, custom_prompt: str) -> tuple[str, str]:
     torch.cuda.empty_cache()
     gc.collect()
-    # 'any' means no length specified
     length = None if caption_length == "any" else caption_length
     if isinstance(length, str):
         try:
             length = int(length)
@@ -176,57 +193,42 @@ def stream_chat(input_image: Image.Image, caption_type: str, caption_length: str
     if custom_prompt.strip() != "":
         prompt_str = custom_prompt.strip()
-    # For debugging
-    print(f"Prompt: {prompt_str}")
-    # Preprocess image
     image = input_image.resize((384, 384), Image.LANCZOS)
-    image = image.convert('RGB')  # Ensure the image has 3 channels
     pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
-    pixel_values = TVF.normalize(pixel_values, [0.5, 0.5, 0.5], [0.5, 0.5, 0.5])  # Normalize for all 3 channels
     pixel_values = pixel_values.to('cuda', dtype=torch.float16)
-    # Embed image
-    with torch.amp.autocast_mode.autocast('cuda', dtype=torch.float16):
         vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
         embedded_images = image_adapter(vision_outputs.hidden_states)
         embedded_images = embedded_images.to('cuda', dtype=torch.float16)
-    # Build the conversation
     convo = [
-        {
-            "role": "system",
-            "content": "You are a helpful image captioner.",
-        },
-        {
-            "role": "user",
-            "content": prompt_str,
-        },
     ]
-    # Format the conversation
-    convo_string = tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = True)
-    assert isinstance(convo_string, str)
-    # Tokenize the conversation
     convo_tokens = tokenizer.encode(convo_string, return_tensors="pt", add_special_tokens=False, truncation=False)
     prompt_tokens = tokenizer.encode(prompt_str, return_tensors="pt", add_special_tokens=False, truncation=False)
-    assert isinstance(convo_tokens, torch.Tensor) and isinstance(prompt_tokens, torch.Tensor)
-    convo_tokens = convo_tokens.squeeze(0)   # Squeeze just to make the following easier
     prompt_tokens = prompt_tokens.squeeze(0)
-    # Calculate where to inject the image
     eot_id_indices = (convo_tokens == tokenizer.convert_tokens_to_ids("<|eot_id|>")).nonzero(as_tuple=True)[0].tolist()
-    assert len(eot_id_indices) == 2, f"Expected 2 <|eot_id|> tokens, got {len(eot_id_indices)}"
-    preamble_len = eot_id_indices[1] - prompt_tokens.shape[0]   # Number of tokens before the prompt
-    # Embed the tokens
-    convo_tokens = convo_tokens.unsqueeze(0).to('cuda')  # Keep as LongTensor
     convo_embeds = text_model.model.embed_tokens(convo_tokens)
-    # Construct the input
     input_embeds = torch.cat([
         convo_embeds[:, :preamble_len],
         embedded_images,
@@ -240,27 +242,31 @@ def stream_chat(input_image: Image.Image, caption_type: str, caption_length: str
     ], dim=1)
     attention_mask = torch.ones_like(input_ids)
-    # Debugging
-    print(f"Input to model: {repr(tokenizer.decode(input_ids[0]))}")
-    with torch.amp.autocast_mode.autocast('cuda', dtype=torch.float16):
         generate_ids = text_model.generate(
             input_ids,
             inputs_embeds=input_embeds,
             attention_mask=attention_mask,
             max_new_tokens=300,
             do_sample=True,
-            suppress_tokens=None,
-            use_cache=True
         )
-    # Trim off the prompt
     generate_ids = generate_ids[:, input_ids.shape[1]:]
     if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
         generate_ids = generate_ids[:, :-1]
-    caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)[0]
     torch.cuda.empty_cache()
     gc.collect()
@@ -275,7 +281,7 @@ def process_directory(directory_path, caption_type, caption_length, extra_option
             img_path = os.path.join(directory_path, filename)
             img = Image.open(img_path)
-            prompt, caption = stream_chat(img, caption_type, caption_length, extra_options, name_input, custom_prompt)
             # Save caption to a .txt file
             txt_filename = os.path.splitext(filename)[0] + '.txt'
@@ -284,9 +290,29 @@ def process_directory(directory_path, caption_type, caption_length, extra_option
                 f.write(caption)
             processed_images.append(img_path)
-            captions.append({"filename": filename, "caption": caption})
-    return processed_images, captions
 # Custom CSS for a futuristic, neon-inspired theme
 custom_css = """
@@ -439,27 +465,7 @@ with gr.Blocks(css=custom_css) as demo:
     with gr.Row():
         output_gallery = gr.Gallery(label="Processed Images", elem_classes="output-box")
-        output_text = gr.JSON(label="Generated Captions", elem_classes="output-box")
-    def process_and_display(images, caption_type, caption_length, extra_options, name_input, custom_prompt):
-        processed_images = []
-        captions = []
-        for img_file in images:
-            img = Image.open(img_file.name)
-            prompt, caption = stream_chat(img, caption_type, caption_length, extra_options, name_input, custom_prompt)
-            processed_images.append(img_file.name)
-            captions.append({"filename": img_file.name, "caption": caption})
-        return processed_images, captions
-    def process_input(input_images, directory_path, caption_type, caption_length, extra_options, name_input, custom_prompt):
-        if directory_path:
-            return process_directory(directory_path, caption_type, caption_length, extra_options, name_input, custom_prompt)
-        elif input_images:
-            return process_and_display(input_images, caption_type, caption_length, extra_options, name_input, custom_prompt)
-        else:
-            return [], []
     run_button.click(
         fn=process_input,

 print("Loading LLM")
 print("Loading VLM's custom text model")
+# Configure 4-bit quantization with more aggressive settings
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
     bnb_4bit_compute_dtype=torch.float16,
     bnb_4bit_use_double_quant=True,
+    llm_int8_enable_fp32_cpu_offload=True
 )
 text_model = AutoModelForCausalLM.from_pretrained(
     CHECKPOINT_PATH / "text_model",
     device_map="auto",
     quantization_config=bnb_config,
+    torch_dtype=torch.float16,
+    low_cpu_mem_usage=True
 )
+# Enable memory efficient attention
+text_model.config.use_memory_efficient_attention = True
 text_model.gradient_checkpointing_enable()
 text_model.eval()
 text_model = torch.compile(text_model)
 image_adapter.to("cuda")
 image_adapter = torch.compile(image_adapter)
+# Optimize CLIP model
+clip_model = clip_model.half()  # Convert to FP16
+clip_model.eval()
+clip_model.requires_grad_(False)
+clip_model = torch.compile(clip_model)
+# Optimize image adapter
+image_adapter = image_adapter.half()  # Convert to FP16
+image_adapter.eval()
+image_adapter.requires_grad_(False)
+image_adapter = torch.compile(image_adapter)
 @spaces.GPU()
 @torch.no_grad()
 def stream_chat(input_image: Image.Image, caption_type: str, caption_length: str | int, extra_options: list[str], name_input: str, custom_prompt: str) -> tuple[str, str]:
+    # Clear memory at the start
     torch.cuda.empty_cache()
     gc.collect()
+    # Build prompt string
     length = None if caption_length == "any" else caption_length
     if isinstance(length, str):
         try:
             length = int(length)
     if custom_prompt.strip() != "":
         prompt_str = custom_prompt.strip()
+    # Resize image to exact dimensions needed
     image = input_image.resize((384, 384), Image.LANCZOS)
+    image = image.convert('RGB')
     pixel_values = TVF.pil_to_tensor(image).unsqueeze(0) / 255.0
+    pixel_values = TVF.normalize(pixel_values, [0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
     pixel_values = pixel_values.to('cuda', dtype=torch.float16)
+    # Process image with optimized memory usage
+    with torch.amp.autocast('cuda', dtype=torch.float16):
         vision_outputs = clip_model(pixel_values=pixel_values, output_hidden_states=True)
         embedded_images = image_adapter(vision_outputs.hidden_states)
         embedded_images = embedded_images.to('cuda', dtype=torch.float16)
+    # Build the conversation with minimal overhead
     convo = [
+        {"role": "system", "content": "You are a helpful image captioner."},
+        {"role": "user", "content": prompt_str},
     ]
+    # Format and tokenize efficiently
+    convo_string = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
     convo_tokens = tokenizer.encode(convo_string, return_tensors="pt", add_special_tokens=False, truncation=False)
     prompt_tokens = tokenizer.encode(prompt_str, return_tensors="pt", add_special_tokens=False, truncation=False)
+    convo_tokens = convo_tokens.squeeze(0)
     prompt_tokens = prompt_tokens.squeeze(0)
+    # Calculate injection point
     eot_id_indices = (convo_tokens == tokenizer.convert_tokens_to_ids("<|eot_id|>")).nonzero(as_tuple=True)[0].tolist()
+    preamble_len = eot_id_indices[1] - prompt_tokens.shape[0]
+    # Prepare input tensors efficiently
+    convo_tokens = convo_tokens.unsqueeze(0).to('cuda')
     convo_embeds = text_model.model.embed_tokens(convo_tokens)
     input_embeds = torch.cat([
         convo_embeds[:, :preamble_len],
         embedded_images,
     ], dim=1)
     attention_mask = torch.ones_like(input_ids)
+    # Generate with optimized settings
+    with torch.amp.autocast('cuda', dtype=torch.float16):
         generate_ids = text_model.generate(
             input_ids,
             inputs_embeds=input_embeds,
             attention_mask=attention_mask,
             max_new_tokens=300,
             do_sample=True,
+            use_cache=True,
+            pad_token_id=tokenizer.pad_token_id,
+            num_beams=1,  # Disable beam search for faster generation
+            temperature=0.7,  # Lower temperature for more focused generation
+            top_p=0.9,  # Nucleus sampling for efficiency
+            repetition_penalty=1.2,  # Prevent repetition
         )
+    # Process output efficiently
     generate_ids = generate_ids[:, input_ids.shape[1]:]
     if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
         generate_ids = generate_ids[:, :-1]
+    caption = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
+    # Clear memory
+    del vision_outputs, embedded_images, input_embeds, generate_ids
     torch.cuda.empty_cache()
     gc.collect()
             img_path = os.path.join(directory_path, filename)
             img = Image.open(img_path)
+            _, caption = stream_chat(img, caption_type, caption_length, extra_options, name_input, custom_prompt)
             # Save caption to a .txt file
             txt_filename = os.path.splitext(filename)[0] + '.txt'
                 f.write(caption)
             processed_images.append(img_path)
+            captions.append(caption)
+    return processed_images, "\n\n".join(captions)  # Join captions with double newline for readability
+def process_and_display(images, caption_type, caption_length, extra_options, name_input, custom_prompt):
+    processed_images = []
+    captions = []
+    for img_file in images:
+        img = Image.open(img_file.name)
+        _, caption = stream_chat(img, caption_type, caption_length, extra_options, name_input, custom_prompt)
+        processed_images.append(img_file.name)
+        captions.append(caption)
+    return processed_images, "\n\n".join(captions)  # Join captions with double newline for readability
+def process_input(input_images, directory_path, caption_type, caption_length, extra_options, name_input, custom_prompt):
+    if directory_path:
+        return process_directory(directory_path, caption_type, caption_length, extra_options, name_input, custom_prompt)
+    elif input_images:
+        return process_and_display(input_images, caption_type, caption_length, extra_options, name_input, custom_prompt)
+    else:
+        return [], ""
 # Custom CSS for a futuristic, neon-inspired theme
 custom_css = """
     with gr.Row():
         output_gallery = gr.Gallery(label="Processed Images", elem_classes="output-box")
+        output_text = gr.Textbox(label="Generated Captions", elem_classes="output-box", lines=10)
     run_button.click(
         fn=process_input,