Caption-Captain

App Files Files Community

Severian commited on 24 days ago

Commit

4992d18

•

1 Parent(s): 348afd0

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -37

app.py CHANGED Viewed

@@ -182,25 +182,35 @@ def preprocess_image(input_image: Image.Image) -> torch.Tensor:
 def generate_caption(text_model, tokenizer, image_features, prompt_str: str, max_new_tokens: int = 300) -> str:
     prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
     prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
-    embedded_bos = text_model.model.embed_tokens(torch.tensor([[tokenizer.bos_token_id]], device=text_model.device, dtype=torch.int64))
-    eot_embed = image_adapter.get_eot_embedding().unsqueeze(0).to(dtype=text_model.dtype)
-    inputs_embeds = torch.cat([
-        embedded_bos.expand(image_features.shape[0], -1, -1),
-        image_features.to(dtype=embedded_bos.dtype),
-        prompt_embeds.expand(image_features.shape[0], -1, -1),
-        eot_embed.expand(image_features.shape[0], -1, -1),
-    ], dim=1)
     input_ids = torch.cat([
-        torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long),
         torch.zeros((1, image_features.shape[1]), dtype=torch.long),
-        prompt,
-        torch.tensor([[tokenizer.convert_tokens_to_ids("<|eot_id|>")]], dtype=torch.long),
     ], dim=1).to('cuda')
     attention_mask = torch.ones_like(input_ids)
-    generate_ids = text_model.generate(input_ids, inputs_embeds=inputs_embeds, attention_mask=attention_mask, max_new_tokens=max_new_tokens, do_sample=True, suppress_tokens=None)
     generate_ids = generate_ids[:, input_ids.shape[1]:]
     if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
@@ -476,9 +486,9 @@ with gr.Blocks(theme="Hev832/Applio", css=css, fill_width=True, fill_height=True
                     )
                 with gr.Row():
-                    username = gr.Textbox(label="Username", placeholder="Enter your username")
                 with gr.Row():
-                    password = gr.Textbox(label="Password", type="password", placeholder="Enter your password")
                 with gr.Row():
                     login_button = gr.Button("Login", size="sm")
                 login_message = gr.Markdown(visible=False)
@@ -558,29 +568,29 @@ with gr.Blocks(theme="Hev832/Applio", css=css, fill_width=True, fill_height=True
                     value="long",
                 )
-            with gr.Accordion("Extra Options", open=True):
-                extra_options = gr.CheckboxGroup(
-                    choices=[
-                        "If there is a person/character in the image you must refer to them as {name}.",
-                        "Do NOT include information about people/characters that cannot be changed (like ethnicity, gender, etc), but do still include changeable attributes (like hair style).",
-                        "Include information about lighting.",
-                        "Include information about camera angle.",
-                        "Include information about whether there is a watermark or not.",
-                        "Include information about whether there are JPEG artifacts or not.",
-                        "If it is a photo you MUST include information about what camera was likely used and details such as aperture, shutter speed, ISO, etc.",
-                        "Do NOT include anything sexual; keep it PG.",
-                        "Do NOT mention the image's resolution.",
-                        "You MUST include information about the subjective aesthetic quality of the image from low to very high.",
-                        "Include information on the image's composition style, such as leading lines, rule of thirds, or symmetry.",
-                        "Do NOT mention any text that is in the image.",
-                        "Specify the depth of field and whether the background is in focus or blurred.",
-                        "If applicable, mention the likely use of artificial or natural lighting sources.",
-                        "Do NOT use any ambiguous language.",
-                        "Include whether the image is sfw, suggestive, or nsfw.",
-                        "ONLY describe the most important elements of the image."
-                    ],
-                    label="Select Extra Options"
-                )
                 name_input = gr.Textbox(label="Person/Character Name (if applicable)")
                 gr.Markdown("**Note:** Name input is only used if an Extra Option is selected that requires it.")

 def generate_caption(text_model, tokenizer, image_features, prompt_str: str, max_new_tokens: int = 300) -> str:
     prompt = tokenizer.encode(prompt_str, return_tensors='pt', padding=False, truncation=False, add_special_tokens=False)
     prompt_embeds = text_model.model.embed_tokens(prompt.to('cuda'))
+    convo = [
+        {"role": "system", "content": "You are a helpful image captioner."},
+        {"role": "user", "content": prompt_str},
+    ]
+    convo_string = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
+    convo_tokens = tokenizer.encode(convo_string, return_tensors="pt", add_special_tokens=False, truncation=False)
+    convo_tokens = convo_tokens.squeeze(0)
+    eot_id_indices = (convo_tokens == tokenizer.convert_tokens_to_ids("<|eot_id|>")).nonzero(as_tuple=True)[0].tolist()
+    assert len(eot_id_indices) == 2, f"Expected 2 <|eot_id|> tokens, got {len(eot_id_indices)}"
+    preamble_len = eot_id_indices[1] - prompt.shape[1]
+    convo_embeds = text_model.model.embed_tokens(convo_tokens.unsqueeze(0).to('cuda'))
+    input_embeds = torch.cat([
+        convo_embeds[:, :preamble_len],
+        image_features.to(dtype=convo_embeds.dtype),
+        convo_embeds[:, preamble_len:],
+    ], dim=1).to('cuda')
     input_ids = torch.cat([
+        convo_tokens[:preamble_len].unsqueeze(0),
         torch.zeros((1, image_features.shape[1]), dtype=torch.long),
+        convo_tokens[preamble_len:].unsqueeze(0),
     ], dim=1).to('cuda')
     attention_mask = torch.ones_like(input_ids)
+    generate_ids = text_model.generate(input_ids, inputs_embeds=input_embeds, attention_mask=attention_mask, max_new_tokens=max_new_tokens, do_sample=True, suppress_tokens=None)
     generate_ids = generate_ids[:, input_ids.shape[1]:]
     if generate_ids[0][-1] == tokenizer.eos_token_id or generate_ids[0][-1] == tokenizer.convert_tokens_to_ids("<|eot_id|>"):
                     )
                 with gr.Row():
+                    username = gr.Textbox(label="Username", placeholder="Enter your username", value="ugd")
                 with gr.Row():
+                    password = gr.Textbox(label="Password", type="password", placeholder="Enter your password", value="ugd!")
                 with gr.Row():
                     login_button = gr.Button("Login", size="sm")
                 login_message = gr.Markdown(visible=False)
                     value="long",
                 )
+                with gr.Accordion("Extra Options", open=True):
+                    extra_options = gr.CheckboxGroup(
+                        choices=[
+                            "If there is a person/character in the image you must refer to them as {name}.",
+                            "Do NOT include information about people/characters that cannot be changed (like ethnicity, gender, etc), but do still include changeable attributes (like hair style).",
+                            "Include information about lighting.",
+                            "Include information about camera angle.",
+                            "Include information about whether there is a watermark or not.",
+                            "Include information about whether there are JPEG artifacts or not.",
+                            "If it is a photo you MUST include information about what camera was likely used and details such as aperture, shutter speed, ISO, etc.",
+                            "Do NOT include anything sexual; keep it PG.",
+                            "Do NOT mention the image's resolution.",
+                            "You MUST include information about the subjective aesthetic quality of the image from low to very high.",
+                            "Include information on the image's composition style, such as leading lines, rule of thirds, or symmetry.",
+                            "Do NOT mention any text that is in the image.",
+                            "Specify the depth of field and whether the background is in focus or blurred.",
+                            "If applicable, mention the likely use of artificial or natural lighting sources.",
+                            "Do NOT use any ambiguous language.",
+                            "Include whether the image is sfw, suggestive, or nsfw.",
+                            "ONLY describe the most important elements of the image."
+                        ],
+                        label="Select Extra Options"
+                    )
                 name_input = gr.Textbox(label="Person/Character Name (if applicable)")
                 gr.Markdown("**Note:** Name input is only used if an Extra Option is selected that requires it.")