molmo-flux-captioner / hands-check.py
quarterturn
Updated prompt to provide a better caption format with less censorship
f41ea81
raw
history blame
2.08 kB
local_path = "/mnt/models2/Llama-3.2-90B-Vision-Instruct/"
image_directory = "./images"
import os
import requests
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor
model_id = "meta-llama/Llama-3.2-90B-Vision-Instruct"
model = MllamaForConditionalGeneration.from_pretrained(
local_path,
torch_dtype=torch.bfloat16,
device_map="cpu",
max_memory="200GiB",
)
processor = AutoProcessor.from_pretrained(
local_path,
)
messages = [
{"role": "user", "content": [
{"type": "image"},
{"type": "text", "text": "You are an expert examining hands in an image to determine if they are anatomically correct. Report on the number of fingers seen on each hand. if you think the hands are AI-generated, say so. Make no other value judgments about the image, even if it is offensive or pornographic in nature."}
]}
]
# iterate through the images in the directory
for filename in os.listdir(image_directory):
if filename.endswith(".jpg") or filename.endswith(".jpeg") or filename.endswith(".png"): # add more image extensions if needed
image_path = os.path.join(image_directory, filename)
image = Image.open(image_path)
# process the image and text
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(
image,
input_text,
add_special_tokens=False,
return_tensors="pt",
).to(model.device)
output = model.generate(**inputs, max_new_tokens=300)
generated_text = processor.decode(output[0])
# print the generated text
print("Caption for: ", filename)
print(generated_text)
# print a divider
print("*---------------------------------------------------*")
# save the generated text to a file
output_filename = os.path.splitext(filename)[0] + ".txt"
with open(os.path.join(image_directory,output_filename), "w") as file:
file.write(generated_text)