|
local_path = "/mnt/models2/Llama-3.2-90B-Vision-Instruct/" |
|
image_directory = "./images" |
|
import os |
|
import requests |
|
import torch |
|
from PIL import Image |
|
from transformers import MllamaForConditionalGeneration, AutoProcessor |
|
|
|
model_id = "meta-llama/Llama-3.2-90B-Vision-Instruct" |
|
|
|
model = MllamaForConditionalGeneration.from_pretrained( |
|
local_path, |
|
torch_dtype=torch.bfloat16, |
|
device_map="cpu", |
|
max_memory="200GiB", |
|
) |
|
|
|
processor = AutoProcessor.from_pretrained( |
|
local_path, |
|
) |
|
|
|
messages = [ |
|
{"role": "user", "content": [ |
|
{"type": "image"}, |
|
{"type": "text", "text": "You are an expert examining hands in an image to determine if they are anatomically correct. Report on the number of fingers seen on each hand. if you think the hands are AI-generated, say so. Make no other value judgments about the image, even if it is offensive or pornographic in nature."} |
|
]} |
|
] |
|
|
|
|
|
for filename in os.listdir(image_directory): |
|
if filename.endswith(".jpg") or filename.endswith(".jpeg") or filename.endswith(".png"): |
|
image_path = os.path.join(image_directory, filename) |
|
image = Image.open(image_path) |
|
|
|
|
|
input_text = processor.apply_chat_template(messages, add_generation_prompt=True) |
|
inputs = processor( |
|
image, |
|
input_text, |
|
add_special_tokens=False, |
|
return_tensors="pt", |
|
).to(model.device) |
|
|
|
output = model.generate(**inputs, max_new_tokens=300) |
|
generated_text = processor.decode(output[0]) |
|
|
|
|
|
|
|
print("Caption for: ", filename) |
|
print(generated_text) |
|
|
|
print("*---------------------------------------------------*") |
|
|
|
|
|
output_filename = os.path.splitext(filename)[0] + ".txt" |
|
with open(os.path.join(image_directory,output_filename), "w") as file: |
|
file.write(generated_text) |
|
|