quarterturn
/

molmo-flux-captioner

Model card Files Files and versions Community

molmo-flux-captioner / caption.py

quarterturn

first commit

837bad6 5 months ago

3.08 kB

	import os
	import torch
	from PIL import Image
	import requests
	from transformers import AutoProcessor, AutoModelForCausalLM, GenerationConfig, BitsAndBytesConfig

	if torch.cuda.is_available():
	device = torch.device("cuda")
	print("GPU is available. Using CUDA.")
	else:
	device = torch.device("cpu")
	print("GPU is not available. Using CPU.")

	# load the processor from local path
	local_path = "./model/Molmo-7B-D-0924"
	processor = AutoProcessor.from_pretrained(
	local_path,
	local_files_only=True,
	trust_remote_code=True,
	torch_dtype='auto',
	device_map='auto'
	)

	model = AutoModelForCausalLM.from_pretrained(
	local_path,
	trust_remote_code=True,
	torch_dtype='auto',
	device_map='auto',
	)


	model.to(dtype=torch.bfloat16)

	# directory containing the images
	image_directory = "./images"

	# iterate through the images in the directory
	for filename in os.listdir(image_directory):
	if filename.endswith(".jpg") or filename.endswith(".jpeg") or filename.endswith(".png"): # add more image extensions if needed
	image_path = os.path.join(image_directory, filename)
	image = Image.open(image_path)

	# process the image and text
	inputs = processor.process(
	images=[image],
	text="Describe what you see in vivid detail, without line breaks. Include information about the pose of characters, their facial expression, their height, body type, weight, the position of their limbs, and the direction of their gaze, the color of their eyes, hair, and skin. If you know a person or place name, provide it. If you know the name of an artist who may have created what you see, provide that. Do not provide opinions or value judgements. Limit your response to 276 words to avoid your description getting cut off.",
	)

	# move inputs to the correct device and make a batch of size 1
	inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}
	inputs["images"] = inputs["images"].to(torch.bfloat16)

	# generate output; maximum 500 new tokens; stop generation when is generated
	with torch.autocast(device_type="cuda", enabled=True, dtype=torch.bfloat16):
	output = model.generate_from_batch(
	inputs,
	GenerationConfig(max_new_tokens=500, stop_strings="<\|endoftext\|>"),
	tokenizer=processor.tokenizer,
	)

	# only get generated tokens; decode them to text
	generated_tokens = output[0, inputs["input_ids"].size(1) :]
	generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)

	# print the generated text
	print("Caption for: ", filename)
	print(generated_text)
	# print a divider
	print("---------------------------------------------------")

	# save the generated text to a file
	output_filename = os.path.splitext(filename)[0] + ".txt"
	with open(os.path.join(image_directory,output_filename), "w") as file:
	file.write(generated_text)