SANA-1.5
Collection
SANA-1.5: Efficient Scaling of Training-Time and Inference-Time Compute in Linear Diffusion Transformer
•
6 items
•
Updated
•
2
Dependency setups:
# other transformers version may also work, but we have not tested
pip install transformers==4.46 accelerate opencv-python torchvision einops pillow
pip install git+https://github.com/bfshi/scaling_on_scales.git
from transformers import AutoConfig, AutoModel
from termcolor import colored
model_path = "Efficient-Large-Model/NVILA-Lite-2B-Verifier"
# you can use config
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_config(config, trust_remote_code=True)
# or directly from_pretrained
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, device_map="auto")
yes_id = model.tokenizer.encode("yes", add_special_tokens=False)[0]
no_id = model.tokenizer.encode("no", add_special_tokens=False)[0]
files = [
f"output/sana_test_prompt/0.png",
f"output/sana_test_prompt/1.png"
],
prompt = "YOUR_GENERATED_PROMPT"
prompt = f"""You are an AI assistant specializing in image analysis and ranking. Your task is to analyze and compare image based on how well they match the given prompt.
The given prompt is:{prompt}. Please consider the prompt and the image to make a decision and response directly with 'yes' or 'no'.
"""
r1, scores1 = model.generate_content([
PIL.Image.open(files[0]),
prompt
])
r2, scores2 = model.generate_content([
PIL.Image.open(files[1]),
prompt
])
if r1 == r2:
if r1 == "yes":
# pick the one with higher score for yes
if scores1[0][0, yes_id] > scores2[0][0, yes_id]:
selected_file = files[0]
else:
selected_file = files[1]
else:
# pick the one with less score for no
if scores1[0][0, no_id] < scores2[0][0, no_id]:
selected_file = files[0]
else:
selected_file = files[1]
else:
if r1 == "yes":
selected_file = files[0]
else:
selected_file = files[1]