Spaces:

LanguageBind
/

UniWorld-V1

Runtime error

App Files Files Community

LinB203 commited on 4 days ago

Commit

0c8d55e

1 Parent(s): e24ee61

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +777 -0
univa/__init__.py +0 -0
univa/dataset/__init__.py +8 -0
univa/dataset/data_collator.py +156 -0
univa/dataset/llava_dataset.py +312 -0
univa/dataset/qwen2vl_dataset.py +658 -0
univa/eval/__init__.py +0 -0
univa/eval/configuration_eval.py +55 -0
univa/eval/dpgbench/README.md +65 -0
univa/eval/dpgbench/__init__.py +0 -0
univa/eval/dpgbench/dpgbench.yaml +18 -0
univa/eval/dpgbench/eval_prompts/dpgbench.csv +0 -0
univa/eval/dpgbench/eval_prompts/dpgbench_prompts.json +0 -0
univa/eval/dpgbench/requirements.txt +32 -0
univa/eval/dpgbench/step1_gen_samples.py +248 -0
univa/eval/dpgbench/step2_compute_dpg_bench.py +269 -0
univa/eval/gedit/README.md +71 -0
univa/eval/gedit/__init__.py +0 -0
univa/eval/gedit/gedit.yaml +20 -0
univa/eval/gedit/gedit_edit.json +0 -0
univa/eval/gedit/secret_t2.env +0 -0
univa/eval/gedit/step0_prepare_gedit.py +85 -0
univa/eval/gedit/step1_gen_samples.py +260 -0
univa/eval/gedit/step2_gedit_bench.py +178 -0
univa/eval/gedit/step3_calculate_statistics.py +153 -0
univa/eval/gedit/viescore/__init__.py +115 -0
univa/eval/gedit/viescore/mllm_tools/__init__.py +0 -0
univa/eval/gedit/viescore/mllm_tools/gemini.py +147 -0
univa/eval/gedit/viescore/mllm_tools/idefics2_eval.py +43 -0
univa/eval/gedit/viescore/mllm_tools/mantis_idefics2_eval.py +43 -0
univa/eval/gedit/viescore/mllm_tools/minicpmv_eval.py +42 -0
univa/eval/gedit/viescore/mllm_tools/openai.py +184 -0
univa/eval/gedit/viescore/mllm_tools/qwen25vl_eval.py +121 -0
univa/eval/gedit/viescore/mllm_tools/utils.py +65 -0
univa/eval/gedit/viescore/parse_prompt.py +20 -0
univa/eval/gedit/viescore/utils.py +362 -0
univa/eval/gedit/viescore/vie_prompts.py +406 -0
univa/eval/genai/README.md +47 -0
univa/eval/genai/__init__.py +0 -0
univa/eval/genai/eval_prompts/genai1600/genai_image.json +0 -0
univa/eval/genai/eval_prompts/genai1600/genai_skills.json +4872 -0
univa/eval/genai/eval_prompts/genai527/genai_image.json +0 -0
univa/eval/genai/eval_prompts/genai527/genai_skills.json +1482 -0
univa/eval/genai/genai1600.yaml +18 -0
univa/eval/genai/genai527.yaml +18 -0
univa/eval/genai/step1_gen_samples.py +269 -0
univa/eval/genai/step2_run_model.py +113 -0
univa/eval/genai/t2v_metrics/__init__.py +13 -0
univa/eval/genai/t2v_metrics/clipscore.py +21 -0
univa/eval/genai/t2v_metrics/constants.py +8 -0

app.py ADDED Viewed

	@@ -0,0 +1,777 @@

+import gradio as gr
+import sys
+sys.path.append("..")
+from transformers import AutoProcessor, SiglipImageProcessor, SiglipVisionModel, T5EncoderModel, BitsAndBytesConfig
+from univa.models.qwen2p5vl.modeling_univa_qwen2p5vl import UnivaQwen2p5VLForConditionalGeneration
+from univa.utils.flux_pipeline import FluxPipeline
+from univa.utils.get_ocr import get_ocr_result
+from univa.utils.denoiser_prompt_embedding_flux import encode_prompt
+from qwen_vl_utils import process_vision_info
+from univa.utils.anyres_util import dynamic_resize, concat_images_adaptive
+import torch
+from torch import nn
+import os
+import uuid
+import base64
+from typing import Dict
+from PIL import Image, ImageDraw, ImageFont
+import spaces
+import argparse
+import gc
+def parse_args():
+    parser = argparse.ArgumentParser(description="Model and component paths")
+    parser.add_argument("--model_path", type=str, default="LanguageBind/UniWorld-V1", help="UniWorld-V1模型路径")
+    parser.add_argument("--flux_path", type=str, default="black-forest-labs/FLUX.1-dev", help="FLUX.1-dev模型路径")
+    parser.add_argument("--siglip_path", type=str, default="google/siglip2-so400m-patch16-512", help="siglip2模型路径")
+    parser.add_argument("--server_name", type=str, default="127.0.0.1", help="IP地址")
+    parser.add_argument("--server_port", type=int, default=6812, help="端口号")
+    parser.add_argument("--share", action="store_true", help="是否公开分享")
+    parser.add_argument("--nf4", action="store_true", help="是否NF4量化")
+    parser.add_argument("--zh", action="store_true", help="是否使用中文")
+    parser.add_argument("--offload", action="store_true", help="是否开启顺序卸载")
+    return parser.parse_args()
+def add_plain_text_watermark(
+    img: Image.Image,
+    text: str,
+    margin: int = 50,
+    font_size: int = 30,
+):
+    if img.mode != "RGB":
+        img = img.convert("RGB")
+    draw = ImageDraw.Draw(img)
+    font = ImageFont.truetype("DejaVuSans.ttf", font_size)
+    bbox = draw.textbbox((0, 0), text)
+    text_width = bbox[2] - bbox[0]
+    text_height = bbox[3] - bbox[1]
+    x = img.width - text_width - int(3.3 * margin)
+    y = img.height - text_height - margin
+    draw.text((x, y), text, font=font, fill=(255, 255, 255))
+    return img
+css = """
+.table-wrap table tr td:nth-child(3) > div {
+    max-height: 150px;       /* 最多 100px 高度，按需修改 */
+    overflow-y: auto;        /* 超出部分显示竖向滚动条 */
+    white-space: pre-wrap;   /* 自动换行 */
+    word-break: break-all;   /* 长单词内部分行 */
+}
+.table-wrap table tr td:nth-child(2) > div {
+    max-width: 150px;
+    white-space: pre-wrap;
+    word-break: break-all;
+    overflow-x: auto;
+}
+.table-wrap table tr th:nth-child(2) {
+    max-width: 150px;
+    white-space: normal;
+    word-break: keep-all;
+    overflow-x: auto;
+}
+.table-wrap table tr td:nth-last-child(-n+8) > div {
+    max-width: 130px;
+    white-space: pre-wrap;
+    word-break: break-all;
+    overflow-x: auto;
+}
+.table-wrap table tr th:nth-last-child(-n+8) {
+    max-width: 130px;
+    white-space: normal;
+    word-break: keep-all;
+    overflow-x: auto;
+}
+"""
+def img2b64(image_path):
+    with open(image_path, "rb") as f:
+        b64 = base64.b64encode(f.read()).decode()
+    data_uri = f"data:image/jpeg;base64,{b64}"
+    return data_uri
+@spaces.GPU
+def initialize_models(args):
+    os.makedirs("tmp", exist_ok=True)
+    # Paths
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    quantization_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_compute_dtype=torch.bfloat16,
+        bnb_4bit_quant_type="nf4",
+    )
+    # Load main model and task head
+    model = UnivaQwen2p5VLForConditionalGeneration.from_pretrained(
+        args.model_path,
+        torch_dtype=torch.bfloat16,
+        attn_implementation="flash_attention_2",
+        quantization_config=quantization_config if args.nf4 else None,
+    ).to(device)
+    task_head = nn.Sequential(
+        nn.Linear(3584, 10240),
+        nn.SiLU(),
+        nn.Dropout(0.3),
+        nn.Linear(10240, 2)
+    ).to(device)
+    task_head.load_state_dict(torch.load(os.path.join(args.model_path, 'task_head_final.pt')))
+    task_head.eval()
+    processor = AutoProcessor.from_pretrained(
+        args.model_path,
+        min_pixels=448*448,
+        max_pixels=448*448,
+    )
+    if args.nf4:
+        text_encoder_2 = T5EncoderModel.from_pretrained(
+            args.flux_path,
+            subfolder="text_encoder_2",
+            quantization_config=quantization_config,
+            torch_dtype=torch.bfloat16,
+        )
+        pipe = FluxPipeline.from_pretrained(
+            args.flux_path,
+            transformer=model.denoise_tower.denoiser,
+            text_encoder_2=text_encoder_2,
+            torch_dtype=torch.bfloat16,
+        ).to(device)
+    else:
+        pipe = FluxPipeline.from_pretrained(
+            args.flux_path,
+            transformer=model.denoise_tower.denoiser,
+            torch_dtype=torch.bfloat16,
+        ).to(device)
+    if args.offload:
+        pipe.enable_model_cpu_offload()
+        pipe.enable_vae_slicing()
+    tokenizers = [pipe.tokenizer, pipe.tokenizer_2]
+    text_encoders = [pipe.text_encoder, pipe.text_encoder_2]
+    # Optional SigLIP
+    siglip_processor, siglip_model = None, None
+    siglip_processor = SiglipImageProcessor.from_pretrained(args.siglip_path)
+    siglip_model = SiglipVisionModel.from_pretrained(
+        args.siglip_path,
+        torch_dtype=torch.bfloat16,
+    ).to(device)
+    return {
+        'model': model,
+        'task_head': task_head,
+        'processor': processor,
+        'pipe': pipe,
+        'tokenizers': tokenizers,
+        'text_encoders': text_encoders,
+        'siglip_processor': siglip_processor,
+        'siglip_model': siglip_model,
+        'device': device,
+    }
+args = parse_args()
+state = initialize_models(args)
+@spaces.GPU
+def process_large_image(raw_img):
+    if raw_img is None:
+        return raw_img
+    img = Image.open(raw_img).convert("RGB")
+    max_side = max(img.width, img.height)
+    if max_side > 1024:
+        scale = 1024 / max_side
+        new_w = int(img.width * scale)
+        new_h = int(img.height * scale)
+        print(f'resize img {img.size} to {(new_w, new_h)}')
+        img = img.resize((new_w, new_h), resample=Image.LANCZOS)
+        save_path = f"tmp/{uuid.uuid4().hex}.png"
+        img.save(save_path)
+        return save_path
+    else:
+        return raw_img
+@spaces.GPU
+def chat_step(image1, image2, text, height, width, steps, guidance,
+              ocr_enhancer, joint_with_t5, enhance_generation, enhance_understanding,
+              seed, num_imgs, history_state, progress=gr.Progress()):
+    try:
+        convo = history_state['conversation']
+        image_paths = history_state['history_image_paths']
+        cur_ocr_i = history_state['cur_ocr_i']
+        cur_genimg_i = history_state['cur_genimg_i']
+        # image1 = process_large_image(image1)
+        # image2 = process_large_image(image2)
+        # Build content
+        content = []
+        if text:
+            ocr_text = ''
+            if ocr_enhancer and content:
+                ocr_texts = []
+                for img in (image1, image2):
+                    if img:
+                        ocr_texts.append(get_ocr_result(img, cur_ocr_i))
+                        cur_ocr_i += 1
+                ocr_text = '\n'.join(ocr_texts)
+            content.append({'type':'text','text': text + ocr_text})
+        for img in (image1, image2):
+            if img:
+                content.append({'type':'image','image':img,'min_pixels':448*448,'max_pixels':448*448})
+                image_paths.append(img)
+        convo.append({'role':'user','content':content})
+        # Prepare inputs
+        chat_text = state['processor'].apply_chat_template(convo,
+                        tokenize=False, add_generation_prompt=True)
+        chat_text = '<|im_end|>\n'.join(chat_text.split('<|im_end|>\n')[1:])
+        image_inputs, video_inputs = process_vision_info(convo)
+        inputs = state['processor'](
+            text=[chat_text], images=image_inputs, videos=video_inputs,
+            padding=True, return_tensors='pt'
+        ).to(state['device'])
+        # Model forward & task head
+        with torch.no_grad():
+            outputs = state['model'](**inputs, return_dict=True, output_hidden_states=True)
+        hidden = outputs.hidden_states[-1]
+        mask = inputs.input_ids == 77091
+        vecs = hidden[mask][-1:]
+        task_res = state['task_head'](vecs.float())[0]
+        print(task_res)
+        # Branch decision
+        if enhance_generation:
+            do_image = True
+        elif enhance_understanding:
+            do_image = False
+        else:
+            do_image = (task_res[0] < task_res[1])
+        seed = int(seed)
+        if seed == -1:
+            seed = torch.Generator(device="cpu").seed()
+        torch.manual_seed(seed)
+        # Generate
+        if do_image:
+            # image generation pipeline
+            siglip_hs = None
+            if state['siglip_processor'] and image_paths:
+                vals = [state['siglip_processor'].preprocess(
+                            images=Image.open(p).convert('RGB'), do_resize=True,
+                            return_tensors='pt', do_convert_rgb=True
+                        ).pixel_values.to(state['device'])
+                        for p in image_paths]
+                siglip_hs = state['siglip_model'](torch.concat(vals)).last_hidden_state
+            with torch.no_grad():
+                lvlm = state['model'](
+                    inputs.input_ids, pixel_values=getattr(inputs,'pixel_values',None),
+                    attention_mask=inputs.attention_mask,
+                    image_grid_thw=getattr(inputs,'image_grid_thw',None),
+                    siglip_hidden_states=siglip_hs,
+                    output_type='denoise_embeds'
+                )
+                prm_embeds, pooled = encode_prompt(
+                    state['text_encoders'], state['tokenizers'],
+                    text if joint_with_t5 else '', 256, state['device'], 1
+                )
+            emb = torch.concat([lvlm, prm_embeds], dim=1) if joint_with_t5 else lvlm
+            def diffusion_to_gradio_callback(_pipeline, step_idx: int, timestep: int, tensor_dict: Dict):
+                # 1）更新 Gradio 进度条
+                frac = (step_idx + 1) / float(steps)
+                progress(frac)
+                return tensor_dict
+            with torch.no_grad():
+                img = state['pipe'](
+                    prompt_embeds=emb, pooled_prompt_embeds=pooled,
+                    height=height, width=width,
+                    num_inference_steps=steps,
+                    guidance_scale=guidance,
+                    generator=torch.Generator(device='cuda').manual_seed(seed),
+                    num_images_per_prompt=num_imgs,
+                    callback_on_step_end=diffusion_to_gradio_callback,
+                    # callback_on_step_end_tensor_inputs=["latents", "prompt_embeds"],
+                ).images
+            # img = [add_plain_text_watermark(im, 'Open-Sora Plan 2.0 Generated') for im in img]
+            img = concat_images_adaptive(img)
+            save_path = f"tmp/{uuid.uuid4().hex}.png"
+            img.save(save_path)
+            convo.append({'role':'assistant','content':[{'type':'image','image':save_path}]})
+            cur_genimg_i += 1
+            progress(1.0)
+            bot_msg = (None, save_path)
+        else:
+            # text generation
+            gen_ids = state['model'].generate(**inputs, max_new_tokens=128)
+            out = state['processor'].batch_decode(
+                [g[len(inputs.input_ids[0]):] for g in gen_ids], skip_special_tokens=True
+            )[0]
+            convo.append({'role':'assistant','content':[{'type':'text','text':out}]})
+            bot_msg = (None, out)
+        chat_pairs = []
+        # print(convo)
+        # print()
+        # print()
+        for msg in convo:
+            # print(msg)
+            if msg['role']=='user':
+                parts = []
+                for c in msg['content']:
+                    if c['type']=='text': parts.append(c['text'])
+                    if c['type']=='image': parts.append(f"![user image]({img2b64(c['image'])})")
+                chat_pairs.append(("\n".join(parts), None))
+            else:
+                parts = []
+                for c in msg['content']:
+                    if c['type']=='text': parts.append(c['text'])
+                    if c['type']=='image': parts.append(f"![assistant image]({img2b64(c['image'])})")
+                if msg['content'][-1]['type']=='text':
+                    chat_pairs[-1] = (chat_pairs[-1][0], parts[-1])
+                else:
+                    chat_pairs[-1] = (chat_pairs[-1][0], parts[-1])
+        # print()
+        # print(chat_pairs)
+        # Update state
+        history_state.update({
+            'conversation': convo,
+            'history_image_paths': image_paths,
+            'cur_ocr_i': cur_ocr_i,
+            'cur_genimg_i': cur_genimg_i
+        })
+        return chat_pairs, history_state, seed
+    except Exception as e:
+        # 捕捉所有异常，返回错误提示，建议用户清理历史后重试
+        error_msg = f"发生错误：{e}. 请点击 \"Clear History\" 清理对话历史后再试一次。"
+        chat_pairs = [(None, error_msg)]
+        # 不修改 history_state，让用户自行清理
+        return chat_pairs, history_state, seed
+def copy_seed_for_user(real_seed):
+    # 这个函数会把隐藏的 seed_holder 值，传给真正要显示的 seed Textbox
+    return real_seed
+def clear_inputs():
+    # img1 和 img2 用 None 来清空；text_in 用空字符串清空；seed 同理清空
+    return None, None, "", ""
+@spaces.GPU
+def clear_history():
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+    # 默认 prompt 和 seed
+    default_prompt = "Translate this photo into a Studio Ghibli-style illustration, holding true to the original composition and movement."
+    default_seed   = "-1"
+    # 1. chatbot 要用 gr.update(value=[]) 清空
+    # 2. state 直接给回初始 dict
+    # 3. prompt 和 seed 同样用 gr.update()
+    return (
+        gr.update(value=[]),                             # 清空聊天框
+        {'conversation':[],                              # 重置 state
+         'history_image_paths':[],
+         'cur_ocr_i':0,
+         'cur_genimg_i':0},
+        gr.update(value=None),                 # 重置 image1
+        gr.update(value=None),                 # 重置 image2
+        gr.update(value=default_prompt),                 # 重置 prompt 文本框
+        gr.update(value=default_seed),                   # 重置 seed 文本框
+    )
+if __name__ == '__main__':
+    # Gradio UI
+    with gr.Blocks(
+        theme=gr.themes.Soft(),
+        css=css
+        ) as demo:
+        gr.Markdown(
+            """
+            <div style="text-align:center;">
+            # 🎉 UniWorld-V1 Chat Interface 🎉
+            ### Unlock Cutting‑Edge Visual Perception, Feature Extraction, Editing, Synthesis, and Understanding
+            **Usage Guide:**
+            - It is recommended to perform inference on four images concurrently to offer varied selections.
+            - Uploaded images are automatically resized; manually specifying resolutions that differ substantially from the original is not advised.
+            </div>
+            """,
+            elem_classes="header-text",
+        )
+        with gr.Row():
+            with gr.Column():
+                chatbot = gr.Chatbot(
+                    max_height=100000, min_height=700,
+                    height=None,
+                    resizable=True,
+                    show_copy_button=True
+                    )
+                text_in = gr.Textbox(label="Instruction", value="Translate this photo into a Studio Ghibli-style illustration, holding true to the original composition and movement.")
+            with gr.Column():
+                with gr.Row():
+                    img1 = gr.Image(type='filepath', label="Image 1", height=256, width=256)
+                    img2 = gr.Image(type='filepath', label="Image 2 (Optional reference)", height=256, width=256, visible=True)
+                seed = gr.Textbox(label="Seed (-1 for random)", value="-1")
+                seed_holder = gr.Textbox(visible=False)
+                with gr.Row():
+                    num_imgs = gr.Slider(1, 4, 4, step=1, label="Num Images")
+                with gr.Row():
+                    height = gr.Slider(256, 2048, 1024, step=64, label="Height")
+                    width = gr.Slider(256, 2048, 1024, step=64, label="Width")
+                with gr.Row():
+                    steps = gr.Slider(8, 50, 30, step=1, label="Inference steps")
+                    guidance = gr.Slider(1.0, 10.0, 4.0, step=0.1, label="Guidance scale")
+                with gr.Accordion("Advanced Options", open=True, visible=True):
+                    with gr.Row():
+                        enhance_gen_box = gr.Checkbox(value=False, label="Enhance Generation")
+                        enhance_und_box = gr.Checkbox(value=False, label="Enhance Understanding")
+                    with gr.Row():
+                        ocr_box = gr.Checkbox(value=False, label="Enhance Text Rendering")
+                        t5_box = gr.Checkbox(value=True, label="Enhance Current Turn")
+        with gr.Row():
+            submit = gr.Button("Send", variant="primary")
+            clear = gr.Button("Clear History", variant="primary")
+        with gr.Row():
+            with gr.Column(1, min_width=0):
+                gr.Markdown(
+                    """
+                    **🖼️ Visual Perception & Feature Extraction**
+                    - Canny Edge Detection
+                    - Mini-Line Segment Detection
+                    - Normal Map Generation
+                    - Sketch Generation
+                    - Holistically-Nested Edge Detection
+                    - Depth Estimation
+                    - Human Pose Estimation
+                    - Object Detection (Boxes)
+                    - Semantic Segmentation (Masks)
+                    """
+                )
+            with gr.Column(1, min_width=0):
+                gr.Markdown(
+                    """
+                    **✂️ Image Editing & Manipulation**
+                    - Add Elements
+                    - Adjust Attributes
+                    - Change Background
+                    - Remove Objects
+                    - Replace Regions
+                    - Perform Actions
+                    - Restyle
+                    - Compose Scenes
+                    """
+                )
+            with gr.Column(1, min_width=0):
+                gr.Markdown(
+                    """
+                    **🔄 Cross-Modal Synthesis & Transformation**
+                    - Text→Image Synthesis
+                    - Image‑to‑Image Translation
+                    - Multi‑Image Combination
+                    - Extract IP Features
+                    - IP Feature Composition
+                    """
+                )
+            with gr.Column(1, min_width=0):
+                gr.Markdown(
+                    """
+                    **🤖 Visual & Textual QA**
+                    - Image‑Text QA
+                    - Text‑Text QA
+                    """
+                )
+        anchor_pixels = 1024*1024
+        # Dynamic resize callback
+        def update_size(i1, i2):
+            shapes = []
+            for p in (i1, i2):
+                if p:
+                    im = Image.open(p)
+                    w, h = im.size
+                    shapes.append((w, h))
+            if not shapes:
+                return gr.update(), gr.update()
+            if len(shapes) == 1:
+                w, h = shapes[0]
+            else:
+                w = sum(s[0] for s in shapes) / len(shapes)
+                h = sum(s[1] for s in shapes) / len(shapes)
+            new_h, new_w = dynamic_resize(int(h), int(w), 'any_11ratio', anchor_pixels=anchor_pixels)
+            return gr.update(value=new_h), gr.update(value=new_w)
+        img1.change(fn=update_size, inputs=[img1, img2], outputs=[height, width])
+        img2.change(fn=update_size, inputs=[img1, img2], outputs=[height, width])
+        # Mutual exclusivity
+        enhance_gen_box.change(
+            lambda g: gr.update(value=False) if g else gr.update(),
+            inputs=[enhance_gen_box], outputs=[enhance_und_box]
+        )
+        enhance_und_box.change(
+            lambda u: gr.update(value=False) if u else gr.update(),
+            inputs=[enhance_und_box], outputs=[enhance_gen_box]
+        )
+        state_ = gr.State({'conversation':[], 'history_image_paths':[], 'cur_ocr_i':0, 'cur_genimg_i':0})
+        progress_bar = gr.Progress()
+        gr.on(
+            triggers=[submit.click, text_in.submit],
+            fn=chat_step,
+            inputs=[img1, img2, text_in, height, width, steps, guidance,
+                    ocr_box, t5_box, enhance_gen_box, enhance_und_box, seed, num_imgs, state_,
+                    ],
+            outputs=[chatbot, state_, seed_holder],
+            scroll_to_output=True
+        ).then(
+            fn=copy_seed_for_user,
+            inputs=[seed_holder],    # 输入是隐藏的 seed_holder
+            outputs=[seed]           # 输出到真正要显示的 seed Textbox
+        )
+        clear.click(
+            fn=clear_history,
+            inputs=[],
+            outputs=[chatbot, state_, img1, img2, text_in, seed]
+        )
+        # ========== 添加 Validation Examples ==========
+        example_height, example_width = 1024, 1024
+        gr.Examples(
+            examples_per_page=100,
+            examples=[
+                # text-to-image
+                [None, None,
+                "Generate an adorable golden retriever puppy playing in a sunny park, "
+                "with fluffy fur, big round eyes, and a happy expression. "
+                "The background should have green grass, some flowers, and a blue sky with white clouds.",
+                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
+                # NIKE color swap
+                ["assets/nike_src.jpg", None,
+                "Switch the product's color from black, black to white, white, making sure the transition is crisp and clear.",
+                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
+                # style transfer (Ghibli)
+                ["assets/gradio/origin.png", None,
+                "Translate this photo into a Studio Ghibli-style illustration, holding true to the original composition and movement.",
+                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
+                ["assets/gradio/origin.png", None,
+                "Remove the bicycle located in the lower center region of the image.",
+                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
+                # blur
+                ["assets/gradio/blur.jpg", None,
+                "Remove blur, make it clear.",
+                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
+                #
+                ["assets/gradio/00004614_tgt.jpg", None,
+                "Add the ingrid fair isle cashmere turtleneck sweater to the person.",
+                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
+                #
+                ["assets/gradio/00006581_tgt.jpg", None,
+                "Place the belvoir broderie anglaise linen tank on the person in a way that complements their appearance and style.",
+                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
+                #
+                ["assets/gradio/00008153_tgt.jpg", None,
+                "Integrate may cashmere tank on body.",
+                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
+                #
+                ["assets/gradio/00002315_src.jpg", None,
+                "Strip away all context and distractions, leaving the pointelle-trimmed cashmere t-shirt floating on a neutral background.",
+                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
+                #
+                ["assets/gradio/00002985_src.jpg", None,
+                "Generate an image containing only the henry shearling jacket, free from any other visual elements.",
+                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
+                ["assets/gradio/origin.png", None,
+                "Add a cat in the center of image.",
+                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
+                # image+image-to-image (compose)
+                ["assets/00182555_target.jpg",
+                "assets/00182555_InstantStyle_ref_1.jpg",
+                "Adapt Image1's content to fit the aesthetic of Image2.",
+                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
+                # replace object
+                ["assets/replace_src.png", None,
+                "replace motorcycle located in the lower center region of the image with a black bicycle",
+                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
+                # segmentation
+                ["assets/seg_src.jpg", None,
+                "Segment the giraffe from the background.\n",
+                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
+                # detection
+                ["assets/det_src.jpg", None,
+                "Please depict the vase accurately",
+                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
+                # image-to-canny
+                ["assets/canny_image.jpg", None,
+                "Generate a Canny edge map for this image.",
+                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
+                # image-to-mlsd
+                ["assets/mlsd_image.jpg", None,
+                "Render an MLSD detection overlay for this input image.",
+                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
+                # image-to-normal
+                ["assets/normal_image.jpg", None,
+                "Convert the input texture into a tangent-space normal map.",
+                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
+                # image-to-sketch
+                ["assets/sketch_image.jpg", None,
+                "Transform this image into a hand-drawn charcoal sketch.",
+                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
+                # image-to-hed
+                ["assets/hed_image.jpg", None,
+                "Produce a holistically-nested boundary probability map of this image.",
+                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
+                # image-to-depth
+                ["assets/depth_image.jpg", None,
+                "Estimate depth with a focus on background structure.",
+                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
+                # image-to-image (reconstruction)
+                ["assets/rec.jpg", None,
+                "Simply reconstruct the original image with no enhancements.",
+                example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
+            ],
+            inputs=[img1, img2, text_in, height, width, steps, guidance,
+                    ocr_box, t5_box, enhance_gen_box, enhance_und_box, seed, num_imgs],
+        )
+    # ==============================================
+UI_TRANSLATIONS = {
+    "🎉 UniWorld-V1 Chat Interface 🎉":"🎉 UniWorld-V1 聊天界面 🎉",
+    "Unlock Cutting‑Edge Visual Perception, Feature Extraction, Editing, Synthesis, and Understanding":
+    '解锁尖端视觉感知，特征提取，编辑，合成和理解',
+    "Usage Guide:":"使用指南：",
+    "It is recommended to perform inference on four images concurrently to offer varied selections.":"建议同时进行四张图像的推理，以提供多选。",
+    "Uploaded images are automatically resized; manually specifying resolutions that differ substantially from the original is not advised.":"已上传的图像将自动调整大小，但手动指定与原始图像差异太大的分辨率并不建议。",
+    "🖼️ Visual Perception & Feature Extraction":"🖼️ 视觉感知与特征提取",
+    "Canny Edge Detection":"Canny边缘检测 ",
+    "Mini-Line Segment Detection":"微型行段检测",
+    "Normal Map Generation":"生成法线图",
+    "Sketch Generation":"手绘生成",
+    "Holistically-Nested Edge Detection":"整体嵌套边缘检测",
+    "Depth Estimation":"深度估计",
+    "Human Pose Estimation":"人体姿势估计",
+    "Object Detection (Boxes)":"对象检测（框）",
+    "Semantic Segmentation (Masks)":"语义分割（蒙版）",
+    "✂️ Image Editing & Manipulation":"✂️ 图像编辑与操作",
+    "Add Elements":"添加元素",
+    "Adjust Attributes":"调整属性",
+    "Change Background":"更改背景",
+    "Remove Objects":"删除对象",
+    "Replace Regions":"替换区域",
+    "Perform Actions":"执行操作",
+    "Restyle":"重绘风格",
+    "Compose Scenes":"组合场景",
+    "🔄 Cross-Modal Synthesis & Transformation":"🔄 跨模态综合与转换",
+    "Text→Image Synthesis":"文本→图像综合",
+    "Image‑to‑Image Translation":"图像-图像转换",
+    "Multi‑Image Combination":"多图像组合",
+    "Extract IP Features":"提取IP特征",
+    "IP Feature Composition":"IP特征组合",
+    "🤖 Visual & Textual QA":"🤖 视觉和文字质量检查",
+    "Image‑Text QA":"图像-文本质量检查",
+    "Text‑Text QA":"文本-文本质量检查",
+    "Image 1":"图像 1",
+    "Image 2 (Optional reference)":"图像 2 (可选参考)",
+    "Instruction":"指令",
+    "Seed (-1 for random)":"种子 (-1为随机)",
+    "Num Images":"图像数量",
+    "Height":"高度",
+    "Width":"宽度",
+    "Inference steps":"推理步数",
+    "Guidance scale":"引导缩放",
+    "Advanced Options":"高级选项",
+    "Enhance Generation":"增强生成",
+    "Enhance Understanding":"增强理解",
+    "Enhance Text Rendering":"增强文本渲染",
+    "Enhance Current Turn":"增强当前轮次",
+    "Send":"发送",
+    "Clear History":"清除历史记录",
+}
+def apply_localization(block):
+    def process_component(component):
+        if not component:
+            return
+        for attr in ['label', 'info', 'placeholder']:
+            if hasattr(component, attr):
+                text = getattr(component, attr)
+                if text in UI_TRANSLATIONS:
+                    setattr(component, attr, UI_TRANSLATIONS[text])
+        if hasattr(component, 'value'):
+            value = component.value
+            if isinstance(value, str) and value in UI_TRANSLATIONS:
+                component.value = UI_TRANSLATIONS[value]
+        if isinstance(component, gr.Markdown):
+            for en, zh in UI_TRANSLATIONS.items():
+                component.value = component.value.replace(en, zh)
+        if hasattr(component, 'children'):
+            for child in component.children:
+                process_component(child)
+    process_component(block)
+    return block
+if __name__ == "__main__":
+    if args.zh:
+        demo = apply_localization(demo)
+    demo.title  = "UniWorld-V1"
+    demo.launch(
+        allowed_paths=["/"],
+        server_name=args.server_name,
+        server_port=args.server_port,
+        share=args.share,
+        inbrowser=True,
+    )
+'''
+MODEL_PATH="/mnt/data/lb/Remake/FlowWorld/checkpoints/flux_qwen2p5vl_7b_vlm_mlp_siglip_stage2_ts_1024_bs42x8x1_fa_any_11ratio_ema999_ocr_adamw_t5_0p4_lr1e-5_mask_refstyle_extract_resume_run3/checkpoint-12000/model_ema"
+FLUX_PATH="/mnt/data/checkpoints/black-forest-labs/FLUX.1-dev"
+SIGLIP_PATH="/mnt/data/checkpoints/google/siglip2-so400m-patch16-512"
+CUDA_VISIBLE_DEVICES=2 python app.py \
+    --model_path ${MODEL_PATH} \
+    --flux_path ${FLUX_PATH} \
+    --siglip_path ${SIGLIP_PATH}
+'''

univa/__init__.py ADDED Viewed

File without changes

univa/dataset/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from .llava_dataset import LlavaDataset
+from .qwen2vl_dataset import Qwen2VLDataset
+DATASET_TYPE = {
+    'llava': LlavaDataset,
+    'qwen2vl': Qwen2VLDataset,
+    'qwen2p5vl': Qwen2VLDataset,
+}

univa/dataset/data_collator.py ADDED Viewed

	@@ -0,0 +1,156 @@

+from typing import List, Dict
+from transformers import PreTrainedTokenizer
+import torch
+import torch.nn.functional as F
+def pad_list_of_tensors(tensor_list, padding_value=0):
+    # tensor_list: list of tensors, each of shape (b, c, h, w)
+    # if all empty list, which means all data are t2i within this batch
+    if all(not isinstance(tensor, torch.Tensor) for tensor in tensor_list):
+        return []
+    else:
+        for tmp_tensor in tensor_list:
+            if isinstance(tmp_tensor, torch.Tensor):
+                # find a tensor
+                break
+        # this line pad zero_tensor when batch mixed between t2i and others.
+        # t2i can be considered a uncondition (no-reference image) editing
+        tensor_list = [
+            torch.zeros_like(tmp_tensor) if isinstance(tensor, list) else tensor for tensor in tensor_list
+            ]
+    assert all(tensor.shape[1] == tensor_list[0].shape[1] for tensor in tensor_list)
+    # 找到最大的 b, h, w
+    max_b = max(tensor.shape[0] for tensor in tensor_list)
+    max_c = tensor_list[0].shape[1]  # 假设c都是一样的
+    max_h = max(tensor.shape[2] for tensor in tensor_list)
+    max_w = max(tensor.shape[3] for tensor in tensor_list)
+    padded_tensors = []
+    for tensor in tensor_list:
+        b, c, h, w = tensor.shape
+        pad_b = max_b - b
+        pad_h = max_h - h
+        pad_w = max_w - w
+        # 先 pad h, w (最后两维)
+        tensor = F.pad(tensor, (0, pad_w, 0, pad_h), value=padding_value)
+        # 再 pad b 维（最前面），要扩成 (max_b, c, h, w)
+        if pad_b > 0:
+            padding_shape = (pad_b, c, max_h, max_w)
+            pad_tensor = torch.full(padding_shape, fill_value=padding_value, dtype=tensor.dtype, device=tensor.device)
+            tensor = torch.cat([tensor, pad_tensor], dim=0)
+        padded_tensors.append(tensor)
+    # 最后 stack 成 (B, b_max, c, h_max, w_max)
+    return torch.stack(padded_tensors)
+def resize_list_of_tensors(weights):
+    # suppose weights is your list of [1, H, W] tensors
+    # 1) find the max height and width
+    heights = [w.shape[-2] for w in weights]
+    widths  = [w.shape[-1] for w in weights]
+    max_h, max_w = max(heights), max(widths)
+    # 2) interpolate each mask to (max_h, max_w)
+    resized = []
+    for w in weights:
+        # F.interpolate expects a 4D tensor: (N, C, H, W)
+        w_4d = w.unsqueeze(0)             # -> [1, 1, H, W]
+        w_4d = w_4d.unsqueeze(0) if w_4d.ndim == 3 else w_4d
+        # but since w is already [1,H,W], unsqueeze once is enough:
+        # w_4d = w.unsqueeze(0) # [1, 1, H, W]
+        w_resized = F.interpolate(
+            w_4d, size=(max_h, max_w), mode='nearest'
+        )
+        # back to [1, H', W']
+        w_resized = w_resized.squeeze(0)
+        resized.append(w_resized)
+    # 3) stack into a single tensor [N, 1, max_h, max_w]
+    weights = torch.stack(resized)  # -> [N, 1, max_h, max_w]
+    return weights
+class DataCollator:
+    def __init__(self, tokenizer: PreTrainedTokenizer, padding_side='right'):
+        self.tokenizer = tokenizer
+        self.padding_side = padding_side
+    def __call__(self, instances: List[Dict]) -> Dict:
+        input_ids = [instance["input_ids"][0] for instance in instances]
+        labels = [instance["labels"][0] for instance in instances]
+        image_position = [instance["image_position"] for instance in instances]
+        pixel_values = [
+            instance["pixel_values"] for instance in instances if len(instance["pixel_values"]) > 0
+        ]
+        pixel_values = torch.cat(pixel_values) if len(pixel_values) > 0 else None
+        image_grid_thw = [
+            instance["image_grid_thw"] for instance in instances if len(instance["image_grid_thw"]) > 0
+        ]
+        image_grid_thw = torch.cat(image_grid_thw) if len(image_grid_thw) > 0 else None
+        pil_pixel_values = [
+            instance["pil_pixel_values"] for instance in instances
+        ]
+        prompts = [instance["prompt"] for instance in instances]
+        ref_pixel_values = [
+            instance["ref_pixel_values"] for instance in instances
+        ]
+        ref_pixel_values = pad_list_of_tensors(ref_pixel_values, padding_value=0)
+        siglip_pixel_values = [
+            instance["siglip_pixel_values"] for instance in instances if len(instance["siglip_pixel_values"]) > 0
+        ]
+        siglip_pixel_values = torch.cat(siglip_pixel_values, dim=0) if len(siglip_pixel_values) > 0 else []
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id,
+            padding_side=self.padding_side,
+        )
+        labels = torch.nn.utils.rnn.pad_sequence(
+            labels, batch_first=True, padding_value=-100,
+            padding_side=self.padding_side,
+        )
+        attention_mask = input_ids.ne(self.tokenizer.pad_token_id)
+        weights = [
+            instance["weights"] for instance in instances if len(instance["weights"]) > 0
+        ]
+        if len(weights) > 0:
+            if all([i.shape == weights[0].shape for i in weights]):
+                weights = torch.stack(weights)
+            else:
+                weights = [i.unsqueeze(0) for i in weights]
+        else:
+            weights = None
+        generated_image = [
+            instance["generated_image"] for instance in instances if len(instance["generated_image"]) > 0
+            ]
+        if len(generated_image) > 0:
+            if all([i.shape == generated_image[0].shape for i in generated_image]):
+                generated_image = torch.stack(generated_image)
+            else:
+                generated_image = [i.unsqueeze(0) for i in generated_image]
+        else:
+            generated_image = []
+        return {
+            "input_ids": input_ids,
+            "pixel_values": pixel_values,
+            "labels": labels,
+            "attention_mask": attention_mask,
+            "image_position": image_position,
+            "image_grid_thw": image_grid_thw,
+            "prompts": prompts,
+            "ref_pixel_values": ref_pixel_values,
+            "pil_pixel_values": pil_pixel_values,
+            "siglip_pixel_values": siglip_pixel_values,
+            "weights": weights,
+            "generated_image": generated_image,
+        }

univa/dataset/llava_dataset.py ADDED Viewed

	@@ -0,0 +1,312 @@

+from typing import Any, Callable, Optional, List
+import torch
+from transformers import PreTrainedTokenizer
+from torch.utils.data import Dataset
+from tqdm import tqdm
+import json
+import os
+from PIL import Image
+from univa.utils.prompter import Prompter
+import numpy as np
+from einops import rearrange
+import random
+from univa.utils.constant import SPACIAL_TOKEN, GENERATE_TOKEN
+class LlavaDataset(Dataset):
+    def __init__(
+        self,
+        dataset_type: str,
+        data_txt: str,
+        tokenizer: PreTrainedTokenizer,
+        prompter: Prompter,
+        image_processor: Callable,
+        processor: Callable = None,
+        min_pixels: int = 384*384,
+        max_pixels: int = 384*384,
+        image_token_length: int = 729,
+        only_generated_task: bool = False,
+        drop_prompt_rate: float = 0.2,
+    ):
+        assert dataset_type == 'llava'
+        with open(data_txt, "r") as f:
+            self.datasets = [line.strip() for line in f.readlines()]
+        self.data = []
+        self._load_data()
+        self.tokenizer = tokenizer
+        self.prompter = prompter
+        self.image_token_length = image_token_length
+        self.image_token = SPACIAL_TOKEN[dataset_type]['image_token']
+        self.image_begin_token = SPACIAL_TOKEN[dataset_type]['image_begin_token']
+        self.image_end_token = SPACIAL_TOKEN[dataset_type]['image_end_token']
+        self.generated_image_token = GENERATE_TOKEN
+        self.image_processor = image_processor
+        self.only_generated_task = only_generated_task  # For denoiser training
+        self.drop_prompt_rate = drop_prompt_rate
+        if self.drop_prompt_rate > 0:
+            assert self.only_generated_task, (
+                "Only generated task is supported when drop prompt rate is greater than 0"
+            )
+        # Add image token if not exists.
+        if self.image_token not in self.tokenizer.get_vocab():
+            self.tokenizer.add_special_tokens(
+                {"additional_special_tokens": [self.image_token]}
+            )
+        self.image_token_id = self.tokenizer.convert_tokens_to_ids(self.image_token)
+        self.image_begin_token_id = self.tokenizer.convert_tokens_to_ids(
+            self.image_begin_token
+        )
+        assert isinstance(self.image_begin_token_id, int), (
+            f"tokenizer miss image begin token `{self.image_begin_token}`"
+        )
+        self.image_end_token_id = self.tokenizer.convert_tokens_to_ids(
+            self.image_end_token
+        )
+        assert isinstance(self.image_end_token_id, int), (
+            f"tokenizer miss image end token `{self.image_end_token}`"
+        )
+    def _load_data(self):
+        for dataset in self.datasets:
+            image_root, json_file = dataset.split(",")
+            # Load json file
+            with open(json_file, "r") as f:
+                data = json.load(f)
+            dataset_data = []
+            for line in tqdm(data):
+                # Ensure `image` is a list
+                if isinstance(line["image"], str):
+                    line["image"] = [line["image"]]
+                assert isinstance(line["image"], list), (
+                    "`image` must be a str or a list."
+                )
+                # Convert image path to absolute path
+                line["image"] = [
+                    os.path.join(image_root, image_path) for image_path in line["image"]
+                ]
+                dataset_data.append(line)
+            print(f"Load {len(dataset_data)} data from {json_file}.")
+            self.data.extend(dataset_data)
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        try:
+            data: Any = self.data[idx]
+            # Reformat the conversation to the format of prompter
+            conversations = []
+            prompt = ""
+            for item in data["conversations"]:
+                if item["from"] == "human":
+                    role = self.prompter.user_role
+                elif item["from"] == "gpt":
+                    role = self.prompter.assistant_role
+                else:
+                    raise ValueError(f"Unknown role: {item['from']}")
+                conversations.append({"from": role, "value": item["value"]})
+            assert prompt != ""
+            # Make prompt
+            drop_condition = False
+            if self.only_generated_task:
+                if self.drop_prompt_rate < random.random():  # Randomly drop the prompt
+                    prompt_list = self.prompter.get_train_prompt(conversations)
+                else:
+                    drop_condition = True
+                    # Drop the prompt
+                    prompt_list = [
+                        {
+                            "from": self.prompter.system_role,
+                            "value": "You are a helpful assistant.",
+                        },
+                        {
+                            "from": self.prompter.user_role,
+                            "value": "Generate an image.",
+                        },
+                        {
+                            "from": self.prompter.assistant_role,
+                            "value": self.generated_image_token,
+                        },
+                    ]
+                    prompt_list = self.prompter.get_train_prompt(prompt_list)
+            else:
+                prompt_list = self.prompter.get_train_prompt(conversations)
+            input_ids = []
+            labels = []
+            has_generated_image = False
+            for item in prompt_list:
+                item["prompt"] = item["prompt"].replace('<image>', self.image_token)
+                if self.generated_image_token in item["prompt"]:  # Check if self.generated_image_token in prompt
+                    assert item["from"] == self.prompter.assistant_role, (
+                        "Generated image token must be in assistant role"
+                    )
+                    assert (
+                        f"{self.generated_image_token}{self.prompter.eos_token}"
+                        in item["prompt"]
+                    ), "Generated image token must in end of prompt"
+                    # Replace the generated image token with image begin token and without eos token
+                    item["prompt"] = item["prompt"].replace(
+                        f"{self.generated_image_token}{self.prompter.eos_token}",
+                        self.image_begin_token,
+                    )
+                    has_generated_image = True
+                tokenized_item = self.tokenizer(
+                    item["prompt"],
+                    return_tensors="pt",
+                    truncation=False,
+                )
+                if item["is_labels"]:  # If this prompt is labels
+                    labels.append(tokenized_item.input_ids)
+                else:
+                    labels.append(torch.full_like(tokenized_item.input_ids, -100))
+                input_ids.append(tokenized_item.input_ids)
+            if (
+                self.only_generated_task and not has_generated_image
+            ):  # For denoiser training
+                raise ValueError(
+                    f"Only generated task is not supported. But this prompt not contains generated image token: {prompt_list[0]['prompt']}"
+                )
+            input_ids = torch.cat(input_ids, dim=1)
+            labels = torch.cat(labels, dim=1)
+            # Load images
+            if has_generated_image:
+                if not drop_condition:
+                    image_slice = data["image"][:-1]
+                else:
+                    image_slice = []
+            else:
+                image_slice = data["image"]
+            image_dict = self._load_image(image_slice, image_processor=self.image_processor, image_token_lengths=self.image_token_length)
+            image_token_lengths = image_dict['image_token_lengths']
+            pixel_values = image_dict['pixel_values']
+            image_grid_thw = image_dict['image_grid_thw']
+            # Repeat the image token to the length of image_token_length
+            # and record the position of image tokens.
+            input_ids, labels, image_position = self._process_image_token(
+                input_ids,
+                labels=labels,
+                image_token_id=self.image_token_id,
+                image_begin_token_id=self.image_begin_token_id,
+                image_end_token_id=self.image_end_token_id,
+                image_token_lengths=image_token_lengths,
+            )
+            return_data = {
+                "input_ids": input_ids,
+                "labels": labels,
+                "pixel_values": pixel_values,
+                "image_position": image_position,
+                "image_grid_thw": image_grid_thw,
+                "prompt": [prompt],
+            }
+            if has_generated_image: # If this item is a generation task
+                image = Image.open(data["image"][-1]).convert("RGB")
+                image_tensor = torch.tensor(np.array(image)) / 255.0  # scale to 0-1
+                image_tensor = rearrange(image_tensor, "h w c -> c h w")
+                return_data["generated_image"] = image_tensor
+            return return_data
+        except Exception as e:
+            print(f'Error with {e}')
+            return self.__getitem__(random.randint(0, self.__len__()-1))
+    @staticmethod
+    def _load_image(
+        image_slice: List[str],
+        max_pixels: int = 448*448,
+        min_pixels: int = 448*448,
+        processor: Callable = None,
+        image_processor: Callable = None,
+        image_token_lengths: int = 729,
+        image_token: str = '<image>',
+    ):
+        # images tensor shape is (b, c, h, w)
+        images = []
+        # Ignore the last image (generated image)
+        for image_path in image_slice: # Ignore the last image (generated image)
+            image = Image.open(image_path).convert("RGB")
+            image = image_processor(
+                image, return_tensors="pt"
+            ).pixel_values
+            images.append(image)
+        if len(images) > 0:
+            images = torch.cat(images)
+        image_token_lengths = len(images) * [image_token_lengths]
+        return {'pixel_values': images, 'image_grid_thw': [], 'image_token_lengths': image_token_lengths}
+    @staticmethod
+    def _process_image_token(
+        input_ids: torch.Tensor,
+        image_token_id: int,
+        image_begin_token_id: int,
+        image_end_token_id: int,
+        image_token_lengths: List[int],
+        labels: Optional[torch.Tensor] = None,
+    ):
+        # Find the indices of the image token
+        image_token_indices = (input_ids == image_token_id).nonzero(as_tuple=True)
+        image_position = []
+        offset = 0
+        cur_i = 0
+        if isinstance(image_token_lengths, int):
+            image_token_lengths = [image_token_lengths] * len(image_token_indices[1])
+        for idx in image_token_indices[1]:
+            image_token_length = image_token_lengths[cur_i]
+            adjusted_idx = idx + offset
+            assert input_ids[0, adjusted_idx] == image_token_id
+            # Add image begin and end token
+            input_ids = torch.cat(
+                [
+                    input_ids[:, :adjusted_idx],
+                    input_ids.new_full(
+                        (1, 1), image_begin_token_id
+                    ),  # image begin token
+                    input_ids.new_full(
+                        (1, image_token_length), image_token_id
+                    ),  # Repeat the image token to the length of image_token_length
+                    input_ids.new_full((1, 1), image_end_token_id),  # image end token
+                    input_ids[:, adjusted_idx + 1 :],
+                ],
+                dim=1,
+            )
+            if labels is not None:
+                labels = torch.cat(
+                    [
+                        labels[:, :adjusted_idx],
+                        labels.new_full(
+                            (1, 1), image_begin_token_id
+                        ),  # Make begin token as label
+                        labels.new_full((1, image_token_length), -100),
+                        labels.new_full((1, 1), -100),
+                        labels[:, adjusted_idx + 1 :],
+                    ],
+                    dim=1,
+                )
+            adjusted_idx += 1  # skip the image begin token
+            image_position.append(adjusted_idx.item())
+            offset += image_token_length - 1
+            offset += 2  # begin and end token
+        return input_ids, labels, image_position

univa/dataset/qwen2vl_dataset.py ADDED Viewed

	@@ -0,0 +1,658 @@

+from typing import Any, Callable, Optional, List
+import torch
+from transformers import PreTrainedTokenizer
+from torch.utils.data import Dataset
+from tqdm import tqdm
+import json
+import os
+from PIL import Image
+from univa.utils.prompter import Prompter
+import numpy as np
+from einops import rearrange
+import random
+# from qwen_vl_utils.vision_process import fetch_image, fetch_video
+from qwen_vl_utils.vision_process import to_rgb, smart_resize, fetch_video
+from univa.utils.constant import SPACIAL_TOKEN, GENERATE_TOKEN
+from univa.utils.get_mask import get_weight_mask
+from univa.utils.get_ocr import get_ocr_result
+from fractions import Fraction
+from torchvision.transforms import functional
+from torchvision import transforms
+from io import BytesIO
+import base64
+import requests
+import torch
+from PIL import Image
+from torchvision import io, transforms
+from typing import Optional
+def get_aspect_ratio(img):
+    width, height = img.size
+    return Fraction(width, height).limit_denominator()
+def has_same_aspect_ratio(img1, img2):
+    if not isinstance(img1, Image.Image):
+        img1 = Image.open(img1).convert('RGB')
+    if not isinstance(img2, Image.Image):
+        img2 = Image.open(img2).convert('RGB')
+    ratio1 = get_aspect_ratio(img1)
+    ratio2 = get_aspect_ratio(img2)
+    return ratio1 == ratio2
+def has_same_resolution(img1, img2):
+    if not isinstance(img1, Image.Image):
+        img1 = Image.open(img1).convert('RGB')
+    if not isinstance(img2, Image.Image):
+        img2 = Image.open(img2).convert('RGB')
+    return img1.size == img2.size
+class Qwen2VLDataset(Dataset):
+    def __init__(
+        self,
+        dataset_type: str,
+        data_txt: str,
+        transform: Callable,
+        tokenizer: PreTrainedTokenizer,
+        prompter: Prompter,
+        image_processor: Callable,
+        processor: Callable = None,
+        min_pixels: int = 384*384,
+        max_pixels: int = 384*384,
+        image_token_length: int = 729,
+        only_generated_task: bool = False,
+        drop_prompt_rate: float = 0.0,
+        joint_ref_feature: bool = False,
+        anyres: bool = False,
+        mask_weight_type: str = 'log',
+        siglip_processor: Callable = None,
+        ocr_enhancer: bool = False,
+        random_data: bool = False,
+        maxnum_per_data: int = -1,
+        notry: bool = False,
+    ):
+        assert dataset_type == 'qwen2vl' or dataset_type == 'qwen2p5vl', "dataset_type == 'qwen2vl' or dataset_type == 'qwen2p5vl'"
+        with open(data_txt, "r") as f:
+            self.datasets = [line.strip() for line in f.readlines()]
+        self.data = []
+        self._load_data(maxnum_per_data)
+        self.transform = transform
+        self.processor = processor
+        self.tokenizer = processor.tokenizer
+        self.prompter = prompter
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.image_token = SPACIAL_TOKEN[dataset_type]['image_token']
+        self.image_begin_token = SPACIAL_TOKEN[dataset_type]['image_begin_token']
+        self.image_end_token = SPACIAL_TOKEN[dataset_type]['image_end_token']
+        self.generated_image_token = GENERATE_TOKEN
+        self.image_processor = processor.image_processor
+        # self.factor = 4 if joint_ref_feature else 1
+        self.factor = 2
+        self.only_generated_task = only_generated_task  # For denoiser training
+        self.drop_prompt_rate = drop_prompt_rate
+        if self.drop_prompt_rate > 0:
+            assert self.only_generated_task, (
+                "Only generated task is supported when drop_prompt_rate > 0"
+            )
+        self.mask_weight_type = mask_weight_type
+        self.siglip_processor = siglip_processor
+        self.ocr_enhancer = ocr_enhancer
+        self.random_data = random_data
+        self.notry = notry
+        # Add image token if not exists.
+        assert self.image_token in self.tokenizer.get_vocab()
+        self.image_token_id = self.tokenizer.convert_tokens_to_ids(self.image_token)
+        self.image_begin_token_id = self.tokenizer.convert_tokens_to_ids(
+            self.image_begin_token
+        )
+        assert isinstance(self.image_begin_token_id, int), (
+            f"tokenizer miss image begin token `{self.image_begin_token}`"
+        )
+        self.image_end_token_id = self.tokenizer.convert_tokens_to_ids(
+            self.image_end_token
+        )
+        assert isinstance(self.image_end_token_id, int), (
+            f"tokenizer miss image end token `{self.image_end_token}`"
+        )
+    def _load_data(self, maxnum_per_data=-1):
+        for dataset in self.datasets:
+            image_root, json_file, need_weight = dataset.split(",")
+            # Load json file
+            with open(json_file, "r") as f:
+                data = json.load(f)
+            if maxnum_per_data > 0 and maxnum_per_data < len(data):
+                print(f'original data: {len(data)}, sample: {maxnum_per_data}')
+                data = random.sample(data, maxnum_per_data)
+            dataset_data = []
+            for line in tqdm(data):
+                if "image" not in line:
+                    line["image"] = []
+                # Ensure `image` is a list
+                if isinstance(line["image"], str):
+                    line["image"] = [line["image"]]
+                assert isinstance(line["image"], list), (
+                    "`image` must be a str or a list."
+                )
+                # Convert image path to absolute path
+                line["need_weight"] = need_weight
+                line["image"] = [
+                    os.path.join(image_root, image_path) for image_path in line["image"]
+                ]
+                dataset_data.append(line)
+            print(f"Load {len(dataset_data)} data from {json_file}.")
+            self.data.extend(dataset_data)
+    def __len__(self):
+        return len(self.data)
+    def _get_random_data(self, ):
+        prompt = self.prompter(
+            [
+                {"from": "system", "value": "You are a helpful assistant."},
+                {
+                    "from": "user",
+                    "value": f"test an image {self.image_token}",
+                },
+            ]
+        )
+        input_ids = self.tokenizer.batch_encode_plus(
+            [prompt], return_tensors="pt", truncation=False,
+        ).input_ids
+        labels = input_ids
+        width, height = 448, 448
+        random_data = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
+        image = Image.fromarray(random_data, 'RGB')
+        image_slice = [image]
+        image_dict = self._load_image(
+            image_slice, self.max_pixels, self.min_pixels,
+            processor=self.processor, image_token=self.image_token,
+            factor=self.factor,
+            last_image=image,
+            vae_image_transform=self.transform,
+            drop_prompt=False,
+            prompt=prompt,
+            mask_weight_type=self.mask_weight_type,
+            siglip_processor=self.siglip_processor,
+            )
+        image_token_lengths = image_dict['image_token_lengths']
+        pixel_values = image_dict['pixel_values']
+        image_grid_thw = image_dict['image_grid_thw']
+        ref_pixel_values = image_dict['ref_pixel_values']
+        pil_pixel_values = image_dict['pil_pixel_values']
+        siglip_pixel_values = image_dict['siglip_pixel_values']
+        weights = image_dict['weights']
+        input_ids, labels, image_position = self._process_image_token(
+                input_ids,
+                labels=labels,
+                image_token_id=self.image_token_id,
+                image_begin_token_id=self.image_begin_token_id,
+                image_end_token_id=self.image_end_token_id,
+                image_token_lengths=image_token_lengths,
+            )
+        generated_image = torch.randn(3, 512, 512)
+        return_data = {
+            "input_ids": input_ids,
+            "labels": labels,
+            "pixel_values": pixel_values,
+            "image_position": image_position,
+            "image_grid_thw": image_grid_thw,
+            "prompt": prompt,
+            "ref_pixel_values": ref_pixel_values,
+            "pil_pixel_values": pil_pixel_values,
+            "siglip_pixel_values": siglip_pixel_values,
+            "weights": weights,
+            "generated_image": generated_image,
+        }
+        return return_data
+    def getitem(self, data):
+        # Reformat the conversation to the format of prompter
+        conversations = []
+        prompt = ""
+        for item in data["conversations"]:
+            if item["from"] == "human":
+                role = self.prompter.user_role
+                prompt = item["value"]
+            elif item["from"] == "gpt":
+                role = self.prompter.assistant_role
+            else:
+                raise ValueError(f"Unknown role: {item['from']}")
+            conversations.append({"from": role, "value": item["value"]})
+        assert prompt != "", "prompt != ''"
+        # The last turn instruction will be used for t5_embed
+        prompt = prompt.replace('<image>', '').replace('\n', '')
+        # Make prompt
+        drop_prompt = False
+        if self.only_generated_task:
+            if self.drop_prompt_rate < random.random():  # Randomly drop the prompt
+                prompt_list = self.prompter.get_train_prompt(conversations)
+            else:
+                drop_prompt = True
+                num_images = (''.join([i['value'] for i in conversations])).count('<image>')
+                # Drop the prompt
+                prompt_list = [
+                    {
+                        "from": self.prompter.system_role,
+                        "value": "You are a helpful assistant.",
+                    },
+                    {
+                        "from": self.prompter.user_role,
+                        # "value": f"{num_images * '<image>'} Generate an image.",
+                        "value": "Generate an image.",
+                    },
+                    {
+                        "from": self.prompter.assistant_role,
+                        "value": self.generated_image_token,
+                    },
+                ]
+                prompt_list = self.prompter.get_train_prompt(prompt_list)
+        else:
+            prompt_list = self.prompter.get_train_prompt(conversations)
+        input_ids = []
+        labels = []
+        has_generated_image = False
+        cur_i = 0
+        for item in prompt_list:
+            item["prompt"] = item["prompt"].replace('<image>', self.image_token)
+            if self.generated_image_token in item["prompt"]:  # Check if self.generated_image_token in prompt
+                assert item["from"] == self.prompter.assistant_role, (
+                    "Generated image token must be in assistant role"
+                )
+                assert (
+                    f"{self.generated_image_token}{self.prompter.eos_token}"
+                    in item["prompt"]
+                ), "Generated image token must in end of prompt"
+                # Replace the generated image token with image begin token and without eos token
+                item["prompt"] = item["prompt"].replace(
+                    f"{self.generated_image_token}{self.prompter.eos_token}",
+                    self.image_begin_token,
+                )
+                has_generated_image = True
+            if self.ocr_enhancer and (self.image_token in item["prompt"]):
+                # print('item["prompt"]', item["prompt"])
+                if not has_generated_image:
+                    num_img = item["prompt"].count(self.image_token)
+                    ocr_sentences = []
+                    for i in range(num_img):
+                        ocr_sentences.append(get_ocr_result(data["image"][cur_i], cur_i))
+                        cur_i += 1
+                    ocr_sentences = '\n'.join(ocr_sentences)
+                    if len(ocr_sentences.split()) > 256:
+                        print(f'ocr_sentences too long, total len {len(ocr_sentences.split())} trunk first 256')
+                        ocr_sentences = ' '.join(ocr_sentences.split()[:256])
+                    # ocr_sentences = ''
+                    assert item['prompt'][-len(self.prompter.eos_token):] == self.prompter.eos_token, \
+                        "item['prompt'][-len(self.prompter.eos_token):] == self.prompter.eos_token"
+                    assert item['prompt'].count(self.prompter.eos_token) == 1, \
+                        "item['prompt'].count(self.prompter.eos_token) == 1"
+                    item["prompt"] = item["prompt"].replace(self.prompter.eos_token, f'{ocr_sentences} {self.prompter.eos_token}')
+            tokenized_item = self.tokenizer(
+                item["prompt"],
+                return_tensors="pt",
+                truncation=True,
+                max_length=1024,
+            )
+            if item["is_labels"]:  # If this prompt is labels
+                labels.append(tokenized_item.input_ids)
+            else:
+                labels.append(torch.full_like(tokenized_item.input_ids, -100))
+            input_ids.append(tokenized_item.input_ids)
+        if (
+            self.only_generated_task and not has_generated_image
+        ):  # For denoiser training
+            raise ValueError(
+                f"Only generated task is not supported. But this prompt not contains generated image token: {prompt_list[0]['prompt']}"
+            )
+        input_ids = torch.cat(input_ids, dim=1)
+        labels = torch.cat(labels, dim=1)
+        # Load images
+        if has_generated_image:
+            # generate task
+            # process images but exclude the last image, which need to generate
+            image_slice = data["image"][:-1]
+        else:
+            # understanding task
+            image_slice = data["image"]
+        image_dict = self._load_image(
+            image_slice, self.max_pixels, self.min_pixels,
+            processor=self.processor, image_token=self.image_token,
+            factor=self.factor,
+            last_image=data["image"][-1] if has_generated_image else None,
+            vae_image_transform=self.transform,
+            drop_prompt=drop_prompt,
+            prompt=prompt,
+            mask_weight_type=self.mask_weight_type,
+            siglip_processor=self.siglip_processor,
+            need_weight=data['need_weight'],
+            )
+        image_token_lengths = image_dict['image_token_lengths']
+        pixel_values = image_dict['pixel_values']
+        image_grid_thw = image_dict['image_grid_thw']
+        ref_pixel_values = image_dict['ref_pixel_values']
+        pil_pixel_values = image_dict['pil_pixel_values']
+        siglip_pixel_values = image_dict['siglip_pixel_values']
+        weights = image_dict['weights']
+        input_ids, labels, image_position = self._process_image_token(
+            input_ids,
+            labels=labels,
+            image_token_id=self.image_token_id,
+            image_begin_token_id=self.image_begin_token_id,
+            image_end_token_id=self.image_end_token_id,
+            image_token_lengths=image_token_lengths,
+        )
+        return_data = {
+            "input_ids": input_ids,
+            "labels": labels,
+            "pixel_values": pixel_values,
+            "image_position": image_position,
+            "image_grid_thw": image_grid_thw,
+            "prompt": prompt,
+            "ref_pixel_values": ref_pixel_values,
+            "pil_pixel_values": pil_pixel_values,
+            "siglip_pixel_values": siglip_pixel_values,
+            "weights": weights,
+        }
+        if has_generated_image: # If this item is a generation task
+            image = Image.open(data["image"][-1]).convert("RGB")
+            # if self.anyres:
+            #     image = image.resize(pil_pixel_values[-1].size)
+            image_tensor = torch.tensor(np.array(image)) / 255.0  # scale to 0-1
+            image_tensor = rearrange(image_tensor, "h w c -> c h w")
+            return_data["generated_image"] = self.transform(image_tensor)
+        else:
+            return_data["generated_image"] = []
+        return return_data
+    def __getitem__(self, idx):
+        if self.random_data:
+            return self._get_random_data()
+        data: Any = self.data[idx]
+        if self.notry:
+            return self.getitem(data)
+        try:
+            return self.getitem(data)
+        except Exception as e:
+            print(f'Error with {e}')
+            return self.__getitem__(random.randint(0, self.__len__()-1))
+    @staticmethod
+    def _load_image(
+        image_slice: List[str],
+        max_pixels: int = 448*448,
+        min_pixels: int = 448*448,
+        processor: Callable = None,
+        image_processor: Callable = None,
+        image_token_lengths: int = 729,
+        image_token: str = '<|image_pad|>',
+        factor: int = 1,
+        last_image: Optional[str] = None,
+        vae_image_transform: Callable = None,
+        drop_prompt: bool = False,
+        prompt: str = '',
+        mask_weight_type: str = None,
+        siglip_processor: Callable = None,
+        need_weight: str = 'true',
+    ):
+        resize_ref_image = False
+        pil_pixel_values_last = []
+        if last_image is not None:
+            last_vision_infos = dict(
+                image=last_image, min_pixels=min_pixels, max_pixels=max_pixels
+                )
+            # last_image will be resize by qwenvl-processor automatically
+            # generated variable resolution
+            last_image_inputs, last_video_inputs = process_vision_info([last_vision_infos], factor=factor)
+            # logging what size will be process when use qwenvl-processor
+            pil_pixel_values_last.append(last_image_inputs[0])
+            # not all reference images are same resolution
+            # if multiple reference images and they have different resolution, resize it depend on last_image (generated_image)
+            if not all([has_same_resolution(image_path, last_image) for image_path in image_slice]):
+                resize_ref_image = True
+                resize_w, resize_h = last_image_inputs[0].size
+        image_token_lengths = []
+        pixel_values = []
+        image_grid_thw = []
+        ref_pixel_values = []
+        pil_pixel_values = []
+        siglip_pixel_values = []
+        # Ignore the last image (generated image)
+        for image_path in image_slice:
+            vision_infos = dict(image=image_path, min_pixels=min_pixels, max_pixels=max_pixels)
+            # if multiple reference images and they have different aspect ratio, resize it depend on generated_image (last_image)
+            if resize_ref_image:
+                vision_infos.update(
+                    dict(resized_height=resize_h, resized_width=resize_w)
+                    )
+            image_inputs, video_inputs = process_vision_info([vision_infos], factor=factor)
+            inputs = processor(text=[f'dummy {image_token}'], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
+            if not drop_prompt:
+                pixel_values.append(inputs.pixel_values)  # inputs.pixel_values shape is (token, dim)
+                image_grid_thw.append(inputs.image_grid_thw)  # image_grid_thw List[int, int, int]
+                image_token_length = (inputs.input_ids[0] == processor.tokenizer.convert_tokens_to_ids(image_token)).sum()
+                image_token_lengths.append(image_token_length)
+            image_tensor = torch.tensor(np.array(image_inputs[0])) / 255.0  # scale to 0-1
+            image_tensor = rearrange(image_tensor, "h w c -> 1 c h w")
+            if vae_image_transform is not None:
+                # image_tensor has been resized by qwenvl-processor
+                image_tensor = (image_tensor - 0.5) / 0.5  # shift [0, 1] to [-1, 1]
+            pil_pixel_values.append(image_inputs[0])
+            if siglip_processor is not None:
+                siglip_pixel_value = siglip_processor.preprocess(
+                            images=Image.open(image_path).convert('RGB') if isinstance(image_path, str) else image_path,
+                            do_resize=True, return_tensors="pt", do_convert_rgb=True
+                        ).pixel_values  # 1 c h w
+                if drop_prompt:
+                    siglip_pixel_values.append(torch.zeros_like(siglip_pixel_value))
+                else:
+                    siglip_pixel_values.append(siglip_pixel_value)
+            # use zero_image as uncondition reference image
+            if drop_prompt:
+                ref_pixel_values.append(torch.zeros_like(image_tensor))
+            else:
+                ref_pixel_values.append(image_tensor)
+        # if multi-image in a sample, concat them
+        # assume pixel_values[0] (n1, 1176), pixel_values[1] (n2, 1176), pixel_values will be (n1+n2, 1176)
+        if len(pixel_values) > 0:
+            pixel_values = torch.concat(pixel_values)
+            image_grid_thw = torch.concat(image_grid_thw)  # (b, 3), 3 mean the grid of t, h, w
+        # if len(ref_pixel_values) > 0:
+        #     ref_pixel_values = torch.concat(ref_pixel_values)  # b c h w
+        ref_pixel_values = []
+        if len(siglip_pixel_values) > 0:
+            siglip_pixel_values = torch.concat(siglip_pixel_values)  # b c h w
+        pil_pixel_values = pil_pixel_values + pil_pixel_values_last
+        if mask_weight_type is not None:
+            _, weights = get_weight_mask(pil_pixel_values, prompt, mask_weight_type, need_weight)
+            if need_weight.lower() == 'false':
+                assert torch.all(weights == 1)
+        else:
+            weights = []
+        return {
+            'pixel_values': pixel_values,
+            'image_grid_thw': image_grid_thw,
+            'image_token_lengths': image_token_lengths,
+            'ref_pixel_values': ref_pixel_values,
+            'pil_pixel_values': pil_pixel_values,
+            'siglip_pixel_values': siglip_pixel_values,
+            'weights': weights,
+            }
+    @staticmethod
+    def _process_image_token(
+        input_ids: torch.Tensor,
+        image_token_id: int,
+        image_begin_token_id: int,
+        image_end_token_id: int,
+        image_token_lengths: List[int],
+        labels: Optional[torch.Tensor] = None,
+    ):
+        # Find the indices of the image token
+        image_token_indices = (input_ids == image_token_id).nonzero(as_tuple=True)
+        # assert len(image_token_lengths) == image_token_indices[1].numel()
+        image_position = []
+        offset = 0
+        cur_i = 0
+        if isinstance(image_token_lengths, int):
+            image_token_lengths = [image_token_lengths] * len(image_token_indices[1])
+        for idx in image_token_indices[1]:
+            image_token_length = image_token_lengths[cur_i]
+            adjusted_idx = idx + offset
+            assert input_ids[0, adjusted_idx] == image_token_id, "assert input_ids[0, adjusted_idx] == image_token_id"
+            # Add image begin and end token
+            input_ids = torch.cat(
+                [
+                    input_ids[:, :adjusted_idx],
+                    input_ids.new_full(
+                        (1, 1), image_begin_token_id
+                    ),  # image begin token
+                    input_ids.new_full(
+                        (1, image_token_length), image_token_id
+                    ),  # Repeat the image token to the length of image_token_length
+                    input_ids.new_full((1, 1), image_end_token_id),  # image end token
+                    input_ids[:, adjusted_idx + 1 :],
+                ],
+                dim=1,
+            )
+            if labels is not None:
+                labels = torch.cat(
+                    [
+                        labels[:, :adjusted_idx],
+                        labels.new_full(
+                            (1, 1), image_begin_token_id
+                        ),  # Make begin token as label
+                        labels.new_full((1, image_token_length), -100),
+                        labels.new_full((1, 1), -100),
+                        labels[:, adjusted_idx + 1 :],
+                    ],
+                    dim=1,
+                )
+            adjusted_idx += 1  # skip the image begin token
+            image_position.append(adjusted_idx.item())
+            offset += image_token_length - 1
+            offset += 2  # begin and end token
+            cur_i += 1
+        return input_ids, labels, image_position
+def fetch_image(ele: dict[str, str | Image.Image], size_factor: int = 28) -> Image.Image:
+    if "image" in ele:
+        image = ele["image"]
+    else:
+        image = ele["image_url"]
+    image_obj = None
+    if isinstance(image, Image.Image):
+        image_obj = image
+    elif image.startswith("http://") or image.startswith("https://"):
+        response = requests.get(image, stream=True)
+        image_obj = Image.open(BytesIO(response.content))
+    elif image.startswith("file://"):
+        image_obj = Image.open(image[7:])
+    elif image.startswith("data:image"):
+        if "base64," in image:
+            _, base64_data = image.split("base64,", 1)
+            data = base64.b64decode(base64_data)
+            image_obj = Image.open(BytesIO(data))
+    else:
+        image_obj = Image.open(image)
+    if image_obj is None:
+        raise ValueError(f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}")
+    image = to_rgb(image_obj)
+    ## resize
+    if "resized_height" in ele and "resized_width" in ele:
+        resized_height, resized_width = smart_resize(
+            ele["resized_height"],
+            ele["resized_width"],
+            factor=size_factor,
+        )
+    else:
+        width, height = image.size
+        min_pixels = ele.get("min_pixels")
+        max_pixels = ele.get("max_pixels")
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=size_factor,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+    image = image.resize((resized_width, resized_height), resample=Image.Resampling.BICUBIC)
+    return image
+def process_vision_info(
+    vision_infos: list,
+    return_video_kwargs: bool = False,
+    factor: int = 1,
+) -> tuple[list[Image.Image] | None, list[torch.Tensor | list[Image.Image]] | None, Optional[dict]]:
+    ## Read images or videos
+    image_inputs = []
+    video_inputs = []
+    video_sample_fps_list = []
+    for vision_info in vision_infos:
+        if "image" in vision_info or "image_url" in vision_info:
+            image_inputs.append(fetch_image(vision_info, size_factor=28*factor))
+        elif "video" in vision_info:
+            video_input, video_sample_fps = fetch_video(vision_info, return_video_sample_fps=True)
+            video_sample_fps_list.append(video_sample_fps)
+            video_inputs.append(video_input)
+        else:
+            raise ValueError("image, image_url or video should in content.")
+    if len(image_inputs) == 0:
+        image_inputs = None
+    if len(video_inputs) == 0:
+        video_inputs = None
+    if return_video_kwargs:
+        return image_inputs, video_inputs, {'fps': video_sample_fps_list}
+    return image_inputs, video_inputs

univa/eval/__init__.py ADDED Viewed

File without changes

univa/eval/configuration_eval.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from dataclasses import dataclass
+from typing import Optional, List
+@dataclass
+class EvalConfig:
+    pretrained_lvlm_name_or_path: str
+    pretrained_denoiser_name_or_path: str
+    pretrained_siglip_name_or_path: str
+    ocr_enhancer: bool = False
+    joint_with_t5: bool = False
+    only_use_t5: bool = False
+    seed: int = 42
+    allow_tf32: bool = False
+    output_dir: str = "./output"
+    num_images_per_prompt: int = 1
+    num_inference_steps: int = 32
+    guidance_scale: float = 3.5  # Used in Flux
+    num_samples_per_prompt: int = 1
+    height: int = 1024
+    width: int = 1024
+    min_pixels: int = 448*448
+    max_pixels: int = 448*448
+    anyres: str = 'any_11ratio'
+    padding_side: str = 'right'
+    local_rank: int = 0
+    world_size: int = 1
+    # genai
+    genai_prompt_path: str = "univa/eval/genai/eval_prompts/genai527/genai_image.json"
+    # geneval
+    n_samples: int = 4
+    geneval_prompt_path: str = "univa/eval/geneval/evaluation_metadata.jsonl"
+    resized_height: int = 1024
+    resized_width: int = 1024
+    # dpgbench
+    dpgbench_prompt_path: str = "univa/eval/dpgbench/dpgbench_prompts.json"
+    # wise
+    wise_prompt_path: str = "univa/eval/wise/data"
+    # imgedit
+    imgedit_prompt_path: str = "univa/eval/imgedit/basic_edit.json"
+    imgedit_image_dir: str = "/mnt/data/lb/Remake/imgedit_bench_eval_images"
+    # gedit
+    gedit_prompt_path: str = "univa/eval/gedit/basic_edit.json"
+    gedit_image_dir: str = "/mnt/data/lb/Remake/gedit_bench_eval_images"

univa/eval/dpgbench/README.md ADDED Viewed

	@@ -0,0 +1,65 @@

+The original code is from [DPG-Bench](https://github.com/TencentQQGYLab/ELLA).
+## Requirements and Installation
+> Official environment is **NOT** recommended.
+Prepare conda environment:
+```bash
+conda create -n dpgbench_eval python=3.10 -y
+conda activate geneval_eval
+```
+Install package:
+```bash
+pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124
+pip install "pip<24.1"
+pip install -r requirements.txt
+```
+## Eval
+### Generate samples
+```bash
+# switch to univa env
+MODEL_PATH='path/to/model'
+OUTPUT_DIR='path/to/eval_output/dpgbench'
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun \
+  --nproc_per_node 8 \
+  -m step1_gen_samples \
+  dpgbench.yaml \
+  --pretrained_lvlm_name_or_path ${MODEL_PATH} \
+  --output_dir ${OUTPUT_DIR}
+```
+### Evaluation & Summary
+Download mplug model to `$MPLUG_LOCAL_PATH`:
+```bash
+conda activate dpgbench_eval
+modelscope download --model 'iic/mplug_visual-question-answering_coco_large_en' --local_dir ${MPLUG_LOCAL_PATH}
+```
+```bash
+conda activate dpgbench_eval
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+IMAGE_DIR=${OUTPUT_DIR}
+accelerate launch --num_machines 1 --num_processes 8 \
+    --multi_gpu --mixed_precision "fp16" \
+    step2_compute_dpg_bench.py \
+    --image_root_path ${IMAGE_DIR} \
+    --resolution 1024 \
+    --pic_num 4 \
+    --res_path ${IMAGE_DIR}.txt \
+    --vqa_model mplug \
+    --mplug_local_path ${MPLUG_LOCAL_PATH} \
+    --csv eval_prompts/dpgbench.csv
+cat ${IMAGE_DIR}.txt
+```

univa/eval/dpgbench/__init__.py ADDED Viewed

File without changes

univa/eval/dpgbench/dpgbench.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+pretrained_lvlm_name_or_path: /mnt/data/lb/Remake/UniWorld//checkpoints/flux_qwen2p5vl_7b_vlm_mlp_siglip_stage2_ts_1024_bs42x8x1_fa_any_11ratio_ema999_ocr_adamw_t5_1p0_lr5e-6_mask_refstyle_extract/checkpoint-20000/model_ema
+pretrained_denoiser_name_or_path: /mnt/data/checkpoints/black-forest-labs/FLUX.1-dev/
+pretrained_siglip_name_or_path: /mnt/data/checkpoints/google/siglip2-so400m-patch16-512
+joint_with_t5: true
+seed: 42
+allow_tf32: false
+output_dir: /mnt/data/lb/Remake/UniWorld//eval_output/dpgbench
+num_images_per_prompt: 4
+num_inference_steps: 28
+guidance_scale: 2.5
+height: 1024
+width: 1024
+dpgbench_prompt_path: /mnt/data/lb/Remake/UniWorld//univa/eval/dpgbench/eval_prompts/dpgbench_prompts.json

univa/eval/dpgbench/eval_prompts/dpgbench.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

univa/eval/dpgbench/eval_prompts/dpgbench_prompts.json ADDED Viewed

The diff for this file is too large to render. See raw diff

univa/eval/dpgbench/requirements.txt ADDED Viewed

	@@ -0,0 +1,32 @@

+accelerate
+numpy
+pandas
+pillow
+tqdm
+# for modelscope
+cloudpickle
+decord>=0.6.0
+diffusers
+fairseq
+ftfy>=6.0.3
+librosa==0.10.1
+modelscope
+opencv-python
+# compatible with taming-transformers-rom1504
+rapidfuzz
+# rough-score was just recently updated from 0.0.4 to 0.0.7
+# which introduced compatability issues that are being investigated
+rouge_score<=0.0.4
+safetensors
+# scikit-video
+soundfile
+taming-transformers-rom1504
+tiktoken
+timm
+tokenizers
+torchvision
+transformers
+transformers_stream_generator
+unicodedata2
+zhconv

univa/eval/dpgbench/step1_gen_samples.py ADDED Viewed

	@@ -0,0 +1,248 @@

+import sys
+import os
+root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+sys.path.append(root)
+import json
+import torch
+import random
+import subprocess
+import numpy as np
+import torch.distributed as dist
+import pandas as pd
+import argparse
+import torch
+import os
+from PIL import Image
+from tqdm import tqdm
+import torch.distributed as dist
+from qwen_vl_utils import process_vision_info
+from torchvision import transforms
+from transformers import AutoProcessor
+from transformers import SiglipImageProcessor, SiglipVisionModel
+from univa.utils.flux_pipeline import FluxPipeline
+from univa.eval.configuration_eval import EvalConfig
+from univa.utils.get_ocr import get_ocr_result
+from univa.utils.denoiser_prompt_embedding_flux import encode_prompt
+from univa.models.qwen2p5vl.modeling_univa_qwen2p5vl import UnivaQwen2p5VLForConditionalGeneration
+# adapted from https://github.com/huggingface/accelerate/blob/main/src/accelerate/utils/random.py#L31
+def set_seed(seed, rank, device_specific=True):
+    if device_specific:
+        seed += rank
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def initialize_models(args, device):
+    # Load main model and task head
+    model = UnivaQwen2p5VLForConditionalGeneration.from_pretrained(
+        args.pretrained_lvlm_name_or_path,
+        torch_dtype=torch.bfloat16,
+        attn_implementation="flash_attention_2",
+    ).to(device)
+    processor = AutoProcessor.from_pretrained(
+        args.pretrained_lvlm_name_or_path,
+        min_pixels=args.min_pixels,
+        max_pixels=args.max_pixels,
+    )
+    # Load FLUX pipeline
+    pipe = FluxPipeline.from_pretrained(
+        args.pretrained_denoiser_name_or_path,
+        transformer=model.denoise_tower.denoiser,
+        torch_dtype=torch.bfloat16,
+    ).to(device)
+    tokenizers = [pipe.tokenizer, pipe.tokenizer_2]
+    text_encoders = [pipe.text_encoder, pipe.text_encoder_2]
+    siglip_processor = SiglipImageProcessor.from_pretrained(args.pretrained_siglip_name_or_path)
+    siglip_model = SiglipVisionModel.from_pretrained(
+        args.pretrained_siglip_name_or_path,
+        torch_dtype=torch.bfloat16,
+    ).to(device)
+    return {
+        'model': model,
+        'processor': processor,
+        'pipe': pipe,
+        'tokenizers': tokenizers,
+        'text_encoders': text_encoders,
+        'device': device,
+        'siglip_model': siglip_model,
+        'siglip_processor': siglip_processor,
+    }
+def init_gpu_env(args):
+    local_rank = int(os.getenv('RANK', 0))
+    world_size = int(os.getenv('WORLD_SIZE', 1))
+    args.local_rank = local_rank
+    args.world_size = world_size
+    torch.cuda.set_device(local_rank)
+    dist.init_process_group(
+        backend='nccl', init_method='env://',
+        world_size=world_size, rank=local_rank
+        )
+    return args
+def run_model_and_return_samples(args, state, text, image1=None, image2=None):
+    # Build content
+    convo = []
+    image_paths = []
+    content = []
+    for img in (image1, image2):
+        if img:
+            content.append({'type':'image','image':img,'min_pixels':args.min_pixels,'max_pixels':args.max_pixels})
+            image_paths.append(img)
+    if text:
+        ocr_text = ''
+        if args.ocr_enhancer and content:
+            ocr_texts = []
+            for img in (image1, image2):
+                if img:
+                    ocr_texts.append(get_ocr_result(img, cur_ocr_i))
+                    cur_ocr_i += 1
+            ocr_text = '\n'.join(ocr_texts)
+        content.append({'type':'text','text': text + ocr_text})
+    if not args.only_use_t5:
+        convo.append({'role':'user','content':content})
+        # Prepare inputs
+        chat_text = state['processor'].apply_chat_template(
+            convo,
+            tokenize=False,
+            add_generation_prompt=True
+            )
+        chat_text = '<|im_end|>\n'.join(chat_text.split('<|im_end|>\n')[1:])
+        image_inputs, video_inputs = process_vision_info(convo)
+        inputs = state['processor'](
+            text=[chat_text], images=image_inputs, videos=video_inputs,
+            padding=True, return_tensors='pt'
+        ).to(state['device'])
+        # Generate
+        # image generation pipeline
+        siglip_hs = None
+        if state['siglip_processor'] and image_paths:
+            vals = [state['siglip_processor'].preprocess(
+                        images=Image.open(p).convert('RGB'), do_resize=True,
+                        return_tensors='pt', do_convert_rgb=True
+                    ).pixel_values.to(state['device'])
+                    for p in image_paths]
+            siglip_hs = state['siglip_model'](torch.concat(vals)).last_hidden_state
+        with torch.no_grad():
+            lvlm = state['model'](
+                inputs.input_ids, pixel_values=getattr(inputs,'pixel_values',None),
+                attention_mask=inputs.attention_mask,
+                image_grid_thw=getattr(inputs,'image_grid_thw',None),
+                siglip_hidden_states=siglip_hs,
+                output_type='denoise_embeds'
+            )
+            prm_embeds, pooled = encode_prompt(
+                state['text_encoders'], state['tokenizers'],
+                text if args.joint_with_t5 else '', 256, state['device'], 1
+            )
+        emb = torch.concat([lvlm, prm_embeds], dim=1) if args.joint_with_t5 else lvlm
+    else:
+        prm_embeds, pooled = encode_prompt(
+            state['text_encoders'], state['tokenizers'],
+            text, 256, state['device'], 1
+        )
+        emb = prm_embeds
+    with torch.no_grad():
+        img = state['pipe'](
+            prompt_embeds=emb,
+            pooled_prompt_embeds=pooled,
+            height=args.height,
+            width=args.width,
+            num_inference_steps=args.num_inference_steps,
+            guidance_scale=args.guidance_scale,
+            num_images_per_prompt=args.num_images_per_prompt,
+        ).images
+    return img
+def concat_image(images, save_path, args):
+    height = args.height
+    width = args.width
+    # 创建一个新的空白图像，宽度和高度是单张图像的两倍
+    new_image = Image.new('RGB', (width * 2, height * 2))
+    # 将四张图像粘贴到新图像的相应位置
+    for index in range(4):
+        row = index // 2
+        col = index % 2
+        img = images[index]
+        new_image.paste(img, (col * width, row * height))
+    # 保存拼接后的图像
+    new_image.save(save_path)
+def main(args):
+    args = init_gpu_env(args)
+    torch.backends.cuda.matmul.allow_tf32 = False
+    torch.backends.cudnn.allow_tf32 = False
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+    set_seed(args.seed, rank=args.local_rank, device_specific=True)
+    device = torch.cuda.current_device()
+    state = initialize_models(args, device)
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+    with open(args.dpgbench_prompt_path, 'r') as f:
+        data = list(json.load(f).items())
+    data = data[args.local_rank::args.world_size]
+    for filename, text_prompt in tqdm(data):
+        img_name = filename.replace('.txt', '.png')
+        save_path = os.path.join(args.output_dir, img_name)
+        if os.path.exists(save_path):
+            continue
+        image = run_model_and_return_samples(args, state, text_prompt, image1=None, image2=None)
+        concat_image(image, save_path, args)
+if __name__ == "__main__":
+    import argparse
+    from omegaconf import OmegaConf
+    parser = argparse.ArgumentParser()
+    parser.add_argument("config", type=str)
+    parser.add_argument("--pretrained_lvlm_name_or_path", type=str, default=None, required=False)
+    parser.add_argument("--output_dir", type=str, default=None, required=False)
+    args = parser.parse_args()
+    config = OmegaConf.load(args.config)
+    schema = OmegaConf.structured(EvalConfig)
+    conf = OmegaConf.merge(schema, config)
+    if args.pretrained_lvlm_name_or_path is not None:
+        assert args.output_dir is not None
+        conf.pretrained_lvlm_name_or_path = args.pretrained_lvlm_name_or_path
+        conf.output_dir = args.output_dir
+    main(conf)

univa/eval/dpgbench/step2_compute_dpg_bench.py ADDED Viewed

	@@ -0,0 +1,269 @@

+import argparse
+import os
+import os.path as osp
+import time
+from collections import defaultdict
+import numpy as np
+import pandas as pd
+import torch
+from accelerate import Accelerator
+from accelerate.utils import gather_object
+from PIL import Image
+from tqdm import tqdm
+def parse_args():
+    parser = argparse.ArgumentParser(description="DPG-Bench evaluation.")
+    parser.add_argument(
+        "--image_root_path",
+        type=str,
+        default=None,
+    )
+    parser.add_argument(
+        "--resolution",
+        type=int,
+        default=None,
+    )
+    parser.add_argument(
+        "--csv",
+        type=str,
+        default='eval/eval_prompts/DPGbench/dpg_bench.csv',
+    )
+    parser.add_argument(
+        "--res_path",
+        type=str,
+        default='eval/dpgbench_test/score_result/result.txt',
+    )
+    parser.add_argument(
+        "--pic_num",
+        type=int,
+        default=1,
+    )
+    parser.add_argument(
+        "--vqa_model",
+        type=str,
+        default='mplug',
+    )
+    parser.add_argument(
+        "--vqa_model_ckpt",
+        type=str,
+        default='/storage/hxy/t2i/opensora/Open-Sora-Plan/opensora/eval/dpgbench_test/mplug',
+    )
+    parser.add_argument(
+        "--mplug_local_path",
+        type=str,
+        default='/storage/hxy/t2i/opensora/Open-Sora-Plan/opensora/eval/dpgbench_test/mplug',
+    )
+    args = parser.parse_args()
+    return args
+class MPLUG(torch.nn.Module):
+    def __init__(self, ckpt='weight/dpgbench', device='gpu'):
+        super().__init__()
+        from modelscope.pipelines import pipeline
+        from modelscope.utils.constant import Tasks
+        self.pipeline_vqa = pipeline(Tasks.visual_question_answering, model=ckpt, device=device)
+    def vqa(self, image, question):
+        input_vqa = {'image': image, 'question': question}
+        result = self.pipeline_vqa(input_vqa)
+        return result['text']
+def prepare_dpg_data(args):
+    previous_id = ''
+    current_id = ''
+    question_dict = dict()
+    category_count = defaultdict(int)
+    # 'item_id', 'text', 'keywords', 'proposition_id', 'dependency', 'category_broad', 'category_detailed', 'tuple', 'question_natural_language'
+    data = pd.read_csv(args.csv)
+    for i, line in data.iterrows():
+        if i == 0:
+            continue
+        current_id = line.item_id
+        qid = int(line.proposition_id)
+        dependency_list_str = line.dependency.split(',')
+        dependency_list_int = []
+        for d in dependency_list_str:
+            d_int = int(d.strip())
+            dependency_list_int.append(d_int)
+        if current_id == previous_id:
+            question_dict[current_id]['qid2tuple'][qid] = line.tuple
+            question_dict[current_id]['qid2dependency'][qid] = dependency_list_int
+            question_dict[current_id]['qid2question'][qid] = line.question_natural_language
+        else:
+            question_dict[current_id] = dict(
+                qid2tuple={qid: line.tuple},
+                qid2dependency={qid: dependency_list_int},
+                qid2question={qid: line.question_natural_language})
+        category = line.question_natural_language.split('(')[0].strip()
+        category_count[category] += 1
+        previous_id = current_id
+    return question_dict
+def crop_image(input_image, crop_tuple=None):
+    if crop_tuple is None:
+        return input_image
+    cropped_image = input_image.crop((crop_tuple[0], crop_tuple[1], crop_tuple[2], crop_tuple[3]))
+    return cropped_image
+def compute_dpg_one_sample(args, question_dict, image_path, vqa_model, resolution):
+    generated_image = Image.open(image_path)
+    crop_tuples_list = [
+        (0,0,resolution,resolution),
+        (resolution, 0, resolution*2, resolution),
+        (0, resolution, resolution, resolution*2),
+        (resolution, resolution, resolution*2, resolution*2),
+    ]
+    crop_tuples = crop_tuples_list[:args.pic_num]
+    key = osp.basename(image_path).split('.')[0]
+    value = question_dict.get(key, None)
+    qid2tuple = value['qid2tuple']
+    qid2question = value['qid2question']
+    qid2dependency = value['qid2dependency']
+    qid2answer = dict()
+    qid2scores = dict()
+    qid2validity = dict()
+    scores = []
+    for crop_tuple in crop_tuples:
+        cropped_image = crop_image(generated_image, crop_tuple)
+        for id, question in qid2question.items():
+            answer = vqa_model.vqa(cropped_image, question)
+            qid2answer[id] = answer
+            qid2scores[id] = float(answer == 'yes')
+            with open(args.res_path.replace('.txt', '_detail.txt'), 'a') as f:
+                f.write(image_path + ', ' + str(crop_tuple) + ', ' + question + ', ' + answer + '\n')
+        qid2scores_orig = qid2scores.copy()
+        for id, parent_ids in qid2dependency.items():
+            # zero-out scores if parent questions are answered 'no'
+            any_parent_answered_no = False
+            for parent_id in parent_ids:
+                if parent_id == 0:
+                    continue
+                if qid2scores[parent_id] == 0:
+                    any_parent_answered_no = True
+                    break
+            if any_parent_answered_no:
+                qid2scores[id] = 0
+                qid2validity[id] = False
+            else:
+                qid2validity[id] = True
+        score = sum(qid2scores.values()) / len(qid2scores)
+        scores.append(score)
+    average_score = sum(scores) / len(scores)
+    with open(args.res_path, 'a') as f:
+        f.write(image_path + ', ' + ', '.join(str(i) for i in scores) + ', ' + str(average_score) + '\n')
+    return average_score, qid2tuple, qid2scores_orig
+def main():
+    args = parse_args()
+    accelerator = Accelerator()
+    question_dict = prepare_dpg_data(args)
+    timestamp = time.time()
+    time_array = time.localtime(timestamp)
+    time_style = time.strftime("%Y%m%d-%H%M%S", time_array)
+    if args.res_path is None:
+        args.res_path = osp.join(args.image_root_path, f'dpg-bench_{time_style}_results.txt')
+    if accelerator.is_main_process:
+        with open(args.res_path, 'w') as f:
+            pass
+        with open(args.res_path.replace('.txt', '_detail.txt'), 'w') as f:
+            pass
+    device = str(accelerator.device)
+    if args.vqa_model == 'mplug':
+        vqa_model = MPLUG(args.mplug_local_path, device=device)
+    else:
+        raise NotImplementedError
+    vqa_model = accelerator.prepare(vqa_model)
+    vqa_model = getattr(vqa_model, 'module', vqa_model)
+    filename_list = os.listdir(args.image_root_path)
+    num_each_rank = len(filename_list) / accelerator.num_processes
+    local_rank = accelerator.process_index
+    local_filename_list = filename_list[round(local_rank * num_each_rank) : round((local_rank + 1) * num_each_rank)]
+    local_scores = []
+    local_category2scores = defaultdict(list)
+    model_id = osp.basename(args.image_root_path)
+    print(f'Start to conduct evaluation of {model_id}')
+    for fn in tqdm(local_filename_list):
+        image_path = osp.join(args.image_root_path, fn)
+        try:
+            # compute score of one sample
+            score, qid2tuple, qid2scores = compute_dpg_one_sample(
+                args=args, question_dict=question_dict, image_path=image_path, vqa_model=vqa_model, resolution=args.resolution)
+            local_scores.append(score)
+            # summarize scores by categoris
+            for qid in qid2tuple.keys():
+                category = qid2tuple[qid].split('(')[0].strip()
+                qid_score = qid2scores[qid]
+                local_category2scores[category].append(qid_score)
+        except Exception as e:
+            print('Failed filename:', fn, e)
+            continue
+    accelerator.wait_for_everyone()
+    global_dpg_scores = gather_object(local_scores)
+    mean_dpg_score = np.mean(global_dpg_scores)
+    global_categories = gather_object(list(local_category2scores.keys()))
+    global_categories = set(global_categories)
+    global_category2scores = dict()
+    global_average_scores = []
+    for category in global_categories:
+        local_category_scores = local_category2scores.get(category, [])
+        global_category2scores[category] = gather_object(local_category_scores)
+        global_average_scores.extend(gather_object(local_category_scores))
+    global_category2scores_l1 = defaultdict(list)
+    for category in global_categories:
+        l1_category = category.split('-')[0].strip()
+        global_category2scores_l1[l1_category].extend(global_category2scores[category])
+    time.sleep(3)
+    if accelerator.is_main_process:
+        output = f'Model: {model_id}\n'
+        output += 'L1 category scores:\n'
+        for l1_category in global_category2scores_l1.keys():
+            output += f'\t{l1_category}: {np.mean(global_category2scores_l1[l1_category]) * 100}\n'
+        output += 'L2 category scores:\n'
+        for category in sorted(global_categories):
+            output += f'\t{category}: {np.mean(global_category2scores[category]) * 100}\n'
+        output += f'Image path: {args.image_root_path}\n'
+        output += f'Save results to: {args.res_path}\n'
+        output += f'DPG-Bench score: {mean_dpg_score * 100}'
+        with open(args.res_path, 'a') as f:
+            f.write(output + '\n')
+        print(output)
+if __name__ == "__main__":
+    main()

univa/eval/gedit/README.md ADDED Viewed

	@@ -0,0 +1,71 @@

+The original code is from [GEdit-Bench](https://github.com/stepfun-ai/Step1X-Edit/blob/main/GEdit-Bench/EVAL.md).
+## Requirements and Installation
+```
+pip install megfile openai
+```
+## Prepare Source Images
+Prepare the original image and metadata json following the example code in `step0_generate_image_example.py`
+```bash
+GEDIT_ASSET="/path/to/gedit_asset"
+python step0_prepare_gedit.py --save_path ${GEDIT_ASSET} --json_file_path gedit_edit.json
+```
+The file directory structure of the original image：
+```folder
+${GEDIT_ASSET}/
+│   └── fullset/
+│       └── edit_task/
+│           ├── cn/  # Chinese instructions
+│           │   ├── key1.png
+│           │   ├── key2.png
+│           │   └── ...
+│           └── en/  # English instructions
+│               ├── key1.png
+│               ├── key2.png
+│               └── ...
+```
+## Eval
+### Generate samples
+```bash
+# switch to univa env
+MODEL_PATH='path/to/model'
+OUTPUT_DIR='path/to/eval_output/gedit'
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun \
+  --nproc_per_node 8 \
+  -m step1_gen_samples \
+  gedit.yaml \
+  --pretrained_lvlm_name_or_path ${MODEL_PATH} \
+  --output_dir ${OUTPUT_DIR}
+```
+### Evaluation
+Write your gpt-api-key to `secret_t2.env`.
+```bash
+IMAGE_DIR=${OUTPUT_DIR}
+python step2_gedit_bench.py \
+    --model_name UniWorld \
+    --save_path ${IMAGE_DIR} \
+    --backbone gpt4o \
+    --source_path ${GEDIT_ASSET}
+```
+### Summary
+```bash
+python step3_calculate_statistics.py \
+    --model_name UniWorld \
+    --save_path ${IMAGE_DIR} \
+    --backbone gpt4o \
+    --language en > ${IMAGE_DIR}.txt
+cat ${IMAGE_DIR}.txt
+```

univa/eval/gedit/__init__.py ADDED Viewed

File without changes

univa/eval/gedit/gedit.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+pretrained_lvlm_name_or_path: /mnt/data/lb/Remake/UniWorld//checkpoints/flux_qwen2p5vl_7b_vlm_mlp_siglip_stage2_ts_1024_bs42x8x1_fa_any_11ratio_ema999_ocr_adamw_t5_1p0_lr5e-6_mask_refstyle_extract/checkpoint-20000/model_ema
+pretrained_denoiser_name_or_path: /mnt/data/checkpoints/black-forest-labs/FLUX.1-dev/
+pretrained_siglip_name_or_path: /mnt/data/checkpoints/google/siglip2-so400m-patch16-512
+joint_with_t5: false
+seed: 42
+allow_tf32: false
+output_dir: /mnt/data/lb/Remake/UniWorld//eval_output/gedit
+num_images_per_prompt: 1
+num_inference_steps: 28
+guidance_scale: 3.5
+height: 1024
+width: 1024
+gedit_prompt_path: gedit_edit.json
+gedit_image_dir: /mnt/data/lb/Remake/gedit_bench_eval_images
+resized_height: 1024
+resized_width: 1024

univa/eval/gedit/gedit_edit.json ADDED Viewed

The diff for this file is too large to render. See raw diff

univa/eval/gedit/secret_t2.env ADDED Viewed

File without changes

univa/eval/gedit/step0_prepare_gedit.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import json
+import os
+import math
+import argparse
+from datasets import Dataset, load_dataset
+# Dataset info structure:
+# - task_type: string - Type of the task
+# - key: string - Unique identifier for the sample
+# - instruction: string - Task instruction/prompt
+# - instruction_language: string - Language of the instruction
+# - input_image: Image - Original input image
+# - input_image_raw: Image - Raw/unprocessed input image
+# - Intersection_exist: bool - Whether intersection exists
+def calculate_dimensions(target_area, ratio):
+    width = math.sqrt(target_area * ratio)
+    height = width / ratio
+    width = round(width / 32) * 32
+    height = round(height / 32) * 32
+    new_area = width * height
+    if new_area < target_area:
+        width += 32
+        new_area = width * height
+    elif new_area > target_area:
+        width -= 32
+        new_area = width * height
+    return width, height, new_area
+def main(args):
+    # Load dataset
+    dataset = load_dataset("stepfun-ai/GEdit-Bench")
+    # Dictionary to store instruction and image paths
+    instruction_image_paths = {}
+    for item in dataset['train']:
+        task_type = item['task_type']
+        key = item['key']
+        instruction = item['instruction']
+        instruction_language = item['instruction_language']
+        input_image = item['input_image']
+        input_image_raw = item['input_image_raw']
+        intersection_exist = item['Intersection_exist']
+        target_width, target_height, new_area = calculate_dimensions(512 * 512, input_image_raw.width / input_image_raw.height)
+        resize_input_image = input_image_raw.resize((target_width, target_height))
+        save_path_fullset_source_image = os.path.join(args.save_path, f"fullset/{task_type}/{instruction_language}/{key}_SRCIMG.png")
+        save_path_fullset = os.path.join(args.save_path, f"fullset/{task_type}/{instruction_language}/{key}.png")
+        relative_path = f"fullset/{task_type}/{instruction_language}/{key}.png"
+        # Create directories if they don't exist
+        os.makedirs(os.path.dirname(save_path_fullset_source_image), exist_ok=True)
+        os.makedirs(os.path.dirname(save_path_fullset), exist_ok=True)
+        # Save the images
+        input_image.save(save_path_fullset_source_image)
+        resize_input_image.save(save_path_fullset)
+        # Store instruction and corresponding image path in the dictionary
+        instruction_image_paths[key] = {
+            'prompt': instruction,
+            'id': relative_path,
+            'edit_type':  task_type,
+        }
+    # Save the dictionary to a JSON file
+    with open(args.json_file_path, 'w') as json_file:
+        json.dump(instruction_image_paths, json_file, indent=4)
+    print(f"Instruction and image paths saved to {args.json_file_path}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Process and save dataset images and instructions.")
+    parser.add_argument("--save_path", type=str, required=True, help="Directory to save processed images.")
+    parser.add_argument("--json_file_path", type=str, required=True, help="Path to save the JSON file with instruction-image mappings.")
+    args = parser.parse_args()
+    main(args)

univa/eval/gedit/step1_gen_samples.py ADDED Viewed

	@@ -0,0 +1,260 @@

+import sys
+import os
+root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+sys.path.append(root)
+import json
+import torch
+import random
+import subprocess
+import numpy as np
+import torch.distributed as dist
+import pandas as pd
+import argparse
+import torch
+import os
+from PIL import Image
+from tqdm import tqdm
+import torch.distributed as dist
+from qwen_vl_utils import process_vision_info
+from torchvision import transforms
+from transformers import AutoProcessor
+from transformers import SiglipImageProcessor, SiglipVisionModel
+from univa.utils.flux_pipeline import FluxPipeline
+from univa.eval.configuration_eval import EvalConfig
+from univa.utils.get_ocr import get_ocr_result
+from univa.utils.denoiser_prompt_embedding_flux import encode_prompt
+from univa.models.qwen2p5vl.modeling_univa_qwen2p5vl import UnivaQwen2p5VLForConditionalGeneration
+from univa.utils.anyres_util import dynamic_resize
+# adapted from https://github.com/huggingface/accelerate/blob/main/src/accelerate/utils/random.py#L31
+def set_seed(seed, rank, device_specific=True):
+    if device_specific:
+        seed += rank
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def initialize_models(args, device):
+    # Load main model and task head
+    model = UnivaQwen2p5VLForConditionalGeneration.from_pretrained(
+        args.pretrained_lvlm_name_or_path,
+        torch_dtype=torch.bfloat16
+    ).to(device)
+    processor = AutoProcessor.from_pretrained(
+        args.pretrained_lvlm_name_or_path,
+        min_pixels=args.min_pixels,
+        max_pixels=args.max_pixels,
+    )
+    # Load FLUX pipeline
+    pipe = FluxPipeline.from_pretrained(
+        args.pretrained_denoiser_name_or_path,
+        transformer=model.denoise_tower.denoiser,
+        torch_dtype=torch.bfloat16,
+    ).to(device)
+    tokenizers = [pipe.tokenizer, pipe.tokenizer_2]
+    text_encoders = [pipe.text_encoder, pipe.text_encoder_2]
+    siglip_processor = SiglipImageProcessor.from_pretrained(args.pretrained_siglip_name_or_path)
+    siglip_model = SiglipVisionModel.from_pretrained(
+        args.pretrained_siglip_name_or_path,
+        torch_dtype=torch.bfloat16,
+    ).to(device)
+    return {
+        'model': model,
+        'processor': processor,
+        'pipe': pipe,
+        'tokenizers': tokenizers,
+        'text_encoders': text_encoders,
+        'device': device,
+        'siglip_model': siglip_model,
+        'siglip_processor': siglip_processor,
+    }
+def init_gpu_env(args):
+    local_rank = int(os.getenv('RANK', 0))
+    world_size = int(os.getenv('WORLD_SIZE', 1))
+    args.local_rank = local_rank
+    args.world_size = world_size
+    torch.cuda.set_device(local_rank)
+    dist.init_process_group(
+        backend='nccl', init_method='env://',
+        world_size=world_size, rank=local_rank
+        )
+    return args
+def update_size(i1, i2, anyres='any_11ratio', anchor_pixels=1024*1024):
+    shapes = []
+    for p in (i1, i2):
+        if p:
+            im = Image.open(p)
+            w, h = im.size
+            shapes.append((w, h))
+    if not shapes:
+        return int(anchor_pixels**0.5), int(anchor_pixels**0.5)
+    if len(shapes) == 1:
+        w, h = shapes[0]
+    else:
+        w = sum(s[0] for s in shapes) / len(shapes)
+        h = sum(s[1] for s in shapes) / len(shapes)
+    new_h, new_w = dynamic_resize(int(h), int(w), anyres, anchor_pixels=anchor_pixels)
+    return new_h, new_w
+def run_model_and_return_samples(args, state, text, image1=None, image2=None):
+    # Build content
+    convo = []
+    image_paths = []
+    content = []
+    if text:
+        ocr_text = ''
+        if args.ocr_enhancer and content:
+            ocr_texts = []
+            for img in (image1, image2):
+                if img:
+                    ocr_texts.append(get_ocr_result(img, cur_ocr_i))
+                    cur_ocr_i += 1
+            ocr_text = '\n'.join(ocr_texts)
+        content.append({'type':'text','text': text + ocr_text})
+    for img in (image1, image2):
+        if img:
+            content.append({'type':'image','image':img,'min_pixels':args.min_pixels,'max_pixels':args.max_pixels})
+            image_paths.append(img)
+    convo.append({'role':'user','content':content})
+    new_h, new_w = update_size(image1, image2, 'any_11ratio', anchor_pixels=args.height * args.width)
+    # Prepare inputs
+    chat_text = state['processor'].apply_chat_template(
+        convo,
+        tokenize=False,
+        add_generation_prompt=True
+        )
+    chat_text = '<|im_end|>\n'.join(chat_text.split('<|im_end|>\n')[1:])
+    image_inputs, video_inputs = process_vision_info(convo)
+    inputs = state['processor'](
+        text=[chat_text], images=image_inputs, videos=video_inputs,
+        padding=True, return_tensors='pt'
+    ).to(state['device'])
+    # Generate
+    # image generation pipeline
+    siglip_hs = None
+    if state['siglip_processor'] and image_paths:
+        vals = [state['siglip_processor'].preprocess(
+                    images=Image.open(p).convert('RGB'), do_resize=True,
+                    return_tensors='pt', do_convert_rgb=True
+                ).pixel_values.to(state['device'])
+                for p in image_paths]
+        siglip_hs = state['siglip_model'](torch.concat(vals)).last_hidden_state
+    with torch.no_grad():
+        lvlm = state['model'](
+            inputs.input_ids, pixel_values=getattr(inputs,'pixel_values',None),
+            attention_mask=inputs.attention_mask,
+            image_grid_thw=getattr(inputs,'image_grid_thw',None),
+            siglip_hidden_states=siglip_hs,
+            output_type='denoise_embeds'
+        )
+        prm_embeds, pooled = encode_prompt(
+            state['text_encoders'], state['tokenizers'],
+            text if args.joint_with_t5 else '', 256, state['device'], 1
+        )
+    if args.only_use_t5:
+        emb = prm_embeds
+    else:
+        emb = torch.concat([lvlm, prm_embeds], dim=1) if args.joint_with_t5 else lvlm
+    with torch.no_grad():
+        img = state['pipe'](
+            prompt_embeds=emb,
+            pooled_prompt_embeds=pooled,
+            # height=args.height,
+            # width=args.width,
+            height=new_h,
+            width=new_w,
+            num_inference_steps=args.num_inference_steps,
+            guidance_scale=args.guidance_scale,
+            num_images_per_prompt=args.num_images_per_prompt,
+        ).images
+    return img
+def main(args):
+    args = init_gpu_env(args)
+    torch.backends.cuda.matmul.allow_tf32 = False
+    torch.backends.cudnn.allow_tf32 = False
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+    set_seed(args.seed, rank=args.local_rank, device_specific=True)
+    device = torch.cuda.current_device()
+    state = initialize_models(args, device)
+    # Create the output directory if it doesn't exist
+    os.makedirs(args.output_dir, exist_ok=True)
+    # Load the evaluation prompts
+    with open(args.gedit_prompt_path, "r") as f:
+        data = json.load(f)
+    inference_list = []
+    for key, value in tqdm(data.items()):
+        outpath = args.output_dir
+        os.makedirs(outpath, exist_ok=True)
+        prompt = value["prompt"]
+        image_path = value['id']
+        inference_list.append([prompt, outpath, key, image_path])
+    inference_list = inference_list[args.local_rank::args.world_size]
+    for prompt, output_path, key, image_path in tqdm(inference_list):
+        output_path = os.path.join(output_path, image_path)
+        real_image_path = os.path.join(args.imgedit_image_dir, image_path)
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        if os.path.exists(output_path):
+            continue
+        image = run_model_and_return_samples(args, state, prompt, image1=real_image_path, image2=None)
+        image = image[0]
+        image = image.resize((args.resized_width, args.resized_height))
+        image.save(
+            output_path
+        )
+if __name__ == "__main__":
+    import argparse
+    from omegaconf import OmegaConf
+    parser = argparse.ArgumentParser()
+    parser.add_argument("config", type=str)
+    parser.add_argument("--pretrained_lvlm_name_or_path", type=str, default=None, required=False)
+    parser.add_argument("--output_dir", type=str, default=None, required=False)
+    args = parser.parse_args()
+    config = OmegaConf.load(args.config)
+    schema = OmegaConf.structured(EvalConfig)
+    conf = OmegaConf.merge(schema, config)
+    if args.pretrained_lvlm_name_or_path is not None:
+        assert args.output_dir is not None
+        conf.pretrained_lvlm_name_or_path = args.pretrained_lvlm_name_or_path
+        conf.output_dir = args.output_dir
+    main(conf)

univa/eval/gedit/step2_gedit_bench.py ADDED Viewed

	@@ -0,0 +1,178 @@

+from viescore import VIEScore
+import PIL
+import os
+import megfile
+from PIL import Image
+from tqdm import tqdm
+from datasets import load_dataset, load_from_disk
+import sys
+import csv
+import threading
+import time
+import argparse
+from concurrent.futures import ThreadPoolExecutor, as_completed
+GROUPS = [
+    "background_change", "color_alter", "material_alter", "motion_change", "ps_human", "style_change", "subject-add", "subject-remove", "subject-replace", "text_change", "tone_transfer"
+]
+def process_single_item(item, vie_score, max_retries=10000):
+    instruction = item['instruction']
+    key = item['key']
+    instruction_language = item['instruction_language']
+    intersection_exist = item['Intersection_exist']
+    sample_prefix = key
+    save_path_fullset_source_image = f"{source_path}/fullset/{group_name}/{instruction_language}/{key}_SRCIMG.png"
+    save_path_fullset_result_image = f"{save_path}/fullset/{group_name}/{instruction_language}/{key}.png"
+    src_image_path = save_path_fullset_source_image
+    save_path_item = save_path_fullset_result_image
+    for retry in range(max_retries):
+        try:
+            pil_image_raw =Image.open(megfile.smart_open(src_image_path, 'rb'))
+            pil_image_edited = Image.open(megfile.smart_open(save_path_item, 'rb')).convert("RGB").resize((pil_image_raw.size[0], pil_image_raw.size[1]))
+            text_prompt = instruction
+            score_list = vie_score.evaluate([pil_image_raw, pil_image_edited], text_prompt)
+            sementics_score, quality_score, overall_score = score_list
+            print(f"sementics_score: {sementics_score}, quality_score: {quality_score}, overall_score: {overall_score}, instruction_language: {instruction_language}, instruction: {instruction}")
+            return {
+                "source_image": src_image_path,
+                "edited_image": save_path_item,
+                "instruction": instruction,
+                "sementics_score": sementics_score,
+                "quality_score": quality_score,
+                "intersection_exist" : item['Intersection_exist'],
+                "instruction_language" : item['instruction_language']
+            }
+        except Exception as e:
+            if retry < max_retries - 1:
+                wait_time = (retry + 1) * 2  # 指数退避：2秒, 4秒, 6秒...
+                print(f"Error processing {save_path_item} (attempt {retry + 1}/{max_retries}): {e}")
+                print(f"Waiting {wait_time} seconds before retry...")
+                time.sleep(wait_time)
+            else:
+                print(f"Failed to process {save_path_item} after {max_retries} attempts: {e}")
+                return
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", type=str, default="UniWorld")
+    parser.add_argument("--save_path", type=str, default="/mnt/data/lb/Remake/UniWorld//eval_output/stage3_ema/Gedit")
+    parser.add_argument("--backbone", type=str, default="gpt4o", choices=["gpt4o", "qwen25vl"])
+    parser.add_argument("--source_path", type=str, default="/mnt/workspace/lb/Remake/gedit_bench_eval_images")
+    args = parser.parse_args()
+    model_name = args.model_name
+    save_path_dir = args.save_path
+    source_path = args.source_path
+    evaluate_group = [args.model_name]
+    backbone = args.backbone
+    vie_score = VIEScore(backbone=backbone, task="tie", key_path='secret_t2.env')
+    max_workers = 5
+    dataset = load_dataset("stepfun-ai/GEdit-Bench")
+    for model_name in evaluate_group:
+        save_path = save_path_dir
+        save_path_new = os.path.join(save_path_dir, backbone, "eval_results_new")
+        all_csv_list = []  # Store all results for final combined CSV
+        # Load existing processed samples from final CSV if it exists
+        processed_samples = set()
+        final_csv_path = os.path.join(save_path_new, f"{model_name}_combined_gpt_score.csv")
+        if megfile.smart_exists(final_csv_path):
+            with megfile.smart_open(final_csv_path, 'r', newline='') as f:
+                reader = csv.DictReader(f)
+                for row in reader:
+                    # Create a unique identifier for each sample
+                    sample_key = (row['source_image'], row['edited_image'])
+                    processed_samples.add(sample_key)
+            print(f"Loaded {len(processed_samples)} processed samples from existing CSV")
+        for group_name in GROUPS:
+            group_csv_list = []
+            group_dataset_list = []
+            for item in tqdm(dataset['train'], desc=f"Processing {model_name} - {group_name}"):
+                if item['instruction_language'] == 'cn':
+                    continue
+                # import pdb;pdb.set_trace()
+                if item['task_type'] == group_name:
+                    group_dataset_list.append(item)
+            # Load existing group CSV if it exists
+            group_csv_path = os.path.join(save_path_new, f"{model_name}_{group_name}_gpt_score.csv")
+            if megfile.smart_exists(group_csv_path):
+                with megfile.smart_open(group_csv_path, 'r', newline='') as f:
+                    reader = csv.DictReader(f)
+                    group_results = list(reader)
+                    group_csv_list.extend(group_results)
+                print(f"Loaded existing results for {model_name} - {group_name}")
+            print(f"Processing group: {group_name}")
+            print(f"Processing model: {model_name}")
+            with ThreadPoolExecutor(max_workers=max_workers) as executor:
+                futures = []
+                for item in group_dataset_list:
+                    instruction = item['instruction']
+                    key = item['key']
+                    instruction_language = item['instruction_language']
+                    intersection_exist = item['Intersection_exist']
+                    sample_prefix = key
+                    save_path_fullset_source_image = f"{source_path}/fullset/{group_name}/{instruction_language}/{key}_SRCIMG.png"
+                    save_path_fullset_result_image = f"{save_path}/fullset/{group_name}/{instruction_language}/{key}.png"
+                    if not megfile.smart_exists(save_path_fullset_result_image) or not megfile.smart_exists(save_path_fullset_source_image):
+                        print(f"Skipping {sample_prefix}: Source or edited image does not exist")
+                        continue
+                    # Check if this sample has already been processed
+                    sample_key = (save_path_fullset_source_image, save_path_fullset_result_image)
+                    exists = sample_key in processed_samples
+                    if exists:
+                        print(f"Skipping already processed sample: {sample_prefix}")
+                        continue
+                    future = executor.submit(process_single_item, item, vie_score)
+                    futures.append(future)
+                for future in tqdm(as_completed(futures), total=len(futures), desc=f"Processing {model_name} - {group_name}"):
+                    result = future.result()
+                    if result:
+                        group_csv_list.append(result)
+            # Save group-specific CSV
+            group_csv_path = os.path.join(save_path_new, f"{model_name}_{group_name}_gpt_score.csv")
+            with megfile.smart_open(group_csv_path, 'w', newline='') as f:
+                fieldnames = ["source_image", "edited_image", "instruction", "sementics_score", "quality_score", "intersection_exist", "instruction_language"]
+                writer = csv.DictWriter(f, fieldnames=fieldnames)
+                writer.writeheader()
+                for row in group_csv_list:
+                    writer.writerow(row)
+            all_csv_list.extend(group_csv_list)
+            print(f"Saved group CSV for {group_name}, length： {len(group_csv_list)}")
+        # After processing all groups, calculate and save combined results
+        if not all_csv_list:
+            print(f"Warning: No results for model {model_name}, skipping combined CSV generation")
+            continue
+        # Save combined CSV
+        combined_csv_path = os.path.join(save_path_new, f"{model_name}_combined_gpt_score.csv")
+        with megfile.smart_open(combined_csv_path, 'w', newline='') as f:
+            fieldnames = ["source_image", "edited_image", "instruction", "sementics_score", "quality_score", "intersection_exist", "instruction_language"]
+            writer = csv.DictWriter(f, fieldnames=fieldnames)
+            writer.writeheader()
+            for row in all_csv_list:
+                writer.writerow(row)

univa/eval/gedit/step3_calculate_statistics.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import megfile
+import os
+import pandas as pd
+from collections import defaultdict
+import sys
+import numpy as np
+import math
+GROUPS = [
+    "background_change", "color_alter", "material_alter", "motion_change", "ps_human", "style_change", "subject-add", "subject-remove", "subject-replace", "text_change", "tone_transfer"
+]
+def analyze_scores(save_path_dir, evaluate_group, language):
+    results = defaultdict(dict)
+    save_path_new = save_path_dir
+    model_total_score = defaultdict(dict)
+    group_dict_sub = {}
+    group_scores_semantics = defaultdict(lambda: defaultdict(list))
+    group_scores_quality = defaultdict(lambda: defaultdict(list))
+    group_scores_overall = defaultdict(lambda: defaultdict(list))
+    group_scores_semantics_intersection = defaultdict(lambda: defaultdict(list))
+    group_scores_quality_intersection = defaultdict(lambda: defaultdict(list))
+    group_scores_overall_intersection = defaultdict(lambda: defaultdict(list))
+    length_total = 0
+    save_path_dir_raw = save_path_dir
+    for group_name in GROUPS:
+        csv_path = os.path.join(save_path_new, f"{evaluate_group[0]}_{group_name}_gpt_score.csv")
+        csv_file = megfile.smart_open(csv_path)
+        df = pd.read_csv(csv_file)
+        filtered_semantics_scores = []
+        filtered_quality_scores = []
+        filtered_overall_scores = []
+        filtered_semantics_scores_intersection = []
+        filtered_quality_scores_intersection = []
+        filtered_overall_scores_intersection = []
+        for _, row in df.iterrows():
+            source_image = row['source_image']
+            edited_image = row['edited_image']
+            instruction = row['instruction']
+            semantics_score = row['sementics_score']
+            quality_score = row['quality_score']
+            intersection_exist = row['intersection_exist']
+            instruction_language = row['instruction_language']
+            if instruction_language == language:
+                pass
+            else:
+                continue
+            overall_score = math.sqrt(semantics_score * quality_score)
+            filtered_semantics_scores.append(semantics_score)
+            filtered_quality_scores.append(quality_score)
+            filtered_overall_scores.append(overall_score)
+            if intersection_exist:
+                filtered_semantics_scores_intersection.append(semantics_score)
+                filtered_quality_scores_intersection.append(quality_score)
+                filtered_overall_scores_intersection.append(overall_score)
+        avg_semantics_score = np.mean(filtered_semantics_scores)
+        avg_quality_score = np.mean(filtered_quality_scores)
+        avg_overall_score = np.mean(filtered_overall_scores)
+        group_scores_semantics[evaluate_group[0]][group_name] = avg_semantics_score
+        group_scores_quality[evaluate_group[0]][group_name] = avg_quality_score
+        group_scores_overall[evaluate_group[0]][group_name] = avg_overall_score
+        avg_semantics_score_intersection = np.mean(filtered_semantics_scores_intersection)
+        avg_quality_score_intersection = np.mean(filtered_quality_scores_intersection)
+        avg_overall_score_intersection = np.mean(filtered_overall_scores_intersection)
+        group_scores_semantics_intersection[evaluate_group[0]][group_name] = avg_semantics_score_intersection
+        group_scores_quality_intersection[evaluate_group[0]][group_name] = avg_quality_score_intersection
+        group_scores_overall_intersection[evaluate_group[0]][group_name] = avg_overall_score_intersection
+    print("\n--- Overall Model Averages ---")
+    print("\nSemantics:")
+    for model_name in evaluate_group:
+        model_scores = [group_scores_semantics[model_name][group] for group in GROUPS]
+        model_avg = np.mean(model_scores)
+        group_scores_semantics[model_name]["avg_semantics"] = model_avg
+    print("\nSemantics Intersection:")
+    for model_name in evaluate_group:
+        model_scores = [group_scores_semantics_intersection[model_name][group] for group in GROUPS]
+        model_avg = np.mean(model_scores)
+        group_scores_semantics_intersection[model_name]["avg_semantics"] = model_avg
+    print("\nQuality:")
+    for model_name in evaluate_group:
+        model_scores = [group_scores_quality[model_name][group] for group in GROUPS]
+        model_avg = np.mean(model_scores)
+        group_scores_quality[model_name]["avg_quality"] = model_avg
+    print("\nQuality Intersection:")
+    for model_name in evaluate_group:
+        model_scores = [group_scores_quality_intersection[model_name][group] for group in GROUPS]
+        model_avg = np.mean(model_scores)
+        group_scores_quality_intersection[model_name]["avg_quality"] = model_avg
+    print("\nOverall:")
+    for model_name in evaluate_group:
+        model_scores = [group_scores_overall[model_name][group] for group in GROUPS]
+        model_avg = np.mean(model_scores)
+        group_scores_overall[model_name]["avg_overall"] = model_avg
+    print("\nOverall Intersection:")
+    for model_name in evaluate_group:
+        model_scores = [group_scores_overall_intersection[model_name][group] for group in GROUPS]
+        model_avg = np.mean(model_scores)
+        group_scores_overall_intersection[model_name]["avg_overall"] = model_avg
+    return group_scores_semantics, group_scores_quality, group_scores_overall, group_scores_semantics_intersection, group_scores_quality_intersection, group_scores_overall_intersection
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", type=str, default="UniWorld")
+    parser.add_argument("--save_path", type=str, default="/mnt/data/lb/Remake/UniWorld//eval_output/stage3_ema/Gedit")
+    parser.add_argument("--backbone", type=str, default="gpt4o", choices=["gpt4o", "qwen25vl"])
+    parser.add_argument("--language", type=str, default="en", choices=["en", "zh"])
+    args = parser.parse_args()
+    model_name = args.model_name
+    save_path_dir = args.save_path
+    evaluate_group = [args.model_name]
+    backbone = args.backbone
+    save_path_new = os.path.join(save_path_dir, backbone, "eval_results_new")
+    print("\nOverall:")
+    for model_name in evaluate_group:
+        group_scores_semantics, group_scores_quality, group_scores_overall, group_scores_semantics_intersection, group_scores_quality_intersection, group_scores_overall_intersection = analyze_scores(save_path_new, [model_name], language=args.language)
+    for group_name in GROUPS:
+        print(f"{group_name}: {group_scores_semantics[model_name][group_name]:.3f}, {group_scores_quality[model_name][group_name]:.3f}, {group_scores_overall[model_name][group_name]:.3f}")
+    print(f"Average: {group_scores_semantics[model_name]['avg_semantics']:.3f}, {group_scores_quality[model_name]['avg_quality']:.3f}, {group_scores_overall[model_name]['avg_overall']:.3f}")
+    print("\nIntersection:")
+    for group_name in GROUPS:
+        print(f"{group_name}: {group_scores_semantics_intersection[model_name][group_name]:.3f}, {group_scores_quality_intersection[model_name][group_name]:.3f}, {group_scores_overall_intersection[model_name][group_name]:.3f}")
+    print(f"Average Intersection: {group_scores_semantics_intersection[model_name]['avg_semantics']:.3f}, {group_scores_quality_intersection[model_name]['avg_quality']:.3f}, {group_scores_overall_intersection[model_name]['avg_overall']:.3f}")

univa/eval/gedit/viescore/__init__.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import sys
+sys.path.insert(0, 'viescore')
+from utils import (
+    mllm_output_to_dict
+)
+import math
+import vie_prompts
+class VIEScore:
+    def __init__(self, backbone="gpt4o", task="t2i", key_path=None) -> None:
+        self.task = task
+        self.backbone_name = backbone
+        if self.task not in ["t2i", "tie", "t2v"]:
+            raise ValueError("task must be either 't2i' or 'tie'")
+        if self.backbone_name == "gpt4o":
+            from mllm_tools.openai import GPT4o
+            self.model = GPT4o(key_path, model_name="gpt-4.1")
+        elif self.backbone_name == "gpt4v":
+            from mllm_tools.openai import GPT4v
+            self.model = GPT4v(key_path)
+        elif self.backbone_name == "gemini":
+            from mllm_tools.gemini import Gemini
+            self.model = Gemini()
+        elif self.backbone_name == "idefics2":
+            from mllm_tools.idefics2_eval import Idefics2
+            self.model = Idefics2()
+        elif self.backbone_name == "mantis":
+            from mllm_tools.mantis_idefics2_eval import Mantis
+            self.model = Mantis()
+        elif self.backbone_name == "minicpmv":
+            from mllm_tools.minicpmv_eval import MiniCPMV
+            self.model = MiniCPMV()
+        elif self.backbone_name == "qwen25vl":
+            from mllm_tools.qwen25vl_eval import Qwen25VL
+            self.model = Qwen25VL()
+        else:
+            raise NotImplementedError("backbone not supported")
+        self.context = vie_prompts._context_no_delimit
+        if self.task == "t2i":
+            self.SC_prompt = "\n".join([self.context, vie_prompts._prompts_0shot_one_image_gen_rule, vie_prompts._prompts_0shot_t2i_rule_SC])
+            self.PQ_prompt = "\n".join([self.context, vie_prompts._prompts_0shot_rule_PQ])
+        elif self.task == "tie":
+            self.SC_prompt = "\n".join([self.context, vie_prompts._prompts_0shot_two_image_edit_rule, vie_prompts._prompts_0shot_tie_rule_SC])
+            self.PQ_prompt = "\n".join([self.context, vie_prompts._prompts_0shot_rule_PQ])
+        elif self.task == "t2v":
+            self.SC_prompt = "\n".join([self.context, vie_prompts._prompts_0shot_one_video_gen_rule, vie_prompts._prompts_0shot_t2v_rule_SC])
+            self.PQ_prompt = "\n".join([self.context, vie_prompts._prompts_0shot_t2v_rule_PQ])
+    def evaluate(self, image_prompts, text_prompt, extract_overall_score_only=False, extract_all_score=True, echo_output=False):
+        if not isinstance(image_prompts, list):
+            image_prompts = [image_prompts]
+        if self.backbone_name in ['gpt4o', 'gpt4v']:
+            self.model.use_encode = False if isinstance(image_prompts[0], str) else True
+            #print("Using encode:", self.model.use_encode)
+        if self.task == "t2i":
+            _SC_prompt = self.SC_prompt.replace("<prompt>", text_prompt)
+        elif self.task == "tie":
+            _SC_prompt = self.SC_prompt.replace("<instruction>", text_prompt)
+        elif self.task == "t2v":
+            _SC_prompt = self.SC_prompt.replace("<prompt>", text_prompt)
+        SC_prompt_final = self.model.prepare_prompt(image_prompts, _SC_prompt)
+        if self.task == "tie":
+            PQ_prompt_final = self.model.prepare_prompt(image_prompts[-1], self.PQ_prompt)
+        else:
+            PQ_prompt_final = self.model.prepare_prompt(image_prompts, self.PQ_prompt)
+        results_dict = {}
+        SC_dict = False
+        PQ_dict = False
+        tries = 0
+        max_tries = 1
+        while SC_dict is False or PQ_dict is False:
+            tries += 1
+            guess_if_cannot_parse = True if tries > max_tries else False
+            result_SC = self.model.get_parsed_output(SC_prompt_final)
+            result_PQ = self.model.get_parsed_output(PQ_prompt_final)
+            SC_dict = mllm_output_to_dict(result_SC, give_up_parsing=guess_if_cannot_parse)
+            PQ_dict = mllm_output_to_dict(result_PQ, give_up_parsing=guess_if_cannot_parse)
+        if SC_dict == "rate_limit_exceeded" or PQ_dict == "rate_limit_exceeded":
+            print("rate_limit_exceeded")
+            raise ValueError("rate_limit_exceeded")
+        results_dict['SC'] = SC_dict
+        results_dict['PQ'] = PQ_dict
+        if echo_output:
+            print("results_dict", results_dict)
+        if extract_all_score:
+            SC_score = min(results_dict['SC']['score'])
+            PQ_score = min(results_dict['PQ']['score'])
+            O_score = math.sqrt(SC_score * PQ_score)
+            return [SC_score, PQ_score, O_score]
+        if extract_overall_score_only:
+            SC_scores = results_dict['SC']['score']
+            PQ_scores = results_dict['PQ']['score']
+            O_score = math.sqrt(min(SC_scores) * min(PQ_scores))
+            return O_score
+        return results_dict
+if __name__ == "__main__":
+    model = VIEScore(backbone="gemini", task="t2i")
+    from datasets import load_dataset
+    dataset = load_dataset("TIGER-Lab/GenAI-Arena-Bench", "image_generation")
+    dataset = dataset["test"]
+    print("Now running the VIEScore model")
+    for idx in range(5):
+        left_image = dataset['left_image'][idx]
+        right_image = dataset['right_image'][idx]
+        prompt = dataset['prompt'][idx]
+        print(model.evaluate(left_image, prompt, extract_all_score=True))
+        print(model.evaluate(right_image, prompt, extract_all_score=True))

univa/eval/gedit/viescore/mllm_tools/__init__.py ADDED Viewed

File without changes

univa/eval/gedit/viescore/mllm_tools/gemini.py ADDED Viewed

	@@ -0,0 +1,147 @@

+"""
+Install the Google AI Python SDK
+$ pip install google-generativeai
+See the getting started guide for more information:
+https://ai.google.dev/gemini-api/docs/get-started/python
+"""
+import requests
+from PIL import Image
+from io import BytesIO
+import os
+from typing import List
+from urllib.parse import urlparse
+import google.generativeai as genai
+import tempfile
+genai.configure(api_key=os.environ["GEMINI_API_KEY"])
+def upload_to_gemini(input, mime_type=None):
+    """Uploads the given file or PIL image to Gemini.
+    See https://ai.google.dev/gemini-api/docs/prompting_with_media
+    """
+    if isinstance(input, str):
+        # Input is a file path
+        file = genai.upload_file(input, mime_type=mime_type)
+    elif isinstance(input, Image.Image):
+        # Input is a PIL image
+        with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp_file:
+            input.save(tmp_file, format="JPEG")
+            tmp_file_path = tmp_file.name
+        file = genai.upload_file(tmp_file_path, mime_type=mime_type or "image/jpeg")
+        os.remove(tmp_file_path)
+    else:
+        raise ValueError("Unsupported input type. Must be a file path or PIL Image.")
+    #print(f"Uploaded file '{file.display_name}' as: {file.uri}")
+    return file
+def save_image_from_url(url, base_save_directory='tmp', file_name=None):
+    # Parse the URL to create a directory path
+    parsed_url = urlparse(url)
+    url_path = os.path.join(parsed_url.netloc, parsed_url.path.lstrip('/'))
+    save_directory = os.path.join(base_save_directory, os.path.dirname(url_path))
+    # Create the directory if it doesn't exist
+    if not os.path.exists(save_directory):
+        os.makedirs(save_directory)
+    # Get the image from the URL
+    response = requests.get(url)
+    if response.status_code == 200:
+        # Open the image
+        image = Image.open(BytesIO(response.content))
+        # Set the file name if not provided
+        if not file_name:
+            file_name = os.path.basename(parsed_url.path)
+        # Save the image locally
+        file_path = os.path.join(save_directory, file_name)
+        image.save(file_path)
+        return file_path
+    else:
+        raise Exception(f"Failed to retrieve image from URL. Status code: {response.status_code}")
+class Gemini():
+    def __init__(self, model_name="gemini-1.5-pro-latest"):
+        # Create the model
+        # See https://ai.google.dev/api/python/google/generativeai/GenerativeModel
+        generation_config = {
+          "temperature": 1,
+          "top_p": 0.95,
+          "top_k": 64,
+          "max_output_tokens": 8192,
+          "response_mime_type": "text/plain",
+        }
+        safety_settings = [
+          {
+            "category": "HARM_CATEGORY_HARASSMENT",
+            "threshold": "BLOCK_NONE",
+          },
+          {
+            "category": "HARM_CATEGORY_HATE_SPEECH",
+            "threshold": "BLOCK_NONE",
+          },
+          {
+            "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
+            "threshold": "BLOCK_NONE",
+          },
+          {
+            "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
+            "threshold": "BLOCK_NONE",
+          },
+        ]
+        self.model = genai.GenerativeModel(
+          model_name=model_name,
+          safety_settings=safety_settings,
+          generation_config=generation_config,
+        )
+    def prepare_prompt(self, image_links: List = [], text_prompt: str = ""):
+        if not isinstance(image_links, list):
+            image_links = [image_links]
+        images_prompt = []
+        for image_link in image_links:
+            if isinstance(image_link, str):
+              image = save_image_from_url(image_link)
+            else:
+              image = image_link
+            image = upload_to_gemini(image, mime_type="image/jpeg")
+            images_prompt.append(image)
+        prompt_content = [images_prompt, text_prompt]
+        return prompt_content
+    def get_parsed_output(self, prompt):
+        images_prompt = prompt[0]
+        text_prompt = prompt[1]
+        chat_session = self.model.start_chat(
+          history=[
+            {
+              "role": "user",
+              "parts": images_prompt,
+            },
+          ]
+        )
+        try:
+          response = chat_session.send_message(text_prompt)
+        except:
+          return "Error in sending message to chat session."
+        return self.extract_response(response)
+    def extract_response(self, response):
+        response = response.text
+        return response
+if __name__ == "__main__":
+    model = Gemini()
+    prompt = model.prepare_prompt(['https://chromaica.github.io/Museum/ImagenHub_Text-Guided_IE/DiffEdit/sample_34_1.jpg', 'https://chromaica.github.io/Museum/ImagenHub_Text-Guided_IE/input/sample_34_1.jpg'], 'What is difference between two images?')
+    print("prompt : \n", prompt)
+    res = model.get_parsed_output(prompt)
+    print("result : \n", res)

univa/eval/gedit/viescore/mllm_tools/idefics2_eval.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import os
+import torch
+import time
+from typing import List
+from transformers import AutoProcessor, AutoModelForVision2Seq
+from transformers.image_utils import load_image
+from transformers.utils import is_flash_attn_2_available
+class Idefics2():
+    def __init__(self, model_path:str="HuggingFaceM4/idefics2-8b") -> None:
+        attn_implementation = "flash_attention_2" if is_flash_attn_2_available() else None
+        print(f"Using {attn_implementation} for attention implementation")
+        self.model = AutoModelForVision2Seq.from_pretrained(model_path, device_map="auto", torch_dtype=torch.float16, _attn_implementation=attn_implementation).eval()
+        self.processor = AutoProcessor.from_pretrained(model_path)
+    def prepare_prompt(self, image_links: List = [], text_prompt: str = ""):
+        if not isinstance(image_links, list):
+            image_links = [image_links]
+        messages = [
+            {
+                "role": "user",
+                "content": [ {"type": "image"}] * len(image_links) + [{"type": "text", "text": text_prompt}]
+            }
+        ]
+        prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True)
+        images = [load_image(image_link) for image_link in image_links] #Support PIL images as well
+        inputs = self.processor(text=prompt, images=images, return_tensors="pt")
+        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+        return inputs
+    def get_parsed_output(self, inputs):
+        generate_ids = self.model.generate(**inputs, max_new_tokens=512, num_beams=1)
+        generated_text = self.processor.batch_decode(generate_ids[:, inputs['input_ids'].shape[1]:], skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        return generated_text
+if __name__ == "__main__":
+    model = Idefics2()
+    prompt = model.prepare_prompt(['https://chromaica.github.io/Museum/ImagenHub_Text-Guided_IE/DiffEdit/sample_34_1.jpg', 'https://chromaica.github.io/Museum/ImagenHub_Text-Guided_IE/input/sample_34_1.jpg'], 'What is difference between two images?')
+    #print("prompt : \n", prompt)
+    res = model.get_parsed_output(prompt)
+    print("result : \n", res)

univa/eval/gedit/viescore/mllm_tools/mantis_idefics2_eval.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import os
+import torch
+import time
+from typing import List
+from transformers import AutoProcessor, AutoModelForVision2Seq
+from transformers.image_utils import load_image
+from transformers.utils import is_flash_attn_2_available
+class Mantis():
+    def __init__(self, model_path:str="TIGER-Lab/Mantis-8B-Idefics2") -> None:
+        attn_implementation = "flash_attention_2" if is_flash_attn_2_available() else None
+        print(f"Using {attn_implementation} for attention implementation")
+        self.model = AutoModelForVision2Seq.from_pretrained(model_path, device_map="auto", torch_dtype=torch.float16, _attn_implementation=attn_implementation).eval()
+        self.processor = AutoProcessor.from_pretrained(model_path)
+    def prepare_prompt(self, image_links: List = [], text_prompt: str = ""):
+        if not isinstance(image_links, list):
+            image_links = [image_links]
+        messages = [
+            {
+                "role": "user",
+                "content": [ {"type": "image"}] * len(image_links) + [{"type": "text", "text": text_prompt}]
+            }
+        ]
+        prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True)
+        images = [load_image(image_link) for image_link in image_links] #Support PIL images as well
+        inputs = self.processor(text=prompt, images=images, return_tensors="pt")
+        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+        return inputs
+    def get_parsed_output(self, inputs):
+        generate_ids = self.model.generate(**inputs, max_new_tokens=512, num_beams=1)
+        generated_text = self.processor.batch_decode(generate_ids[:, inputs['input_ids'].shape[1]:], skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        return generated_text
+if __name__ == "__main__":
+    model = Mantis()
+    prompt = model.prepare_prompt(['https://chromaica.github.io/Museum/ImagenHub_Text-Guided_IE/DiffEdit/sample_34_1.jpg', 'https://chromaica.github.io/Museum/ImagenHub_Text-Guided_IE/input/sample_34_1.jpg'], 'What is difference between two images?')
+    #print("prompt : \n", prompt)
+    res = model.get_parsed_output(prompt)
+    print("result : \n", res)

univa/eval/gedit/viescore/mllm_tools/minicpmv_eval.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import os
+import torch
+import time
+from PIL import Image
+from typing import List
+from transformers import AutoModel, AutoTokenizer
+from transformers.utils import is_flash_attn_2_available
+class MiniCPMV():
+    def __init__(self) -> None:
+        attn_implementation = "flash_attention_2" if is_flash_attn_2_available() else None
+        self.model = AutoModel.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5', trust_remote_code=True, torch_dtype=torch.float16, device_map='auto', _attn_implementation=attn_implementation).eval()
+        self.tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-Llama3-V-2_5', trust_remote_code=True)
+        print(f"Using {attn_implementation} for attention implementation")
+    def prepare_prompt(self, image_links: List = [], text_prompt: str = ""):
+        if not isinstance(image_links, list):
+            image_links = [image_links]
+        messages = [
+            {
+                "role": "user",
+                "content": [ {"type": "image"}] * len(image_links) + [{"type": "text", "text": text_prompt}]
+            }
+        ]
+        return messages
+    def get_parsed_output(self, inputs):
+        res = self.model.chat(
+            image=None,
+            msgs=inputs,
+            tokenizer=self.tokenizer,
+            sampling=False, # if sampling=False, beam_search will be used by default
+        )
+        return res
+if __name__ == "__main__":
+    model = MiniCPMV()
+    prompt = model.prepare_prompt(['https://chromaica.github.io/Museum/ImagenHub_Text-Guided_IE/DiffEdit/sample_34_1.jpg', 'https://chromaica.github.io/Museum/ImagenHub_Text-Guided_IE/input/sample_34_1.jpg'], 'What is difference between two images?')
+    #print("prompt : \n", prompt)
+    res = model.get_parsed_output(prompt)
+    print("result : \n", res)

univa/eval/gedit/viescore/mllm_tools/openai.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import base64
+import requests
+from io import BytesIO, StringIO
+from typing import Union, Optional, Tuple, List
+from PIL import Image, ImageOps
+import os
+def get_api_key(file_path):
+    # Read the API key from the first line of the file
+    with open(file_path, 'r') as file:
+        return file.readline().strip()
+# Function to encode the image
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+def pick_next_item(current_item, item_list):
+    if current_item not in item_list:
+        raise ValueError("Current item is not in the list")
+    current_index = item_list.index(current_item)
+    next_index = (current_index + 1) % len(item_list)
+    return item_list[next_index]
+# Function to encode a PIL image
+def encode_pil_image(pil_image):
+    # Create an in-memory binary stream
+    image_stream = BytesIO()
+    # Save the PIL image to the binary stream in JPEG format (you can change the format if needed)
+    pil_image.save(image_stream, format='JPEG')
+    # Get the binary data from the stream and encode it as base64
+    image_data = image_stream.getvalue()
+    base64_image = base64.b64encode(image_data).decode('utf-8')
+    return base64_image
+def load_image(image: Union[str, Image.Image], format: str = "RGB", size: Optional[Tuple] = None) -> Image.Image:
+    """
+    Load an image from a given path or URL and convert it to a PIL Image.
+    Args:
+        image (Union[str, Image.Image]): The image path, URL, or a PIL Image object to be loaded.
+        format (str, optional): Desired color format of the resulting image. Defaults to "RGB".
+        size (Optional[Tuple], optional): Desired size for resizing the image. Defaults to None.
+    Returns:
+        Image.Image: A PIL Image in the specified format and size.
+    Raises:
+        ValueError: If the provided image format is not recognized.
+    """
+    if isinstance(image, str):
+        if image.startswith("http://") or image.startswith("https://"):
+            image = Image.open(requests.get(image, stream=True).raw)
+        elif os.path.isfile(image):
+            image = Image.open(image)
+        else:
+            raise ValueError(
+                f"Incorrect path or url, URLs must start with `http://` or `https://`, and {image} is not a valid path"
+            )
+    elif isinstance(image, Image.Image):
+        image = image
+    else:
+        raise ValueError(
+            "Incorrect format used for image. Should be an url linking to an image, a local path, or a PIL image."
+        )
+    image = ImageOps.exif_transpose(image)
+    image = image.convert(format)
+    if (size != None):
+        image = image.resize(size, Image.LANCZOS)
+    return image
+class GPT4v():
+    def __init__(self, api_key_path='keys/secret.env', are_images_encoded=False, model_name="gpt-4-vision-preview"):
+        """OpenAI GPT-4-vision model wrapper
+        Args:
+            api_key_path (str): Path to the API key file. Defaults to 'keys/secret.env'.
+            are_images_encoded (bool): Whether the images are encoded in base64. Defaults to False.
+        """
+        self.multiple_api_keys = False
+        self.current_key_file = None
+        self.key_lists = None
+        if isinstance(api_key_path, list):
+            self.key_lists = api_key_path
+            self.current_key_file = api_key_path[0]
+            self.api_key = get_api_key(self.current_key_file)
+            self.multiple_api_keys = True
+        else:
+            self.api_key = get_api_key(api_key_path)
+        if not self.api_key:
+            print("API key not found.")
+            exit(1)
+        self.url = "https://api.openai.com/v1/chat/completions"
+        self.model_name = model_name
+        self.use_encode = are_images_encoded
+    def prepare_prompt(self, image_links: List = [], text_prompt: str = ""):
+        prompt_content = []
+        text_dict = {
+                    "type": "text",
+                    "text": text_prompt
+                }
+        prompt_content.append(text_dict)
+        if not isinstance(image_links, list):
+            image_links = [image_links]
+        for image_link in image_links:
+            image = load_image(image_link)
+            if self.use_encode == True:
+                visual_dict = {
+                        "type": "image_url",
+                        "image_url": {"url": f"data:image/jpeg;base64,{encode_pil_image(image)}"}
+                    }
+            else:
+                visual_dict = {
+                        "type": "image_url",
+                        "image_url": {"url": image_link}
+                    }
+            prompt_content.append(visual_dict)
+        return prompt_content
+    def get_parsed_output(self, prompt):
+        payload = {
+            "model": self.model_name,
+            "messages": [
+            {
+                "role": "user",
+                "content": prompt
+            }
+            ],
+            "max_tokens": 1400
+        }
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.api_key}"
+        }
+        response = requests.post(self.url, json=payload, headers=headers)
+        #return response.text
+        return self.extract_response(response)
+    def extract_response(self, response):
+        response = response.json()
+        try:
+            out = response['choices'][0]['message']['content']
+            return out
+        except:
+            if response['error']['code'] == 'content_policy_violation':
+                print("Code is content_policy_violation")
+            elif response['error']['code'] == 'rate_limit_exceeded' or response['error']['code'] == 'insufficient_quota':
+                print(f"Code is {response['error']['code']}")
+                print(response['error']['message'])
+                if self.multiple_api_keys == True:
+                    new_key = pick_next_item(self.current_key_file, self.key_lists)
+                    self.update_key(new_key)
+                    self.current_key_file = new_key #override key
+                    print("New key is from the file: ", new_key)
+            else:
+                print("Code is different")
+                print(response)
+        return ""
+    def update_key(self, key, load_from_file=True):
+        if load_from_file:
+            self.api_key = get_api_key(key)
+        else:
+            self.api_key = key
+class GPT4o(GPT4v):
+    def __init__(self, api_key_path='keys/secret.env', are_images_encoded=False, model_name="gpt-4o-2024-05-13"):
+        super().__init__(api_key_path, are_images_encoded, model_name)
+if __name__ == "__main__":
+    model = GPT4o('secret_t2.env', model_name="gpt-4.1")
+    prompt = model.prepare_prompt(['https://chromaica.github.io/Museum/ImagenHub_Text-Guided_IE/DiffEdit/sample_34_1.jpg', 'https://chromaica.github.io/Museum/ImagenHub_Text-Guided_IE/input/sample_34_1.jpg'], 'What is difference between two images?')
+    print("prompt : \n", prompt)
+    res = model.get_parsed_output(prompt)
+    print("result : \n", res)

univa/eval/gedit/viescore/mllm_tools/qwen25vl_eval.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import os
+import torch
+import time
+from PIL import Image
+from typing import List
+from transformers import AutoModel, AutoTokenizer
+from transformers.utils import is_flash_attn_2_available
+from transformers import Qwen2_5_VLForConditionalGeneration
+from qwen_vl_utils import process_vision_info
+from transformers import AutoProcessor
+import requests
+from io import BytesIO
+import random
+import numpy as np
+import base64
+import magic
+import megfile
+def process_image(image):
+    img_byte_arr = BytesIO()
+    image.save(img_byte_arr, format='PNG')
+    img_byte_arr = img_byte_arr.getvalue()
+    return img_byte_arr
+def convert_image_to_base64(file_content):
+    mime_type = magic.from_buffer(file_content, mime=True)
+    base64_encoded_data = base64.b64encode(file_content).decode('utf-8')
+    return f"data:{mime_type};base64,{base64_encoded_data}"
+def set_seed(seed: int):
+    """
+    Args:
+    Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch`.
+        seed (`int`): The seed to set.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+class Qwen25VL():
+    def __init__(self) -> None:
+        attn_implementation = "flash_attention_2" if is_flash_attn_2_available() else None
+        self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            "/mnt/jfs-test/pretrained_models/Qwen2.5-VL-72B-Instruct-AWQ",
+            torch_dtype=torch.float16,
+            device_map="auto"
+        ).eval()
+        self.processor = AutoProcessor.from_pretrained("/mnt/jfs-test/pretrained_models/Qwen2.5-VL-72B-Instruct-AWQ")
+        print(f"Using {attn_implementation} for attention implementation")
+    def prepare_prompt(self, image_links: List = [], text_prompt: str = ""):
+        if not isinstance(image_links, list):
+            image_links = [image_links]
+        image_links_base64 = []
+        for img_link in image_links:
+            if type(img_link) == str:
+                image_links_base64.append(convert_image_to_base64(process_image(megfile.smart_open(img_link, 'rb'))))
+            else:
+                image_links_base64.append(convert_image_to_base64(process_image(img_link)))
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": img_link} for img_link in image_links_base64
+                ] + [{"type": "text", "text": text_prompt}]
+            }
+        ]
+        return messages
+    def get_parsed_output(self, messages):
+        set_seed(42)
+        # Prepare the inputs
+        text = self.processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
+        # Process inputs
+        inputs = self.processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt"
+        )
+        inputs = inputs.to("cuda")
+        # Generate output
+        generation_config = {
+            "max_new_tokens": 512,
+            "num_beams": 1,
+            "do_sample": False,
+            "temperature": 0.1,
+            "top_p": None,
+        }
+        generated_ids = self.model.generate(**inputs, **generation_config)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_text = self.processor.batch_decode(
+            generated_ids_trimmed,
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )
+        return output_text[0] if output_text else ""
+if __name__ == "__main__":
+    model = Qwen25VL()
+    prompt = model.prepare_prompt(
+        ["https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"],
+        'Describe the image in detail.'
+    )
+    res = model.get_parsed_output(prompt)
+    print("result : \n", res)

univa/eval/gedit/viescore/mllm_tools/utils.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from typing import List
+import base64
+from io import BytesIO
+from PIL import Image
+import requests
+def pil_image_to_base64(pil_image, format="PNG"):
+    buffered = BytesIO()
+    pil_image.save(buffered, format=format)  # Save image to the buffer in the specified format
+    img_str = base64.b64encode(buffered.getvalue()).decode('utf-8')  # Encode the buffer's content to base64
+    return img_str
+def load_image(image_file):
+    if image_file.startswith("http"):
+        response = requests.get(image_file)
+        image = Image.open(BytesIO(response.content)).convert("RGB")
+    else:
+        import os
+        image = Image.open(image_file).convert("RGB")
+    return image
+def load_images(image_files):
+    out = []
+    for image_file in image_files:
+        image = load_image(image_file)
+        out.append(image)
+    return out
+def merge_images(image_links: List = []):
+        """Merge multiple images into one image
+        Args:
+            image_links (List, optional): List of image links. Defaults to [].
+        Returns:
+            [type]: [description]
+        """
+        if len(image_links) == 0:
+            return None
+        images = load_images(image_links)
+        if len(images) == 1:
+            return images[0]
+        widths, heights = zip(*(i.size for i in images))
+        average_height = sum(heights) // len(heights)
+        for i, im in enumerate(images):
+            # scale in proportion
+            images[i] = im.resize((int(im.size[0] * average_height / im.size[1]), average_height))
+        widths, heights = zip(*(i.size for i in images))
+        total_width = sum(widths)
+        max_height = max(heights)
+        new_im = Image.new("RGB", (total_width + 10 * (len(images) - 1), max_height))
+        x_offset = 0
+        for i, im in enumerate(images):
+            if i > 0:
+                # past a column of 1 pixel starting from x_offset width being black, 8 pixels being white, and 1 pixel being black
+                new_im.paste(Image.new("RGB", (1, max_height), (0, 0, 0)), (x_offset, 0))
+                x_offset += 1
+                new_im.paste(Image.new("RGB", (8, max_height), (255, 255, 255)), (x_offset, 0))
+                x_offset += 8
+                new_im.paste(Image.new("RGB", (1, max_height), (0, 0, 0)), (x_offset, 0))
+                x_offset += 1
+            new_im.paste(im, (x_offset, 0))
+            x_offset += im.size[0]
+        return new_im

univa/eval/gedit/viescore/parse_prompt.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import os
+def create_python_file_with_texts(folder_path, output_file):
+    with open(output_file, 'w', encoding='utf-8') as out_file:
+        out_file.write("# This file is generated automatically through parse_prompt.py\n\n")
+        for root, dirs, files in os.walk(folder_path):
+            for file in files:
+                if file.endswith(".txt"):
+                    file_path = os.path.join(root, file)
+                    var_name = "_" + file_path.replace(folder_path, "").replace(os.sep, "_").replace(".txt", "").strip("_")
+                    with open(file_path, 'r', encoding='utf-8') as f:
+                        content = f.read().replace('"""', '\"\"\"')
+                        out_file.write(f'{var_name} = """{content}"""\n\n')
+# Example usage
+current_file_path = os.path.abspath(__file__)
+current_folder_path = os.path.dirname(current_file_path)
+folder_path = os.path.join(current_folder_path, "prompts_raw")
+output_file = os.path.join(current_folder_path, "vie_prompts.py")
+create_python_file_with_texts(folder_path, output_file)

univa/eval/gedit/viescore/utils.py ADDED Viewed

	@@ -0,0 +1,362 @@

+import os
+from typing import Union, List, Optional
+import json
+import regex as re
+import ast
+import random
+def fix_json(input_str):
+    # Add double quotes around keys using regex
+    fixed_str = re.sub(r'(\w+):', r'"\1":', input_str)
+    # Add double quotes around string values if necessary and wrap int/float values in []
+    def format_value(match):
+        key, value, comma = match.groups()
+        value = value.strip()
+        # Check if value is an integer or float
+        if re.match(r'^-?\d+(\.\d+)?$', value):
+            value = f'[{value}]'
+        # Check if value is a boolean or null
+        elif re.match(r'^(true|false|null)$', value, re.IGNORECASE):
+            pass  # leave as is
+        else:
+            # Add quotes around string values
+            value = f'"{value}"'
+        return f'{key}: {value}{comma}'
+    fixed_str = re.sub(r'(".*?"):(.*?)(,|})', format_value, fixed_str)
+    return fixed_str
+def read_file_to_string(file_path):
+    """
+    Reads the contents of a text file and returns it as a string.
+    :param file_path: The path to the text file.
+    :return: A string containing the contents of the file.
+    """
+    try:
+        with open(file_path, 'r', encoding='utf-8') as file:
+            return file.read()
+    except FileNotFoundError:
+        print(f"The file {file_path} was not found.")
+        return None
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        return None
+def read_files_to_string(file_paths):
+    """
+    Reads the contents of multiple text files and returns them as a single string,
+    with each file's contents separated by a newline.
+    :param file_paths: A list of paths to text files.
+    :return: A string containing the concatenated contents of the files.
+    """
+    all_contents = []  # List to hold the contents of each file
+    for file_path in file_paths:
+        try:
+            with open(file_path, 'r', encoding='utf-8') as file:
+                all_contents.append(file.read())
+        except FileNotFoundError:
+            print(f"The file {file_path} was not found.")
+        except Exception as e:
+            print(f"An error occurred while reading {file_path}: {e}")
+    # Join all the contents with a newline character
+    return "\n".join(all_contents)
+def get_file_path(filename: Union[str, os.PathLike], search_from: Union[str, os.PathLike] = "."):
+    """
+    Search for a file across a directory and return its absolute path.
+    Args:
+        filename (Union[str, os.PathLike]): The name of the file to search for.
+        search_from (Union[str, os.PathLike], optional): The directory from which to start the search. Defaults to ".".
+    Returns:
+        str: Absolute path to the found file.
+    Raises:
+        FileNotFoundError: If the file is not found.
+    """
+    for root, dirs, files in os.walk(search_from):
+        for name in files:
+            if name == filename:
+                return os.path.abspath(os.path.join(root, name))
+    raise FileNotFoundError(filename, "not found.")
+#+=========================================================================================
+def verify(s, target_sequence):
+    # Count the occurrences of the target sequence
+    count = s.count(target_sequence)
+    # Check if the target sequence appears exactly twice
+    return count == 2
+def is_int_between_0_and_10(s):
+    try:
+        num = int(s)
+        return 0 <= num <= 10
+    except ValueError:
+        return False
+def is_str_a_list_of_ints_0_to_10(s):
+    try:
+        # Attempt to parse the string as a Python literal (list, dict, etc.)
+        parsed = ast.literal_eval(s)
+        # Check if the parsed object is a list
+        if not isinstance(parsed, list):
+            return False
+        # Check if all elements are integers and between 0 to 10
+        return all(isinstance(item, int) and 0 <= item <= 10 for item in parsed)
+    except (ValueError, SyntaxError):
+        # If parsing fails or any other error occurs
+        return False
+def is_str_valid_score_format_brackets(s):
+    try:
+        # Removing brackets and splitting the string by commas
+        content = s.strip("[]").split(',')
+        length = len(content)
+        # Parsing each element and checking the format and range
+        scores = {}
+        for item in content:
+            key, value = item.split(':')
+            key = key.strip()
+            value = int(value.strip())
+            # Check if the key starts with 'score' and the value is in the correct range
+            if not key.startswith("score") or not 0 <= value <= 10:
+                return False
+            scores[key] = value
+        fetch_words = [f"score{i+1}" for i in range(length)]
+        # Check if at least 'score1' and 'score2' are present
+        return all(key in scores for key in fetch_words)
+    except (ValueError, SyntaxError):
+        # If any parsing error occurs
+        return False
+#+=========================================================================================
+def mllm_output_to_dict(input_string, give_up_parsing=False):
+    """
+    Args:
+        input_string (str): actually the output of the mllm model to be parsed
+        output_file_name (str): The name of the output file.
+    """
+    # Catch for gpt4v rate_limit_exceeded error
+    if input_string == "rate_limit_exceeded":
+        return "rate_limit_exceeded"
+    # Define the delimiters
+    delimiter = '||V^=^V||'
+    if input_string.count(delimiter) == 2:
+        if not verify(input_string, delimiter):
+            print("The required delimiters were not found correctly in the string.")
+            return False
+        # Extract the content between the delimiters
+        start_index = input_string.find(delimiter) + len(delimiter)
+        end_index = input_string.rfind(delimiter)
+    else:
+        # find the json mannually
+        # some mllm tends not to output the delimiters, but it does output the json contents
+        # so we will find the json content mannually
+        start_index = input_string.find('{')
+        end_index = input_string.rfind('}') + 1
+        if start_index == -1 or end_index == 0:
+            # json not found
+            # some mllm tends to output only a list of scores like [6, 0],
+            # this time we will just get the scores and ignore the reasoning (other part of the json)
+            start_index = input_string.find('[')
+            end_index = input_string.rfind(']') + 1
+            if give_up_parsing: # if we want to give up parsing
+                guessed_value = random.randint(0, 10)
+                print(f"Failed to find the json content in the string. Guess a value : {guessed_value}.")
+                json_content = {'score': [guessed_value], "reasoning": f"guess_if_cannot_parse | {input_string}"}
+                json_str = json.dumps(json_content)
+                input_string = json_str
+                start_index = 0
+                end_index = len(json_str)
+            elif re.match(r'^\[\d+, ?\d+\]$', input_string[start_index:end_index]):
+                scores = json.loads(input_string[start_index:end_index])
+                if not isinstance(scores, list):
+                    scores = [scores]
+                json_content = {'score': scores, "reasoning": "System: output is simply a list of scores"}
+                json_str = json.dumps(json_content)
+                input_string = json_str
+                start_index = 0
+                end_index = len(json_str)
+            elif is_int_between_0_and_10(input_string): # if output is simply a number
+                scores = [int(input_string)]
+                json_content = {'score': scores, "reasoning": "System: output is simply a number"}
+                json_str = json.dumps(json_content)
+                input_string = json_str
+                start_index = 0
+                end_index = len(json_str)
+            else:
+                print("Failed to find the json content in the string.")
+                return False
+    # Check if we found two delimiters
+    if start_index != -1 and end_index != -1 and start_index != end_index:
+        # Extract the JSON string
+        json_str = input_string[start_index:end_index].strip()
+        json_str = json_str.replace("\n", "")
+        # Parse the JSON string into a dictionary
+        try:
+            new_data = json.loads(json_str)
+            if not isinstance(new_data['score'], list):
+                new_data['score'] = [new_data['score']]
+        except:
+            print("Now fixing: ", json_str)
+            try:
+                new_data = json.loads(fix_json(json_str))
+                return new_data
+            except:
+                print("Error: Cannot fix", json_str)
+                return False
+        return new_data
+    else:
+        print("The required delimiters were not found correctly in the string.")
+        return False
+def write_entry_to_json_file(input_string, uid, prompt_input, vision_input, output_file_name, give_up_parsing=False):
+    """
+    Args:
+        input_string (str): actually the output of the mllm model to be parsed
+        uid (str): The unique identifier for the each item in the test data
+        prompt_input (str): The prompt input for the entry. text prompt.
+        vision_input (str): The vision input for the entry. image links.
+        output_file_name (str): The name of the output file.
+    """
+    # Catch for gpt4v rate_limit_exceeded error
+    if input_string == "rate_limit_exceeded":
+        return "rate_limit_exceeded"
+    # Define the delimiters
+    delimiter = '||V^=^V||'
+    if input_string.count(delimiter) == 2:
+        if not verify(input_string, delimiter):
+            print("The required delimiters were not found correctly in the string.")
+            return False
+        # Extract the content between the delimiters
+        start_index = input_string.find(delimiter) + len(delimiter)
+        end_index = input_string.rfind(delimiter)
+    else:
+        # find the json mannually
+        # some mllm tends not to output the delimiters, but it does output the json contents
+        # so we will find the json content mannually
+        start_index = input_string.find('{')
+        end_index = input_string.rfind('}') + 1
+        if start_index == -1 or end_index == 0:
+            # json not found
+            # some mllm tends to output only a list of scores like [6, 0],
+            # this time we will just get the scores and ignore the reasoning (other part of the json)
+            start_index = input_string.find('[')
+            end_index = input_string.rfind(']') + 1
+            if give_up_parsing: # if we want to give up parsing
+                guessed_value = random.randint(0, 10)
+                print(f"Failed to find the json content in the string. Guess a value : {guessed_value}.")
+                json_content = {'score': [guessed_value], "reasoning": f"guess_if_cannot_parse | {input_string}"}
+                json_str = json.dumps(json_content)
+                input_string = json_str
+                start_index = 0
+                end_index = len(json_str)
+            elif re.match(r'^\[\d+, ?\d+\]$', input_string[start_index:end_index]):
+                scores = json.loads(input_string[start_index:end_index])
+                json_content = {'score': scores, "reasoning": None}
+                json_str = json.dumps(json_content)
+                input_string = json_str
+                start_index = 0
+                end_index = len(json_str)
+            elif is_int_between_0_and_10(input_string): # if output is simply a number
+                scores = [int(input_string)]
+                json_content = {'score': scores, "reasoning": None}
+                json_str = json.dumps(json_content)
+                input_string = json_str
+                start_index = 0
+                end_index = len(json_str)
+            else:
+                print("Failed to find the json content in the string.")
+                return False
+    # Check if we found two delimiters
+    if start_index != -1 and end_index != -1 and start_index != end_index:
+        # Extract the JSON string
+        json_str = input_string[start_index:end_index].strip()
+        json_str = json_str.replace("\n", "")
+        try:
+            # Parse the JSON string into a dictionary
+            new_data = json.loads(json_str)
+            # Ensure the directory exists
+            os.makedirs(os.path.dirname(output_file_name), exist_ok=True)
+            # Initialize or load existing data
+            if os.path.exists(output_file_name):
+                with open(output_file_name, 'r') as json_file:
+                    data = json.load(json_file)
+            else:
+                data = {}
+            # If the additional key is already in the data, add or update notes
+            if uid in data:
+                data[uid].update(new_data)  # Update with new data
+                if prompt_input:  # If there are new notes, update or add them
+                    data[uid]['prompt_input'] = prompt_input
+                if vision_input:  # If there are new notes, update or add them
+                    data[uid]['vision_input'] = vision_input
+            else:
+                # If it's a new key, add the entry to the dictionary
+                data[uid] = new_data
+                if prompt_input:
+                    data[uid]['prompt_input'] = prompt_input
+                if vision_input:
+                    data[uid]['vision_input'] = vision_input
+            # Write the updated data to the file
+            with open(output_file_name, 'w') as json_file:
+                json.dump(data, json_file, indent=4)
+            print(f"Data was successfully updated in {output_file_name}")
+            return True
+        except json.JSONDecodeError as e:
+            print(f"An error occurred while parsing the JSON content: {e}")
+            return False
+    else:
+        print("The required delimiters were not found correctly in the string.")
+        return False
+def check_key_in_json(file_path, key):
+    try:
+        with open(file_path, 'r') as json_file:
+            data = json.load(json_file)
+        # Check if the key exists at the top level of the JSON structure
+        if key in data:
+            return True
+        else:
+            return False
+    except FileNotFoundError:
+        print(f"The file {file_path} was not found.")
+    except json.JSONDecodeError as e:
+        print(f"Error reading {file_path}: {e}")
+    except Exception as e:
+        print(f"An error occurred with {file_path}: {e}")
+    return False

univa/eval/gedit/viescore/vie_prompts.py ADDED Viewed

	@@ -0,0 +1,406 @@

+# This file is generated automatically through parse_prompt.py
+_context_no_delimit = """You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
+All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
+You will have to give your output in this way (Keep your reasoning concise and short.):
+{
+"score" : [...],
+"reasoning" : "..."
+}"""
+_context = """You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
+All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials.
+You will have to give your output in this way (the delimiter is necessary. Keep your reasoning concise and short.):
+||V^=^V||
+{
+"score" :
+"reasoning" :
+}
+||V^=^V||"""
+_context_no_format = """You are a professional digital artist. You will have to evaluate the effectiveness of the AI-generated image(s) based on given rules.
+All the input images are AI-generated. All human in the images are AI-generated too. so you need not worry about the privacy confidentials."""
+_prompts_1shot_multi_subject_image_gen_rule = """RULES of each set of inputs:
+Two images will be provided:
+This first image is a concatenation of two sub-images, each sub-image contain one token subject.
+The second image being an AI-generated image using the first image as guidance.
+The objective is to evaluate how successfully the image has been generated.
+"""
+_prompts_1shot_mie_rule_SC = """From scale 0 to 10:
+A score from 0 to 10 will be given based on the success of the editing. (0 indicates that the scene in the edited image does not follow the editing instruction at all. 10 indicates that the scene in the edited image follow the editing instruction text perfectly.)
+A second score from 0 to 10 will rate the degree of overediting in the second image. (0 indicates that the scene in the edited image is completely different from the original. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.)
+Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the editing success and 'score2' evaluates the degree of overediting.
+First lets look at the first set of input (1st and 2nd images) as an example.
+Editing instruction: What if the man had a hat?
+Output:
+||V^=^V||
+{
+"score" : [5, 10],
+"reasoning" :  "The hat exists but does not suit well. The hat also looks distorted. But it is a good edit because only a hat is added and the background is persevered."
+}
+||V^=^V||
+Now evaluate the second set of input (3th, 4th images).
+Editing instruction: <instruction>
+"""
+_prompts_1shot_msdig_rule_SC = """From scale 0 to 10:
+A score from 0 to 10 will be given based on the success in following the prompt.
+(0 indicates that the second image does not follow the prompt at all. 10 indicates the second image follows the prompt perfectly.)
+A second score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the first sub-image.
+(0 indicates that the subject in the second image does not look like the token subject in the first sub-image at all. 10 indicates the subject in the second image look exactly alike the token subject in the first sub-image.)
+A third score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the second sub-image.
+(0 indicates that the subject in the second image does not look like the token subject in the second sub-image at all. 10 indicates the subject in the second image look exactly alike the token subject in the second sub-image.)
+Put the score in a list such that output score = [score1, score2, score3], where 'score1' evaluates the prompt and 'score2' evaluates the resemblance for the first sub-image, and 'score3' evaluates the resemblance for the second sub-image.
+First lets look at the first set of input (1st and 2nd images) as an example.
+Text Prompt: A digital illustration of a cat beside a wooden pot
+Output:
+||V^=^V||
+{
+"score" : [5, 5, 10],
+"reasoning" :  "The cat is not beside the wooden pot. The pot looks partially resemble to the subject pot. The cat looks highly resemble to the subject cat."
+}
+||V^=^V||
+Now evaluate the second set of input (3th, 4th images).
+Text Prompt: <prompt>"""
+_prompts_1shot_t2i_rule_SC = """From scale 0 to 10:
+A score from 0 to 10 will be given based on the success in following the prompt.
+(0 indicates that the AI generated image does not follow the prompt at all. 10 indicates the AI generated image follows the prompt perfectly.)
+Put the score in a list such that output score = [score].
+First lets look at the first set of input (1st image) as an example.
+Text Prompt: A pink and a white frisbee are on the ground.
+Output:
+||V^=^V||
+{
+"score" : [5],
+"reasoning" :  "White frisbee not present in the image."
+}
+||V^=^V||
+Now evaluate the second set of input (2nd image).
+Text Prompt: <prompt>
+"""
+_prompts_1shot_tie_rule_SC = """From scale 0 to 10:
+A score from 0 to 10 will be given based on the success of the editing. (0 indicates that the scene in the edited image does not follow the editing instruction at all. 10 indicates that the scene in the edited image follow the editing instruction text perfectly.)
+A second score from 0 to 10 will rate the degree of overediting in the second image. (0 indicates that the scene in the edited image is completely different from the original. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.)
+Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the editing success and 'score2' evaluates the degree of overediting.
+First lets look at the first set of input (1st and 2nd images) as an example.
+Editing instruction: What if the man had a hat?
+Output:
+||V^=^V||
+{
+"score" : [5, 10],
+"reasoning" :  "The hat exists but does not suit well. The hat also looks distorted. But it is a good edit because only a hat is added and the background is persevered."
+}
+||V^=^V||
+Now evaluate the second set of input (3th, 4th images).
+Editing instruction: <instruction>
+"""
+_prompts_1shot_sdie_rule_SC = """From scale 0 to 10:
+A score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the second image.
+(0 indicates that the subject in the third image does not look like the token subject at all. 10 indicates the subject in the third image look exactly alike the token subject.)
+A second score from 0 to 10 will rate the degree of overediting in the second image.
+(0 indicates that the scene in the edited image is completely different from the first image. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.)
+Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the resemblance and 'score2' evaluates the degree of overediting.
+First lets look at the first set of input (1st, 2nd and 3rd images) as an example.
+Subject: <subject>
+Output:
+||V^=^V||
+{
+"score" : [5, 10],
+"reasoning" :  "The monster toy looks partially resemble to the token subject. The edit is minimal."
+}
+||V^=^V||
+Now evaluate the second set of input (4th, 5th, and 6th images).
+Subject: <subject>
+"""
+_prompts_1shot_one_image_gen_rule = """RULES of each set of inputs:
+One image will be provided; The image is an AI-generated image.
+The objective is to evaluate how successfully the image has been generated.
+"""
+_prompts_1shot_sdig_rule_SC = """From scale 0 to 10:
+A score from 0 to 10 will be given based on the success in following the prompt.
+(0 indicates that the second image does not follow the prompt at all. 10 indicates the second image follows the prompt perfectly.)
+A second score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the first image.
+(0 indicates that the subject in the second image does not look like the token subject at all. 10 indicates the subject in the second image look exactly alike the token subject.)
+Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the prompt and 'score2' evaluates the resemblance.
+First lets look at the first set of input (1st and 2nd images) as an example.
+Text Prompt: a red cartoon figure eating a banana
+Output:
+||V^=^V||
+{
+"score" : [10, 5],
+"reasoning" :  "The red cartoon figure is eating a banana. The red cartoon figure looks partially resemble to the subject."
+}
+||V^=^V||
+Now evaluate the second set of input (3th, 4th images).
+Text Prompt: <prompt>
+"""
+_prompts_1shot_rule_PQ = """RULES of each set of inputs:
+One image will be provided; The image is an AI-generated image.
+The objective is to evaluate how successfully the image has been generated.
+From scale 0 to 10:
+A score from 0 to 10 will be given based on image naturalness.
+(
+    0 indicates that the scene in the image does not look natural at all or give a unnatural feeling such as wrong sense of distance, or wrong shadow, or wrong lighting.
+    10 indicates that the image looks natural.
+)
+A second score from 0 to 10 will rate the image artifacts.
+(
+    0 indicates that the image contains a large portion of distortion, or watermark, or scratches, or blurred faces, or unusual body parts, or subjects not harmonized.
+    10 indicates the image has no artifacts.
+)
+Put the score in a list such that output score = [naturalness, artifacts]
+First lets look at the first set of input (1st image) as an example.
+Output:
+||V^=^V||
+{
+"score" : [5, 5],
+"reasoning" :  "The image gives an unnatural feeling on hands of the girl. There is also minor distortion on the eyes of the girl."
+}
+||V^=^V||
+Now evaluate the second set of input (2nd image).
+"""
+_prompts_1shot_subject_image_gen_rule = """RULES of each set of inputs:
+Two images will be provided: The first being a token subject image and the second being an AI-generated image using the first image as guidance.
+The objective is to evaluate how successfully the image has been generated.
+"""
+_prompts_1shot_cig_rule_SC = """
+From scale 0 to 10:
+A score from 0 to 10 will be given based on the success in following the prompt.
+(0 indicates that the second image does not follow the prompt at all. 10 indicates the second image follows the prompt perfectly.)
+A second score from 0 to 10 will rate how well the generated image is following the guidance image.
+(0 indicates that the second image is not following the guidance at all. 10 indicates that second image is following the guidance image.)
+Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the prompt and 'score2' evaluates the guidance.
+First lets look at the first set of input (1st and 2nd images) as an example.
+Text Prompt: the bridge is red, Golden Gate Bridge in San Francisco, USA
+Output:
+||V^=^V||
+{
+"score" : [5, 5],
+"reasoning" :  "The bridge is red. But half of the bridge is gone."
+}
+||V^=^V||
+Now evaluate the second set of input (3th, 4th images).
+Text Prompt: <prompt>
+"""
+_prompts_1shot_two_image_edit_rule = """RULES of each set of inputs:
+Two images will be provided: The first being the original AI-generated image and the second being an edited version of the first.
+The objective is to evaluate how successfully the editing instruction has been executed in the second image.
+Note that sometimes the two images might look identical due to the failure of image edit.
+"""
+_prompts_1shot_subject_image_edit_rule = """RULES of each set of inputs:
+Three images will be provided:
+The first image is a input image to be edited.
+The second image is a token subject image.
+The third image is an AI-edited image from the first image. it should contain a subject that looks alike the subject in second image.
+The objective is to evaluate how successfully the image has been edited.
+"""
+_prompts_1shot_control_image_gen_rule = """RULES of each set of inputs:
+Two images will be provided: The first being a processed image (e.g. Canny edges, openpose, grayscale etc.) and the second being an AI-generated image using the first image as guidance.
+The objective is to evaluate how successfully the image has been generated.
+"""
+_prompts_0shot_two_image_edit_rule = """RULES:
+Two images will be provided: The first being the original AI-generated image and the second being an edited version of the first.
+The objective is to evaluate how successfully the editing instruction has been executed in the second image.
+Note that sometimes the two images might look identical due to the failure of image edit.
+"""
+_prompts_0shot_one_video_gen_rule = """RULES:
+The images are extracted from a AI-generated video according to the text prompt.
+The objective is to evaluate how successfully the video has been generated.
+"""
+_prompts_0shot_t2v_rule_PQ = """RULES:
+The image frames are AI-generated.
+The objective is to evaluate how successfully the image frames has been generated.
+From scale 0 to 10:
+A score from 0 to 10 will be given based on the image frames naturalness.
+(
+    0 indicates that the scene in the image frames does not look natural at all or give a unnatural feeling such as wrong sense of distance, or wrong shadow, or wrong lighting.
+    10 indicates that the image frames looks natural.
+)
+A second score from 0 to 10 will rate the image frames artifacts.
+(
+    0 indicates that the image frames contains a large portion of distortion, or watermark, or scratches, or blurred faces, or unusual body parts, or subjects not harmonized.
+    10 indicates the image frames has no artifacts.
+)
+Put the score in a list such that output score = [naturalness, artifacts]
+"""
+_prompts_0shot_msdig_rule_SC = """From scale 0 to 10:
+A score from 0 to 10 will be given based on the success in following the prompt.
+(0 indicates that the second image does not follow the prompt at all. 10 indicates the second image follows the prompt perfectly.)
+A second score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the first sub-image.
+(0 indicates that the subject in the second image does not look like the token subject in the first sub-image at all. 10 indicates the subject in the second image look exactly alike the token subject in the first sub-image.)
+A third score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the second sub-image.
+(0 indicates that the subject in the second image does not look like the token subject in the second sub-image at all. 10 indicates the subject in the second image look exactly alike the token subject in the second sub-image.)
+Put the score in a list such that output score = [score1, score2, score3], where 'score1' evaluates the prompt and 'score2' evaluates the resemblance for the first sub-image, and 'score3' evaluates the resemblance for the second sub-image.
+Text Prompt: <prompt>
+"""
+_prompts_0shot_sdie_rule_SC = """From scale 0 to 10:
+A score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the second image.
+(0 indicates that the subject in the third image does not look like the token subject at all. 10 indicates the subject in the third image look exactly alike the token subject.)
+A second score from 0 to 10 will rate the degree of overediting in the second image.
+(0 indicates that the scene in the edited image is completely different from the first image. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.)
+Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the resemblance and 'score2' evaluates the degree of overediting.
+Subject: <subject>"""
+_prompts_0shot_subject_image_edit_rule = """RULES:
+Three images will be provided:
+The first image is a input image to be edited.
+The second image is a token subject image.
+The third image is an AI-edited image from the first image. it should contain a subject that looks alike the subject in second image.
+The objective is to evaluate how successfully the image has been edited.
+"""
+_prompts_0shot_mie_rule_SC = """From scale 0 to 10:
+A score from 0 to 10 will be given based on the success of the editing. (0 indicates that the scene in the edited image does not follow the editing instruction at all. 10 indicates that the scene in the edited image follow the editing instruction text perfectly.)
+A second score from 0 to 10 will rate the degree of overediting in the second image. (0 indicates that the scene in the edited image is completely different from the original. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.)
+Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the editing success and 'score2' evaluates the degree of overediting.
+Editing instruction: <instruction>
+"""
+_prompts_0shot_sdig_rule_SC = """From scale 0 to 10:
+A score from 0 to 10 will be given based on the success in following the prompt.
+(0 indicates that the second image does not follow the prompt at all. 10 indicates the second image follows the prompt perfectly.)
+A second score from 0 to 10 will rate how well the subject in the generated image resemble to the token subject in the first image.
+(0 indicates that the subject in the second image does not look like the token subject at all. 10 indicates the subject in the second image look exactly alike the token subject.)
+Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the prompt and 'score2' evaluates the resemblance.
+Text Prompt: <prompt>
+"""
+_prompts_0shot_tie_rule_SC = """
+From scale 0 to 10:
+A score from 0 to 10 will be given based on the success of the editing. (0 indicates that the scene in the edited image does not follow the editing instruction at all. 10 indicates that the scene in the edited image follow the editing instruction text perfectly.)
+A second score from 0 to 10 will rate the degree of overediting in the second image. (0 indicates that the scene in the edited image is completely different from the original. 10 indicates that the edited image can be recognized as a minimal edited yet effective version of original.)
+Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the editing success and 'score2' evaluates the degree of overediting.
+Editing instruction: <instruction>
+"""
+_prompts_0shot_t2i_rule_SC = """From scale 0 to 10:
+A score from 0 to 10 will be given based on the success in following the prompt.
+(0 indicates that the AI generated image does not follow the prompt at all. 10 indicates the AI generated image follows the prompt perfectly.)
+Put the score in a list such that output score = [score].
+Text Prompt: <prompt>
+"""
+_prompts_0shot_cig_rule_SC = """From scale 0 to 10:
+A score from 0 to 10 will be given based on the success in following the prompt.
+(0 indicates that the second image does not follow the prompt at all. 10 indicates the second image follows the prompt perfectly.)
+A second score from 0 to 10 will rate how well the generated image is following the guidance image.
+(0 indicates that the second image is not following the guidance at all. 10 indicates that second image is following the guidance image.)
+Put the score in a list such that output score = [score1, score2], where 'score1' evaluates the prompt and 'score2' evaluates the guidance.
+Text Prompt: <prompt>"""
+_prompts_0shot_control_image_gen_rule = """RULES:
+Two images will be provided: The first being a processed image (e.g. Canny edges, openpose, grayscale etc.) and the second being an AI-generated image using the first image as guidance.
+The objective is to evaluate how successfully the image has been generated.
+"""
+_prompts_0shot_rule_PQ = """RULES:
+The image is an AI-generated image.
+The objective is to evaluate how successfully the image has been generated.
+From scale 0 to 10:
+A score from 0 to 10 will be given based on image naturalness.
+(
+    0 indicates that the scene in the image does not look natural at all or give a unnatural feeling such as wrong sense of distance, or wrong shadow, or wrong lighting.
+    10 indicates that the image looks natural.
+)
+A second score from 0 to 10 will rate the image artifacts.
+(
+    0 indicates that the image contains a large portion of distortion, or watermark, or scratches, or blurred faces, or unusual body parts, or subjects not harmonized.
+    10 indicates the image has no artifacts.
+)
+Put the score in a list such that output score = [naturalness, artifacts]
+"""
+_prompts_0shot_t2v_rule_SC = """From scale 0 to 10:
+A score from 0 to 10 will be given based on the success in following the prompt.
+(0 indicates that the image frames does not follow the prompt at all. 10 indicates the image frames follows the prompt perfectly.)
+Put the score in a list such that output score = [score].
+Text Prompt: <prompt>
+"""
+_prompts_0shot_multi_subject_image_gen_rule = """RULES:
+Two images will be provided:
+This first image is a concatenation of two sub-images, each sub-image contain one token subject.
+The second image being an AI-generated image using the first image as guidance.
+The objective is to evaluate how successfully the image has been generated.
+"""
+_prompts_0shot_subject_image_gen_rule = """RULES:
+Two images will be provided: The first being a token subject image and the second being an AI-generated image using the first image as guidance.
+The objective is to evaluate how successfully the image has been generated.
+"""
+_prompts_0shot_one_image_gen_rule = """RULES:
+The image is an AI-generated image according to the text prompt.
+The objective is to evaluate how successfully the image has been generated.
+"""

univa/eval/genai/README.md ADDED Viewed

	@@ -0,0 +1,47 @@

+The original code is from [GenAI-Bench](https://github.com/linzhiqiu/t2v_metrics).
+## Requirements and Installation
+```
+pip install git+https://github.com/openai/CLIP.git
+pip install open-clip-torch
+```
+## Eval
+### Generate samples
+We also support `genai1600`, just replace`genai527.yaml` with `genai1600.yaml` and change `$OUTPUT_DIR`.
+```bash
+# switch to univa env
+MODEL_PATH='path/to/model'
+OUTPUT_DIR='path/to/eval_output/genai527'
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 torchrun \
+  --nproc_per_node 8 \
+  -m step1_gen_samples \
+  genai527.yaml \
+  --pretrained_lvlm_name_or_path ${MODEL_PATH} \
+  --output_dir ${OUTPUT_DIR}
+```
+### Evaluation & Summary
+Download [zhiqiulin/clip-flant5-xxl](https://huggingface.co/zhiqiulin/clip-flant5-xxl) to `$T5_PATH`.
+Download [openai/clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336) to `$VISION_TOWER`.
+```bash
+# switch to univa env
+META_DIR="eval_prompts/genai527"
+IMAGE_DIR=${OUTPUT_DIR}
+CUDA_VISIBLE_DEVICES=4 VISION_TOWER=${VISION_TOWER} python -m step2_run_model \
+    --model_path ${T5_PATH} \
+    --image_dir ${IMAGE_DIR} \
+    --meta_dir ${META_DIR} > ${IMAGE_DIR}.txt
+cat ${IMAGE_DIR}.txt
+```

univa/eval/genai/__init__.py ADDED Viewed

File without changes

univa/eval/genai/eval_prompts/genai1600/genai_image.json ADDED Viewed

The diff for this file is too large to render. See raw diff

univa/eval/genai/eval_prompts/genai1600/genai_skills.json ADDED Viewed

	@@ -0,0 +1,4872 @@

+{
+    "basic": [
+        0,
+        1,
+        2,
+        3,
+        4,
+        5,
+        6,
+        7,
+        8,
+        9,
+        11,
+        12,
+        13,
+        14,
+        15,
+        16,
+        17,
+        18,
+        19,
+        20,
+        21,
+        22,
+        23,
+        24,
+        25,
+        26,
+        27,
+        28,
+        29,
+        30,
+        31,
+        32,
+        33,
+        34,
+        35,
+        36,
+        37,
+        38,
+        39,
+        40,
+        41,
+        42,
+        43,
+        44,
+        45,
+        46,
+        47,
+        48,
+        49,
+        50,
+        51,
+        52,
+        53,
+        54,
+        55,
+        56,
+        57,
+        58,
+        59,
+        60,
+        61,
+        62,
+        63,
+        64,
+        65,
+        66,
+        67,
+        68,
+        69,
+        70,
+        71,
+        72,
+        73,
+        74,
+        75,
+        76,
+        77,
+        78,
+        79,
+        80,
+        81,
+        82,
+        83,
+        84,
+        85,
+        86,
+        87,
+        88,
+        89,
+        90,
+        91,
+        92,
+        93,
+        94,
+        95,
+        96,
+        97,
+        98,
+        99,
+        100,
+        101,
+        118,
+        119,
+        120,
+        121,
+        125,
+        126,
+        127,
+        129,
+        130,
+        131,
+        132,
+        218,
+        219,
+        220,
+        221,
+        222,
+        223,
+        224,
+        225,
+        226,
+        227,
+        228,
+        229,
+        230,
+        231,
+        232,
+        233,
+        234,
+        235,
+        236,
+        237,
+        269,
+        270,
+        271,
+        272,
+        273,
+        274,
+        275,
+        276,
+        277,
+        278,
+        279,
+        280,
+        281,
+        282,
+        283,
+        284,
+        285,
+        286,
+        287,
+        288,
+        289,
+        290,
+        291,
+        300,
+        301,
+        302,
+        303,
+        304,
+        305,
+        306,
+        307,
+        308,
+        309,
+        310,
+        311,
+        312,
+        313,
+        314,
+        315,
+        316,
+        317,
+        318,
+        319,
+        320,
+        321,
+        322,
+        323,
+        324,
+        325,
+        326,
+        327,
+        328,
+        329,
+        330,
+        331,
+        332,
+        333,
+        334,
+        335,
+        336,
+        337,
+        338,
+        339,
+        340,
+        341,
+        342,
+        343,
+        344,
+        345,
+        346,
+        347,
+        348,
+        352,
+        353,
+        354,
+        355,
+        382,
+        386,
+        387,
+        403,
+        527,
+        528,
+        529,
+        530,
+        531,
+        532,
+        533,
+        534,
+        535,
+        536,
+        537,
+        538,
+        539,
+        540,
+        541,
+        542,
+        543,
+        544,
+        545,
+        546,
+        547,
+        548,
+        549,
+        550,
+        551,
+        552,
+        553,
+        554,
+        555,
+        556,
+        557,
+        558,
+        559,
+        560,
+        561,
+        562,
+        563,
+        564,
+        565,
+        566,
+        567,
+        568,
+        569,
+        570,
+        571,
+        572,
+        573,
+        574,
+        575,
+        576,
+        577,
+        578,
+        579,
+        580,
+        581,
+        582,
+        583,
+        584,
+        585,
+        586,
+        587,
+        588,
+        589,
+        590,
+        591,
+        592,
+        593,
+        594,
+        595,
+        596,
+        597,
+        598,
+        599,
+        602,
+        603,
+        604,
+        606,
+        607,
+        608,
+        609,
+        610,
+        611,
+        612,
+        613,
+        614,
+        615,
+        616,
+        617,
+        618,
+        619,
+        620,
+        621,
+        622,
+        623,
+        624,
+        625,
+        626,
+        627,
+        628,
+        629,
+        630,
+        631,
+        632,
+        633,
+        634,
+        635,
+        636,
+        637,
+        638,
+        639,
+        640,
+        641,
+        642,
+        643,
+        644,
+        645,
+        646,
+        647,
+        648,
+        649,
+        650,
+        651,
+        652,
+        653,
+        654,
+        655,
+        656,
+        657,
+        658,
+        659,
+        660,
+        661,
+        662,
+        663,
+        664,
+        665,
+        666,
+        667,
+        668,
+        669,
+        670,
+        671,
+        672,
+        673,
+        674,
+        675,
+        676,
+        677,
+        678,
+        679,
+        680,
+        681,
+        682,
+        683,
+        684,
+        685,
+        686,
+        687,
+        688,
+        689,
+        690,
+        692,
+        693,
+        694,
+        695,
+        696,
+        697,
+        698,
+        699,
+        700,
+        701,
+        702,
+        704,
+        705,
+        706,
+        707,
+        708,
+        709,
+        710,
+        711,
+        712,
+        713,
+        714,
+        715,
+        716,
+        717,
+        718,
+        720,
+        722,
+        723,
+        724,
+        725,
+        726,
+        727,
+        728,
+        729,
+        731,
+        732,
+        734,
+        735,
+        736,
+        738,
+        740,
+        741,
+        742,
+        744,
+        745,
+        746,
+        747,
+        748,
+        749,
+        750,
+        751,
+        752,
+        753,
+        754,
+        755,
+        756,
+        757,
+        758,
+        759,
+        760,
+        761,
+        762,
+        763,
+        764,
+        765,
+        766,
+        767,
+        768,
+        769,
+        770,
+        771,
+        772,
+        773,
+        774,
+        775,
+        776,
+        777,
+        778,
+        779,
+        780,
+        781,
+        782,
+        783,
+        784,
+        785,
+        786,
+        787,
+        788,
+        789,
+        790,
+        791,
+        792,
+        793,
+        794,
+        795,
+        796,
+        797,
+        798,
+        799,
+        800,
+        803,
+        805,
+        806,
+        809,
+        811,
+        812,
+        815,
+        817,
+        818,
+        820,
+        823,
+        827,
+        829,
+        840,
+        843,
+        847,
+        869,
+        905,
+        935,
+        941,
+        952,
+        954,
+        965,
+        974,
+        980,
+        984,
+        985,
+        988,
+        989,
+        990,
+        991,
+        992,
+        993,
+        994,
+        995,
+        996,
+        997,
+        998,
+        1007,
+        1009,
+        1010,
+        1011,
+        1012,
+        1013,
+        1014,
+        1015,
+        1016,
+        1017,
+        1018,
+        1019,
+        1020,
+        1021,
+        1022,
+        1023,
+        1024,
+        1025,
+        1026,
+        1027,
+        1028,
+        1029,
+        1031,
+        1032,
+        1033,
+        1034,
+        1036,
+        1038,
+        1039,
+        1040,
+        1041,
+        1042,
+        1044,
+        1045,
+        1046,
+        1047,
+        1048,
+        1049,
+        1050,
+        1051,
+        1052,
+        1053,
+        1054,
+        1055,
+        1056,
+        1057,
+        1058,
+        1059,
+        1060,
+        1061,
+        1062,
+        1063,
+        1064,
+        1065,
+        1066,
+        1067,
+        1068,
+        1069,
+        1070,
+        1071,
+        1072,
+        1073,
+        1074,
+        1075,
+        1076,
+        1077,
+        1078,
+        1079,
+        1080,
+        1081,
+        1082,
+        1083,
+        1084,
+        1085,
+        1086,
+        1087,
+        1088,
+        1089,
+        1090,
+        1091,
+        1092,
+        1093,
+        1094,
+        1095,
+        1096,
+        1097,
+        1098,
+        1099,
+        1100,
+        1101,
+        1102,
+        1103,
+        1104,
+        1105,
+        1106,
+        1107,
+        1108,
+        1109,
+        1110,
+        1111,
+        1112,
+        1113,
+        1114,
+        1115,
+        1117,
+        1119,
+        1120,
+        1121,
+        1122,
+        1123,
+        1124,
+        1125,
+        1126,
+        1127,
+        1128,
+        1129,
+        1130,
+        1131,
+        1132,
+        1133,
+        1134,
+        1135,
+        1136,
+        1138,
+        1139,
+        1140,
+        1141,
+        1143,
+        1144,
+        1145,
+        1146,
+        1147,
+        1148,
+        1149,
+        1150,
+        1151,
+        1152,
+        1153,
+        1154,
+        1155,
+        1156,
+        1157,
+        1158,
+        1159,
+        1160,
+        1161,
+        1162,
+        1163,
+        1164,
+        1165,
+        1166,
+        1167,
+        1168,
+        1169,
+        1170,
+        1171,
+        1172,
+        1173,
+        1174,
+        1175,
+        1176,
+        1177,
+        1178,
+        1179,
+        1180,
+        1181,
+        1182,
+        1183,
+        1184,
+        1185,
+        1186,
+        1187,
+        1188,
+        1189,
+        1190,
+        1191,
+        1192,
+        1193,
+        1194,
+        1195,
+        1196,
+        1197,
+        1198,
+        1199,
+        1200,
+        1201,
+        1202,
+        1203,
+        1204,
+        1205,
+        1206,
+        1207,
+        1208,
+        1209,
+        1210,
+        1211,
+        1212,
+        1213,
+        1214,
+        1215,
+        1216,
+        1245,
+        1322,
+        1369,
+        1405,
+        1406,
+        1407,
+        1408,
+        1425,
+        1533
+    ],
+    "advanced": [
+        10,
+        102,
+        103,
+        104,
+        105,
+        106,
+        107,
+        108,
+        109,
+        110,
+        111,
+        112,
+        113,
+        114,
+        115,
+        116,
+        117,
+        122,
+        123,
+        124,
+        128,
+        133,
+        134,
+        135,
+        136,
+        137,
+        138,
+        139,
+        140,
+        141,
+        142,
+        143,
+        144,
+        145,
+        146,
+        147,
+        148,
+        149,
+        150,
+        151,
+        152,
+        153,
+        154,
+        155,
+        156,
+        157,
+        158,
+        159,
+        160,
+        161,
+        162,
+        163,
+        164,
+        165,
+        166,
+        167,
+        168,
+        169,
+        170,
+        171,
+        172,
+        173,
+        174,
+        175,
+        176,
+        177,
+        178,
+        179,
+        180,
+        181,
+        182,
+        183,
+        184,
+        185,
+        186,
+        187,
+        188,
+        189,
+        190,
+        191,
+        192,
+        193,
+        194,
+        195,
+        196,
+        197,
+        198,
+        199,
+        200,
+        201,
+        202,
+        203,
+        204,
+        205,
+        206,
+        207,
+        208,
+        209,
+        210,
+        211,
+        212,
+        213,
+        214,
+        215,
+        216,
+        217,
+        238,
+        239,
+        240,
+        241,
+        242,
+        243,
+        244,
+        245,
+        246,
+        247,
+        248,
+        249,
+        250,
+        251,
+        252,
+        253,
+        254,
+        255,
+        256,
+        257,
+        258,
+        259,
+        260,
+        261,
+        262,
+        263,
+        264,
+        265,
+        266,
+        267,
+        268,
+        292,
+        293,
+        294,
+        295,
+        296,
+        297,
+        298,
+        299,
+        349,
+        350,
+        351,
+        356,
+        357,
+        358,
+        359,
+        360,
+        361,
+        362,
+        363,
+        364,
+        365,
+        366,
+        367,
+        368,
+        369,
+        370,
+        371,
+        372,
+        373,
+        374,
+        375,
+        376,
+        377,
+        378,
+        379,
+        380,
+        381,
+        383,
+        384,
+        385,
+        388,
+        389,
+        390,
+        391,
+        392,
+        393,
+        394,
+        395,
+        396,
+        397,
+        398,
+        399,
+        400,
+        401,
+        402,
+        404,
+        405,
+        406,
+        407,
+        408,
+        409,
+        410,
+        411,
+        412,
+        413,
+        414,
+        415,
+        416,
+        417,
+        418,
+        419,
+        420,
+        421,
+        422,
+        423,
+        424,
+        425,
+        426,
+        427,
+        428,
+        429,
+        430,
+        431,
+        432,
+        433,
+        434,
+        435,
+        436,
+        437,
+        438,
+        439,
+        440,
+        441,
+        442,
+        443,
+        444,
+        445,
+        446,
+        447,
+        448,
+        449,
+        450,
+        451,
+        452,
+        453,
+        454,
+        455,
+        456,
+        457,
+        458,
+        459,
+        460,
+        461,
+        462,
+        463,
+        464,
+        465,
+        466,
+        467,
+        468,
+        469,
+        470,
+        471,
+        472,
+        473,
+        474,
+        475,
+        476,
+        477,
+        478,
+        479,
+        480,
+        481,
+        482,
+        483,
+        484,
+        485,
+        486,
+        487,
+        488,
+        489,
+        490,
+        491,
+        492,
+        493,
+        494,
+        495,
+        496,
+        497,
+        498,
+        499,
+        500,
+        501,
+        502,
+        503,
+        504,
+        505,
+        506,
+        507,
+        508,
+        509,
+        510,
+        511,
+        512,
+        513,
+        514,
+        515,
+        516,
+        517,
+        518,
+        519,
+        520,
+        521,
+        522,
+        523,
+        524,
+        525,
+        526,
+        600,
+        601,
+        605,
+        691,
+        703,
+        737,
+        801,
+        802,
+        804,
+        807,
+        808,
+        810,
+        813,
+        814,
+        816,
+        819,
+        821,
+        822,
+        824,
+        825,
+        826,
+        828,
+        830,
+        831,
+        832,
+        833,
+        834,
+        835,
+        836,
+        837,
+        838,
+        839,
+        841,
+        842,
+        844,
+        845,
+        846,
+        848,
+        849,
+        850,
+        851,
+        852,
+        853,
+        854,
+        855,
+        856,
+        857,
+        858,
+        859,
+        860,
+        861,
+        862,
+        863,
+        864,
+        865,
+        866,
+        867,
+        868,
+        870,
+        871,
+        872,
+        873,
+        874,
+        875,
+        876,
+        877,
+        878,
+        879,
+        880,
+        881,
+        882,
+        883,
+        884,
+        885,
+        886,
+        887,
+        888,
+        889,
+        890,
+        891,
+        892,
+        893,
+        894,
+        895,
+        896,
+        897,
+        898,
+        899,
+        900,
+        901,
+        902,
+        903,
+        904,
+        906,
+        907,
+        908,
+        909,
+        910,
+        911,
+        912,
+        913,
+        914,
+        915,
+        916,
+        917,
+        918,
+        919,
+        920,
+        921,
+        922,
+        923,
+        924,
+        925,
+        926,
+        927,
+        928,
+        929,
+        930,
+        931,
+        932,
+        933,
+        934,
+        936,
+        937,
+        938,
+        939,
+        940,
+        942,
+        943,
+        944,
+        945,
+        946,
+        947,
+        948,
+        949,
+        950,
+        953,
+        955,
+        956,
+        957,
+        958,
+        959,
+        960,
+        961,
+        962,
+        963,
+        964,
+        966,
+        967,
+        968,
+        969,
+        970,
+        971,
+        972,
+        973,
+        975,
+        976,
+        977,
+        978,
+        979,
+        981,
+        982,
+        983,
+        986,
+        987,
+        999,
+        1000,
+        1001,
+        1002,
+        1003,
+        1004,
+        1005,
+        1006,
+        1008,
+        1030,
+        1035,
+        1037,
+        1043,
+        1116,
+        1118,
+        1137,
+        1142,
+        1217,
+        1218,
+        1219,
+        1220,
+        1221,
+        1222,
+        1223,
+        1224,
+        1225,
+        1226,
+        1227,
+        1228,
+        1229,
+        1230,
+        1231,
+        1232,
+        1233,
+        1234,
+        1235,
+        1236,
+        1237,
+        1238,
+        1239,
+        1240,
+        1241,
+        1242,
+        1243,
+        1244,
+        1246,
+        1247,
+        1248,
+        1249,
+        1250,
+        1251,
+        1252,
+        1253,
+        1254,
+        1255,
+        1256,
+        1257,
+        1258,
+        1259,
+        1260,
+        1261,
+        1262,
+        1263,
+        1264,
+        1265,
+        1266,
+        1267,
+        1268,
+        1269,
+        1270,
+        1271,
+        1272,
+        1273,
+        1274,
+        1275,
+        1276,
+        1277,
+        1278,
+        1279,
+        1280,
+        1281,
+        1282,
+        1283,
+        1284,
+        1285,
+        1286,
+        1287,
+        1288,
+        1289,
+        1290,
+        1291,
+        1292,
+        1293,
+        1294,
+        1295,
+        1296,
+        1297,
+        1298,
+        1299,
+        1300,
+        1301,
+        1302,
+        1303,
+        1304,
+        1305,
+        1306,
+        1307,
+        1308,
+        1309,
+        1310,
+        1311,
+        1312,
+        1313,
+        1314,
+        1315,
+        1316,
+        1317,
+        1318,
+        1319,
+        1320,
+        1321,
+        1323,
+        1324,
+        1325,
+        1326,
+        1327,
+        1328,
+        1329,
+        1330,
+        1331,
+        1332,
+        1333,
+        1334,
+        1335,
+        1336,
+        1337,
+        1338,
+        1339,
+        1340,
+        1341,
+        1342,
+        1343,
+        1344,
+        1345,
+        1346,
+        1347,
+        1348,
+        1349,
+        1350,
+        1351,
+        1352,
+        1353,
+        1354,
+        1355,
+        1356,
+        1357,
+        1358,
+        1359,
+        1360,
+        1361,
+        1362,
+        1363,
+        1364,
+        1365,
+        1366,
+        1367,
+        1368,
+        1370,
+        1371,
+        1372,
+        1373,
+        1374,
+        1375,
+        1376,
+        1377,
+        1378,
+        1379,
+        1380,
+        1381,
+        1382,
+        1383,
+        1384,
+        1385,
+        1386,
+        1387,
+        1388,
+        1389,
+        1390,
+        1391,
+        1392,
+        1393,
+        1394,
+        1395,
+        1396,
+        1397,
+        1398,
+        1399,
+        1400,
+        1401,
+        1402,
+        1403,
+        1404,
+        1409,
+        1410,
+        1411,
+        1412,
+        1413,
+        1414,
+        1415,
+        1416,
+        1417,
+        1418,
+        1419,
+        1420,
+        1421,
+        1422,
+        1423,
+        1424,
+        1426,
+        1427,
+        1428,
+        1429,
+        1430,
+        1431,
+        1432,
+        1433,
+        1434,
+        1435,
+        1436,
+        1437,
+        1438,
+        1439,
+        1440,
+        1441,
+        1442,
+        1443,
+        1444,
+        1445,
+        1446,
+        1447,
+        1448,
+        1449,
+        1450,
+        1451,
+        1452,
+        1453,
+        1454,
+        1455,
+        1456,
+        1457,
+        1458,
+        1459,
+        1460,
+        1461,
+        1462,
+        1463,
+        1464,
+        1465,
+        1466,
+        1467,
+        1468,
+        1469,
+        1470,
+        1471,
+        1472,
+        1473,
+        1474,
+        1475,
+        1476,
+        1477,
+        1478,
+        1479,
+        1480,
+        1481,
+        1482,
+        1483,
+        1484,
+        1485,
+        1486,
+        1487,
+        1488,
+        1489,
+        1490,
+        1491,
+        1492,
+        1493,
+        1494,
+        1495,
+        1496,
+        1497,
+        1498,
+        1499,
+        1500,
+        1501,
+        1502,
+        1503,
+        1504,
+        1505,
+        1506,
+        1507,
+        1508,
+        1509,
+        1510,
+        1511,
+        1512,
+        1513,
+        1514,
+        1515,
+        1516,
+        1517,
+        1518,
+        1519,
+        1520,
+        1521,
+        1522,
+        1523,
+        1524,
+        1525,
+        1526,
+        1527,
+        1528,
+        1529,
+        1530,
+        1531,
+        1532,
+        1534,
+        1535,
+        1536,
+        1537,
+        1538,
+        1539,
+        1540,
+        1541,
+        1542,
+        1543,
+        1544,
+        1545,
+        1546,
+        1547,
+        1548,
+        1549,
+        1550,
+        1551,
+        1552,
+        1553,
+        1554,
+        1555,
+        1556,
+        1557,
+        1558,
+        1559,
+        1560,
+        1561,
+        1562,
+        1563,
+        1564,
+        1565,
+        1566,
+        1567,
+        1568,
+        1569,
+        1570,
+        1571,
+        1572,
+        1573,
+        1574,
+        1575,
+        1576,
+        1577,
+        1578,
+        1579,
+        1580,
+        1581,
+        1582,
+        1583,
+        1584,
+        1585,
+        1586,
+        1587,
+        1588,
+        1589,
+        1590,
+        1591,
+        1592,
+        1593,
+        1594,
+        1595,
+        1596,
+        1597,
+        1598,
+        1599
+    ],
+    "attribute": [
+        0,
+        1,
+        3,
+        4,
+        5,
+        6,
+        7,
+        8,
+        9,
+        11,
+        12,
+        13,
+        14,
+        15,
+        16,
+        17,
+        18,
+        19,
+        20,
+        23,
+        24,
+        25,
+        27,
+        29,
+        31,
+        34,
+        35,
+        38,
+        39,
+        40,
+        41,
+        42,
+        44,
+        45,
+        46,
+        47,
+        48,
+        49,
+        50,
+        51,
+        52,
+        55,
+        57,
+        58,
+        63,
+        64,
+        68,
+        71,
+        72,
+        73,
+        74,
+        75,
+        76,
+        77,
+        78,
+        79,
+        80,
+        81,
+        83,
+        84,
+        85,
+        87,
+        88,
+        89,
+        90,
+        91,
+        92,
+        93,
+        94,
+        95,
+        96,
+        97,
+        98,
+        99,
+        100,
+        101,
+        120,
+        129,
+        131,
+        218,
+        219,
+        220,
+        221,
+        223,
+        224,
+        225,
+        226,
+        227,
+        228,
+        229,
+        232,
+        233,
+        234,
+        235,
+        236,
+        237,
+        269,
+        271,
+        272,
+        275,
+        276,
+        277,
+        278,
+        285,
+        289,
+        290,
+        291,
+        300,
+        302,
+        303,
+        304,
+        305,
+        306,
+        307,
+        310,
+        312,
+        314,
+        315,
+        316,
+        317,
+        318,
+        319,
+        332,
+        333,
+        334,
+        336,
+        338,
+        339,
+        340,
+        343,
+        344,
+        346,
+        347,
+        352,
+        353,
+        354,
+        355,
+        382,
+        386,
+        387,
+        403,
+        527,
+        528,
+        530,
+        531,
+        534,
+        535,
+        536,
+        538,
+        539,
+        540,
+        541,
+        542,
+        543,
+        544,
+        545,
+        547,
+        548,
+        549,
+        550,
+        551,
+        552,
+        553,
+        554,
+        555,
+        558,
+        559,
+        560,
+        563,
+        565,
+        567,
+        568,
+        570,
+        572,
+        573,
+        574,
+        576,
+        577,
+        578,
+        579,
+        580,
+        583,
+        585,
+        586,
+        587,
+        589,
+        590,
+        591,
+        592,
+        595,
+        597,
+        598,
+        602,
+        603,
+        604,
+        606,
+        607,
+        608,
+        609,
+        610,
+        611,
+        612,
+        613,
+        614,
+        615,
+        616,
+        618,
+        620,
+        621,
+        622,
+        623,
+        633,
+        634,
+        636,
+        639,
+        640,
+        642,
+        643,
+        645,
+        646,
+        648,
+        649,
+        650,
+        651,
+        652,
+        653,
+        654,
+        655,
+        656,
+        658,
+        659,
+        660,
+        663,
+        667,
+        668,
+        669,
+        670,
+        671,
+        672,
+        673,
+        674,
+        676,
+        677,
+        678,
+        679,
+        680,
+        681,
+        682,
+        683,
+        686,
+        692,
+        693,
+        694,
+        696,
+        697,
+        698,
+        699,
+        700,
+        701,
+        702,
+        704,
+        705,
+        706,
+        707,
+        708,
+        709,
+        710,
+        711,
+        712,
+        713,
+        714,
+        715,
+        716,
+        718,
+        722,
+        723,
+        724,
+        725,
+        726,
+        727,
+        729,
+        731,
+        732,
+        734,
+        735,
+        736,
+        738,
+        740,
+        742,
+        744,
+        745,
+        746,
+        747,
+        748,
+        749,
+        750,
+        751,
+        752,
+        753,
+        754,
+        755,
+        756,
+        757,
+        758,
+        765,
+        768,
+        769,
+        770,
+        772,
+        773,
+        774,
+        775,
+        777,
+        779,
+        780,
+        781,
+        782,
+        783,
+        784,
+        785,
+        786,
+        787,
+        789,
+        790,
+        791,
+        792,
+        793,
+        794,
+        795,
+        796,
+        797,
+        798,
+        799,
+        803,
+        805,
+        806,
+        809,
+        811,
+        812,
+        815,
+        817,
+        818,
+        820,
+        823,
+        829,
+        840,
+        843,
+        869,
+        905,
+        935,
+        941,
+        952,
+        954,
+        965,
+        974,
+        980,
+        984,
+        985,
+        988,
+        989,
+        991,
+        992,
+        993,
+        996,
+        997,
+        1007,
+        1009,
+        1010,
+        1011,
+        1012,
+        1013,
+        1014,
+        1015,
+        1016,
+        1018,
+        1019,
+        1020,
+        1021,
+        1022,
+        1023,
+        1024,
+        1025,
+        1026,
+        1027,
+        1028,
+        1029,
+        1031,
+        1032,
+        1033,
+        1034,
+        1036,
+        1038,
+        1039,
+        1041,
+        1042,
+        1044,
+        1045,
+        1046,
+        1047,
+        1048,
+        1049,
+        1050,
+        1051,
+        1052,
+        1053,
+        1054,
+        1055,
+        1057,
+        1059,
+        1060,
+        1061,
+        1062,
+        1063,
+        1064,
+        1065,
+        1066,
+        1067,
+        1069,
+        1070,
+        1071,
+        1074,
+        1075,
+        1077,
+        1078,
+        1079,
+        1080,
+        1081,
+        1082,
+        1083,
+        1084,
+        1085,
+        1086,
+        1087,
+        1088,
+        1089,
+        1090,
+        1091,
+        1092,
+        1093,
+        1094,
+        1095,
+        1096,
+        1097,
+        1098,
+        1099,
+        1100,
+        1101,
+        1102,
+        1103,
+        1104,
+        1106,
+        1107,
+        1109,
+        1110,
+        1111,
+        1112,
+        1113,
+        1114,
+        1115,
+        1117,
+        1119,
+        1120,
+        1121,
+        1122,
+        1123,
+        1124,
+        1125,
+        1126,
+        1127,
+        1128,
+        1129,
+        1130,
+        1131,
+        1133,
+        1134,
+        1135,
+        1136,
+        1138,
+        1140,
+        1141,
+        1143,
+        1144,
+        1145,
+        1146,
+        1147,
+        1148,
+        1149,
+        1150,
+        1151,
+        1152,
+        1153,
+        1155,
+        1156,
+        1158,
+        1159,
+        1160,
+        1161,
+        1162,
+        1163,
+        1164,
+        1165,
+        1166,
+        1167,
+        1168,
+        1169,
+        1170,
+        1171,
+        1172,
+        1173,
+        1176,
+        1177,
+        1178,
+        1179,
+        1180,
+        1181,
+        1182,
+        1183,
+        1184,
+        1185,
+        1186,
+        1187,
+        1188,
+        1189,
+        1190,
+        1191,
+        1192,
+        1193,
+        1194,
+        1195,
+        1196,
+        1197,
+        1198,
+        1199,
+        1200,
+        1203,
+        1204,
+        1205,
+        1206,
+        1207,
+        1208,
+        1209,
+        1210,
+        1211,
+        1213,
+        1215,
+        1231,
+        1245,
+        1322,
+        1405,
+        1406,
+        1408,
+        1425,
+        1533
+    ],
+    "scene": [
+        0,
+        1,
+        2,
+        3,
+        4,
+        5,
+        7,
+        8,
+        11,
+        12,
+        13,
+        15,
+        16,
+        17,
+        18,
+        20,
+        21,
+        22,
+        23,
+        27,
+        41,
+        44,
+        45,
+        46,
+        47,
+        48,
+        49,
+        50,
+        52,
+        53,
+        54,
+        55,
+        58,
+        59,
+        63,
+        64,
+        65,
+        66,
+        68,
+        69,
+        70,
+        71,
+        76,
+        77,
+        79,
+        83,
+        84,
+        86,
+        87,
+        88,
+        90,
+        91,
+        93,
+        95,
+        96,
+        97,
+        98,
+        99,
+        100,
+        101,
+        118,
+        119,
+        125,
+        129,
+        131,
+        218,
+        219,
+        221,
+        222,
+        223,
+        224,
+        226,
+        228,
+        229,
+        231,
+        233,
+        235,
+        236,
+        237,
+        281,
+        282,
+        287,
+        288,
+        289,
+        290,
+        304,
+        305,
+        307,
+        308,
+        309,
+        310,
+        312,
+        314,
+        315,
+        316,
+        317,
+        318,
+        320,
+        321,
+        322,
+        323,
+        325,
+        328,
+        329,
+        330,
+        331,
+        332,
+        334,
+        336,
+        337,
+        338,
+        339,
+        340,
+        341,
+        344,
+        345,
+        346,
+        347,
+        352,
+        354,
+        355,
+        387,
+        403,
+        527,
+        528,
+        531,
+        532,
+        533,
+        534,
+        537,
+        538,
+        539,
+        540,
+        541,
+        543,
+        547,
+        548,
+        549,
+        550,
+        553,
+        555,
+        557,
+        560,
+        561,
+        562,
+        563,
+        564,
+        566,
+        567,
+        568,
+        569,
+        570,
+        571,
+        572,
+        573,
+        574,
+        575,
+        578,
+        579,
+        580,
+        581,
+        583,
+        584,
+        586,
+        594,
+        595,
+        597,
+        598,
+        599,
+        604,
+        607,
+        608,
+        609,
+        613,
+        615,
+        617,
+        618,
+        622,
+        623,
+        624,
+        625,
+        636,
+        638,
+        639,
+        640,
+        641,
+        643,
+        646,
+        649,
+        651,
+        652,
+        653,
+        656,
+        657,
+        658,
+        660,
+        661,
+        662,
+        663,
+        664,
+        665,
+        667,
+        668,
+        669,
+        670,
+        671,
+        672,
+        673,
+        674,
+        675,
+        676,
+        677,
+        678,
+        680,
+        681,
+        682,
+        684,
+        685,
+        686,
+        687,
+        688,
+        689,
+        690,
+        692,
+        693,
+        695,
+        696,
+        697,
+        700,
+        701,
+        709,
+        717,
+        720,
+        723,
+        724,
+        726,
+        727,
+        728,
+        756,
+        757,
+        758,
+        759,
+        760,
+        766,
+        770,
+        772,
+        773,
+        775,
+        777,
+        779,
+        780,
+        781,
+        782,
+        783,
+        784,
+        785,
+        786,
+        787,
+        788,
+        789,
+        790,
+        791,
+        792,
+        793,
+        794,
+        795,
+        796,
+        797,
+        798,
+        799,
+        800,
+        809,
+        811,
+        812,
+        820,
+        827,
+        843,
+        847,
+        869,
+        905,
+        954,
+        965,
+        974,
+        1010,
+        1013,
+        1018,
+        1027,
+        1033,
+        1038,
+        1040,
+        1042,
+        1044,
+        1045,
+        1047,
+        1049,
+        1050,
+        1051,
+        1053,
+        1057,
+        1060,
+        1062,
+        1064,
+        1068,
+        1073,
+        1074,
+        1075,
+        1076,
+        1080,
+        1091,
+        1092,
+        1114,
+        1117,
+        1119,
+        1123,
+        1128,
+        1129,
+        1131,
+        1138,
+        1143,
+        1152,
+        1153,
+        1163,
+        1164,
+        1166,
+        1167,
+        1168,
+        1169,
+        1170,
+        1172,
+        1173,
+        1174,
+        1175,
+        1176,
+        1177,
+        1178,
+        1179,
+        1180,
+        1181,
+        1182,
+        1183,
+        1184,
+        1186,
+        1188,
+        1189,
+        1190,
+        1191,
+        1192,
+        1193,
+        1194,
+        1195,
+        1196,
+        1197,
+        1198,
+        1207,
+        1208,
+        1209,
+        1212,
+        1213,
+        1407,
+        1408
+    ],
+    "spatial relation": [
+        0,
+        3,
+        5,
+        6,
+        7,
+        9,
+        12,
+        13,
+        14,
+        16,
+        19,
+        21,
+        22,
+        24,
+        26,
+        28,
+        30,
+        38,
+        39,
+        40,
+        42,
+        43,
+        47,
+        51,
+        56,
+        59,
+        62,
+        67,
+        72,
+        73,
+        74,
+        75,
+        76,
+        77,
+        78,
+        79,
+        80,
+        81,
+        82,
+        85,
+        86,
+        89,
+        90,
+        91,
+        92,
+        93,
+        94,
+        100,
+        118,
+        120,
+        121,
+        125,
+        126,
+        127,
+        130,
+        131,
+        132,
+        218,
+        219,
+        220,
+        221,
+        223,
+        225,
+        227,
+        229,
+        230,
+        231,
+        232,
+        234,
+        236,
+        237,
+        269,
+        270,
+        273,
+        274,
+        275,
+        276,
+        285,
+        286,
+        302,
+        315,
+        319,
+        321,
+        324,
+        326,
+        331,
+        335,
+        336,
+        339,
+        342,
+        343,
+        344,
+        345,
+        346,
+        347,
+        352,
+        353,
+        354,
+        355,
+        382,
+        386,
+        387,
+        403,
+        527,
+        533,
+        534,
+        535,
+        538,
+        540,
+        542,
+        543,
+        546,
+        547,
+        548,
+        550,
+        551,
+        552,
+        554,
+        556,
+        557,
+        560,
+        561,
+        562,
+        564,
+        565,
+        566,
+        569,
+        571,
+        574,
+        575,
+        576,
+        577,
+        580,
+        582,
+        585,
+        588,
+        590,
+        595,
+        596,
+        599,
+        602,
+        603,
+        604,
+        606,
+        607,
+        608,
+        609,
+        610,
+        611,
+        612,
+        614,
+        615,
+        616,
+        619,
+        621,
+        624,
+        625,
+        626,
+        627,
+        628,
+        629,
+        630,
+        631,
+        632,
+        633,
+        634,
+        635,
+        637,
+        638,
+        641,
+        642,
+        643,
+        644,
+        647,
+        650,
+        655,
+        657,
+        660,
+        662,
+        663,
+        664,
+        665,
+        667,
+        668,
+        672,
+        674,
+        678,
+        679,
+        684,
+        688,
+        690,
+        693,
+        696,
+        700,
+        701,
+        702,
+        704,
+        706,
+        707,
+        708,
+        709,
+        711,
+        712,
+        713,
+        714,
+        715,
+        716,
+        717,
+        718,
+        722,
+        723,
+        724,
+        725,
+        726,
+        727,
+        728,
+        729,
+        756,
+        757,
+        758,
+        759,
+        760,
+        761,
+        762,
+        763,
+        764,
+        765,
+        767,
+        769,
+        771,
+        772,
+        773,
+        774,
+        775,
+        776,
+        778,
+        779,
+        784,
+        790,
+        791,
+        793,
+        794,
+        796,
+        799,
+        800,
+        806,
+        809,
+        811,
+        815,
+        818,
+        827,
+        829,
+        840,
+        905,
+        935,
+        952,
+        954,
+        974,
+        980,
+        984,
+        985,
+        988,
+        989,
+        994,
+        996,
+        997,
+        1009,
+        1010,
+        1011,
+        1013,
+        1014,
+        1015,
+        1016,
+        1017,
+        1018,
+        1020,
+        1021,
+        1022,
+        1025,
+        1026,
+        1027,
+        1029,
+        1031,
+        1032,
+        1034,
+        1036,
+        1039,
+        1040,
+        1042,
+        1044,
+        1045,
+        1046,
+        1047,
+        1048,
+        1050,
+        1052,
+        1054,
+        1055,
+        1056,
+        1057,
+        1059,
+        1060,
+        1061,
+        1062,
+        1063,
+        1064,
+        1065,
+        1067,
+        1068,
+        1070,
+        1071,
+        1072,
+        1075,
+        1076,
+        1079,
+        1080,
+        1081,
+        1082,
+        1083,
+        1084,
+        1085,
+        1086,
+        1088,
+        1089,
+        1090,
+        1091,
+        1092,
+        1094,
+        1095,
+        1096,
+        1097,
+        1099,
+        1100,
+        1101,
+        1102,
+        1103,
+        1104,
+        1105,
+        1106,
+        1107,
+        1108,
+        1109,
+        1110,
+        1111,
+        1113,
+        1119,
+        1129,
+        1143,
+        1152,
+        1153,
+        1156,
+        1164,
+        1166,
+        1167,
+        1168,
+        1169,
+        1171,
+        1172,
+        1174,
+        1175,
+        1177,
+        1182,
+        1183,
+        1185,
+        1187,
+        1188,
+        1190,
+        1192,
+        1199,
+        1200,
+        1201,
+        1202,
+        1203,
+        1204,
+        1205,
+        1206,
+        1207,
+        1209,
+        1213,
+        1215,
+        1216,
+        1245,
+        1322,
+        1406,
+        1408,
+        1425
+    ],
+    "action relation": [
+        0,
+        1,
+        2,
+        3,
+        4,
+        5,
+        6,
+        8,
+        9,
+        15,
+        17,
+        18,
+        19,
+        20,
+        22,
+        23,
+        24,
+        25,
+        26,
+        28,
+        29,
+        30,
+        31,
+        32,
+        33,
+        34,
+        35,
+        36,
+        37,
+        38,
+        39,
+        41,
+        42,
+        45,
+        52,
+        53,
+        55,
+        56,
+        58,
+        59,
+        72,
+        73,
+        74,
+        75,
+        76,
+        77,
+        80,
+        81,
+        82,
+        83,
+        85,
+        86,
+        91,
+        92,
+        95,
+        96,
+        97,
+        98,
+        99,
+        100,
+        101,
+        118,
+        119,
+        120,
+        121,
+        125,
+        126,
+        127,
+        130,
+        131,
+        132,
+        218,
+        222,
+        229,
+        236,
+        273,
+        274,
+        275,
+        276,
+        279,
+        280,
+        281,
+        282,
+        287,
+        288,
+        289,
+        290,
+        291,
+        301,
+        302,
+        303,
+        304,
+        305,
+        306,
+        307,
+        308,
+        309,
+        310,
+        311,
+        312,
+        313,
+        314,
+        315,
+        316,
+        317,
+        318,
+        319,
+        320,
+        321,
+        322,
+        323,
+        324,
+        325,
+        326,
+        327,
+        328,
+        329,
+        330,
+        331,
+        332,
+        333,
+        334,
+        335,
+        336,
+        337,
+        338,
+        339,
+        340,
+        341,
+        342,
+        343,
+        344,
+        345,
+        347,
+        348,
+        352,
+        354,
+        355,
+        527,
+        528,
+        529,
+        530,
+        531,
+        532,
+        533,
+        534,
+        535,
+        536,
+        537,
+        538,
+        539,
+        540,
+        541,
+        542,
+        543,
+        544,
+        545,
+        546,
+        547,
+        548,
+        549,
+        550,
+        551,
+        552,
+        553,
+        554,
+        555,
+        556,
+        557,
+        558,
+        559,
+        560,
+        561,
+        562,
+        564,
+        566,
+        567,
+        568,
+        569,
+        570,
+        571,
+        572,
+        573,
+        574,
+        575,
+        576,
+        579,
+        581,
+        582,
+        583,
+        584,
+        585,
+        586,
+        588,
+        589,
+        590,
+        593,
+        594,
+        597,
+        598,
+        599,
+        602,
+        604,
+        607,
+        608,
+        609,
+        610,
+        611,
+        613,
+        614,
+        615,
+        616,
+        617,
+        618,
+        620,
+        622,
+        624,
+        632,
+        633,
+        634,
+        635,
+        636,
+        637,
+        638,
+        640,
+        641,
+        642,
+        643,
+        644,
+        645,
+        648,
+        649,
+        651,
+        652,
+        656,
+        657,
+        658,
+        661,
+        664,
+        665,
+        666,
+        667,
+        670,
+        671,
+        675,
+        684,
+        685,
+        686,
+        687,
+        688,
+        689,
+        690,
+        692,
+        693,
+        695,
+        696,
+        702,
+        705,
+        706,
+        708,
+        715,
+        716,
+        722,
+        723,
+        726,
+        728,
+        729,
+        741,
+        756,
+        757,
+        758,
+        759,
+        760,
+        761,
+        762,
+        763,
+        764,
+        765,
+        766,
+        767,
+        768,
+        769,
+        770,
+        771,
+        772,
+        773,
+        774,
+        775,
+        776,
+        777,
+        778,
+        779,
+        783,
+        785,
+        786,
+        790,
+        793,
+        794,
+        796,
+        799,
+        800,
+        803,
+        805,
+        806,
+        809,
+        811,
+        812,
+        815,
+        817,
+        818,
+        820,
+        823,
+        827,
+        829,
+        840,
+        843,
+        847,
+        869,
+        935,
+        952,
+        954,
+        965,
+        974,
+        984,
+        989,
+        990,
+        991,
+        992,
+        993,
+        994,
+        995,
+        996,
+        997,
+        998,
+        1009,
+        1010,
+        1011,
+        1012,
+        1013,
+        1015,
+        1017,
+        1018,
+        1019,
+        1020,
+        1021,
+        1022,
+        1025,
+        1026,
+        1028,
+        1029,
+        1031,
+        1032,
+        1033,
+        1034,
+        1036,
+        1038,
+        1039,
+        1040,
+        1041,
+        1042,
+        1044,
+        1045,
+        1046,
+        1047,
+        1049,
+        1050,
+        1053,
+        1054,
+        1055,
+        1056,
+        1057,
+        1058,
+        1059,
+        1060,
+        1061,
+        1062,
+        1063,
+        1064,
+        1065,
+        1066,
+        1067,
+        1068,
+        1069,
+        1070,
+        1071,
+        1072,
+        1073,
+        1074,
+        1076,
+        1077,
+        1078,
+        1079,
+        1080,
+        1081,
+        1082,
+        1083,
+        1084,
+        1085,
+        1086,
+        1087,
+        1088,
+        1089,
+        1090,
+        1092,
+        1093,
+        1094,
+        1095,
+        1096,
+        1097,
+        1098,
+        1099,
+        1100,
+        1102,
+        1103,
+        1104,
+        1105,
+        1107,
+        1108,
+        1112,
+        1113,
+        1117,
+        1122,
+        1125,
+        1129,
+        1130,
+        1131,
+        1138,
+        1139,
+        1155,
+        1163,
+        1165,
+        1166,
+        1169,
+        1170,
+        1172,
+        1173,
+        1174,
+        1175,
+        1176,
+        1177,
+        1179,
+        1180,
+        1185,
+        1186,
+        1187,
+        1190,
+        1192,
+        1197,
+        1204,
+        1205,
+        1206,
+        1207,
+        1208,
+        1209,
+        1210,
+        1211,
+        1212,
+        1213,
+        1214,
+        1216,
+        1369,
+        1405,
+        1406,
+        1407,
+        1408,
+        1425,
+        1533
+    ],
+    "part relation": [
+        7,
+        14,
+        43,
+        60,
+        61,
+        62,
+        63,
+        64,
+        65,
+        66,
+        67,
+        68,
+        69,
+        70,
+        71,
+        72,
+        73,
+        74,
+        226,
+        237,
+        275,
+        276,
+        277,
+        278,
+        283,
+        284,
+        300,
+        301,
+        302,
+        303,
+        306,
+        308,
+        312,
+        319,
+        327,
+        337,
+        340,
+        353,
+        529,
+        530,
+        531,
+        532,
+        536,
+        537,
+        538,
+        542,
+        544,
+        545,
+        547,
+        548,
+        549,
+        552,
+        554,
+        555,
+        558,
+        559,
+        563,
+        567,
+        572,
+        573,
+        577,
+        581,
+        584,
+        588,
+        589,
+        591,
+        592,
+        593,
+        603,
+        609,
+        611,
+        616,
+        617,
+        618,
+        619,
+        620,
+        621,
+        622,
+        623,
+        624,
+        625,
+        634,
+        637,
+        642,
+        654,
+        667,
+        669,
+        670,
+        671,
+        675,
+        679,
+        683,
+        694,
+        698,
+        705,
+        707,
+        713,
+        722,
+        725,
+        754,
+        762,
+        781,
+        782,
+        803,
+        805,
+        806,
+        809,
+        811,
+        812,
+        815,
+        817,
+        818,
+        820,
+        823,
+        827,
+        829,
+        840,
+        847,
+        941,
+        952,
+        985,
+        989,
+        991,
+        995,
+        998,
+        1007,
+        1019,
+        1027,
+        1028,
+        1031,
+        1036,
+        1039,
+        1046,
+        1047,
+        1049,
+        1055,
+        1056,
+        1061,
+        1063,
+        1065,
+        1066,
+        1069,
+        1070,
+        1071,
+        1072,
+        1073,
+        1074,
+        1075,
+        1076,
+        1077,
+        1078,
+        1079,
+        1081,
+        1082,
+        1084,
+        1086,
+        1087,
+        1089,
+        1113,
+        1114,
+        1115,
+        1117,
+        1119,
+        1120,
+        1121,
+        1122,
+        1124,
+        1125,
+        1126,
+        1127,
+        1132,
+        1133,
+        1134,
+        1135,
+        1138,
+        1139,
+        1140,
+        1141,
+        1145,
+        1146,
+        1148,
+        1149,
+        1151,
+        1153,
+        1154,
+        1156,
+        1157,
+        1158,
+        1162,
+        1171,
+        1200,
+        1211,
+        1213,
+        1405,
+        1406
+    ],
+    "counting": [
+        10,
+        108,
+        122,
+        133,
+        134,
+        137,
+        138,
+        139,
+        140,
+        141,
+        142,
+        143,
+        144,
+        145,
+        146,
+        147,
+        148,
+        149,
+        150,
+        151,
+        152,
+        153,
+        154,
+        155,
+        156,
+        157,
+        160,
+        173,
+        175,
+        178,
+        191,
+        197,
+        199,
+        200,
+        245,
+        246,
+        247,
+        248,
+        249,
+        250,
+        251,
+        297,
+        298,
+        356,
+        358,
+        359,
+        360,
+        361,
+        362,
+        364,
+        365,
+        366,
+        367,
+        369,
+        370,
+        374,
+        375,
+        399,
+        422,
+        423,
+        424,
+        425,
+        426,
+        427,
+        428,
+        429,
+        430,
+        431,
+        447,
+        461,
+        467,
+        471,
+        472,
+        474,
+        476,
+        479,
+        480,
+        482,
+        504,
+        505,
+        506,
+        507,
+        508,
+        509,
+        510,
+        511,
+        512,
+        513,
+        514,
+        515,
+        516,
+        517,
+        518,
+        519,
+        520,
+        521,
+        522,
+        523,
+        524,
+        525,
+        526,
+        600,
+        601,
+        605,
+        691,
+        703,
+        737,
+        819,
+        821,
+        822,
+        824,
+        826,
+        828,
+        830,
+        831,
+        832,
+        833,
+        834,
+        835,
+        836,
+        838,
+        841,
+        842,
+        844,
+        845,
+        846,
+        850,
+        851,
+        852,
+        853,
+        854,
+        855,
+        856,
+        857,
+        858,
+        859,
+        860,
+        861,
+        862,
+        863,
+        866,
+        872,
+        879,
+        880,
+        882,
+        884,
+        885,
+        886,
+        887,
+        888,
+        889,
+        890,
+        891,
+        893,
+        895,
+        897,
+        899,
+        902,
+        904,
+        906,
+        907,
+        908,
+        910,
+        911,
+        919,
+        920,
+        924,
+        927,
+        928,
+        929,
+        930,
+        931,
+        933,
+        934,
+        936,
+        937,
+        938,
+        939,
+        945,
+        964,
+        967,
+        971,
+        972,
+        973,
+        979,
+        981,
+        982,
+        983,
+        986,
+        987,
+        999,
+        1000,
+        1001,
+        1002,
+        1003,
+        1004,
+        1005,
+        1006,
+        1008,
+        1030,
+        1035,
+        1037,
+        1043,
+        1118,
+        1137,
+        1142,
+        1219,
+        1220,
+        1221,
+        1222,
+        1223,
+        1224,
+        1226,
+        1227,
+        1228,
+        1229,
+        1230,
+        1231,
+        1232,
+        1233,
+        1234,
+        1235,
+        1236,
+        1237,
+        1238,
+        1239,
+        1240,
+        1241,
+        1242,
+        1243,
+        1244,
+        1246,
+        1247,
+        1248,
+        1249,
+        1251,
+        1252,
+        1253,
+        1254,
+        1255,
+        1256,
+        1257,
+        1258,
+        1261,
+        1264,
+        1296,
+        1299,
+        1300,
+        1303,
+        1305,
+        1306,
+        1307,
+        1328,
+        1330,
+        1333,
+        1336,
+        1342,
+        1421,
+        1422,
+        1431,
+        1432,
+        1441,
+        1442,
+        1444,
+        1448,
+        1455,
+        1456,
+        1457,
+        1458,
+        1459,
+        1462,
+        1464,
+        1465,
+        1467,
+        1468,
+        1469,
+        1470,
+        1472,
+        1473,
+        1474,
+        1475,
+        1476,
+        1477,
+        1479,
+        1480,
+        1483,
+        1489,
+        1510,
+        1512,
+        1513,
+        1514,
+        1516,
+        1517,
+        1518,
+        1519,
+        1520,
+        1521,
+        1522,
+        1523,
+        1524,
+        1525,
+        1527,
+        1528,
+        1529,
+        1531,
+        1532,
+        1534,
+        1538,
+        1540,
+        1541,
+        1542,
+        1546,
+        1556,
+        1557,
+        1558,
+        1559,
+        1566,
+        1567,
+        1568,
+        1570,
+        1572,
+        1573,
+        1574,
+        1577,
+        1580,
+        1582,
+        1583,
+        1584,
+        1585,
+        1586,
+        1587,
+        1588,
+        1589,
+        1590,
+        1591,
+        1592,
+        1593,
+        1595,
+        1598,
+        1599
+    ],
+    "comparison": [
+        102,
+        103,
+        104,
+        105,
+        106,
+        107,
+        108,
+        109,
+        110,
+        111,
+        112,
+        113,
+        114,
+        115,
+        116,
+        117,
+        177,
+        178,
+        196,
+        210,
+        238,
+        239,
+        240,
+        241,
+        294,
+        356,
+        357,
+        358,
+        359,
+        360,
+        361,
+        362,
+        363,
+        364,
+        365,
+        366,
+        367,
+        369,
+        390,
+        491,
+        492,
+        493,
+        494,
+        495,
+        496,
+        497,
+        498,
+        499,
+        500,
+        501,
+        502,
+        503,
+        838,
+        839,
+        842,
+        848,
+        854,
+        859,
+        860,
+        864,
+        871,
+        876,
+        880,
+        881,
+        895,
+        899,
+        903,
+        918,
+        923,
+        937,
+        938,
+        942,
+        943,
+        945,
+        953,
+        956,
+        957,
+        958,
+        959,
+        960,
+        962,
+        963,
+        968,
+        970,
+        976,
+        977,
+        978,
+        979,
+        981,
+        986,
+        1217,
+        1250,
+        1259,
+        1263,
+        1271,
+        1318,
+        1321,
+        1324,
+        1326,
+        1339,
+        1376,
+        1377,
+        1388,
+        1389,
+        1392,
+        1396,
+        1397,
+        1399,
+        1400,
+        1401,
+        1402,
+        1416,
+        1424,
+        1427,
+        1437,
+        1441,
+        1442,
+        1450,
+        1460,
+        1466,
+        1482,
+        1483,
+        1484,
+        1486,
+        1518,
+        1521,
+        1522,
+        1523,
+        1524,
+        1527,
+        1528,
+        1535,
+        1538,
+        1539,
+        1540,
+        1541,
+        1542,
+        1552,
+        1553,
+        1555,
+        1556,
+        1557,
+        1558,
+        1559,
+        1560,
+        1561,
+        1562,
+        1563,
+        1564,
+        1565,
+        1566,
+        1567,
+        1568,
+        1569,
+        1570,
+        1573,
+        1589,
+        1594,
+        1596,
+        1597,
+        1598,
+        1599
+    ],
+    "differentiation": [
+        104,
+        108,
+        109,
+        117,
+        122,
+        123,
+        124,
+        128,
+        133,
+        134,
+        135,
+        136,
+        154,
+        158,
+        159,
+        160,
+        161,
+        162,
+        163,
+        164,
+        165,
+        166,
+        167,
+        168,
+        169,
+        170,
+        171,
+        172,
+        173,
+        174,
+        175,
+        176,
+        177,
+        178,
+        179,
+        180,
+        181,
+        182,
+        183,
+        184,
+        194,
+        195,
+        196,
+        200,
+        201,
+        202,
+        203,
+        242,
+        243,
+        244,
+        245,
+        292,
+        293,
+        349,
+        350,
+        351,
+        356,
+        357,
+        358,
+        359,
+        360,
+        361,
+        362,
+        363,
+        364,
+        365,
+        366,
+        367,
+        368,
+        369,
+        370,
+        371,
+        372,
+        373,
+        374,
+        375,
+        376,
+        377,
+        378,
+        379,
+        380,
+        381,
+        383,
+        384,
+        385,
+        388,
+        389,
+        390,
+        391,
+        392,
+        393,
+        394,
+        395,
+        396,
+        397,
+        398,
+        399,
+        400,
+        401,
+        402,
+        422,
+        423,
+        424,
+        425,
+        426,
+        427,
+        428,
+        429,
+        430,
+        431,
+        447,
+        472,
+        473,
+        474,
+        479,
+        480,
+        482,
+        494,
+        499,
+        801,
+        802,
+        804,
+        807,
+        808,
+        810,
+        813,
+        814,
+        816,
+        822,
+        839,
+        842,
+        850,
+        851,
+        852,
+        853,
+        861,
+        864,
+        868,
+        873,
+        876,
+        880,
+        881,
+        894,
+        895,
+        897,
+        903,
+        906,
+        907,
+        908,
+        909,
+        913,
+        915,
+        918,
+        920,
+        925,
+        927,
+        934,
+        936,
+        938,
+        943,
+        949,
+        953,
+        956,
+        958,
+        960,
+        973,
+        976,
+        977,
+        978,
+        979,
+        981,
+        986,
+        1116,
+        1217,
+        1249,
+        1250,
+        1259,
+        1318,
+        1330,
+        1339,
+        1366,
+        1376,
+        1392,
+        1397,
+        1401,
+        1402,
+        1424,
+        1427,
+        1431,
+        1444,
+        1445,
+        1461,
+        1462,
+        1463,
+        1464,
+        1465,
+        1466,
+        1467,
+        1468,
+        1469,
+        1470,
+        1471,
+        1472,
+        1473,
+        1474,
+        1475,
+        1476,
+        1477,
+        1478,
+        1479,
+        1480,
+        1481,
+        1482,
+        1483,
+        1484,
+        1485,
+        1486,
+        1487,
+        1488,
+        1489,
+        1490,
+        1491,
+        1492,
+        1493,
+        1494,
+        1495,
+        1496,
+        1497,
+        1498,
+        1499,
+        1500,
+        1501,
+        1502,
+        1503,
+        1504,
+        1505,
+        1506,
+        1507,
+        1508,
+        1509,
+        1510,
+        1514,
+        1516,
+        1518,
+        1519,
+        1520,
+        1523,
+        1524,
+        1525,
+        1526,
+        1527,
+        1528,
+        1534,
+        1535,
+        1538,
+        1539,
+        1540,
+        1541,
+        1542,
+        1546,
+        1553,
+        1554,
+        1555,
+        1556,
+        1559,
+        1560,
+        1562,
+        1563,
+        1564,
+        1565,
+        1566,
+        1567,
+        1568,
+        1570,
+        1571,
+        1572,
+        1573,
+        1576,
+        1579,
+        1582,
+        1583,
+        1584,
+        1585,
+        1586,
+        1587,
+        1588,
+        1590,
+        1591,
+        1592,
+        1593,
+        1594,
+        1595,
+        1596,
+        1597,
+        1598,
+        1599
+    ],
+    "negation": [
+        168,
+        169,
+        172,
+        177,
+        179,
+        185,
+        186,
+        187,
+        188,
+        189,
+        190,
+        191,
+        192,
+        193,
+        194,
+        195,
+        196,
+        197,
+        198,
+        199,
+        200,
+        201,
+        202,
+        203,
+        204,
+        205,
+        206,
+        207,
+        208,
+        209,
+        210,
+        211,
+        212,
+        213,
+        214,
+        215,
+        216,
+        217,
+        295,
+        296,
+        360,
+        371,
+        432,
+        433,
+        434,
+        435,
+        436,
+        437,
+        438,
+        439,
+        440,
+        441,
+        442,
+        443,
+        444,
+        445,
+        446,
+        447,
+        448,
+        449,
+        450,
+        451,
+        452,
+        454,
+        455,
+        456,
+        457,
+        458,
+        459,
+        460,
+        461,
+        462,
+        463,
+        464,
+        465,
+        466,
+        467,
+        468,
+        469,
+        470,
+        471,
+        472,
+        473,
+        474,
+        475,
+        476,
+        477,
+        478,
+        479,
+        480,
+        481,
+        482,
+        483,
+        484,
+        486,
+        487,
+        488,
+        489,
+        490,
+        851,
+        863,
+        864,
+        865,
+        868,
+        870,
+        873,
+        876,
+        894,
+        896,
+        901,
+        907,
+        909,
+        913,
+        923,
+        934,
+        939,
+        945,
+        949,
+        953,
+        956,
+        958,
+        960,
+        962,
+        977,
+        979,
+        1217,
+        1225,
+        1250,
+        1260,
+        1261,
+        1262,
+        1264,
+        1265,
+        1266,
+        1267,
+        1268,
+        1270,
+        1272,
+        1274,
+        1275,
+        1276,
+        1277,
+        1278,
+        1279,
+        1280,
+        1281,
+        1282,
+        1283,
+        1284,
+        1285,
+        1286,
+        1287,
+        1288,
+        1289,
+        1290,
+        1291,
+        1292,
+        1293,
+        1294,
+        1295,
+        1296,
+        1297,
+        1298,
+        1299,
+        1300,
+        1301,
+        1302,
+        1303,
+        1304,
+        1305,
+        1306,
+        1307,
+        1308,
+        1309,
+        1310,
+        1311,
+        1312,
+        1313,
+        1314,
+        1315,
+        1316,
+        1317,
+        1319,
+        1320,
+        1323,
+        1325,
+        1327,
+        1328,
+        1329,
+        1330,
+        1331,
+        1332,
+        1333,
+        1334,
+        1335,
+        1336,
+        1337,
+        1338,
+        1340,
+        1341,
+        1342,
+        1343,
+        1344,
+        1345,
+        1346,
+        1347,
+        1348,
+        1349,
+        1350,
+        1351,
+        1352,
+        1353,
+        1354,
+        1355,
+        1356,
+        1357,
+        1358,
+        1359,
+        1360,
+        1361,
+        1362,
+        1363,
+        1364,
+        1365,
+        1366,
+        1367,
+        1368,
+        1370,
+        1371,
+        1372,
+        1373,
+        1374,
+        1375,
+        1377,
+        1378,
+        1379,
+        1380,
+        1381,
+        1382,
+        1383,
+        1384,
+        1385,
+        1386,
+        1387,
+        1390,
+        1391,
+        1393,
+        1394,
+        1395,
+        1398,
+        1403,
+        1404,
+        1409,
+        1410,
+        1411,
+        1412,
+        1413,
+        1414,
+        1415,
+        1416,
+        1417,
+        1418,
+        1419,
+        1420,
+        1422,
+        1423,
+        1426,
+        1428,
+        1429,
+        1430,
+        1431,
+        1433,
+        1434,
+        1435,
+        1436,
+        1437,
+        1438,
+        1439,
+        1440,
+        1441,
+        1442,
+        1443,
+        1444,
+        1445,
+        1446,
+        1447,
+        1448,
+        1449,
+        1450,
+        1452,
+        1453,
+        1454,
+        1456,
+        1457,
+        1458,
+        1459,
+        1460,
+        1461,
+        1462,
+        1463,
+        1465,
+        1466,
+        1467,
+        1468,
+        1469,
+        1470,
+        1471,
+        1472,
+        1474,
+        1475,
+        1476,
+        1477,
+        1478,
+        1479,
+        1480,
+        1481,
+        1482,
+        1483,
+        1484,
+        1485,
+        1486,
+        1488,
+        1489,
+        1511,
+        1512,
+        1513,
+        1514,
+        1515,
+        1517,
+        1519,
+        1520,
+        1526,
+        1529,
+        1530,
+        1532,
+        1534,
+        1536,
+        1537,
+        1539,
+        1541,
+        1543,
+        1544,
+        1546,
+        1548,
+        1549,
+        1562,
+        1566
+    ],
+    "universal": [
+        252,
+        253,
+        254,
+        255,
+        256,
+        257,
+        258,
+        259,
+        260,
+        261,
+        262,
+        263,
+        264,
+        265,
+        266,
+        267,
+        268,
+        297,
+        299,
+        365,
+        404,
+        405,
+        406,
+        407,
+        408,
+        409,
+        410,
+        411,
+        412,
+        413,
+        414,
+        415,
+        416,
+        417,
+        418,
+        419,
+        420,
+        421,
+        422,
+        423,
+        424,
+        425,
+        426,
+        427,
+        428,
+        429,
+        430,
+        431,
+        453,
+        485,
+        825,
+        835,
+        836,
+        837,
+        839,
+        846,
+        849,
+        852,
+        855,
+        857,
+        858,
+        859,
+        860,
+        867,
+        874,
+        875,
+        877,
+        878,
+        883,
+        884,
+        892,
+        897,
+        898,
+        900,
+        910,
+        911,
+        912,
+        914,
+        916,
+        917,
+        919,
+        921,
+        922,
+        926,
+        932,
+        934,
+        939,
+        940,
+        942,
+        944,
+        946,
+        947,
+        948,
+        950,
+        955,
+        961,
+        966,
+        969,
+        975,
+        976,
+        978,
+        1218,
+        1251,
+        1267,
+        1269,
+        1273,
+        1327,
+        1378,
+        1431,
+        1433,
+        1441,
+        1451,
+        1452,
+        1453,
+        1454,
+        1455,
+        1456,
+        1457,
+        1458,
+        1459,
+        1463,
+        1516,
+        1517,
+        1525,
+        1529,
+        1530,
+        1531,
+        1532,
+        1535,
+        1543,
+        1544,
+        1545,
+        1547,
+        1549,
+        1550,
+        1551,
+        1554,
+        1573,
+        1574,
+        1575,
+        1576,
+        1577,
+        1578,
+        1579,
+        1580,
+        1581,
+        1589
+    ]
+}

univa/eval/genai/eval_prompts/genai527/genai_image.json ADDED Viewed

The diff for this file is too large to render. See raw diff

univa/eval/genai/eval_prompts/genai527/genai_skills.json ADDED Viewed

	@@ -0,0 +1,1482 @@

+{
+    "basic": [
+        1,
+        2,
+        3,
+        4,
+        5,
+        6,
+        7,
+        8,
+        9,
+        10,
+        11,
+        12,
+        13,
+        14,
+        15,
+        16,
+        17,
+        18,
+        19,
+        20,
+        21,
+        22,
+        23,
+        24,
+        25,
+        26,
+        27,
+        28,
+        29,
+        30,
+        31,
+        32,
+        33,
+        34,
+        35,
+        36,
+        37,
+        38,
+        39,
+        40,
+        41,
+        42,
+        43,
+        44,
+        45,
+        46,
+        47,
+        48,
+        49,
+        50,
+        51,
+        52,
+        53,
+        54,
+        55,
+        56,
+        57,
+        58,
+        59,
+        60,
+        61,
+        62,
+        63,
+        64,
+        65,
+        66,
+        67,
+        68,
+        69,
+        70,
+        71,
+        72,
+        73,
+        74,
+        75,
+        76,
+        77,
+        78,
+        79,
+        80,
+        81,
+        82,
+        83,
+        84,
+        85,
+        86,
+        87,
+        88,
+        89,
+        90,
+        91,
+        92,
+        93,
+        94,
+        95,
+        96,
+        97,
+        98,
+        99,
+        100,
+        101,
+        102,
+        119,
+        120,
+        121,
+        122,
+        126,
+        127,
+        128,
+        130,
+        131,
+        132,
+        133,
+        219,
+        220,
+        221,
+        222,
+        223,
+        224,
+        225,
+        226,
+        227,
+        228,
+        229,
+        230,
+        231,
+        232,
+        233,
+        234,
+        235,
+        236,
+        237,
+        238,
+        270,
+        271,
+        272,
+        273,
+        274,
+        275,
+        276,
+        277,
+        278,
+        279,
+        280,
+        281,
+        282,
+        283,
+        284,
+        285,
+        286,
+        287,
+        288,
+        289,
+        290,
+        291,
+        292,
+        301,
+        302,
+        303,
+        304,
+        305,
+        306,
+        307,
+        308,
+        309,
+        310,
+        311,
+        312,
+        313,
+        314,
+        315,
+        316,
+        317,
+        318,
+        319,
+        320,
+        321,
+        322,
+        323,
+        324,
+        325,
+        326,
+        327,
+        328,
+        329,
+        330,
+        331,
+        332,
+        333,
+        334,
+        335,
+        336,
+        337,
+        338,
+        339,
+        340,
+        341,
+        342,
+        343,
+        344,
+        345,
+        346,
+        347,
+        348,
+        349,
+        350,
+        353,
+        354,
+        355,
+        356,
+        383,
+        387,
+        388,
+        404
+    ],
+    "advanced": [
+        103,
+        104,
+        105,
+        106,
+        107,
+        108,
+        109,
+        110,
+        111,
+        112,
+        113,
+        114,
+        115,
+        116,
+        117,
+        118,
+        123,
+        124,
+        125,
+        129,
+        134,
+        135,
+        136,
+        137,
+        138,
+        139,
+        140,
+        141,
+        142,
+        143,
+        144,
+        145,
+        146,
+        147,
+        148,
+        149,
+        150,
+        151,
+        152,
+        153,
+        154,
+        155,
+        156,
+        157,
+        158,
+        159,
+        160,
+        161,
+        162,
+        163,
+        164,
+        165,
+        166,
+        167,
+        168,
+        169,
+        170,
+        171,
+        172,
+        173,
+        174,
+        175,
+        176,
+        177,
+        178,
+        179,
+        180,
+        181,
+        182,
+        183,
+        184,
+        185,
+        186,
+        187,
+        188,
+        189,
+        190,
+        191,
+        192,
+        193,
+        194,
+        195,
+        196,
+        197,
+        198,
+        199,
+        200,
+        201,
+        202,
+        203,
+        204,
+        205,
+        206,
+        207,
+        208,
+        209,
+        210,
+        211,
+        212,
+        213,
+        214,
+        215,
+        216,
+        217,
+        218,
+        239,
+        240,
+        241,
+        242,
+        243,
+        244,
+        245,
+        246,
+        247,
+        248,
+        249,
+        250,
+        251,
+        252,
+        253,
+        254,
+        255,
+        256,
+        257,
+        258,
+        259,
+        260,
+        261,
+        262,
+        263,
+        264,
+        265,
+        266,
+        267,
+        268,
+        269,
+        293,
+        294,
+        295,
+        296,
+        297,
+        298,
+        299,
+        300,
+        351,
+        352,
+        357,
+        358,
+        359,
+        360,
+        361,
+        362,
+        363,
+        364,
+        365,
+        366,
+        367,
+        368,
+        369,
+        370,
+        371,
+        372,
+        373,
+        374,
+        375,
+        376,
+        377,
+        378,
+        379,
+        380,
+        381,
+        382,
+        384,
+        385,
+        386,
+        389,
+        390,
+        391,
+        392,
+        393,
+        394,
+        395,
+        396,
+        397,
+        398,
+        399,
+        400,
+        401,
+        402,
+        403,
+        405,
+        406,
+        407,
+        408,
+        409,
+        410,
+        411,
+        412,
+        413,
+        414,
+        415,
+        416,
+        417,
+        418,
+        419,
+        420,
+        421,
+        422,
+        423,
+        424,
+        425,
+        426,
+        427,
+        428,
+        429,
+        430,
+        431,
+        432,
+        433,
+        434,
+        435,
+        436,
+        437,
+        438,
+        439,
+        440,
+        441,
+        442,
+        443,
+        444,
+        445,
+        446,
+        447,
+        448,
+        449,
+        450,
+        451,
+        452,
+        453,
+        454,
+        455,
+        456,
+        457,
+        458,
+        459,
+        460,
+        461,
+        462,
+        463,
+        464,
+        465,
+        466,
+        467,
+        468,
+        469,
+        470,
+        471,
+        472,
+        473,
+        474,
+        475,
+        476,
+        477,
+        478,
+        479,
+        480,
+        481,
+        482,
+        483,
+        484,
+        485,
+        486,
+        487,
+        488,
+        489,
+        490,
+        491,
+        492,
+        493,
+        494,
+        495,
+        496,
+        497,
+        498,
+        499,
+        500,
+        501,
+        502,
+        503,
+        504,
+        505,
+        506,
+        507,
+        508,
+        509,
+        510,
+        511,
+        512,
+        513,
+        514,
+        515,
+        516,
+        517,
+        518,
+        519,
+        520,
+        521,
+        522,
+        523,
+        524,
+        525,
+        526,
+        527
+    ],
+    "attribute": [
+        1,
+        2,
+        3,
+        4,
+        5,
+        6,
+        7,
+        8,
+        9,
+        10,
+        11,
+        12,
+        13,
+        14,
+        15,
+        16,
+        17,
+        18,
+        19,
+        20,
+        21,
+        24,
+        25,
+        26,
+        28,
+        30,
+        32,
+        35,
+        36,
+        39,
+        40,
+        41,
+        42,
+        43,
+        45,
+        46,
+        47,
+        48,
+        49,
+        50,
+        51,
+        52,
+        53,
+        56,
+        58,
+        59,
+        64,
+        65,
+        69,
+        72,
+        73,
+        74,
+        75,
+        76,
+        77,
+        78,
+        79,
+        80,
+        81,
+        82,
+        84,
+        85,
+        86,
+        88,
+        89,
+        90,
+        91,
+        92,
+        93,
+        94,
+        95,
+        96,
+        97,
+        98,
+        99,
+        100,
+        101,
+        102,
+        121,
+        130,
+        132,
+        219,
+        220,
+        221,
+        222,
+        224,
+        225,
+        226,
+        227,
+        228,
+        229,
+        230,
+        233,
+        234,
+        236,
+        237,
+        238,
+        270,
+        272,
+        273,
+        276,
+        277,
+        278,
+        279,
+        286,
+        290,
+        291,
+        292,
+        301,
+        303,
+        304,
+        305,
+        306,
+        307,
+        308,
+        311,
+        313,
+        315,
+        316,
+        317,
+        318,
+        319,
+        320,
+        333,
+        334,
+        335,
+        337,
+        339,
+        340,
+        341,
+        344,
+        345,
+        347,
+        348,
+        350,
+        353,
+        354,
+        355,
+        356,
+        383,
+        387,
+        388,
+        404
+    ],
+    "scene": [
+        1,
+        2,
+        3,
+        4,
+        5,
+        9,
+        11,
+        12,
+        13,
+        14,
+        16,
+        18,
+        19,
+        21,
+        22,
+        24,
+        28,
+        42,
+        45,
+        46,
+        47,
+        48,
+        49,
+        50,
+        51,
+        53,
+        54,
+        55,
+        56,
+        59,
+        60,
+        64,
+        65,
+        66,
+        67,
+        69,
+        70,
+        71,
+        72,
+        84,
+        85,
+        87,
+        88,
+        89,
+        91,
+        92,
+        94,
+        96,
+        97,
+        98,
+        99,
+        100,
+        101,
+        102,
+        119,
+        120,
+        130,
+        132,
+        219,
+        222,
+        223,
+        224,
+        225,
+        227,
+        229,
+        230,
+        232,
+        234,
+        237,
+        282,
+        283,
+        288,
+        289,
+        290,
+        291,
+        305,
+        306,
+        308,
+        309,
+        310,
+        311,
+        313,
+        315,
+        317,
+        318,
+        319,
+        321,
+        322,
+        323,
+        324,
+        326,
+        329,
+        331,
+        332,
+        333,
+        335,
+        337,
+        338,
+        339,
+        341,
+        342,
+        345,
+        346,
+        347,
+        348,
+        353,
+        356,
+        388,
+        404
+    ],
+    "spatial relation": [
+        1,
+        4,
+        6,
+        7,
+        8,
+        10,
+        11,
+        13,
+        14,
+        15,
+        17,
+        20,
+        22,
+        23,
+        25,
+        27,
+        29,
+        31,
+        39,
+        40,
+        41,
+        43,
+        44,
+        48,
+        52,
+        57,
+        60,
+        63,
+        68,
+        73,
+        74,
+        75,
+        76,
+        77,
+        78,
+        79,
+        80,
+        81,
+        82,
+        83,
+        86,
+        87,
+        90,
+        91,
+        92,
+        93,
+        94,
+        95,
+        101,
+        119,
+        121,
+        122,
+        126,
+        127,
+        128,
+        131,
+        132,
+        133,
+        219,
+        220,
+        221,
+        222,
+        224,
+        226,
+        228,
+        231,
+        232,
+        233,
+        235,
+        237,
+        238,
+        270,
+        271,
+        274,
+        275,
+        276,
+        277,
+        286,
+        287,
+        303,
+        316,
+        320,
+        322,
+        325,
+        327,
+        332,
+        336,
+        337,
+        340,
+        343,
+        344,
+        345,
+        346,
+        347,
+        348,
+        350,
+        353,
+        354,
+        355,
+        356,
+        383,
+        387,
+        388,
+        404
+    ],
+    "action relation": [
+        1,
+        2,
+        3,
+        4,
+        5,
+        6,
+        7,
+        8,
+        9,
+        10,
+        12,
+        16,
+        18,
+        19,
+        20,
+        21,
+        23,
+        24,
+        25,
+        26,
+        27,
+        29,
+        30,
+        31,
+        32,
+        33,
+        34,
+        35,
+        36,
+        37,
+        38,
+        40,
+        42,
+        43,
+        46,
+        54,
+        56,
+        57,
+        59,
+        60,
+        73,
+        74,
+        75,
+        76,
+        77,
+        78,
+        81,
+        82,
+        83,
+        84,
+        86,
+        92,
+        93,
+        95,
+        96,
+        97,
+        98,
+        99,
+        100,
+        101,
+        102,
+        119,
+        120,
+        121,
+        122,
+        126,
+        127,
+        128,
+        131,
+        132,
+        133,
+        219,
+        223,
+        237,
+        274,
+        275,
+        276,
+        277,
+        280,
+        281,
+        282,
+        283,
+        288,
+        289,
+        290,
+        291,
+        292,
+        302,
+        303,
+        304,
+        305,
+        306,
+        307,
+        308,
+        309,
+        310,
+        311,
+        312,
+        313,
+        314,
+        315,
+        316,
+        317,
+        318,
+        319,
+        320,
+        321,
+        322,
+        323,
+        324,
+        325,
+        326,
+        327,
+        328,
+        329,
+        330,
+        331,
+        332,
+        333,
+        334,
+        335,
+        336,
+        337,
+        338,
+        339,
+        340,
+        341,
+        342,
+        343,
+        344,
+        345,
+        346,
+        348,
+        349,
+        350,
+        353,
+        355,
+        356
+    ],
+    "part relation": [
+        15,
+        276,
+        277,
+        278,
+        279,
+        284,
+        285,
+        44,
+        301,
+        302,
+        303,
+        307,
+        309,
+        313,
+        61,
+        62,
+        63,
+        64,
+        65,
+        66,
+        67,
+        68,
+        69,
+        70,
+        71,
+        72,
+        73,
+        74,
+        75,
+        320,
+        328,
+        338,
+        341,
+        354
+    ],
+    "counting": [
+        512,
+        513,
+        514,
+        515,
+        516,
+        517,
+        518,
+        519,
+        520,
+        521,
+        522,
+        523,
+        524,
+        525,
+        526,
+        527,
+        109,
+        123,
+        134,
+        135,
+        138,
+        139,
+        140,
+        141,
+        142,
+        143,
+        144,
+        145,
+        146,
+        147,
+        148,
+        149,
+        150,
+        151,
+        152,
+        153,
+        154,
+        155,
+        156,
+        157,
+        158,
+        161,
+        174,
+        176,
+        179,
+        198,
+        200,
+        201,
+        246,
+        247,
+        248,
+        249,
+        250,
+        251,
+        252,
+        298,
+        299,
+        357,
+        359,
+        360,
+        361,
+        362,
+        363,
+        365,
+        366,
+        367,
+        368,
+        370,
+        371,
+        375,
+        376,
+        400,
+        423,
+        424,
+        425,
+        426,
+        427,
+        428,
+        429,
+        430,
+        431,
+        432,
+        448,
+        462,
+        468,
+        473,
+        475,
+        477,
+        480,
+        483,
+        505,
+        506,
+        507,
+        508,
+        509,
+        510,
+        511
+    ],
+    "comparison": [
+        504,
+        359,
+        360,
+        362,
+        363,
+        295,
+        364,
+        492,
+        365,
+        493,
+        178,
+        179,
+        366,
+        494,
+        367,
+        495,
+        368,
+        496,
+        197,
+        497,
+        498,
+        206,
+        211,
+        503,
+        357,
+        358,
+        103,
+        104,
+        105,
+        106,
+        107,
+        108,
+        109,
+        110,
+        111,
+        112,
+        113,
+        114,
+        115,
+        116,
+        117,
+        118,
+        239,
+        240,
+        241,
+        242,
+        370,
+        499,
+        500,
+        501,
+        502
+    ],
+    "differentiation": [
+        105,
+        109,
+        110,
+        118,
+        123,
+        124,
+        125,
+        129,
+        134,
+        135,
+        136,
+        137,
+        155,
+        159,
+        160,
+        161,
+        162,
+        163,
+        164,
+        165,
+        166,
+        167,
+        168,
+        169,
+        170,
+        171,
+        172,
+        173,
+        174,
+        175,
+        176,
+        177,
+        178,
+        179,
+        180,
+        181,
+        182,
+        183,
+        184,
+        185,
+        195,
+        196,
+        197,
+        201,
+        202,
+        203,
+        204,
+        243,
+        244,
+        245,
+        246,
+        293,
+        294,
+        351,
+        352,
+        357,
+        358,
+        359,
+        360,
+        361,
+        362,
+        363,
+        364,
+        365,
+        366,
+        367,
+        368,
+        369,
+        370,
+        371,
+        372,
+        373,
+        374,
+        375,
+        376,
+        377,
+        378,
+        379,
+        380,
+        381,
+        382,
+        384,
+        385,
+        386,
+        389,
+        390,
+        391,
+        392,
+        393,
+        394,
+        395,
+        396,
+        397,
+        398,
+        399,
+        400,
+        401,
+        402,
+        403,
+        448,
+        473,
+        474,
+        475,
+        495,
+        500
+    ],
+    "negation": [
+        169,
+        170,
+        173,
+        178,
+        180,
+        186,
+        187,
+        188,
+        189,
+        190,
+        191,
+        192,
+        193,
+        194,
+        195,
+        196,
+        197,
+        198,
+        199,
+        200,
+        201,
+        202,
+        203,
+        204,
+        205,
+        206,
+        207,
+        208,
+        209,
+        210,
+        211,
+        212,
+        213,
+        214,
+        215,
+        216,
+        217,
+        218,
+        296,
+        297,
+        298,
+        361,
+        372,
+        433,
+        434,
+        435,
+        436,
+        437,
+        438,
+        439,
+        440,
+        441,
+        442,
+        443,
+        444,
+        445,
+        446,
+        447,
+        448,
+        449,
+        450,
+        451,
+        452,
+        453,
+        455,
+        456,
+        457,
+        458,
+        459,
+        460,
+        461,
+        462,
+        463,
+        464,
+        465,
+        466,
+        467,
+        468,
+        469,
+        470,
+        471,
+        472,
+        473,
+        474,
+        475,
+        476,
+        477,
+        478,
+        479,
+        480,
+        481,
+        482,
+        483,
+        484,
+        485,
+        486,
+        487,
+        488,
+        489,
+        490,
+        491
+    ],
+    "universal": [
+        256,
+        257,
+        258,
+        259,
+        260,
+        261,
+        262,
+        263,
+        264,
+        265,
+        266,
+        267,
+        268,
+        269,
+        405,
+        406,
+        407,
+        408,
+        409,
+        410,
+        411,
+        412,
+        413,
+        414,
+        415,
+        416,
+        417,
+        418,
+        419,
+        420,
+        421,
+        422,
+        423,
+        424,
+        425,
+        426,
+        427,
+        300,
+        428,
+        429,
+        430,
+        431,
+        432,
+        454,
+        253,
+        254,
+        255
+    ]
+}

univa/eval/genai/genai1600.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+pretrained_lvlm_name_or_path: /mnt/data/lb/Remake/UniWorld//checkpoints/flux_qwen2p5vl_7b_vlm_mlp_siglip_stage2_ts_1024_bs42x8x1_fa_any_11ratio_ema999_ocr_adamw_t5_1p0_lr5e-6_mask_refstyle_extract/checkpoint-20000/model_ema
+pretrained_denoiser_name_or_path: /mnt/data/checkpoints/black-forest-labs/FLUX.1-dev/
+pretrained_siglip_name_or_path: /mnt/data/checkpoints/google/siglip2-so400m-patch16-512
+joint_with_t5: true
+seed: 42
+allow_tf32: false
+output_dir: /mnt/data/lb/Remake/UniWorld//eval_output/genai1600
+num_images_per_prompt: 1
+num_inference_steps: 28
+guidance_scale: 3.5
+height: 1024
+width: 1024
+genai_prompt_path: eval_prompts/genai1600/genai_image.json

univa/eval/genai/genai527.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+pretrained_lvlm_name_or_path: /mnt/data/lb/Remake/UniWorld//checkpoints/flux_qwen2p5vl_7b_vlm_mlp_siglip_stage2_ts_1024_bs42x8x1_fa_any_11ratio_ema999_ocr_adamw_t5_1p0_lr5e-6_mask_refstyle_extract/checkpoint-20000/model_ema
+pretrained_denoiser_name_or_path: /mnt/data/checkpoints/black-forest-labs/FLUX.1-dev/
+pretrained_siglip_name_or_path: /mnt/data/checkpoints/google/siglip2-so400m-patch16-512
+joint_with_t5: true
+seed: 42
+allow_tf32: false
+output_dir: /mnt/data/lb/Remake/UniWorld//eval_output/genai527
+num_images_per_prompt: 1
+num_inference_steps: 28
+guidance_scale: 3.5
+height: 1024
+width: 1024
+genai_prompt_path: eval_prompts/genai527/genai_image.json

univa/eval/genai/step1_gen_samples.py ADDED Viewed

	@@ -0,0 +1,269 @@

+import sys
+import os
+root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+sys.path.append(root)
+import torch
+import random
+import subprocess
+import numpy as np
+import torch.distributed as dist
+import pandas as pd
+import argparse
+import torch
+import os
+from PIL import Image
+from tqdm import tqdm
+import torch.distributed as dist
+from qwen_vl_utils import process_vision_info
+from torchvision import transforms
+from transformers import AutoProcessor
+from transformers import SiglipImageProcessor, SiglipVisionModel
+from univa.utils.flux_pipeline import FluxPipeline
+from univa.eval.configuration_eval import EvalConfig
+from univa.utils.get_ocr import get_ocr_result
+from univa.utils.denoiser_prompt_embedding_flux import encode_prompt
+from univa.models.qwen2p5vl.modeling_univa_qwen2p5vl import UnivaQwen2p5VLForConditionalGeneration
+import pandas as pd
+from copy import deepcopy
+import json
+def get_meta(prompt_path):
+    '''
+    [
+        {
+            "Prompt": "a photo of a cat",
+            "Category": "",
+            "id": "",
+        },
+        ...
+    ]
+    '''
+    with open(prompt_path, 'r') as f:
+        meta_info = json.load(f)
+    ret_meta_info = []
+    for v in meta_info.values():
+        if 'models' in v: del v['models']
+        if 'prompt in Chinese' in v: del v['prompt in Chinese']
+        v['Prompts'] = deepcopy(v['prompt'])
+        if 'prompt' in v: del v['prompt']
+        v['Category'] = 'No Category'
+        v['id'] = f"{int(v['id']):09d}"
+        ret_meta_info.append(v)
+    return ret_meta_info
+# adapted from https://github.com/huggingface/accelerate/blob/main/src/accelerate/utils/random.py#L31
+def set_seed(seed, rank, device_specific=True):
+    if device_specific:
+        seed += rank
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+def initialize_models(args, device):
+    # Load main model and task head
+    model = UnivaQwen2p5VLForConditionalGeneration.from_pretrained(
+        args.pretrained_lvlm_name_or_path,
+        torch_dtype=torch.bfloat16,
+        attn_implementation="flash_attention_2",
+    ).to(device)
+    processor = AutoProcessor.from_pretrained(
+        args.pretrained_lvlm_name_or_path,
+        min_pixels=args.min_pixels,
+        max_pixels=args.max_pixels,
+    )
+    # Load FLUX pipeline
+    pipe = FluxPipeline.from_pretrained(
+        args.pretrained_denoiser_name_or_path,
+        transformer=model.denoise_tower.denoiser,
+        torch_dtype=torch.bfloat16,
+    ).to(device)
+    tokenizers = [pipe.tokenizer, pipe.tokenizer_2]
+    text_encoders = [pipe.text_encoder, pipe.text_encoder_2]
+    siglip_processor = SiglipImageProcessor.from_pretrained(args.pretrained_siglip_name_or_path)
+    siglip_model = SiglipVisionModel.from_pretrained(
+        args.pretrained_siglip_name_or_path,
+        torch_dtype=torch.bfloat16,
+    ).to(device)
+    return {
+        'model': model,
+        'processor': processor,
+        'pipe': pipe,
+        'tokenizers': tokenizers,
+        'text_encoders': text_encoders,
+        'device': device,
+        'siglip_model': siglip_model,
+        'siglip_processor': siglip_processor,
+    }
+def init_gpu_env(args):
+    local_rank = int(os.getenv('RANK', 0))
+    world_size = int(os.getenv('WORLD_SIZE', 1))
+    args.local_rank = local_rank
+    args.world_size = world_size
+    torch.cuda.set_device(local_rank)
+    dist.init_process_group(
+        backend='nccl', init_method='env://',
+        world_size=world_size, rank=local_rank
+        )
+    return args
+def run_model_and_return_samples(args, state, text, image1=None, image2=None):
+    # Build content
+    convo = []
+    image_paths = []
+    content = []
+    for img in (image1, image2):
+        if img:
+            content.append({'type':'image','image':img,'min_pixels':args.min_pixels,'max_pixels':args.max_pixels})
+            image_paths.append(img)
+    if text:
+        ocr_text = ''
+        if args.ocr_enhancer and content:
+            ocr_texts = []
+            for img in (image1, image2):
+                if img:
+                    ocr_texts.append(get_ocr_result(img, cur_ocr_i))
+                    cur_ocr_i += 1
+            ocr_text = '\n'.join(ocr_texts)
+        content.append({'type':'text','text': text + ocr_text})
+    if not args.only_use_t5:
+        convo.append({'role':'user','content':content})
+        # Prepare inputs
+        chat_text = state['processor'].apply_chat_template(
+            convo,
+            tokenize=False,
+            add_generation_prompt=True
+            )
+        chat_text = '<|im_end|>\n'.join(chat_text.split('<|im_end|>\n')[1:])
+        image_inputs, video_inputs = process_vision_info(convo)
+        inputs = state['processor'](
+            text=[chat_text], images=image_inputs, videos=video_inputs,
+            padding=True, return_tensors='pt'
+        ).to(state['device'])
+        # Generate
+        # image generation pipeline
+        siglip_hs = None
+        if state['siglip_processor'] and image_paths:
+            vals = [state['siglip_processor'].preprocess(
+                        images=Image.open(p).convert('RGB'), do_resize=True,
+                        return_tensors='pt', do_convert_rgb=True
+                    ).pixel_values.to(state['device'])
+                    for p in image_paths]
+            siglip_hs = state['siglip_model'](torch.concat(vals)).last_hidden_state
+        with torch.no_grad():
+            lvlm = state['model'](
+                inputs.input_ids, pixel_values=getattr(inputs,'pixel_values',None),
+                attention_mask=inputs.attention_mask,
+                image_grid_thw=getattr(inputs,'image_grid_thw',None),
+                siglip_hidden_states=siglip_hs,
+                output_type='denoise_embeds'
+            )
+            prm_embeds, pooled = encode_prompt(
+                state['text_encoders'], state['tokenizers'],
+                text if args.joint_with_t5 else '', 256, state['device'], 1
+            )
+        emb = torch.concat([lvlm, prm_embeds], dim=1) if args.joint_with_t5 else lvlm
+    else:
+        prm_embeds, pooled = encode_prompt(
+            state['text_encoders'], state['tokenizers'],
+            text, 256, state['device'], 1
+        )
+        emb = prm_embeds
+    with torch.no_grad():
+        img = state['pipe'](
+            prompt_embeds=emb,
+            pooled_prompt_embeds=pooled,
+            height=args.height,
+            width=args.width,
+            num_inference_steps=args.num_inference_steps,
+            guidance_scale=args.guidance_scale,
+            num_images_per_prompt=args.num_images_per_prompt,
+        ).images
+    return img
+def main(args):
+    args = init_gpu_env(args)
+    torch.backends.cuda.matmul.allow_tf32 = False
+    torch.backends.cudnn.allow_tf32 = False
+    if args.allow_tf32:
+        torch.backends.cuda.matmul.allow_tf32 = True
+        torch.backends.cudnn.allow_tf32 = True
+    set_seed(args.seed, rank=args.local_rank, device_specific=True)
+    device = torch.cuda.current_device()
+    state = initialize_models(args, device)
+    meta_info = get_meta(args.genai_prompt_path)
+    print(f'origin meta_info ({len(meta_info)})')
+    text_and_savepath = [
+        [
+            meta_info[i]['Prompts'], os.path.join(args.output_dir, f"{meta_info[i]['id']}.jpg")
+            ] for i in range(len(meta_info))
+        ]
+    text_and_savepath_ = [
+        [text_prompt, save_path] for text_prompt, save_path in text_and_savepath if not os.path.exists(save_path)
+    ]
+    print(f'need to process ({len(text_and_savepath_)})')
+    if len(text_and_savepath_) == 0:
+        import sys;sys.exit(0)
+    text_and_savepath = text_and_savepath[args.local_rank::args.world_size]
+    os.makedirs(args.output_dir, exist_ok=True)
+    print(f'args: {args}')
+    cnt = 0
+    for text_prompt, save_path in tqdm(text_and_savepath):
+        # print(text_prompt, save_path)
+        if os.path.exists(save_path):
+            continue
+        set_seed(args.seed + cnt * 50, rank=args.local_rank, device_specific=True)
+        image = run_model_and_return_samples(args, state, text_prompt, image1=None, image2=None)
+        image = image[0]
+        image.save(save_path)
+        # import ipdb;ipdb.set_trace()
+        assert args.num_samples_per_prompt == 1
+        cnt += 1
+if __name__ == "__main__":
+    import argparse
+    from omegaconf import OmegaConf
+    parser = argparse.ArgumentParser()
+    parser.add_argument("config", type=str)
+    parser.add_argument("--pretrained_lvlm_name_or_path", type=str, default=None, required=False)
+    parser.add_argument("--output_dir", type=str, default=None, required=False)
+    args = parser.parse_args()
+    config = OmegaConf.load(args.config)
+    schema = OmegaConf.structured(EvalConfig)
+    conf = OmegaConf.merge(schema, config)
+    if args.pretrained_lvlm_name_or_path is not None:
+        assert args.output_dir is not None
+        conf.pretrained_lvlm_name_or_path = args.pretrained_lvlm_name_or_path
+        conf.output_dir = args.output_dir
+    main(conf)

univa/eval/genai/step2_run_model.py ADDED Viewed

	@@ -0,0 +1,113 @@

+# Evaluate on GenAI-Bench-Image (with 527 prompt) using a specific model
+# Example scripts to run:
+# VQAScore: python genai_image_eval.py --model clip-flant5-xxl
+# CLIPScore: python genai_image_eval.py --model openai:ViT-L-14-336
+# GPT4o VQAScore: python genai_image_eval.py --model gpt-4o
+import sys
+import os
+import argparse
+import os
+import t2v_metrics
+import json
+import torch
+import numpy as np
+from t2v_metrics.dataset import GenAIBench_Image
+tag_groups = {
+    'basic': ['attribute', 'scene', 'spatial relation', 'action relation', 'part relation', 'basic'],
+    'advanced': ['counting', 'comparison', 'differentiation', 'negation', 'universal', 'advanced'],
+    'overall': ['basic', 'advanced', 'all']
+}
+def show_performance_per_skill(our_scores, dataset, items_name='images', prompt_to_items_name='prompt_to_images', print_std=False):
+    tag_result = {}
+    tag_file = f"{dataset.meta_dir}/genai_skills.json"
+    tags = json.load(open(tag_file))
+    items = getattr(dataset, items_name)
+    prompt_to_items = getattr(dataset, prompt_to_items_name)
+    items_by_model_tag = {}
+    for tag in tags:
+        items_by_model_tag[tag] = {}
+        for prompt_idx in tags[tag]:
+            for image_idx in prompt_to_items[f"{prompt_idx:05d}"]:
+                model = items[image_idx]['model']
+                if model not in items_by_model_tag[tag]:
+                    items_by_model_tag[tag][model] = []
+                items_by_model_tag[tag][model].append(image_idx)
+    for tag in tags:
+        # print(f"Tag: {tag}")
+        tag_result[tag] = {}
+        for model in items_by_model_tag[tag]:
+            our_scores_mean = our_scores[items_by_model_tag[tag][model]].mean()
+            our_scores_std = our_scores[items_by_model_tag[tag][model]].std()
+            # print(f"{model} (Metric Score): {our_scores_mean:.2f} +- {our_scores_std:.2f}")
+            tag_result[tag][model] = {
+                'metric': {'mean': our_scores_mean, 'std': our_scores_std},
+            }
+        # print()
+    # print("All")
+    tag_result['all'] = {}
+    all_models = items_by_model_tag[tag]
+    for model in all_models:
+        all_model_indices = set()
+        for tag in items_by_model_tag:
+            all_model_indices = all_model_indices.union(set(items_by_model_tag[tag][model]))
+        all_model_indices = list(all_model_indices)
+        our_scores_mean = our_scores[all_model_indices].mean()
+        our_scores_std = our_scores[all_model_indices].std()
+        # print(f"{model} (Metric Score): {our_scores_mean:.2f} +- {our_scores_std:.2f}")
+        tag_result['all'][model] = {
+            'metric': {'mean': our_scores_mean, 'std': our_scores_std},
+        }
+    for tag_group in tag_groups:
+        for score_name in ['metric']:
+            print(f"Tag Group: {tag_group} ({score_name} performance)")
+            tag_header = f"{'Model':<17}" + " ".join([f"{tag:<17}" for tag in tag_groups[tag_group]])
+            print(tag_header)
+            for model_name in all_models:
+                if print_std:
+                    detailed_scores = [f"{tag_result[tag][model_name][score_name]['mean']:.6f}+-{tag_result[tag][model_name][score_name]['std']:.6f}" for tag in tag_groups[tag_group]]
+                else:
+                    detailed_scores = [f"{tag_result[tag][model_name][score_name]['mean']:.6f}" for tag in tag_groups[tag_group]]
+                detailed_scores = " ".join([f"{score:<17}" for score in detailed_scores])
+                model_scores = f"{model_name:<17}" + detailed_scores
+                print(model_scores)
+            print()
+        print()
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--meta_dir", type=str, required=True)
+    parser.add_argument("--model_path", type=str, required=True)
+    parser.add_argument("--image_dir", type=str, required=True)
+    parser.add_argument("--batch_size", type=int, default=16)
+    parser.add_argument("--seed", type=int, default=1234)
+    args = parser.parse_args()
+    return args
+def main():
+    args = get_args()
+    image_dir = args.image_dir
+    meta_dir = args.meta_dir
+    dataset = GenAIBench_Image(root_dir=image_dir, meta_dir=meta_dir)
+    model = args.model_path
+    device = torch.device('cuda:0')
+    score_func = t2v_metrics.get_score_model(model=model, device=device)
+    kwargs = {}
+    scores = score_func.batch_forward(dataset, batch_size=args.batch_size, **kwargs).cpu()
+    ### Get performance per skill
+    our_scores = scores.mean(axis=1)
+    show_performance_per_skill(our_scores, dataset, print_std=True)
+if __name__ == "__main__":
+    main()

univa/eval/genai/t2v_metrics/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from .constants import HF_CACHE_DIR
+from .vqascore import VQAScore, list_all_vqascore_models
+def list_all_models():
+    return list_all_vqascore_models()
+def get_score_model(model='clip-flant5-xxl', device='cuda', cache_dir=HF_CACHE_DIR, **kwargs):
+    return VQAScore(model, device=device, cache_dir=cache_dir, **kwargs)

univa/eval/genai/t2v_metrics/clipscore.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from typing import List
+from .score import Score
+from .constants import HF_CACHE_DIR
+from .models.clipscore_models import list_all_clipscore_models, get_clipscore_model
+class CLIPScore(Score):
+    def prepare_scoremodel(self,
+                           model='openai:ViT-L/14',
+                           device='cuda',
+                           cache_dir=HF_CACHE_DIR):
+        return get_clipscore_model(
+            model,
+            device=device,
+            cache_dir=cache_dir
+        )
+    def list_all_models(self) -> List[str]:
+        return list_all_clipscore_models()

univa/eval/genai/t2v_metrics/constants.py ADDED Viewed

	@@ -0,0 +1,8 @@

+HF_CACHE_DIR = "./hf_cache/" # TODO: change this to your own cache dir
+# For CLIP-FlanT5 and LLaVA-1.5 (copied from llava)
+CONTEXT_LEN = 2048
+SYSTEM_MSG = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"