File size: 7,630 Bytes
2299e1d cfb8f95 2299e1d d7210ba 2299e1d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
---
license: cc-by-nc-4.0
base_model: OpenGVLab/InternVL2_5-1B-MPO
datasets:
- Salesforce/CogAlign
language:
- multilingual
model-index:
- name: cogalign-internvl2.5-mpo-1b
results: []
---
# Why Vision Language Models Struggle with Visual Arithmetic? Towards Enhanced Chart and Geometry Understanding
Vision Language Models (VLMs) have achieved remarkable progress in multimodal tasks, yet they often struggle with visual arithmetic, seemingly simple capabilities like object counting or length comparison, which are essential for relevant complex tasks like chart understanding and geometric reasoning. In this work, we first investigate the root causes of this deficiency through a suite of probing tasks focusing on basic visual arithmetic. Our analysis reveals that while pre-trained vision encoders typically capture sufficient information, the text decoder often fails to decode it correctly for arithmetic reasoning. To address this, we propose CogAlign, a novel post-training strategy inspired by Piaget's theory of cognitive development. CogAlign trains VLMs to recognize invariant properties under visual transformations. We demonstrate that this approach significantly improves the performance of three diverse VLMs on our proposed probing tasks. Furthermore, CogAlign enhances performance by an average of 4.6% on CHOCOLATE and 2.9% on MATH-VISION, outperforming or matching supervised fine-tuning methods while requiring only 60% less training data. These results highlight the effectiveness and generalizability of CogAlign in improving fundamental visual arithmetic capabilities and their transfer to downstream tasks.
### Quick start
Loading models can be done easily with `transformers`:
```python
from transformers import AutoTokenizer, AutoModel, AutoConfig
path = "Salesforce/cogalign-internvl2_5-mpo-1b"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True).eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
```
Then, we define some functions for inference
```python
# Adapted from https://huggingface.co/OpenGVLab/InternVL2_5-1B-MPO
import copy
import pandas as pd
from datasets import load_dataset
import requests
import numpy as np
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
# Taken from InternVL's code
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float('inf')
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = set(
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
i * j <= max_num and i * j >= min_num)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
def load_image(image_file, input_size=448, max_num=12, is_url=False):
if is_url:
image = Image.open(requests.get(image_file, stream=True).raw)
else:
image = Image.open(image_file).convert('RGB')
transform = build_transform(input_size=input_size)
images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values = [transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return pixel_values
```
After that, we can try an example on the CHOCOLATE datset:
```python
chocolate = load_dataset("khhuang/CHOCOLATE")["test"]
chocolate_df = pd.DataFrame(chocolate)
chocolate_df_lvlm = chocolate_df.loc[chocolate_df.split=="LVLM",:]
instance = chocolate_df_lvlm.iloc[2]
caption = ' '.join(instance.sentences)
url = instance.image_path
pixel_values = load_image(url, max_num=12, is_url=True).to(torch.bfloat16).cuda()
generation_config = dict(max_new_tokens=1024, do_sample=True)
prompt = f"""
You are given a chart and a caption, you are tasked to detect whether the caption is factually
consistent with the chart. A caption is factually consistent with the chart if it describes the datapoints within the charts without factual errors (e.g. wrong label, value, trends).
[Start of Caption]
{caption}
[End of Caption]
For the above caption, you should respond 'Answer: Yes' if it is factually consistent with the chart. Otherwise, respond 'Answer: No'. Do not provide explanation or other thing.
"""
question = f'<image>\n{prompt}'
response = model.chat(tokenizer, pixel_values, question, generation_config)
print(f'User: {question}\nAssistant: {response}')
```
### License information
This release is for research purposes only in support of an academic paper. This repository is licensed under the noncommercial license [CC-BY-NC 4.0](https://creativecommons.org/licenses/by-nc/4.0/).
### Citation
If you find CogAlign useful in your research, please consider citing:
```
@misc{huang-etal-2025-cogalign,
title = "Why Vision Language Models Struggle with Visual Arithmetic? Towards Enhanced Chart and Geometry Understanding",
author = "Huang, Kung-Hsiang and
Qin, Can and
Qiu, Haoyi and
Laban, Philippe and
Joty, Shafiq and
Xiong, Caiming and
Wu, Chien-Sheng",
year = "2025",
eprint={2502.11492},
archivePrefix = "arXiv",
primaryClass={cs.AI}
}
``` |