Qwen2.5-VL-7B-Instruct

Running on Zero

App Files Files Community

developer0hye commited on 15 days ago

Commit

5e746b8

verified ·

1 Parent(s): fbbbccb

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -139

app.py CHANGED Viewed

@@ -1,154 +1,87 @@
 import gradio as gr
 import spaces
 import torch
-import math
-import numpy as np
 import os
-from PIL import Image
-import torchvision.transforms as T
-from torchvision.transforms.functional import InterpolationMode
-from transformers import AutoModel, AutoTokenizer, AutoConfig
-# =============================================================================
-# InternVL‑3 preprocessing utilities (image‑only version)
-# =============================================================================
-IMAGENET_MEAN = (0.485, 0.456, 0.406)
-IMAGENET_STD = (0.229, 0.224, 0.225)
-def build_transform(input_size: int = 448):
-    """Return torchvision transform matching InternVL pre‑training."""
-    return T.Compose(
-        [
-            T.Lambda(lambda img: img.convert("RGB") if img.mode != "RGB" else img),
-            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
-            T.ToTensor(),
-            T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
-        ]
-    )
-def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
-    best_ratio_diff = float("inf")
-    best_ratio = (1, 1)
-    area = width * height
-    for ratio in target_ratios:
-        tgt_ar = ratio[0] / ratio[1]
-        diff = abs(aspect_ratio - tgt_ar)
-        if diff < best_ratio_diff or (diff == best_ratio_diff and area > 0.5 * image_size * image_size * ratio[0] * ratio[1]):
-            best_ratio_diff = diff
-            best_ratio = ratio
-    return best_ratio
-def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
-    """Split arbitrarily‑sized image into ≤12 tiles sized 448×448 (InternVL spec)."""
-    ow, oh = image.size
-    aspect_ratio = ow / oh
-    target_ratios = sorted(
-        {(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if min_num <= i * j <= max_num},
-        key=lambda x: x[0] * x[1],
-    )
-    ratio = find_closest_aspect_ratio(aspect_ratio, target_ratios, ow, oh, image_size)
-    tw, th = image_size * ratio[0], image_size * ratio[1]
-    blocks = ratio[0] * ratio[1]
-    resized = image.resize((tw, th))
-    tiles = [
-        resized.crop(
-            (
-                (idx % (tw // image_size)) * image_size,
-                (idx // (tw // image_size)) * image_size,
-                ((idx % (tw // image_size)) + 1) * image_size,
-                ((idx // (tw // image_size)) + 1) * image_size,
-            )
-        )
-        for idx in range(blocks)
-    ]
-    if use_thumbnail and blocks != 1:
-        tiles.append(image.resize((image_size, image_size)))
-    return tiles
-def load_image(path: str, input_size: int = 448, max_num: int = 12):
-    """Return tensor of shape (N, 3, H, W) ready for InternVL."""
-    img = Image.open(path).convert("RGB")
-    transform = build_transform(input_size)
-    tiles = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
-    return torch.stack([transform(t) for t in tiles])
 # =============================================================================
-# InternVL‑3‑8B model loading (multi‑GPU aware)
 # =============================================================================
-MODEL_ID = "OpenGVLab/InternVL3-8B"
-def split_model(model_name: str):
-    """Distribute LLM layers across GPUs, keeping vision encoder on GPU 0."""
-    n_gpu = torch.cuda.device_count()
-    if n_gpu < 2:
-        return "auto"  # let transformers decide
-    cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
-    n_layers = cfg.llm_config.num_hidden_layers  # type: ignore[attr-defined]
-    # GPU0 does vision + some text layers => treat as 0.5 GPU
-    per_gpu = math.ceil(n_layers / (n_gpu - 0.5))
-    alloc = [per_gpu] * n_gpu
-    alloc[0] = math.ceil(alloc[0] * 0.5)
-    dmap = {
-        "vision_model": 0,
-        "mlp1": 0,
-        "language_model.model.tok_embeddings": 0,
-        "language_model.model.embed_tokens": 0,
-        "language_model.output": 0,
-        "language_model.model.norm": 0,
-        "language_model.model.rotary_emb": 0,
-        "language_model.lm_head": 0,
-    }
-    layer_idx = 0
-    for gpu, n in enumerate(alloc):
-        for _ in range(n):
-            if layer_idx >= n_layers:
-                break
-            dmap[f"language_model.model.layers.{layer_idx}"] = 0 if layer_idx == n_layers - 1 else gpu
-            layer_idx += 1
-    return dmap
-device_map = split_model(MODEL_ID)
-model = AutoModel.from_pretrained(
     MODEL_ID,
-    torch_dtype=torch.bfloat16,
-    low_cpu_mem_usage=True,
-    use_flash_attn=True,
-    trust_remote_code=True,
-    device_map=device_map,
-).eval()
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True, use_fast=False)
 # =============================================================================
-# Inference function (image‑only)
 # =============================================================================
 @spaces.GPU
-def internvl_inference(image_path: str | None, text_input: str | None = None):
     if image_path is None:
         return "Please upload an image first."
-    pixel_values = load_image(image_path, max_num=12).to(torch.bfloat16).cuda()
-    prompt = f"<image>\n{text_input}" if text_input else "<image>\n"
-    gen_cfg = dict(max_new_tokens=1024, do_sample=True)
-    return model.chat(tokenizer, pixel_values, prompt, gen_cfg)
 # =============================================================================
-# Gradio UI (image‑only, Gradio 5 compatible)
 # =============================================================================
 DESCRIPTION = (
-    "[InternVL 3‑8B demo](https://huggingface.co/OpenGVLab/InternVL3-8B) — "
     "upload an image and ask anything about it."
 )
@@ -164,26 +97,23 @@ with gr.Blocks(css=css, theme="origin") as demo:
     gr.Markdown(DESCRIPTION)
     with gr.Row():
-        # Left column: image, question, submit button (stacked vertically)
         with gr.Column(scale=1):
             input_image = gr.Image(label="Upload Image", type="filepath")
             text_input = gr.Textbox(label="Question")
             submit_btn = gr.Button("Submit")
-        # Right column: model output
         with gr.Column(scale=1):
             output_text = gr.Textbox(label="Model Output", elem_id="output_text")
-    # 🔽 예제 추가
     gr.Examples(
-        examples=[["example.webp", "explain this image"]],
         inputs=[input_image, text_input],
         outputs=output_text,
-        fn=internvl_inference,     # 클릭 시 바로 실행하려면 지정
-        cache_examples=True,       # 결과 캐시(선택)
-        label="Try an example"     # 표기명(선택)
     )
-    submit_btn.click(internvl_inference, [input_image, text_input], [output_text])
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import spaces
 import torch
 import os
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from qwen_vl_utils import process_vision_info  # pip install qwen-vl-utils[decord]==0.0.8
 # =============================================================================
+# Qwen2.5-VL-7B-Instruct: model & processor
 # =============================================================================
+MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"
+# 권장: flash-attn2 사용 (환경에 따라 주석 해제)
+# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+#     MODEL_ID,
+#     torch_dtype=torch.bfloat16,
+#     attn_implementation="flash_attention_2",
+#     device_map="auto",
+# )
+# 기본 로드
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID,
+    torch_dtype="auto",
+    device_map="auto",
+)
+model.eval()
+# 해상도 자동 조절(기본값 사용). 필요시 min/max_pixels로 토큰 비용 제어 가능.
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+# 예: min_pixels = 256*28*28; max_pixels = 1280*28*28
+# processor = AutoProcessor.from_pretrained(MODEL_ID, min_pixels=min_pixels, max_pixels=max_pixels)
 # =============================================================================
+# Inference (image-only UI, text는 선택)
 # =============================================================================
 @spaces.GPU
+def qwen_vl_inference(image_path: str | None, text_input: str | None = None):
     if image_path is None:
         return "Please upload an image first."
+    # Qwen은 파일 경로를 file:// URI로 전달하는 방식을 공식 예제로 제공
+    file_uri = f"file://{os.path.abspath(image_path)}"
+    user_text = text_input.strip() if text_input else "Describe this image."
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": file_uri},
+                {"type": "text", "text": user_text},
+            ],
+        }
+    ]
+    # 텍스트/비전 전처리
+    chat_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=[chat_text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    # 모델 디바이스로 이동 (device_map="auto" 환경에서도 안전)
+    inputs = {k: (v.to(model.device) if isinstance(v, torch.Tensor) else v) for k, v in inputs.items()}
+    # 생성
+    gen_ids = model.generate(**inputs, max_new_tokens=512)
+    # 입력 토큰 제거 후 디코딩
+    trimmed = [out[len(inp):] for inp, out in zip(inputs["input_ids"], gen_ids)]
+    output = processor.batch_decode(trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    return output
 # =============================================================================
+# Gradio UI (Gradio 5)
 # =============================================================================
 DESCRIPTION = (
+    "[Qwen2.5-VL-7B-Instruct demo](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) — "
     "upload an image and ask anything about it."
 )
     gr.Markdown(DESCRIPTION)
     with gr.Row():
         with gr.Column(scale=1):
             input_image = gr.Image(label="Upload Image", type="filepath")
             text_input = gr.Textbox(label="Question")
             submit_btn = gr.Button("Submit")
         with gr.Column(scale=1):
             output_text = gr.Textbox(label="Model Output", elem_id="output_text")
     gr.Examples(
+        examples=[["example.webp", "Explain this image"]],
         inputs=[input_image, text_input],
         outputs=output_text,
+        fn=qwen_vl_inference,
+        cache_examples=True,
+        label="Try an example"
     )
+    submit_btn.click(qwen_vl_inference, [input_image, text_input], [output_text])
 if __name__ == "__main__":
     demo.launch()