Vintern-1B-v3_5-Demo

Running on Zero

@@ -7,14 +7,14 @@ import os
 import time
 import hashlib
 import re
 import gradio as gr
 import requests
 import random
 from filelock import FileLock
 from io import BytesIO
 from PIL import Image, ImageDraw, ImageFont
 from constants import LOGDIR
 from utils import (
     build_logger,
@@ -25,7 +25,8 @@ from utils import (
     get_log_filename,
 )
 from threading import Thread
-import torch
 from conversation import Conversation
 from transformers import AutoModel, AutoTokenizer, TextIteratorStreamer
@@ -166,6 +167,7 @@ def add_text(state, message, system_prompt, request: gr.Request):
     ) * 5
 model_name = "5CD-AI/Vintern-1B-v3_5"
 model = AutoModel.from_pretrained(
     model_name,
     torch_dtype=torch.bfloat16,
@@ -196,7 +198,6 @@ def http_bot(
         ) + (no_change_btn,) * 5
         return
-    # No available worker
     if model is None:
         # state.messages[-1][-1] = server_error_msg
         state.update_message(Conversation.ASSISTANT, server_error_msg)
@@ -225,16 +226,33 @@ def http_bot(
     try:
         # Stream output
         # response = requests.post(worker_addr, json=pload, headers=headers, stream=True, timeout=300)
-        streamer = TextIteratorStreamer(
-            tokenizer, skip_prompt=True, skip_special_tokens=True
-        )
-        generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
-        thread = Thread(target=model.generate, kwargs=generation_kwargs)
-        thread.start()
         buffer = ""
-        for new_text in streamer:
             buffer += new_text
             # Remove <|im_end|> or similar tokens from the output
             buffer = buffer.replace("<|im_end|>", "")
@@ -247,6 +265,8 @@ def http_bot(
             ) + (disable_btn,) * 5
     except Exception as e:
         state.update_message(Conversation.ASSISTANT, server_error_msg, None)
         yield (
             state,
@@ -289,20 +309,19 @@ def http_bot(
 # <h1 style="font-size: 28px; font-weight: bold;">Expanding Performance Boundaries of Open-Source Multimodal Models with Model, Data, and Test-Time Scaling</h1>
 title_html = """
-<img src="https://internvl.opengvlab.com/assets/logo-47b364d3.jpg" style="width: 280px; height: 70px;">
-<p>Vintern-1B: Expanding Performance Boundaries of Open-Source Multimodal Models with Model, Data, and Test-Time Scaling</p>
-<a href="https://internvl.github.io/blog/2024-12-05-InternVL-2.5/">[🆕 InternVL Blog]</a>
-<a href="https://huggingface.co/papers/2412.05271">[📖 InternVL Paper]</a>
-<a href="https://github.com/OpenGVLab/InternVL">[🌟 Github]</a><br>
-<a href="https://internvl.readthedocs.io/en/latest/">[📜 Document]</a>
-<a href="https://internvl.opengvlab.com/">[🗨️ Official Demo]</a>
 """
 tos_markdown = """
 ### Terms of use
 By using this service, users are required to agree to the following terms:
-The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
 Please click the "Flag" button if you get any inappropriate answer! We will collect those to keep improving our moderator.
 For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
 """
@@ -332,45 +351,45 @@ block_css = """
 }
 """
-js = """
-function createWaveAnimation() {
-    const text = document.getElementById('text');
-    var i = 0;
-    setInterval(function() {
-        const colors = [
-            'red, orange, yellow, green, blue, indigo, violet, purple',
-            'orange, yellow, green, blue, indigo, violet, purple, red',
-            'yellow, green, blue, indigo, violet, purple, red, orange',
-            'green, blue, indigo, violet, purple, red, orange, yellow',
-            'blue, indigo, violet, purple, red, orange, yellow, green',
-            'indigo, violet, purple, red, orange, yellow, green, blue',
-            'violet, purple, red, orange, yellow, green, blue, indigo',
-            'purple, red, orange, yellow, green, blue, indigo, violet',
-        ];
-        const angle = 45;
-        const colorIndex = i % colors.length;
-        text.style.background = `linear-gradient(${angle}deg, ${colors[colorIndex]})`;
-        text.style.webkitBackgroundClip = 'text';
-        text.style.backgroundClip = 'text';
-        text.style.color = 'transparent';
-        text.style.fontSize = '28px';
-        text.style.width = 'auto';
-        text.textContent = 'Vintern-1B';
-        text.style.fontWeight = 'bold';
-        i += 1;
-    }, 200);
-    const params = new URLSearchParams(window.location.search);
-    url_params = Object.fromEntries(params);
-    // console.log(url_params);
-    // console.log('hello world...');
-    // console.log(window.location.search);
-    // console.log('hello world...');
-    // alert(window.location.search)
-    // alert(url_params);
-    return url_params;
-}
-"""
 def build_demo():
@@ -472,7 +491,7 @@ def build_demo():
             with gr.Column(scale=8):
                 chatbot = gr.Chatbot(
                     elem_id="chatbot",
-                    label="InternVL",
                     height=580,
                     show_copy_button=True,
                     show_share_button=True,

 import time
 import hashlib
 import re
+import torch
 import gradio as gr
 import requests
 import random
 from filelock import FileLock
 from io import BytesIO
 from PIL import Image, ImageDraw, ImageFont
+from models import load_image
 from constants import LOGDIR
 from utils import (
     build_logger,
     get_log_filename,
 )
 from threading import Thread
+import traceback
+# import torch
 from conversation import Conversation
 from transformers import AutoModel, AutoTokenizer, TextIteratorStreamer
     ) * 5
 model_name = "5CD-AI/Vintern-1B-v3_5"
+model = None
 model = AutoModel.from_pretrained(
     model_name,
     torch_dtype=torch.bfloat16,
         ) + (no_change_btn,) * 5
         return
     if model is None:
         # state.messages[-1][-1] = server_error_msg
         state.update_message(Conversation.ASSISTANT, server_error_msg)
     try:
         # Stream output
         # response = requests.post(worker_addr, json=pload, headers=headers, stream=True, timeout=300)
+        print(f"all_image_paths: {all_image_paths}")
+        pixel_values = load_image(all_image_paths[0], max_num=6).to(torch.bfloat16)
+        print(f"pixel_values: {pixel_values}")
+        generation_config = dict(max_new_tokens= 700, do_sample=False, num_beams = 3, repetition_penalty=2.5)
+        message = state.get_user_message(source=state.USER)
+        print(f"######################")
+        print(f"message: {message}")
+        if pixel_values is not None:
+            question = '<image>\n'+message
+        else:
+            question = message
+        response, conv_history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
+        print(f"AI response: {response}")
+        # streamer = TextIteratorStreamer(
+        #     tokenizer, skip_prompt=True, skip_special_tokens=True
+        # )
+        # generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
+        # thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        # thread.start()
+        # response = "This is a test response"
         buffer = ""
+        for new_text in response:
             buffer += new_text
             # Remove <|im_end|> or similar tokens from the output
             buffer = buffer.replace("<|im_end|>", "")
             ) + (disable_btn,) * 5
     except Exception as e:
+        logger.error(f"Error in http_bot: {e}")
+        traceback.print_exc()
         state.update_message(Conversation.ASSISTANT, server_error_msg, None)
         yield (
             state,
 # <h1 style="font-size: 28px; font-weight: bold;">Expanding Performance Boundaries of Open-Source Multimodal Models with Model, Data, and Test-Time Scaling</h1>
 title_html = """
+<div style="text-align: center;">
+    <img src="https://lh3.googleusercontent.com/pw/AP1GczMmW-aFQ4dNaR_LCAllh4UZLLx9fTZ1ITHeGVMWx-1bwlIWz4VsWJSGb3_9C7CQfvboqJH41y2Sbc5ToC9ZmKeV4-buf_DEevIMU0HtaLWgHAPOqBiIbG6LaE8CvDqniLZzvB9UX8TR_-YgvYzPFt2z=w1472-h832-s-no-gm?authuser=0" style="height: 100; width: 100%;">
+    <p>Vintern-1B: An Efficient Multimodal Large Language Model for Vietnamese</p>
+    <a href="https://huggingface.co/papers/2408.12480">[📖 Vintern Paper]</a>
+    <a href="https://huggingface.co/5CD-AI">[🤗 5CD-AI Huggingface]</a>
+</div>
 """
 tos_markdown = """
 ### Terms of use
 By using this service, users are required to agree to the following terms:
+It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
 Please click the "Flag" button if you get any inappropriate answer! We will collect those to keep improving our moderator.
 For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
 """
 }
 """
+# js = """
+# function createWaveAnimation() {
+#     const text = document.getElementById('text');
+#     var i = 0;
+#     setInterval(function() {
+#         const colors = [
+#             'red, orange, yellow, green, blue, indigo, violet, purple',
+#             'orange, yellow, green, blue, indigo, violet, purple, red',
+#             'yellow, green, blue, indigo, violet, purple, red, orange',
+#             'green, blue, indigo, violet, purple, red, orange, yellow',
+#             'blue, indigo, violet, purple, red, orange, yellow, green',
+#             'indigo, violet, purple, red, orange, yellow, green, blue',
+#             'violet, purple, red, orange, yellow, green, blue, indigo',
+#             'purple, red, orange, yellow, green, blue, indigo, violet',
+#         ];
+#         const angle = 45;
+#         const colorIndex = i % colors.length;
+#         text.style.background = `linear-gradient(${angle}deg, ${colors[colorIndex]})`;
+#         text.style.webkitBackgroundClip = 'text';
+#         text.style.backgroundClip = 'text';
+#         text.style.color = 'transparent';
+#         text.style.fontSize = '28px';
+#         text.style.width = 'auto';
+#         text.textContent = 'Vintern-1B';
+#         text.style.fontWeight = 'bold';
+#         i += 1;
+#     }, 200);
+#     const params = new URLSearchParams(window.location.search);
+#     url_params = Object.fromEntries(params);
+#     // console.log(url_params);
+#     // console.log('hello world...');
+#     // console.log(window.location.search);
+#     // console.log('hello world...');
+#     // alert(window.location.search)
+#     // alert(url_params);
+#     return url_params;
+# }
+# """
 def build_demo():
             with gr.Column(scale=8):
                 chatbot = gr.Chatbot(
                     elem_id="chatbot",
+                    label="Vintern",
                     height=580,
                     show_copy_button=True,
                     show_share_button=True,

conversation.py CHANGED Viewed

@@ -173,6 +173,15 @@ class Conversation:
                 images.append(image)
         return images
     def to_gradio_chatbot(self):
         ret = []
@@ -231,12 +240,14 @@ class Conversation:
     def update_message(self, role, content, image=None, idx=-1):
         assert len(self.messages) > 0, "No message in the conversation."
         idx = (idx + len(self.messages)) % len(self.messages)
-        assert (
-            self.messages[idx]["role"] == role
-        ), f"Role mismatch: {role} vs {self.messages[idx]['role']}"
         self.messages[idx]["content"] = content
         if image is not None:
@@ -245,6 +256,8 @@ class Conversation:
             if not isinstance(image, list):
                 image = [image]
             self.messages[idx]["image"].extend(image)
     def return_last_message(self):
         return self.messages[-1]["content"]

                 images.append(image)
         return images
+    def get_user_message(self, source: Union[str, None] = None):
+        assert len(self.messages) > 0, "No message in the conversation."
+        assert source in [self.USER, self.ASSISTANT, None], f"Invalid source: {source}"
+        for i, msg in enumerate(self.messages):
+            if source and msg["role"] != source:
+                continue
+            if msg["role"] == self.USER:
+                return msg["content"]
     def to_gradio_chatbot(self):
         ret = []
     def update_message(self, role, content, image=None, idx=-1):
         assert len(self.messages) > 0, "No message in the conversation."
+        print(f"Messsage: {self.messages}")
         idx = (idx + len(self.messages)) % len(self.messages)
+        # assert (
+        #     self.messages[idx]["role"] == role
+        # ), f"Role mismatch: {role} vs {self.messages[idx]['role']}"
+        if role != Conversation.ASSISTANT and self.messages[idx]["role"] != role:
+            raise AssertionError(f"Role mismatch: {role} vs {self.messages[idx]['role']}")
         self.messages[idx]["content"] = content
         if image is not None:
             if not isinstance(image, list):
                 image = [image]
             self.messages[idx]["image"].extend(image)
+        print(f"Updated message: {self.messages}")
     def return_last_message(self):
         return self.messages[-1]["content"]

logs/2025-01-15-conv.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{"tstamp": 1736901847.9869, "like": false, "index": [0, 1], "model": "Vintern-1B-v3", "state": {"mandatory_system_message": "\u6211\u662f\u4e66\u751f\u00b7\u4e07\u8c61\uff0c\u82f1\u6587\u540d\u662fInternVL\uff0c\u662f\u7531\u4e0a\u6d77\u4eba\u5de5\u667a\u80fd\u5b9e\u9a8c\u5ba4\u3001\u6e05\u534e\u5927\u5b66\u53ca\u591a\u5bb6\u5408\u4f5c\u5355\u4f4d\u8054\u5408\u5f00\u53d1\u7684\u591a\u6a21\u6001\u5927\u8bed\u8a00\u6a21\u578b\u3002", "system_message": "\u8bf7\u5c3d\u53ef\u80fd\u8be6\u7ec6\u5730\u56de\u7b54\u7528\u6237\u7684\u95ee\u9898\u3002", "roles": ["system", "user", "assistant"], "messages": [{"role": "user", "content": "Please help me analyze this picture.", "image": ["logs/serve_images/2025-01-15/d7ea81988546544ca773fc48dc9da837.jpg"]}, {"role": "assistant", "content": "**COULD NOT LOAD MODEL. PLEASE REGENERATE OR REFRESH THIS PAGE.**"}]}, "ip": "127.0.0.1"}
+{"tstamp": 1736901849.31, "like": false, "index": [0, 1], "model": "Vintern-1B-v3", "state": {"mandatory_system_message": "\u6211\u662f\u4e66\u751f\u00b7\u4e07\u8c61\uff0c\u82f1\u6587\u540d\u662fInternVL\uff0c\u662f\u7531\u4e0a\u6d77\u4eba\u5de5\u667a\u80fd\u5b9e\u9a8c\u5ba4\u3001\u6e05\u534e\u5927\u5b66\u53ca\u591a\u5bb6\u5408\u4f5c\u5355\u4f4d\u8054\u5408\u5f00\u53d1\u7684\u591a\u6a21\u6001\u5927\u8bed\u8a00\u6a21\u578b\u3002", "system_message": "\u8bf7\u5c3d\u53ef\u80fd\u8be6\u7ec6\u5730\u56de\u7b54\u7528\u6237\u7684\u95ee\u9898\u3002", "roles": ["system", "user", "assistant"], "messages": [{"role": "user", "content": "Please help me analyze this picture.", "image": ["logs/serve_images/2025-01-15/d7ea81988546544ca773fc48dc9da837.jpg"]}, {"role": "assistant", "content": "**COULD NOT LOAD MODEL. PLEASE REGENERATE OR REFRESH THIS PAGE.**"}]}, "ip": "127.0.0.1"}
+{"tstamp": 1736901850.7274, "like": false, "index": [0, 1], "model": "Vintern-1B-v3", "state": {"mandatory_system_message": "\u6211\u662f\u4e66\u751f\u00b7\u4e07\u8c61\uff0c\u82f1\u6587\u540d\u662fInternVL\uff0c\u662f\u7531\u4e0a\u6d77\u4eba\u5de5\u667a\u80fd\u5b9e\u9a8c\u5ba4\u3001\u6e05\u534e\u5927\u5b66\u53ca\u591a\u5bb6\u5408\u4f5c\u5355\u4f4d\u8054\u5408\u5f00\u53d1\u7684\u591a\u6a21\u6001\u5927\u8bed\u8a00\u6a21\u578b\u3002", "system_message": "\u8bf7\u5c3d\u53ef\u80fd\u8be6\u7ec6\u5730\u56de\u7b54\u7528\u6237\u7684\u95ee\u9898\u3002", "roles": ["system", "user", "assistant"], "messages": [{"role": "user", "content": "Please help me analyze this picture.", "image": ["logs/serve_images/2025-01-15/d7ea81988546544ca773fc48dc9da837.jpg"]}, {"role": "assistant", "content": "**COULD NOT LOAD MODEL. PLEASE REGENERATE OR REFRESH THIS PAGE.**"}]}, "ip": "127.0.0.1"}
+{"tstamp": 1736901851.5865, "like": true, "index": [0, 1], "model": "Vintern-1B-v3", "state": {"mandatory_system_message": "\u6211\u662f\u4e66\u751f\u00b7\u4e07\u8c61\uff0c\u82f1\u6587\u540d\u662fInternVL\uff0c\u662f\u7531\u4e0a\u6d77\u4eba\u5de5\u667a\u80fd\u5b9e\u9a8c\u5ba4\u3001\u6e05\u534e\u5927\u5b66\u53ca\u591a\u5bb6\u5408\u4f5c\u5355\u4f4d\u8054\u5408\u5f00\u53d1\u7684\u591a\u6a21\u6001\u5927\u8bed\u8a00\u6a21\u578b\u3002", "system_message": "\u8bf7\u5c3d\u53ef\u80fd\u8be6\u7ec6\u5730\u56de\u7b54\u7528\u6237\u7684\u95ee\u9898\u3002", "roles": ["system", "user", "assistant"], "messages": [{"role": "user", "content": "Please help me analyze this picture.", "image": ["logs/serve_images/2025-01-15/d7ea81988546544ca773fc48dc9da837.jpg"]}, {"role": "assistant", "content": "**COULD NOT LOAD MODEL. PLEASE REGENERATE OR REFRESH THIS PAGE.**"}]}, "ip": "127.0.0.1"}
+{"tstamp": 1736901852.2976, "like": false, "index": [0, 1], "model": "Vintern-1B-v3", "state": {"mandatory_system_message": "\u6211\u662f\u4e66\u751f\u00b7\u4e07\u8c61\uff0c\u82f1\u6587\u540d\u662fInternVL\uff0c\u662f\u7531\u4e0a\u6d77\u4eba\u5de5\u667a\u80fd\u5b9e\u9a8c\u5ba4\u3001\u6e05\u534e\u5927\u5b66\u53ca\u591a\u5bb6\u5408\u4f5c\u5355\u4f4d\u8054\u5408\u5f00\u53d1\u7684\u591a\u6a21\u6001\u5927\u8bed\u8a00\u6a21\u578b\u3002", "system_message": "\u8bf7\u5c3d\u53ef\u80fd\u8be6\u7ec6\u5730\u56de\u7b54\u7528\u6237\u7684\u95ee\u9898\u3002", "roles": ["system", "user", "assistant"], "messages": [{"role": "user", "content": "Please help me analyze this picture.", "image": ["logs/serve_images/2025-01-15/d7ea81988546544ca773fc48dc9da837.jpg"]}, {"role": "assistant", "content": "**COULD NOT LOAD MODEL. PLEASE REGENERATE OR REFRESH THIS PAGE.**"}]}, "ip": "127.0.0.1"}
+{"tstamp": 1736901853.9731, "like": false, "index": [0, 1], "model": "Vintern-1B-v3", "state": {"mandatory_system_message": "\u6211\u662f\u4e66\u751f\u00b7\u4e07\u8c61\uff0c\u82f1\u6587\u540d\u662fInternVL\uff0c\u662f\u7531\u4e0a\u6d77\u4eba\u5de5\u667a\u80fd\u5b9e\u9a8c\u5ba4\u3001\u6e05\u534e\u5927\u5b66\u53ca\u591a\u5bb6\u5408\u4f5c\u5355\u4f4d\u8054\u5408\u5f00\u53d1\u7684\u591a\u6a21\u6001\u5927\u8bed\u8a00\u6a21\u578b\u3002", "system_message": "\u8bf7\u5c3d\u53ef\u80fd\u8be6\u7ec6\u5730\u56de\u7b54\u7528\u6237\u7684\u95ee\u9898\u3002", "roles": ["system", "user", "assistant"], "messages": [{"role": "user", "content": "Please help me analyze this picture.", "image": ["logs/serve_images/2025-01-15/d7ea81988546544ca773fc48dc9da837.jpg"]}, {"role": "assistant", "content": "**COULD NOT LOAD MODEL. PLEASE REGENERATE OR REFRESH THIS PAGE.**"}]}, "ip": "127.0.0.1"}
+{"tstamp": 1736901854.5329, "like": false, "index": [0, 1], "model": "Vintern-1B-v3", "state": {"mandatory_system_message": "\u6211\u662f\u4e66\u751f\u00b7\u4e07\u8c61\uff0c\u82f1\u6587\u540d\u662fInternVL\uff0c\u662f\u7531\u4e0a\u6d77\u4eba\u5de5\u667a\u80fd\u5b9e\u9a8c\u5ba4\u3001\u6e05\u534e\u5927\u5b66\u53ca\u591a\u5bb6\u5408\u4f5c\u5355\u4f4d\u8054\u5408\u5f00\u53d1\u7684\u591a\u6a21\u6001\u5927\u8bed\u8a00\u6a21\u578b\u3002", "system_message": "\u8bf7\u5c3d\u53ef\u80fd\u8be6\u7ec6\u5730\u56de\u7b54\u7528\u6237\u7684\u95ee\u9898\u3002", "roles": ["system", "user", "assistant"], "messages": [{"role": "user", "content": "Please help me analyze this picture.", "image": ["logs/serve_images/2025-01-15/d7ea81988546544ca773fc48dc9da837.jpg"]}, {"role": "assistant", "content": "**COULD NOT LOAD MODEL. PLEASE REGENERATE OR REFRESH THIS PAGE.**"}]}, "ip": "127.0.0.1"}
+{"tstamp": 1736901854.6853, "like": false, "index": [0, 1], "model": "Vintern-1B-v3", "state": {"mandatory_system_message": "\u6211\u662f\u4e66\u751f\u00b7\u4e07\u8c61\uff0c\u82f1\u6587\u540d\u662fInternVL\uff0c\u662f\u7531\u4e0a\u6d77\u4eba\u5de5\u667a\u80fd\u5b9e\u9a8c\u5ba4\u3001\u6e05\u534e\u5927\u5b66\u53ca\u591a\u5bb6\u5408\u4f5c\u5355\u4f4d\u8054\u5408\u5f00\u53d1\u7684\u591a\u6a21\u6001\u5927\u8bed\u8a00\u6a21\u578b\u3002", "system_message": "\u8bf7\u5c3d\u53ef\u80fd\u8be6\u7ec6\u5730\u56de\u7b54\u7528\u6237\u7684\u95ee\u9898\u3002", "roles": ["system", "user", "assistant"], "messages": [{"role": "user", "content": "Please help me analyze this picture.", "image": ["logs/serve_images/2025-01-15/d7ea81988546544ca773fc48dc9da837.jpg"]}, {"role": "assistant", "content": "**COULD NOT LOAD MODEL. PLEASE REGENERATE OR REFRESH THIS PAGE.**"}]}, "ip": "127.0.0.1"}
+{"tstamp": 1736903025.7072, "like": null, "model": "5CD-AI/Vintern-1B-v3_5", "start": 1736903024.824, "finish": 1736903024.824, "state": {"mandatory_system_message": "\u6211\u662f\u4e66\u751f\u00b7\u4e07\u8c61\uff0c\u82f1\u6587\u540d\u662fInternVL\uff0c\u662f\u7531\u4e0a\u6d77\u4eba\u5de5\u667a\u80fd\u5b9e\u9a8c\u5ba4\u3001\u6e05\u534e\u5927\u5b66\u53ca\u591a\u5bb6\u5408\u4f5c\u5355\u4f4d\u8054\u5408\u5f00\u53d1\u7684\u591a\u6a21\u6001\u5927\u8bed\u8a00\u6a21\u578b\u3002", "system_message": "\u8bf7\u5c3d\u53ef\u80fd\u8be6\u7ec6\u5730\u56de\u7b54\u7528\u6237\u7684\u95ee\u9898\u3002", "roles": ["system", "user", "assistant"], "messages": [{"role": "user", "content": "Please help me analyze this picture.", "image": ["logs/serve_images/2025-01-15/d7ea81988546544ca773fc48dc9da837.jpg"]}, {"role": "assistant", "content": "This is a test response"}]}, "images": ["logs/serve_images/2025-01-15/d7ea81988546544ca773fc48dc9da837.jpg"], "ip": "127.0.0.1"}

logs/2025-01-15-conv.json.lock ADDED Viewed

File without changes

logs/gradio_web_server.log ADDED Viewed

The diff for this file is too large to render. See raw diff

logs/serve_images/2025-01-15/d7ea81988546544ca773fc48dc9da837.jpg ADDED Viewed

models.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import torch
+import numpy as np
+import torch
+import torchvision.transforms as T
+from torchvision.transforms.functional import InterpolationMode
+from PIL import Image
+from PIL import Image, ExifTags
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+def build_transform(input_size):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=MEAN, std=STD)
+    ])
+    return transform
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+        i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+def correct_image_orientation(image_path):
+    # Mở ảnh
+    image = Image.open(image_path)
+    # Kiểm tra dữ liệu Exif (nếu có)
+    try:
+        exif = image._getexif()
+        if exif is not None:
+            for tag, value in exif.items():
+                if ExifTags.TAGS.get(tag) == "Orientation":
+                    # Sửa hướng dựa trên Orientation
+                    if value == 3:
+                        image = image.rotate(180, expand=True)
+                    elif value == 6:
+                        image = image.rotate(-90, expand=True)
+                    elif value == 8:
+                        image = image.rotate(90, expand=True)
+                    break
+    except Exception as e:
+        print("Không thể xử lý Exif:", e)
+    return image
+def load_image(image_file, input_size=448, max_num=12):
+    try:
+        print("Loading image:", image_file)
+        image = correct_image_orientation(image_file).convert('RGB')
+        print("Image size:", image.size)
+        transform = build_transform(input_size=input_size)
+        images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
+        print("Number of images:", len(images))
+        pixel_values = [transform(image) for image in images]
+        pixel_values = torch.stack(pixel_values)
+        print("Image loaded successfully.")
+    except Exception as e:
+        print("Error loading image:", e)
+        pixel_values = None
+    return pixel_values

utils.py CHANGED Viewed

@@ -12,7 +12,7 @@ from constants import LOGDIR
 import datetime
 server_error_msg = (
-    "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
 )
 moderation_msg = (
     "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."

 import datetime
 server_error_msg = (
+    "**COULD NOT LOAD MODEL. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
 )
 moderation_msg = (
     "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."