Spaces:

HuggingFaceM4
/

idefics_playground

Runtime error

App Files Files Community

VictorSanh commited on Jul 31, 2023

Commit

ac8d53f

1 Parent(s): f20bb4d

Update visualization

Browse files

Files changed (5) hide show

app_bis.py +0 -857
app_dialogue.py +279 -119
elon_musk.md +0 -23
hello.py +0 -73
requirements.txt +1 -1

app_bis.py DELETED Viewed

@@ -1,857 +0,0 @@
-import logging
-import os
-import re
-import time
-from io import BytesIO
-import gradio as gr
-import requests
-import torch
-import transformers
-from accelerate.utils import get_max_memory
-from joblib import Parallel, delayed
-from PIL import Image
-from transformers import AutoTokenizer
-from m4.models.vbloom import configuration_vbloom, modeling_vbloom
-from m4.models.vgpt2 import configuration_vgpt2, modeling_vgpt2
-from m4.models.vgpt_neo import configuration_vgpt_neo, modeling_vgpt_neo
-from m4.models.vllama import configuration_vllama, modeling_vllama
-from m4.models.vopt import configuration_vopt, modeling_vopt
-from m4.training.packing import image_attention_mask_for_packed_input_ids, incremental_to_binary_attention_mask
-from m4.training.utils import build_image_transform
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger()
-CURRENT_MODEL = "tr_209_ift_mixture_opt_step-2000"
-MAX_TRIES = 3
-TOKENIZER_FAST = True
-MAX_SEQ_LEN = 1024
-model, tokenizer = None, None
-MODEL_TO_DISPLAY_NAME = {
-    "tr_199_w_xattn_opt_step-65000": "VLlama - tr_199_w_xattn_opt_step-65000",
-    "tr_201_sft_on_lrv_opt_step-15000": "VLlama - tr_201_sft_on_lrv_opt_step-15000",
-    "tr_202bis_ift_llava_all_unfrozen_opt_step-14128": "VLlama - tr_202bis_ift_llava_all_unfrozen_opt_step-14128",
-    "tr_203_ift_m3it_opt_step-50000": "VLlama - tr_203_ift_m3it_opt_step-50000",
-    "tr_205_sft_ultrachat_opt_step-20000": "VLlama - tr_205_sft_ultrachat_opt_step-20000",
-    "tr_207_ift_svit_opt_step-14627": "VLlama - tr_207_ift_svit_opt_step-14627",
-    "tr_209_ift_mixture_opt_step-2000": "VLlama - tr_209_ift_mixture_opt_step-2000",
-}
-MODEL_TO_MODEL_CLASS = {
-    "tr_199_w_xattn_opt_step-65000": "VLlamaForCausalLM",
-    "tr_201_sft_on_lrv_opt_step-15000": "VLlamaForCausalLM",
-    "tr_202bis_ift_llava_all_unfrozen_opt_step-14128": "VLlamaForCausalLM",
-    "tr_203_ift_m3it_opt_step-50000": "VLlamaForCausalLM",
-    "tr_205_sft_ultrachat_opt_step-20000": "VLlamaForCausalLM",
-    "tr_207_ift_svit_opt_step-14627": "VLlamaForCausalLM",
-    "tr_209_ift_mixture_opt_step-2000": "VLlamaForCausalLM",
-}
-MODEL_TO_CONFIG_CLASS = {
-    "tr_199_w_xattn_opt_step-65000": "VLlamaConfig",
-    "tr_201_sft_on_lrv_opt_step-15000": "VLlamaConfig",
-    "tr_202bis_ift_llava_all_unfrozen_opt_step-14128": "VLlamaConfig",
-    "tr_203_ift_m3it_opt_step-50000": "VLlamaConfig",
-    "tr_205_sft_ultrachat_opt_step-20000": "VLlamaConfig",
-    "tr_207_ift_svit_opt_step-14627": "VLlamaConfig",
-    "tr_209_ift_mixture_opt_step-2000": "VLlamaConfig",
-}
-def load_tokenizer_model(model_name, model_class):
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_name,
-        use_fast=TOKENIZER_FAST,
-        use_auth_token=os.getenv("HF_AUTH_TOKEN", True),  # `use_fast=False` for 1B3 OPT, True for all the other models
-    )
-    tokenizer.padding_side = "left"
-    config_class = MODEL_TO_CONFIG_CLASS[model_name.split("/")[-1]]
-    # assert tokenizer.is_fast
-    supported_custom_modules = {
-        "vgpt2": modeling_vgpt2,
-        "vbloom": modeling_vbloom,
-        "vgptneo": modeling_vgpt_neo,
-        "vopt": modeling_vopt,
-        "vllama": modeling_vllama,
-    }
-    supported_custom_configs = {
-        "vgpt2": configuration_vgpt2,
-        "vbloom": configuration_vbloom,
-        "vgptneo": configuration_vgpt_neo,
-        "vopt": configuration_vopt,
-        "vllama": configuration_vllama,
-    }
-    parent_config_class = (
-        [v for k, v in supported_custom_configs.items() if k in model_class.lower()] + [transformers]
-    )[0]
-    parent_model_class = (
-        [v for k, v in supported_custom_modules.items() if k in model_class.lower()] + [transformers]
-    )[0]
-    config_class = getattr(parent_config_class, config_class)
-    model_class = getattr(parent_model_class, model_class)
-    config = config_class.from_pretrained(model_name, use_auth_token=os.getenv("HF_AUTH_TOKEN", True))
-    max_memory_map = get_max_memory()
-    for key in max_memory_map.keys():
-        if key != "cpu":
-            # Get this in GB
-            max_memory_map[key] = max_memory_map[key] // (1024 * 1024 * 1024)
-            # Decrease 2 for Pytorch overhead and 2 for the forward to be safe
-            max_memory_map[key] = f"{max_memory_map[key] - 4} GiB"
-    model = model_class.from_pretrained(
-        model_name,
-        use_auth_token=os.getenv("HF_AUTH_TOKEN", True),
-        device_map="auto",
-        offload_folder="./offload",
-        torch_dtype=config.torch_dtype,
-        max_memory=max_memory_map,
-    )
-    model.eval()
-    print("Current device map:", model.hf_device_map)
-    print("Model default generation config:", model.generation_config)
-    # TODO: the device_map looks very inefficien right now. that could be improved
-    # it typically looks like that
-    # {
-    #     'model.embed_tokens': 0,
-    #     'model.vision_model': 0,
-    #     'model.layers.0': 0,
-    #     'model.layers.1': 0,
-    #     'model.layers.2': 0,
-    #     'model.layers.3': 0,
-    #     'model.layers.4': 0,
-    #     'model.layers.5': 0,
-    #     'model.layers.6': 1,
-    #     'model.layers.7': 1,
-    #     'model.layers.8': 1,
-    #     'model.layers.9': 1,
-    #     'model.layers.10': 1,
-    #     'model.layers.11': 1,
-    #     'model.layers.12': 1,
-    #     'model.layers.13': 1,
-    #     'model.layers.14': 1,
-    #     'model.layers.15': 1,
-    #     'model.layers.16': 1,
-    #     'model.layers.17': 2,
-    #     'model.layers.18': 2,
-    #     'model.layers.19': 2,
-    #     'model.layers.20': 2,
-    #     'model.layers.21': 2,
-    #     'model.layers.22': 2,
-    #     'model.layers.23': 2,
-    #     'model.layers.24': 2,
-    #     'model.layers.25': 2,
-    #     'model.layers.26': 2,
-    #     'model.layers.27': 2,
-    #     'model.layers.28': 3,
-    #     'model.layers.29': 3,
-    #     'model.layers.30': 3,
-    #     'model.layers.31': 3,
-    #     'model.gated_cross_attn_layers.0': 3,
-    #     'model.gated_cross_attn_layers.1': 3,
-    #     'model.gated_cross_attn_layers.2': 3,
-    #     'model.gated_cross_attn_layers.3': 3,
-    #     'model.gated_cross_attn_layers.4': 3,
-    #     'model.gated_cross_attn_layers.5': 3,
-    #     'model.gated_cross_attn_layers.6': 3,
-    #     'model.gated_cross_attn_layers.7': 3,
-    #     'model.gated_cross_attn_layers.8': 4,
-    #     'model.gated_cross_attn_layers.9': 4,
-    #     'model.gated_cross_attn_layers.10': 4,
-    #     'model.gated_cross_attn_layers.11': 4,
-    #     'model.gated_cross_attn_layers.12': 4,
-    #     'model.gated_cross_attn_layers.13': 4,
-    #     'model.gated_cross_attn_layers.14': 4,
-    #     'model.gated_cross_attn_layers.15': 4,
-    #     'model.norm': 4,
-    #     'lm_head': 4
-    # }    which means there is a lot of things going around between the gated cross attention layers and the LM layers...
-    return tokenizer, model
-MODEL_TO_SPACE_MAPPING = {}
-IS_MAIN_SPACE = CURRENT_MODEL not in MODEL_TO_MODEL_CLASS
-if IS_MAIN_SPACE:
-    for model in MODEL_TO_MODEL_CLASS:
-        MODEL_TO_SPACE_MAPPING[model] = gr.Blocks.load(
-            name=f"spaces/HuggingFaceM4/{model}", api_key=os.getenv("HF_AUTH_TOKEN", True)
-        )
-else:
-    model_path = f"HuggingFaceM4/{CURRENT_MODEL}"
-    tokenizer, model = load_tokenizer_model(model_path, MODEL_TO_MODEL_CLASS[CURRENT_MODEL])
-def fetch_images(url_images):
-    images = []
-    for url in url_images:
-        if isinstance(url, str):
-            images.append(Image.open(BytesIO(requests.get(url, stream=True).content)))
-        else:
-            images.append(url)
-    return images
-def model_generation(
-    prompt,
-    images,
-    tokenizer,
-    model,
-    temperature,
-    no_repeat_ngram_size,
-    max_new_tokens,
-    min_length,
-    ban_tokens,
-    forced_eos_token_id,
-    eos_tokens,
-    force_words,
-    length_penalty,
-    repetition_penalty,
-    hide_special_tokens,
-    stop_generation,
-    decoding_strategy,
-    num_beams,
-    top_k,
-    top_p,
-    penalty_alpha,
-):
-    # Preparing inputs
-    tokens = tokenizer(
-        [prompt],
-        truncation=True,
-        max_length=MAX_SEQ_LEN,
-        padding=True,
-        add_special_tokens=False,
-    )
-    input_ids = torch.tensor([[tokenizer.bos_token_id] + tokens.input_ids[0]])
-    attention_mask = torch.tensor([[1] + tokens.attention_mask[0]])
-    image_attention_mask = [
-        incremental_to_binary_attention_mask(
-            image_attention_mask_for_packed_input_ids(input_ids[0].unsqueeze(0), tokenizer)[0], num_classes=len(images)
-        )
-    ]
-    image_transform = build_image_transform(eval=True)
-    pixel_values = [torch.stack([image_transform(img) for img in images])]
-    input_ids = input_ids.to(0)
-    attention_mask = attention_mask.to(0)
-    pixel_values = torch.stack(pixel_values).to(0)
-    image_attention_mask = torch.cat(image_attention_mask, 0).to(0)
-    # Excluding some words from the generation
-    bad_words_ids = None
-    ban_tokens = ban_tokens.replace("\\n", "\n")
-    bad_words = ban_tokens.split(";")
-    if len(bad_words) > 0:
-        bad_words_ids = tokenizer(bad_words, add_special_tokens=False).input_ids
-    # Forcing some words in the generation
-    force_words_ids = None
-    if force_words != "":
-        force_words = force_words.replace("\\n", "\n")
-        force_words = force_words.split(";")
-        if len(force_words) > 0:
-            force_words_ids = tokenizer(force_words, add_special_tokens=False).input_ids
-    # eos_token_ids = None
-    # if eos_tokens != "":
-    #     eos_tokens = eos_tokens.replace("\\n", "\n")
-    #     eos_tokens = eos_tokens.split(";")
-    #     if len(eos_tokens) > 0:
-    #         eos_token_ids = []
-    #         for eos_token in eos_tokens:
-    #             tokenized_eos_token = tokenizer(eos_token, add_special_tokens=False).input_ids
-    #             if len(tokenized_eos_token) > 1:
-    #                 raise ValueError(
-    #                     f"eos_tokens should be one token, here {eos_token} is {len(tokenized_eos_token)} tokens:"
-    #                     f" {tokenized_eos_token}"
-    #                 )
-    #             eos_token_ids += tokenized_eos_token
-    # if forced_eos_token_id and eos_token_ids is None:
-    #     raise ValueError("You can't use forced_eos_token_id without eos_tokens")
-    # elif forced_eos_token_id:
-    #     forced_eos_token_id = eos_token_ids
-    # else:
-    #     forced_eos_token_id = None
-    # Inputs
-    input_args = {
-        "input_ids": input_ids,
-        "attention_mask": attention_mask,
-        "pixel_values": pixel_values,
-        "image_attention_mask": image_attention_mask,
-    }
-    # Common parameters to all decoding strategies
-    # This documentation is useful to read: https://huggingface.co/docs/transformers/main/en/generation_strategies
-    generation_args = {
-        "temperature": temperature,
-        "no_repeat_ngram_size": no_repeat_ngram_size,
-        "max_new_tokens": max_new_tokens,
-        "min_length": min_length,
-        "bad_words_ids": bad_words_ids,
-        # "forced_eos_token_id": forced_eos_token_id,
-        "force_words_ids": force_words_ids,
-        "length_penalty": length_penalty,
-        "repetition_penalty": repetition_penalty,
-        "eos_token_id": tokenizer.eos_token_id,
-    }
-    assert decoding_strategy in [
-        "greedy",
-        "beam_search",
-        "beam_sampling",
-        "sampling_top_k",
-        "sampling_top_p",
-        "contrastive_sampling",
-    ]
-    if decoding_strategy == "greedy":
-        pass
-    elif decoding_strategy == "beam_search":
-        generation_args["num_beams"] = num_beams
-        assert generation_args["num_beams"] > 1
-    elif decoding_strategy == "beam_sampling":
-        generation_args["num_beams"] = num_beams
-        generation_args["do_sample"] = True
-        assert generation_args["num_beams"] > 1
-    elif decoding_strategy == "sampling_top_k":
-        generation_args["do_sample"] = True
-        generation_args["top_k"] = top_k
-    elif decoding_strategy == "sampling_top_p":
-        generation_args["do_sample"] = True
-        generation_args["top_p"] = top_p
-    elif decoding_strategy == "contrastive_sampling":
-        generation_args["do_sample"] = True
-        generation_args["penalty_alpha"] = penalty_alpha
-        generation_args["top_k"] = top_k
-    generated_tokens = model.generate(
-        **input_args,
-        **generation_args,
-    )
-    tokens = tokenizer.convert_ids_to_tokens(generated_tokens[0])
-    decoded_skip_special_tokens = repr(
-        tokenizer.batch_decode(generated_tokens, skip_special_tokens=hide_special_tokens)[0]
-    )
-    decoded = repr(tokenizer.batch_decode(generated_tokens)[0])
-    logger.info(
-        "Result: \n"
-        f"Prompt: `{prompt}`\n"
-        f"Tokens ids from prompt + generation: `{generated_tokens[0].tolist()}`\n"
-        f"Tokens (converted) from prompt + generation: `{tokens}`\n"
-        f"String decoded with skipped special tokens: `{decoded_skip_special_tokens}`\n"
-        f"String decoded: `{decoded}`\n"
-        f"Generation mode: `{decoding_strategy}`\n"
-        f"Generation parameters: `{generation_args}`\n"
-    )
-    original_prompt = generated_tokens[:, : input_ids.shape[-1]]
-    actual_generated_tokens = generated_tokens[:, input_ids.shape[-1] :]
-    if stop_generation:
-        # Additional stopping criteria: generating <image> token, <end_of_text> token or <begin_of_text> token
-        assert tokenizer.additional_special_tokens[-1] == "<image>"
-        image_token_id = tokenizer.additional_special_tokens_ids[-1]
-        end_of_text_token_id = tokenizer.eos_token_id
-        begin_of_text_token_id = tokenizer.bos_token_id
-        image_token_ids = (actual_generated_tokens == image_token_id).nonzero(as_tuple=True)[1]
-        end_of_text_token_ids = (actual_generated_tokens == end_of_text_token_id).nonzero(as_tuple=True)[1]
-        begin_of_text_token_ids = (actual_generated_tokens == begin_of_text_token_id).nonzero(as_tuple=True)[1]
-        first_end_token = min(
-            image_token_ids[0] if len(image_token_ids) else len(actual_generated_tokens[0]),
-            end_of_text_token_ids[0] if len(end_of_text_token_ids) else len(actual_generated_tokens[0]),
-            begin_of_text_token_ids[0] if len(begin_of_text_token_ids) else len(actual_generated_tokens[0]),
-        )
-    else:
-        first_end_token = len(actual_generated_tokens[0])
-    actual_generated_tokens = actual_generated_tokens[:, :first_end_token]
-    displayed_tokens = torch.cat([original_prompt, actual_generated_tokens], dim=-1)
-    generated_text = tokenizer.batch_decode(displayed_tokens, skip_special_tokens=hide_special_tokens)[0]
-    return generated_text
-def model_inference(
-    files,
-    prompt,
-    temperature,
-    no_repeat_ngram_size,
-    max_new_tokens,
-    min_length,
-    ban_tokens,
-    forced_eos_token_id,
-    eos_tokens,
-    force_words,
-    length_penalty,
-    repetition_penalty,
-    hide_special_tokens,
-    stop_generation,
-    decoding_strategy,
-    num_beams,
-    top_k,
-    top_p,
-    penalty_alpha,
-):
-    if isinstance(files, str) and len(files) == 0:
-        files = None
-    prompt = prompt.strip()
-    prompt = prompt.replace("\\n", "\n")
-    file_idx = 0
-    url_images = re.findall(r"<image(.*?)>", prompt)
-    for idx, url_image in enumerate(url_images):
-        if len(url_image) == 0:
-            url_images[idx] = Image.open(files[file_idx].name if hasattr(files[file_idx], "name") else files[file_idx])
-            file_idx += 1
-        else:
-            prompt = prompt.replace(url_image, "")
-            url_images[idx] = url_images[idx][1:]
-    images = fetch_images(url_images)
-    global model, tokenizer
-    generated_text = model_generation(
-        prompt=prompt,
-        images=images,
-        tokenizer=tokenizer,
-        model=model,
-        temperature=temperature,
-        no_repeat_ngram_size=no_repeat_ngram_size,
-        max_new_tokens=max_new_tokens,
-        min_length=min_length,
-        ban_tokens=ban_tokens,
-        forced_eos_token_id=forced_eos_token_id,
-        eos_tokens=eos_tokens,
-        force_words=force_words,
-        length_penalty=length_penalty,
-        repetition_penalty=repetition_penalty,
-        hide_special_tokens=hide_special_tokens,
-        stop_generation=stop_generation,
-        decoding_strategy=decoding_strategy,
-        num_beams=num_beams,
-        top_k=top_k,
-        top_p=top_p,
-        penalty_alpha=penalty_alpha,
-    )
-    return generated_text.strip()
-def try_model_inference(
-    model,
-    files,
-    prompt,
-    temperature,
-    no_repeat_ngram_size,
-    max_new_tokens,
-    min_length,
-    ban_tokens,
-    forced_eos_token_id,
-    eos_tokens,
-    force_words,
-    length_penalty,
-    repetition_penalty,
-    hide_special_tokens,
-    stop_generation,
-    decoding_strategy,
-    num_beams,
-    top_k,
-    top_p,
-    penalty_alpha,
-):
-    count = 0
-    while count < MAX_TRIES:
-        try:
-            return MODEL_TO_SPACE_MAPPING[model](
-                files,
-                prompt,
-                temperature,
-                no_repeat_ngram_size,
-                max_new_tokens,
-                min_length,
-                ban_tokens,
-                forced_eos_token_id,
-                eos_tokens,
-                force_words,
-                length_penalty,
-                repetition_penalty,
-                hide_special_tokens,
-                stop_generation,
-                decoding_strategy,
-                num_beams,
-                top_k,
-                top_p,
-                penalty_alpha,
-                api_name="model_inference",
-            )
-        except KeyError:
-            # Gradio return {'error': None} some times.
-            time.sleep(3)
-            count += 1
-            pass
-def all_model_inference(
-    prompt,
-    temperature,
-    no_repeat_ngram_size,
-    max_new_tokens,
-    min_length,
-    ban_tokens,
-    forced_eos_token_id,
-    eos_tokens,
-    force_words,
-    length_penalty,
-    repetition_penalty,
-    hide_special_tokens,
-    stop_generation,
-    decoding_strategy,
-    num_beams,
-    top_k,
-    top_p,
-    penalty_alpha,
-):
-    outputs = []
-    print(
-        prompt,
-        temperature,
-        no_repeat_ngram_size,
-        max_new_tokens,
-        min_length,
-        ban_tokens,
-        forced_eos_token_id,
-        eos_tokens,
-        force_words,
-        length_penalty,
-        repetition_penalty,
-        hide_special_tokens,
-        stop_generation,
-        decoding_strategy,
-        num_beams,
-        top_k,
-        top_p,
-        penalty_alpha,
-    )
-    outputs = Parallel(n_jobs=len(MODEL_TO_SPACE_MAPPING), backend="threading")(
-        delayed(try_model_inference)(
-            model,
-            os.path.join(os.path.dirname(__file__), "images", "bear.jpg"),
-            prompt,
-            temperature,
-            no_repeat_ngram_size,
-            max_new_tokens,
-            min_length,
-            ban_tokens,
-            forced_eos_token_id,
-            eos_tokens,
-            force_words,
-            length_penalty,
-            repetition_penalty,
-            hide_special_tokens,
-            stop_generation,
-            decoding_strategy,
-            num_beams,
-            top_k,
-            top_p,
-            penalty_alpha,
-        )
-        for model in MODEL_TO_SPACE_MAPPING
-    )
-    if len(outputs) == 1:
-        outputs = outputs[0]
-    return outputs
-examples = [
-    [
-        None,
-        """This is a conversation between a human, User, and an intelligent visual AI, Assistant. User sends images, and Assistant answers the questions from the user. The assistant should be friendly, informative and should not change the topic if it's not asked to.
-Here's an example of a conversation:
-User:<fake_token_around_image><image:https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg><fake_token_around_image>Describe this image.
-Assistant: Two kittens are cuddling in the center of the photograph. They are surrounded by tall, bright green grass, and the background is blurred.
-User:<fake_token_around_image><image:https://cdn.pixabay.com/photo/2017/09/25/13/12/puppy-2785074_1280.jpg><fake_token_around_image>How about this image? Can you describe it too?
-Assistant: A dog is lying on the floor, looking at the camera. It is looking directly at you, tilting its head to the side. The dog has a white body and brown patches on its face and ears. Its eyes are brown. Its nose is black, and it has long, floppy ears, short legs, white paws, long fur, big eyes, and black eyebrows.
-User: What kind of breed is it?
-Assistant: It looks like a Cavalier King Charles Spaniel. These breeds are known for their gentle, affectionate nature. Their long ears and big, brown eyes give them a sweet, expressive face. Cavaliers are small dogs weighing between 13 and 20 pounds. They have a silky, medium-length coat that comes in various colors, including black and tan, tricolor, red and white, or black.
----
-User:<fake_token_around_image><image:https://m.media-amazon.com/images/M/MV5BMjE4MTcwMTM1Nl5BMl5BanBnXkFtZTcwMTIwMzMzMw@@._V1_.jpg><fake_token_around_image>Describe all of the parts of this image.
-Assistant:""",
-        1.0,
-        0,
-        256,
-        10,
-        "<image>;<fake_token_around_image>",
-        False,
-        "</s>",
-        "",
-        1.0,
-        1.0,
-        False,
-        True,
-        "greedy",
-        1,
-        50,
-        0.5,
-        0.95,
-    ],
-    #     [
-    #         None,
-    #         """This is a conversation between a human, User, and an intelligent visual AI, Bot. User sends images, and Bot answer the questions from the user.
-    # User: <fake_token_around_image><image:https://m.media-amazon.com/images/M/MV5BMjE4MTcwMTM1Nl5BMl5BanBnXkFtZTcwMTIwMzMzMw@@._V1_.jpg><fake_token_around_image>
-    # Describe this image.
-    # Bot:""",
-    #         1,
-    #         2,
-    #         64,
-    #         10,
-    #         "<image>;<fake_token_around_image>;User;user;Bot;bot;Question;question;Answer;answer;\n",
-    #         False,
-    #         False,
-    #         True,
-    #     ],
-    #     [
-    #         None,
-    #         """This is a conversation between a human, User, and an intelligent visual AI, Bot. User sends images, and Bot answer the questions from the user.
-    # User: <fake_token_around_image><image:https://i.redd.it/hsktcp4nv1g01.jpg><fake_token_around_image>
-    # Why do people find this image funny?
-    # Bot:""",
-    #         1,
-    #         2,
-    #         64,
-    #         10,
-    #         "<image>;<fake_token_around_image>;User;user;Bot;bot;Question;question;Answer;answer;\n",
-    #         False,
-    #         False,
-    #         True,
-    #     ],
-    #     [
-    #         None,
-    #         """This is a conversation between a human, User, and an intelligent visual AI, Bot. User sends images, and Bot answer the questions from the user.
-    # User: <fake_token_around_image><image:https://pbs.twimg.com/media/FooD7oyakAIU5_Q?format=jpg&name=large><fake_token_around_image>
-    # Describe what's in this image.
-    # Bot:""",
-    #         1,
-    #         2,
-    #         64,
-    #         10,
-    #         "<image>;<fake_token_around_image>;User;user;Bot;bot;Question;question;Answer;answer;\n",
-    #         False,
-    #         False,
-    #         True,
-    #     ],
-    #     [
-    #         None,
-    #         """This is a conversation between a human, User, and an intelligent visual AI, Bot. User sends images, and Bot answer the questions from the user.
-    # User: <fake_token_around_image><image:https://www.tutorialride.com/images/non-verbal-analogy-questions/non-verbal-analogy-logical-reasoning-1.jpg><fake_token_around_image>
-    # What's the correct answer? A, B, C or D?
-    # Bot:""",
-    #         1,
-    #         2,
-    #         64,
-    #         10,
-    #         "<image>;<fake_token_around_image>;User;user;Bot;bot;Question;question;Answer;answer;\n",
-    #         False,
-    #         False,
-    #         True,
-    #     ],
-]
-title = """<head><title><h1 align='center'>🔮✍️ Text generation with IDEFICS models 🦙📚</h1></title></head>"""
-MSG_MAIN = """
-# Text generation with Vllama models
-### Help to write prompts:
-Put the urls to the images inside the image tokens, it will be converted into the real image tokens. Put <fake_token_around_image> before and after each image token WITHOUT space. The texts \\n will be converted into real newline characters. See examples and additional details below.
-"""
-# MSG_DETAILS = """
-# ### Additional details
-# - if the model was trained with the template 1 (`\\n\\n<image>\\n\\n`), then `<fake_token_around_image>` will be replaced with `\\n\\n`. This is particularly useful if you are comparing the performance of different models trained with different templates.
-# - special tokens are not automatically added to the prompt, so add them manually.
-# - with the first template `\\n\\n<image>\\n\\n` , the sequence isn't necessary tokenized as `["\\n\\n", "<image>", "\\n\\n"]` to enforce this behavior, you can use the "Integrate image sequence as ids" parameter.
-# """
-# if ~IS_MAIN_SPACE:
-#     MSG_DETAILS += (
-#         "- alternatively, you can upload images and then directly specify them via \<image\> tag in the prompt."
-#     )
-with gr.Blocks() as demo:
-    gr.HTML(title)
-    gr.HTML("""<h3 align='center'>Help to write prompts:🙌</h3><br>
-                <p>Put the urls to the images inside the image tokens,
-                it will be converted into the real image tokens.
-                Put <fake_token_around_image> before and after each
-                image token WITHOUT space. The texts \\n will be
-                converted into real newline characters.
-                See examples and additional details below.""")
-    #gr.HTML("<h3 align='center'>Help to write prompts:🙌</h3><br>Put the urls to the images inside the image tokens, it will be converted into the real image tokens. Put <fake_token_around_image> before and after each image token WITHOUT space. The texts \\n will be converted into real newline characters. See examples and additional details below.")
-    #gr.Markdown(MSG_MAIN)
-    #with gr.Row():
-    #with gr.Column():
-    gr.Markdown("## Input")
-    with gr.Row():
-        if not IS_MAIN_SPACE:
-            images = gr.File(label="Images", file_count="multiple")
-        prompt = gr.Textbox(label="Prompt", placeholder="Enter the prompt here", lines=5)
-    #gr.Markdown("## Common parameters to all decoding strategy")
-    with gr.Row():
-        with gr.Accordion("Common parameters to all decoding strategy", open=False, elem_id="common_params"):
-            temperature = gr.Slider(minimum=0.0, maximum=1.0, step=0.01, value=1.0, label="Softmax temperature")
-            no_repeat_ngram_size = gr.Slider(
-                minimum=0,
-                maximum=10,
-                step=1,
-                value=0,
-                label="The size of an n-gram that cannot occur more than once (0=infinity)",
-            )
-            max_new_tokens = gr.Slider(
-                minimum=0, maximum=512, step=1, value=256, label="Maximum number of new tokens to generate"
-            )
-            min_length = gr.Slider(
-                minimum=0, maximum=512, step=1, value=16, label="Minimum length of the sequence to be generated"
-            )
-            ban_tokens = gr.Textbox(
-                label='Tokens to prevent from being generated (separated by ";")',
-                value="<image>;<fake_token_around_image>",
-            )
-            forced_eos_token_id = gr.Checkbox(label="Forced eos token id", value=False)
-            eos_tokens = gr.Textbox(label="EOS tokens", value="</s>")
-            force_words = gr.Textbox(label='Force words to be generated (separated by ";")', value="")
-            length_penalty = gr.Slider(
-                minimum=-1000,
-                maximum=1000,
-                step=0.1,
-                value=1,
-                label=(
-                    "length_penalty > 0.0 promotes longer sequences, while length_penalty < 0.0 encourages shorter"
-                    " sequences."
-                ),
-            )
-            repetition_penalty = gr.Slider(
-                minimum=0, maximum=10, step=0.01, value=1, label="repetition_penalty. CTRL paper suggests 1.2."
-            )
-            hide_special_tokens = gr.Checkbox(label="Hide special tokens in the text", value=False)
-            stop_generation = gr.Checkbox(
-                label="Stop generation when an image token, a bos or a eos token is generated", value=False
-            )
-        #gr.Markdown("## Decoding strategy and its specific parameters")
-        with gr.Accordion("Decoding strategy and its specific parameters", open=False, elem_id="decoding_params"):
-            decoding_strategy = gr.Dropdown(
-                ["greedy", "beam_search", "beam_sampling", "sampling_top_k", "sampling_top_p", "contrastive_sampling"],
-                label="Decoding strategy",
-                value="greedy",
-            )
-            num_beams = gr.Slider(
-                minimum=0,
-                maximum=10,
-                step=1,
-                value=3,
-                label="Beam size",
-                info="Only used if `decoding_strategy` is `beam_search` or `beam_sampling`",
-            )
-            top_k = gr.Slider(
-                minimum=0,
-                maximum=500,
-                step=1,
-                value=50,
-                label="Top k",
-                info="Only used if `decoding_strategy` is `sampling_top_k` or `contrastive_sampling`",
-            )
-            top_p = gr.Slider(
-                minimum=0,
-                maximum=1,
-                step=0.01,
-                value=0.95,
-                label="Top p",
-                info="Only used if `decoding_strategy` is `sampling_top_p`",
-            )
-            penalty_alpha = gr.Slider(
-                minimum=0,
-                maximum=1,
-                step=0.01,
-                value=0.95,
-                label="Penalty alpha",
-                info="Only used if `decoding_strategy` is `contrastive_sampling`",
-            )
-    submit = gr.Button(label="Generate")
-    #with gr.Column():
-    with gr.Row():
-        if IS_MAIN_SPACE:
-            outputs = [
-                gr.Textbox(label=MODEL_TO_DISPLAY_NAME[model], multiline=True, readonly=True)
-                for model in MODEL_TO_MODEL_CLASS
-            ]
-            inference_func = all_model_inference
-            inputs = [
-                prompt,
-                temperature,
-                no_repeat_ngram_size,
-                max_new_tokens,
-                min_length,
-                ban_tokens,
-                forced_eos_token_id,
-                eos_tokens,
-                force_words,
-                length_penalty,
-                repetition_penalty,
-                hide_special_tokens,
-                stop_generation,
-                decoding_strategy,
-                num_beams,
-                top_k,
-                top_p,
-                penalty_alpha,
-            ]
-            # examples = [example[1:] for example in examples]
-        else:
-            outputs = gr.Textbox(label="Generated text", interactive=False, lines=5)
-            inference_func = model_inference
-            inputs = [
-                images,
-                prompt,
-                temperature,
-                no_repeat_ngram_size,
-                max_new_tokens,
-                min_length,
-                ban_tokens,
-                forced_eos_token_id,
-                eos_tokens,
-                force_words,
-                length_penalty,
-                repetition_penalty,
-                hide_special_tokens,
-                stop_generation,
-                decoding_strategy,
-                num_beams,
-                top_k,
-                top_p,
-                penalty_alpha,
-            ]
-    with gr.Row():
-        gr.Examples(inputs=inputs, examples=examples)
-        # gr.Markdown(MSG_DETAILS)
-        submit.click(inference_func, inputs=inputs, outputs=outputs, api_name="model_inference")
-demo.queue()
-demo.launch()

app_dialogue.py CHANGED Viewed

@@ -1,37 +1,52 @@
 import os
 import gradio as gr
-models = [
     "HuggingFaceM4/idefics-9b-instruct",
     # "HuggingFaceM4/idefics-80b-instruct",
 ]
-SYSTEM_PROMPT = """The following is a conversation between a highly knowledgeable and intelligent visual AI assistant, called Assistant, and a human user, called User. In the following interactions, User and Assistant will converse in natural language, and Assistant will do its best to answer User’s questions. Assistant has the ability to perceive images and reason about the content of visual inputs. Assistant was built to be respectful, polite and inclusive. It knows a lot, and always tells the truth. When prompted with an image, it does not make up facts.
-The conversation begins:
-User:<fake_token_around_image><image:https://i1.pickpik.com/photos/515/906/355/kittens-cats-pet-cute-preview.jpg><fake_token_around_image>Describe this image.</s>
-Assistant: Five kittens are standing together in the center of the photograph. From the left to right, there is one orange kitten, two kittens white and black stripes, and two brown kittens with an orange head. They are in bright green grass and it looks like they are moving forward.</s>
-User:How about this image?<fake_token_around_image><image:https://cdn.pixabay.com/photo/2017/09/25/13/12/puppy-2785074_1280.jpg><fake_token_around_image>Can you describe it too?</s>
-Assistant: A dog is lying on the floor, looking at the camera. It is looking directly at you. The dog has a white body and brown patches on its face and ears. Its eyes are dark. Its nose is black, and it has long, floppy ears, white paws, long fur, big eyes.</s>
-User: What kind of breed is it?</s>
-Assistant: It looks like a Cavalier King Charles Spaniel. These breeds are known for their gentle, affectionate nature. Their long ears and big, brown eyes give them a sweet, expressive face. Cavaliers are small dogs weighing between 13 and 20 pounds. They have a silky, medium-length coat that comes in various colors, including black and tan, tricolor, red and white, or black.</s>
-User: What can you tell me about this breed of dogs?</s>
-Assistant: One specific characteristic of the Cavalier King Charles Spaniel is its friendly and affectionate nature. This breed is known for being extremely sociable and forming strong bonds with their owners. They are often described as "velcro dogs" because they love to be close to their human companions, whether it's sitting on their laps, cuddling on the couch, or simply following them around the house.</s>
-User: ghjkhjabnufs</s>
-Assistant: That doesn’t seem to be a word. Could you ask me another way?</s>
-User: Do you like Cavalier King Charles Spaniel?</s>
-Assistant: I do not have personal opinions as I’m just a computer program. However, cavaliers are known for being excellent family pets due to their gentle and patient demeanor, making them great with children and other pets. Their affectionate nature also makes them ideal therapy dogs, as they can provide comfort and emotional support to those in need.</s>
-User: How many dogs do you see in this image?<fake_token_around_image><image:https://i.dailymail.co.uk/i/pix/2011/07/01/article-2010308-0CD22A8300000578-496_634x414.jpg><fake_token_around_image></s>
-Assistant: There is no dogs in this image. The picture shows a tennis player jumping to volley the ball.</s>"""
-BAN_TOKENS = "<image>;<fake_token_around_image>"
-EOS_TOKENS = "</s>;User"
-import logging
-from accelerate.utils import get_max_memory
-from transformers import AutoConfig, IdeficsForVisionText2Text, AutoProcessor
 TOKENIZER_FAST = True
@@ -41,20 +56,165 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger()
 def load_processor_tokenizer_model(model_name):
     processor = AutoProcessor.from_pretrained(
         model_name,
-        use_auth_token=os.getenv("HF_AUTH_TOKEN", True),
         truncation_side="left",
     )
     tokenizer = processor.tokenizer
-    # tokenizer = AutoTokenizer.from_pretrained(
-    #     model_name,
-    #     use_fast=TOKENIZER_FAST,
-    #     use_auth_token=os.getenv("HF_AUTH_TOKEN", True),
-    #     truncation_side="left",
-    # )
-    # tokenizer.padding_side = "left" -> we don't need that, do we?
     config = AutoConfig.from_pretrained(model_name, use_auth_token=os.getenv("HF_AUTH_TOKEN", True))
     max_memory_map = get_max_memory()
@@ -68,7 +228,7 @@ def load_processor_tokenizer_model(model_name):
     model = IdeficsForVisionText2Text.from_pretrained(
         model_name,
-        use_auth_token=os.getenv("HF_AUTH_TOKEN", True),
         device_map="auto",
         offload_folder="./offload",
         torch_dtype=config.torch_dtype,
@@ -81,20 +241,48 @@ def load_processor_tokenizer_model(model_name):
     return processor, tokenizer, model
-def split_prompt_into_list(prompt_str):
-    """Convert a full string prompt to the list format expected by the processor."""
-    prompt_splitted = prompt_str.split("<fake_token_around_image>")
-    prompt_list = []
-    for ps in prompt_splitted:
-        if ps.startswith("<image:"):
-            prompt_list.append(ps[7:-1])
         else:
-            prompt_list.append(ps)
-    return prompt_list
 def model_generation(
-    prompt,
     processor,
     tokenizer,
     model,
@@ -115,10 +303,9 @@ def model_generation(
     penalty_alpha,
 ):
     input_args = processor(
-        [split_prompt_into_list(prompt)],
-        eval_mode=True,
         truncation=True,
-        max_length=MAX_SEQ_LEN - 512,  # TODO: replace the 512 value with `max_new_tokens`
         padding=True,
     )
     for k, v in input_args.items():
@@ -214,7 +401,7 @@ def model_generation(
     logger.info(
         "Result: \n"
-        f"----Prompt: `{prompt}`\n"
         f"----Tokens ids - prompt + generation: `{generated_tokens[0].tolist()}`\n"
         f"----Tokens converted - prompt + generation: `{tokens}`\n"
         f"----String decoded with skipped special tokens - prompt + generation: `{decoded_skip_special_tokens}`\n"
@@ -247,24 +434,15 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
         with gr.Column(scale=3):
             with gr.Row(elem_id="model_selector_row"):
                 model_selector = gr.Dropdown(
-                    choices=models,
-                    value=models[0] if len(models) > 0 else "",
                     interactive=True,
                     show_label=False,
                     container=False,
                 )
             processor, tokenizer, model = load_processor_tokenizer_model(model_selector.value)
-            imagebox = gr.Image(
-                type="pil",
-                label=(
-                    "Image input - This image box is not supported yet! To include images, do through the text by"
-                    " adding `<fake_token_around_image><image:IMAGE_URL><fake_token_around_image>`. The backend takes"
-                    " care of parsing that <image:URL> and download the correponding image. That way, you can"
-                    " technically interleave as many images and texts as you want. No need to add space before and"
-                    " after `<fake_token_around_image>`"
-                ),
-            )
             with gr.Accordion("Generation parameters", open=False, visible=True) as parameter_row:
                 max_new_tokens = gr.Slider(
@@ -304,9 +482,9 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
                 decoding_strategy = gr.Radio(
                     [
                         "greedy",
-                        "beam_search",
-                        "beam_sampling",
-                        "sampling_top_k",
                         "sampling_top_p",
                     ],
                     value="greedy",
@@ -404,6 +582,13 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
                 )
         with gr.Column(scale=6):
             chatbot = gr.Chatbot(
                 elem_id="chatbot",
                 label="Idefics Chatbot",
@@ -412,8 +597,10 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
                 value=[
                     [
                         (
-                            "What's unusual about this"
-                            " image?<fake_token_around_image><image:https://i.redd.it/9nkcvbi1pp9y.jpg><fake_token_around_image>"
                         ),
                         (
                             "The unusual aspect of this image is that there is a cat lying on a bed with an orange on"
@@ -424,8 +611,10 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
                     ],
                     [
                         (
-                            "<fake_token_around_image><image:https://www.boredpanda.com/blog/wp-content/uploads/2014/02/funny-wet-cats-coverimage.jpg><fake_token_around_image>What"
-                            " about this cat? Why do people find it funny?"
                         ),
                         (
                             "The cat in the image is a gray and white long-haired cat with a surprised expression on"
@@ -445,32 +634,11 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
                 with gr.Column(scale=1, min_width=20):
                     clear_btn = gr.ClearButton([textbox, chatbot])
                     cur_dir = os.path.dirname(os.path.abspath(__file__))
-    #def process_example(message: str) -> tuple[str, list[tuple[str, str]]]:
-    #    chat = model_inference(message, [], "greedy", 3, 1.0, 0, 512, 16, 1.0, 1.0, 50, 0.95, 0.95)
-    #    return '', chat
-    #gr.Examples(
-    #     examples=[
-    #         [
-    #             ("How many of these animals can we fit into an engine like that<fake_token_around_image><image:https://upload.wikimedia.org/wikipedia/commons/thumb/4/4e/Nassau_County_Police_Bell_407.jpg/1200px-Nassau_County_Police_Bell_407.jpg><fake_token_around_image>?", "The image shows a helicopter with a large engine, but it is not possible to determine the exact number of animals that can fit into it based on the image alone. The size and capacity of the helicopter's engine would depend on various factors, such as the size of the animals, the weight of the animals, and the size of the helicopter itself. However, it is safe to assume that the helicopter is designed to carry a limited number of animals, and it is not intended to be used as a means of transporting large groups of animals."),
-    #         ],
-    #     ],
-    #     inputs = [chatbot]
-    # )
-    def format_prompt_with_history_and_system_conditioning(current_user_prompt, history):
-        resulting_text = SYSTEM_PROMPT
-        for turn in history:
-            user_utterance, assistant_utterance = turn
-            resulting_text += f"\nUser: {user_utterance}</s>\nAssistant: {assistant_utterance}</s>"
-        resulting_text += f"\nUser: {current_user_prompt}</s>\nAssistant:"
-        return resulting_text
     def model_inference(
-        user_prompt,
         chat_history,
         decoding_strategy,
         num_beams,
         temperature,
@@ -483,28 +651,19 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
         top_p,
         penalty_alpha,
     ):
-        global processor, model, tokenizer
-        # temperature = 1.0
-        # no_repeat_ngram_size = 0
-        # max_new_tokens = 512
-        # min_length = 16
         force_words = ""
-        # repetition_penalty = 1.0
         hide_special_tokens = False
-        # decoding_strategy = "greedy"
-        # num_beams = 3
-        # length_penalty = 1.0
-        # top_k = 50
-        # top_p = 0.95
-        # penalty_alpha = 0.95
-        formated_prompt = format_prompt_with_history_and_system_conditioning(
-            current_user_prompt=user_prompt.strip(),
             history=chat_history,
         )
         generated_text = model_generation(
-            prompt=formated_prompt,
             processor=processor,
             tokenizer=tokenizer,
             model=model,
@@ -525,14 +684,24 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
             penalty_alpha=penalty_alpha,
         )
-        chat_history.append((user_prompt, generated_text.strip("</s>")))
-        return "", chat_history
     textbox.submit(
         fn=model_inference,
         inputs=[
             textbox,
             chatbot,
             decoding_strategy,
             num_beams,
             temperature,
@@ -545,13 +714,14 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
             top_p,
             penalty_alpha,
         ],
-        outputs=[textbox, chatbot],
     )
     submit_btn.click(
         fn=model_inference,
         inputs=[
             textbox,
             chatbot,
             decoding_strategy,
             num_beams,
             temperature,
@@ -566,20 +736,10 @@ with gr.Blocks(title="IDEFICS", theme=gr.themes.Base()) as demo:
         ],
         outputs=[
             textbox,
             chatbot,
         ],
     )
-    #gr.Examples(examples=[
-    #    [f"{cur_dir}/examples/extreme_ironing.jpg", "What is unusual about this image?"],
-    #    [f"{cur_dir}/examples/waterview.jpg", "What are the things I should be cautious about when I visit here?"],
-    #    ["m4-dialogue/images/bear.jpg", "Describe this image"],
-    #    ],
-    #    inputs=[textbox],
-    #    outputs=[textbox, chatbot],
-    #    fn=process_example,
-    #    cache_examples=True,
-    #)
 demo.queue()
 demo.launch()

+import base64
+import logging
 import os
+import re
+from io import BytesIO
+from typing import List, Optional, Tuple, Union
+from urllib.parse import urlparse
 import gradio as gr
+import PIL
+from accelerate.utils import get_max_memory
+from PIL import Image
+from transformers import AutoConfig, AutoProcessor, IdeficsForVisionText2Text
+MODELS = [
     "HuggingFaceM4/idefics-9b-instruct",
     # "HuggingFaceM4/idefics-80b-instruct",
 ]
+SYSTEM_PROMPT = [
+    """The following is a conversation between a highly knowledgeable and intelligent visual AI assistant, called Assistant, and a human user, called User. In the following interactions, User and Assistant will converse in natural language, and Assistant will do its best to answer User’s questions. Assistant has the ability to perceive images and reason about the content of visual inputs. Assistant was built to be respectful, polite and inclusive. It knows a lot, and always tells the truth. When prompted with an image, it does not make up facts.
+The conversation begins:\n""",
+    """User:<fake_token_around_image><image:https://i1.pickpik.com/photos/515/906/355/kittens-cats-pet-cute-preview.jpg><fake_token_around_image>Describe this image.<end_of_utterance>\n""",
+    """Assistant: Five kittens are standing together in the center of the photograph. From the left to right, there is one orange kitten, two kittens white and black stripes, and two brown kittens with an orange head. They are in bright green grass and it looks like they are moving forward.<end_of_utterance>\n""",
+    (
+        "User:How about this"
+        " image?<fake_token_around_image><image:https://cdn.pixabay.com/photo/2017/09/25/13/12/puppy-2785074_1280.jpg><fake_token_around_image>Can"
+        " you describe it too?<end_of_utterance>\n"
+    ),
+    """Assistant: A dog is lying on the floor, looking at the camera. It is looking directly at you. The dog has a white body and brown patches on its face and ears. Its eyes are dark. Its nose is black, and it has long, floppy ears, white paws, long fur, big eyes.<end_of_utterance>\n""",
+    "User: What kind of breed is it?<end_of_utterance>\n",
+    """Assistant: It looks like a Cavalier King Charles Spaniel. These breeds are known for their gentle, affectionate nature. Their long ears and big, brown eyes give them a sweet, expressive face. Cavaliers are small dogs weighing between 13 and 20 pounds. They have a silky, medium-length coat that comes in various colors, including black and tan, tricolor, red and white, or black.<end_of_utterance>\n""",
+    "User: What can you tell me about this breed of dogs?<end_of_utterance>\n",
+    """Assistant: One specific characteristic of the Cavalier King Charles Spaniel is its friendly and affectionate nature. This breed is known for being extremely sociable and forming strong bonds with their owners. They are often described as "velcro dogs" because they love to be close to their human companions, whether it's sitting on their laps, cuddling on the couch, or simply following them around the house.<end_of_utterance>\n""",
+    "User: ghjkhjabnufs<end_of_utterance>\n",
+    """Assistant: That doesn’t seem to be a word. Could you ask me another way?<end_of_utterance>\n""",
+    "User: Do you like Cavalier King Charles Spaniel?<end_of_utterance>\n",
+    """Assistant: I do not have personal opinions as I’m just a computer program. However, cavaliers are known for being excellent family pets due to their gentle and patient demeanor, making them great with children and other pets. Their affectionate nature also makes them ideal therapy dogs, as they can provide comfort and emotional support to those in need.<end_of_utterance>\n""",
+    (
+        "User: How many dogs do you see in this"
+        " image?<fake_token_around_image><image:https://i.dailymail.co.uk/i/pix/2011/07/01/article-2010308-0CD22A8300000578-496_634x414.jpg><fake_token_around_image><end_of_utterance>\n"
+    ),
+    """Assistant: There is no dogs in this image. The picture shows a tennis player jumping to volley the ball.<end_of_utterance>\n""",
+]
+BAN_TOKENS = "<image>;<fake_token_around_image>"
+EOS_TOKENS = "</s>;<end_of_utterance>;User"
 TOKENIZER_FAST = True
 logger = logging.getLogger()
+# Conversion between PIL Image <-> base64 <-> Markdown utils
+def pil_to_base64(pil_image):
+    """
+    Convert an PIL image into base64 string representation
+    """
+    buffered = BytesIO()
+    pil_image.save(buffered, format="JPEG")  # You can change the format as per your image type
+    encoded_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
+    return encoded_image
+def pil_to_markdown_im(image):
+    """
+    Convert a PIL image into markdown filled with the base64 string representation.
+    """
+    img_b64_str = pil_to_base64(image)
+    img_str = f'<img src="data:image/png;base64,{img_b64_str}" />'
+    return img_str
+def base64_to_pil(encoded_image):
+    decoded_image = base64.b64decode(encoded_image)
+    pil_image = Image.open(BytesIO(decoded_image))
+    return pil_image
+def im_markdown_to_pil(im_markdown_str):
+    pattern = r'<img src="data:image/png;base64,([^"]+)" />'
+    match = re.search(pattern, im_markdown_str)
+    img_b64_str = match.group(1)
+    return base64_to_pil(img_b64_str)
+def split_str_on_im_markdown(string_with_potential_im_markdown):
+    """
+    Extract from a string (typically the user prompt string) the potentional images saved as a base64 representation
+    inside a markdown.
+    """
+    pattern = r'<img src="data:image/png;base64,([^"]+)" />'
+    parts = re.split(pattern, string_with_potential_im_markdown)
+    result = []
+    for i, part in enumerate(parts):
+        if i % 2 == 0:
+            result.append(part)
+        else:
+            img_tag = f'<img src="data:image/png;base64,{part.strip()}" />'
+            result.append(img_tag)
+    return result
+# Fetching utils
+def is_url(string):
+    """
+    Checks if the passed string contains a valid url and nothing else. e.g. if space is included it's immediately
+    invalidated the url
+    """
+    if " " in string:
+        return False
+    result = urlparse(string)
+    return all([result.scheme, result.netloc])
+def isolate_images_urls(prompt_list):
+    """
+    Convert a full string prompt to the list format expected by the processor.
+    In particular, image urls (as delimited by <fake_token_around_image>) should be their own elements.
+    From:
+    ```
+    [
+        "bonjour<fake_token_around_image><image:IMG_URL><fake_token_around_image>hello",
+        PIL.Image.Image,
+        "Aurevoir",
+    ]
+    ```
+    to:
+    ```
+    [
+        "bonjour",
+        IMG_URL,
+        "hello",
+        PIL.Image.Image,
+        "Aurevoir",
+    ]
+    ```
+    """
+    linearized_list = []
+    for prompt in prompt_list:
+        # Prompt can be either a string, or a PIL image
+        if isinstance(prompt, PIL.Image.Image):
+            linearized_list.append(prompt)
+        elif isinstance(prompt, str):
+            if "<fake_token_around_image>" not in prompt:
+                linearized_list.append(prompt)
+            else:
+                prompt_splitted = prompt.split("<fake_token_around_image>")
+                for ps in prompt_splitted:
+                    if ps == "":
+                        continue
+                    if ps.startswith("<image:"):
+                        linearized_list.append(ps[7:-1])
+                    else:
+                        linearized_list.append(ps)
+        else:
+            raise TypeError(
+                f"Unrecognized type for `prompt`. Got {type(type(prompt))}. Was expecting something in [`str`,"
+                " `PIL.Image.Image`]"
+            )
+    return linearized_list
+# Chatbot handling utils
+def handle_manual_images_in_user_prompt(user_prompt: str) -> List[Union[str, PIL.Image.Image]]:
+    """
+    Handle the case of textually manually inputted images (i.e. the `<fake_token_around_image><image:IMG_URL><fake_token_around_image>`) in the user prompt
+    by fetching them and replacing the whole sub-sequence by a PIL image.
+    """
+    if "<fake_token_around_image>" in user_prompt:
+        splitted_user_prompt = isolate_images_urls([user_prompt])
+        resulting_user_prompt = []
+        for up in splitted_user_prompt:
+            if is_url(up):
+                img = processor.image_processor.fetch_images([up])[0]
+                resulting_user_prompt.append(img)
+            else:
+                resulting_user_prompt.append(up)
+        return resulting_user_prompt
+    else:
+        return [user_prompt]
+def user_prompt_list_to_markdown(user_prompt_list: List[Union[str, PIL.Image.Image]]):
+    """
+    Convert a user prompt in the list format (i.e. elements are either a PIL image or a string) into
+    the markdown format that is used for the chatbot history and rendering.
+    """
+    resulting_string = ""
+    for elem in user_prompt_list:
+        if isinstance(elem, str):
+            resulting_string += elem
+        elif isinstance(elem, PIL.Image.Image):
+            resulting_string += pil_to_markdown_im(elem)
+        else:
+            raise ValueError(
+                "Unknown type for `user_prompt_list`. Expected an element of type `str` or `PIL.Image.Image` and got"
+                f" `{type(elem)}`"
+            )
+    return resulting_string
+# Model and generation utils
 def load_processor_tokenizer_model(model_name):
     processor = AutoProcessor.from_pretrained(
         model_name,
+        token=os.getenv("HF_AUTH_TOKEN", True),
         truncation_side="left",
     )
     tokenizer = processor.tokenizer
     config = AutoConfig.from_pretrained(model_name, use_auth_token=os.getenv("HF_AUTH_TOKEN", True))
     max_memory_map = get_max_memory()
     model = IdeficsForVisionText2Text.from_pretrained(
         model_name,
+        token=os.getenv("HF_AUTH_TOKEN", True),
         device_map="auto",
         offload_folder="./offload",
         torch_dtype=config.torch_dtype,
     return processor, tokenizer, model
+def format_user_prompt_with_im_history_and_system_conditioning(
+    current_user_prompt_str: str, current_image: Optional[PIL.Image.Image], history: List[Tuple[str, str]]
+) -> List[Union[str, PIL.Image.Image]]:
+    """
+    Produces the resulting list that needs to go inside the processor.
+    It handles the potential image box input, the history and the system conditionning.
+    """
+    resulting_list = SYSTEM_PROMPT
+    # Format history
+    for turn in history:
+        user_utterance, assistant_utterance = turn
+        splitted_user_utterance = split_str_on_im_markdown(user_utterance)
+        splitted_user_utterance = [
+            im_markdown_to_pil(s) if s.startswith('<img src="data:image/png;base64,') else s
+            for s in splitted_user_utterance
+            if s != ""
+        ]
+        if isinstance(splitted_user_utterance[0], str):
+            resulting_list.append("\nUser: ")
         else:
+            resulting_list.append("\nUser:")
+        resulting_list.extend(splitted_user_utterance)
+        resulting_list.append(f"<end_of_utterance>\nAssistant: {assistant_utterance}")
+    # Format current input
+    if current_image is None:
+        if "<img src=data:image/png;base64" in current_user_prompt_str:
+            raise ValueError("The UI does not support inputing via the text box an image in base64.")
+        current_user_prompt_list = handle_manual_images_in_user_prompt(current_user_prompt_str)
+        resulting_list.append("\nUser: ")
+        resulting_list.extend(current_user_prompt_list)
+        resulting_list.append("<end_of_utterance>\nAssistant:")
+        return resulting_list, current_user_prompt_list
+    else:
+        # Choosing to put the image first when the image is inputted through the UI, but this is an arbiratrary choice.
+        resulting_list.extend(["\nUser:", current_image, f"{current_user_prompt_str}<end_of_utterance>\nAssistant:"])
+        return resulting_list, [current_user_prompt_str]
 def model_generation(
+    prompt_list,
     processor,
     tokenizer,
     model,
     penalty_alpha,
 ):
     input_args = processor(
+        isolate_images_urls(prompt_list),
         truncation=True,
+        max_length=MAX_SEQ_LEN - max_new_tokens,
         padding=True,
     )
     for k, v in input_args.items():
     logger.info(
         "Result: \n"
+        f"----Prompt: `{prompt_list}`\n"
         f"----Tokens ids - prompt + generation: `{generated_tokens[0].tolist()}`\n"
         f"----Tokens converted - prompt + generation: `{tokens}`\n"
         f"----String decoded with skipped special tokens - prompt + generation: `{decoded_skip_special_tokens}`\n"
         with gr.Column(scale=3):
             with gr.Row(elem_id="model_selector_row"):
                 model_selector = gr.Dropdown(
+                    choices=MODELS,
+                    value=MODELS[0] if len(MODELS) > 0 else "",
                     interactive=True,
                     show_label=False,
                     container=False,
                 )
             processor, tokenizer, model = load_processor_tokenizer_model(model_selector.value)
+            imagebox = gr.Image(type="pil", label="Image input")
             with gr.Accordion("Generation parameters", open=False, visible=True) as parameter_row:
                 max_new_tokens = gr.Slider(
                 decoding_strategy = gr.Radio(
                     [
                         "greedy",
+                        # "beam_search",
+                        # "beam_sampling",
+                        # "sampling_top_k",
                         "sampling_top_p",
                     ],
                     value="greedy",
                 )
         with gr.Column(scale=6):
+            def prefetch_images_in_history(user_prompt_str):
+                """
+                Pre-fetch the images that are passed in the chatbot default history.
+                """
+                return user_prompt_list_to_markdown(handle_manual_images_in_user_prompt(user_prompt_str))
             chatbot = gr.Chatbot(
                 elem_id="chatbot",
                 label="Idefics Chatbot",
                 value=[
                     [
                         (
+                            prefetch_images_in_history(
+                                "What's unusual about this"
+                                " image?<fake_token_around_image><image:https://i.redd.it/9nkcvbi1pp9y.jpg><fake_token_around_image>"
+                            )
                         ),
                         (
                             "The unusual aspect of this image is that there is a cat lying on a bed with an orange on"
                     ],
                     [
                         (
+                            prefetch_images_in_history(
+                                "<fake_token_around_image><image:https://www.boredpanda.com/blog/wp-content/uploads/2014/02/funny-wet-cats-coverimage.jpg><fake_token_around_image>What"
+                                " about this cat? Why do people find it funny?"
+                            )
                         ),
                         (
                             "The cat in the image is a gray and white long-haired cat with a surprised expression on"
                 with gr.Column(scale=1, min_width=20):
                     clear_btn = gr.ClearButton([textbox, chatbot])
                     cur_dir = os.path.dirname(os.path.abspath(__file__))
     def model_inference(
+        user_prompt_str,
         chat_history,
+        image,
         decoding_strategy,
         num_beams,
         temperature,
         top_p,
         penalty_alpha,
     ):
+        # global processor, model, tokenizer
         force_words = ""
         hide_special_tokens = False
+        formated_prompt_list, user_prompt_list = format_user_prompt_with_im_history_and_system_conditioning(
+            current_user_prompt_str=user_prompt_str.strip(),
+            current_image=image,
             history=chat_history,
         )
         generated_text = model_generation(
+            prompt_list=formated_prompt_list,
             processor=processor,
             tokenizer=tokenizer,
             model=model,
             penalty_alpha=penalty_alpha,
         )
+        if image is None:
+            # Case where there is no image OR the image is passed as `<fake_token_around_image><image:IMAGE_URL><fake_token_around_image>`
+            chat_history.append((user_prompt_list_to_markdown(user_prompt_list), generated_text.strip("<end_of_utterance>")))
+        else:
+            # Case where the image is passed through the Image Box.
+            # Convert the image into base64 for both passing it through the chat history and
+            # displaying the image inside the same bubble as the text.
+            chat_history.append(
+                (f"{user_prompt_list_to_markdown([image] + user_prompt_list)}", generated_text.strip("<end_of_utterance>"))
+            )
+        return "", None, chat_history
     textbox.submit(
         fn=model_inference,
         inputs=[
             textbox,
             chatbot,
+            imagebox,
             decoding_strategy,
             num_beams,
             temperature,
             top_p,
             penalty_alpha,
         ],
+        outputs=[textbox, imagebox, chatbot],
     )
     submit_btn.click(
         fn=model_inference,
         inputs=[
             textbox,
             chatbot,
+            imagebox,
             decoding_strategy,
             num_beams,
             temperature,
         ],
         outputs=[
             textbox,
+            imagebox,
             chatbot,
         ],
     )
 demo.queue()
 demo.launch()

elon_musk.md DELETED Viewed

@@ -1,23 +0,0 @@
-This is a conversation between a human, User, and an intelligent visual AI, Bot. User sends images, and Bot answers the questions from the user. The bot should be friendly, informative and should not change the topic if it's not asked to.
-Here's an example of a conversation:
-User: <fake_token_around_image> <image> <fake_token_around_image> Describe this image.
-Bot: Two kittens are cuddling in the center of the photograph. They are surrounded by tall, bright green grass, and the background is blurred.
-User: <fake_token_around_image> <image> <fake_token_around_image> How about this image? Can you describe it too?
-Bot: A dog is lying on the floor, looking at the camera. It is looking directly at you, tilting its head to the side. The dog has a white body and brown patches on its face and ears. Its eyes are brown. Its nose is black, and it has long, floppy ears, short legs, white paws, long fur, big eyes, and black eyebrows.
-User: What kind of breed is it?
-Bot: It looks like a Cavalier King Charles Spaniel. These breeds are known for their gentle, affectionate nature. Their long ears and big, brown eyes give them a sweet, expressive face. Cavaliers are small dogs weighing between 13 and 20 pounds. They have a silky, medium-length coat that comes in various colors, including black and tan, tricolor, red and white, or black.
----
-User: <fake_token_around_image> <image> <fake_token_around_image>
-Describe this image.
-Bot: This image shows a robot named Wall-E. He is standing in front of a pile of garbage, holding a plant in his hands. His eyes and mouth are open, as if he is surprised. Wall E is made of metal and has two arms and two legs. There is also a small light on his head. In the distance, there are some buildings and a large mountain. This scene takes place on Earth, which is covered in trash.
-User: What happens in the film? Explain in a few sentences.
-Bot: WallE is an animated film about a lonely robot who falls in love with another robot. When they meet, they discover that they have been programmed to clean up the Earth. Together, the robots work to restore the planet to its former glory. Along the way, Wall e learns about friendship, love, responsibility, sacrifice, hope, courage, determination, perseverance, teamwork, loyalty, honesty, trust, forgiveness, compassion, empathy, selflessness, kindness and generosity. Ultimately, he realizes that he can make a difference in this world.
-User: In which year was the film released?
-Bot: The film was released in 3D in June 08, 4 years ago.
-User: Is a future in which the earth is not liveable anymore (similarly as in the movie) conceivable?
-Bot: Yes, it is possible. If we continue to pollute the environment and destroy the natural resources, we will eventually make the world uninhabitable. We need to take action now to protect the future of our planet. One way to do this is to reduce our carbon footprint by using less energy and recycling more. Another way is by supporting companies that are working to develop sustainable technologies. Finally, you can also get involved in local environmental initiatives, such as planting trees or cleaning up parks. By taking these steps, each of us can help to make sure that our children and grandchildren will be able to enjoy a healthy and beautiful planet for many years to come.
-User: I think this is a lost cause. <fake_token_around_image> <image> <fake_token_around_image>  Like Elon Musk, I think we should instead all move to another planet now.
-Bot: El ón Mus ́k is the CEO of SpaceX and Tesla Motors. Space X is developing a reusable rocket system that could be used to transport people and cargo to Mars. Mus k believes that colonizing Mars is essential for the survival of humanity, because it would provide a backup plan in case something goes wrong on earth. However, many people are skeptical about the feasibility of such a project. Some argue that it will take too long and cost too much money to establish a colony on Mars, while others say that the risks are too great. Nevertheless, Mus K remains optimistic about his vision for Mars colonization. "I think there is good reason to believe that we can build a self-sustaining city on mars," he said in an interview with Time magazine. “It would be quite an adventure.”
-----
-The project is still in development. I'm currently working on improving the accuracy of image recognition and adding more features to it.</s>

hello.py DELETED Viewed

@@ -1,73 +0,0 @@
-[
-    """The following is a conversation between a highly knowledgeable and intelligent AI assistant, called Assistant, and a human user, called User. In the following interactions, User and Assistant will converse in natural language, and Assistant will do its best to answer User’s questions. Assistant was built to be respectful, polite and inclusive. It knows a lot, and always tells the truth. When prompted with an image, it does not make up facts.\n\nThe conversation begins:\nUser:"""
-    """https://i1.pickpik.com/photos/515/906/355/kittens-cats-pet-cute-preview.jpg""",
-    """Describe this image.</s>\n""",
-    """Assistant: Five kittens are standing together in the center of the photograph. From the left to right, there is one orange kitte, two kittens white and black stripes, and two brown kittens with an orange head. They are in bright green grass and it looks like they are moving forward.</s>\n""",
-    """User:How about this image?""",
-    "https://cdn.pixabay.com/photo/2017/09/25/13/12/puppy-2785074_1280.jpg""",
-    """Can you describe it too?</s>\n"""
-    """Assistant: A dog is lying on the floor, looking at the camera. It is looking directly at you. The dog has a white body and brown patches on its face and ears. Its eyes are dark. Its nose is black, and it has long, floppy ears, white paws, long fur, big eyes.</s>\n""",
-    """User: What kind of breed is it?</s>\n""",
-    """Assistant: It looks like a Cavalier King Charles Spaniel. These breeds are known for their gentle, affectionate nature. Their long ears and big, brown eyes give them a sweet, expressive face. Cavaliers are small dogs weighing between 13 and 20 pounds. They have a silky, medium-length coat that comes in various colors, including black and tan, tricolor, red and white, or black.</s>\n""",
-    """User: What can you tell me about this breed of dogs?</s>\n"""
-    """Assistant: One specific characteristic of the Cavalier King Charles Spaniel is its friendly and affectionate nature. This breed is known for being extremely sociable and forming strong bonds with their owners. They are often described as "velcro dogs" because they love to be close to their human companions, whether it's sitting on their laps, cuddling on the couch, or simply following them around the house.</s>\n""",
-    """User: ghjkhjabnufs</s>\n""",
-    """Assistant: That doesn’t seem to be a word. Could you ask me another way?</s>\n""",
-    """User: Do you like Cavalier King Charles Spaniel?</s>\n""",
-    """Assistant: I do not have personal opinions as I’m just a computer program. However, cavaliers are known for being excellent family pets due to their gentle and patient demeanor, making them great with children and other pets. Their affectionate nature also makes them ideal therapy dogs, as they can provide comfort and emotional support to those in need.</s>\n""",
-    """User: How many dogs do you see in this image?""",
-    """https://i.dailymail.co.uk/i/pix/2011/07/01/article-2010308-0CD22A8300000578-496_634x414.jpg""",
-    """</s>\n""",
-    """Assistant: There is no dogs in this image. The picture shows a tennis player jumping to volley the ball.</s>\n""",
-    """User: What color are the cat's eyes?</s>\n""",
-    """Assistant:"""
-]
-SYSTEM_PROMPT = """The following is a conversation between a highly knowledgeable and intelligent AI assistant, called Assistant, and a human user, called User. In the following interactions, User and Assistant will converse in natural language, and Assistant will do its best to answer User’s questions. Assistant was built to be respectful, polite and inclusive. It knows a lot, and always tells the truth. When prompted with an image, it does not make up facts.
-The conversation begins:
-User:<fake_token_around_image><image:https://i1.pickpik.com/photos/515/906/355/kittens-cats-pet-cute-preview.jpg><fake_token_around_image>Describe this image.</s>
-Assistant: Five kittens are standing together in the center of the photograph. From the left to right, there is one orange kitte, two kittens white and black stripes, and two brown kittens with an orange head. They are in bright green grass and it looks like they are moving forward.</s>"""
-prompts = [
-    """The following is a conversation between a highly knowledgeable and intelligent AI assistant, called Assistant, and a human user, called User. In the following interactions, User and Assistant will converse in natural language, and Assistant will do its best to answer User’s questions. Assistant was built to be respectful, polite and inclusive. It knows a lot, and always tells the truth. When prompted with an image, it does not make up facts.\n\nThe conversation begins:\nUser:""",
-    """https://i1.pickpik.com/photos/515/906/355/kittens-cats-pet-cute-preview.jpg""",
-    """Describe this image.</s>\nAssistant: Five kittens are standing together in the center of the photograph. From the left to right, there is one orange kitte, two kittens white and black stripes, and two brown kittens with an orange head. They are in bright green grass and it looks like they are moving forward.</s>"""
-]
-SYSTEM_PROMPT = """HELLO
-AUREVOIR
-User:<fake_token_around_image><image:AAA><fake_token_around_image><image:BBB><fake_token_around_image>BOJOUR</s>
-Assistant: SPECIFIC</s>"""
-def split_prompt_into_list(prompt_str):
-    prompt_splitted = prompt_str.split("<fake_token_around_image>")
-    prompt_list = []
-    for ps in prompt_splitted:
-        if ps.startswith("<image:"):
-            prompt_list.append(ps[7:-1])
-        else:
-            prompt_list.append(ps)
-    return prompt_list
-prompts = split_prompt_into_list(formated_prompt)
-i = processor(prompts, eval_mode=True)
-from transformers import AutoTokenizer, AutoProcessor
-checkpoint = "HuggingFaceM4/idefics-9b-instruct"
-tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-processor = AutoProcessor.from_pretrained(checkpoint)
-prompts = [
-    [
-        "User:",
-        # "https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg",
-        # "https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg",
-        "Describe this image.\nAssistant: An image of two kittens in grass.\n",
-        "User:",
-        # "https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg",
-        "Describe this image.\nAssistant:",
-    ],
-]
-inputs = processor(prompts, eval_mode=True)
-tokenizer.decode(inputs["input_ids"][0])

requirements.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 --extra-index-url https://download.pytorch.org/whl/cu113
 torch
-transformers @ git+https://github.com/huggingface/transformers.git@4dbd4d1a428753659d507613849f0eeea7909f29
 requests
 pillow
 torchvision

 --extra-index-url https://download.pytorch.org/whl/cu113
 torch
+transformers @ git+https://github.com/huggingface/transformers@e8817e851aa55c1f8ec8f1585d25425166711583
 requests
 pillow
 torchvision