Spaces:

rp-yu
/

apiprompting

Runtime error

File size: 7,608 Bytes

c64fb9f

import argparse
import torch

from llava.constants import (
    IMAGE_TOKEN_INDEX,
    DEFAULT_IMAGE_TOKEN,
    DEFAULT_IM_START_TOKEN,
    DEFAULT_IM_END_TOKEN,
    IMAGE_PLACEHOLDER,
)
from llava.conversation import conv_templates, SeparatorStyle
from llava.model.builder import load_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import (
    process_images,
    tokenizer_image_token,
    get_model_name_from_path,
    KeywordsStoppingCriteria,
)
from llava.transformers.generation.stopping_criteria import MaxNewTokensCriteria

from PIL import Image

import requests
from PIL import Image
from io import BytesIO
import re

def image_parser(args):
    out = args.image_file.split(args.sep)
    return out

def load_image(image_file):
    if image_file.startswith("http") or image_file.startswith("https"):
        response = requests.get(image_file)
        image = Image.open(BytesIO(response.content)).convert("RGB")
    else:
        image = Image.open(image_file).convert("RGB")
    return image


def load_images(image_files):
    out = []
    for image_file in image_files:
        image = load_image(image_file)
        out.append(image)
    return out

def get_preanswer(model, model_name, hl, tokenizer, image_processor, context_len, query, image):
    sep = ","
    temperature = 0
    top_p = None
    num_beams = 1
    max_new_tokens = 1024
    conv_mode = None
    
    disable_torch_init()

    tokenizer, model, image_processor, context_len = tokenizer, model, image_processor, context_len

    hl = hl

    hl.reinit()

    qs = query
    image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
    if IMAGE_PLACEHOLDER in qs:
        if model.config.mm_use_im_start_end:
            qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs)
        else:
            qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs)
    else:
        if model.config.mm_use_im_start_end:
            qs = image_token_se + "\n" + qs
        else:
            qs = DEFAULT_IMAGE_TOKEN + "\n" + qs

    if "llama-2" in model_name.lower():
        conv_mode = "llava_llama_2"
    elif "v1" in model_name.lower():
        conv_mode = "llava_v1"
    elif "mpt" in model_name.lower():
        conv_mode = "mpt"
    else:
        conv_mode = "llava_v0"

    if conv_mode is not None and conv_mode != conv_mode:
        print(
            "[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format(
                conv_mode, conv_mode, conv_mode
            )
        )
    else:
        conv_mode = conv_mode

    conv = conv_templates[conv_mode].copy()
    conv.append_message(conv.roles[0], qs)
    conv.append_message(conv.roles[1], None)
    prompt = conv.get_prompt()

    images = [image]
    images = [image.convert('RGB') if image.mode != 'RGB' else image for image in images]
        
    images_tensor = process_images(
        images,
        image_processor,
        model.config
    ).to(model.device, dtype=torch.float16)

    input_ids = (
        tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
        .unsqueeze(0)
        .to(model.device)
    )

    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
    keywords = [stop_str]
    stopping_criteria = [
        KeywordsStoppingCriteria(keywords, tokenizer, input_ids),
        MaxNewTokensCriteria(input_ids.shape[1], max_new_tokens)
    ]

    with torch.inference_mode():
        output_ids = model.generate(
            input_ids,
            images=images_tensor,
            do_sample=True if temperature > 0 else False,
            temperature=temperature,
            top_p=top_p,
            num_beams=num_beams,
            # max_new_tokens=max_new_tokens,
            use_cache=True,
            stopping_criteria=stopping_criteria,
        )

    attention_output = hl.finalize()
    attention_output = attention_output.view(attention_output.shape[0],24,24)
    attention_output = attention_output.detach()

    input_token_len = input_ids.shape[1]
    n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
    if n_diff_input_output > 0:
        print(
            f"[Warning] {n_diff_input_output} output_ids are not the same as the input_ids"
        )

    # outputs = tokenizer.batch_decode(
    #     output_ids[:, input_token_len:].cpu(), skip_special_tokens=True
    # )[0]
    # outputs = outputs.strip()
    # if outputs.endswith(stop_str):
    #     outputs = outputs[: -len(stop_str)]
    # outputs = outputs.strip()
    output = tokenizer.decode(output_ids[:, input_token_len:].cpu()[0])

    token_mapping = get_token_mapping(tokenizer, output, output_ids[:, input_token_len:].cpu()[0])

    return output, {"llava_attentions":attention_output.detach(), "llava_token_mapping":token_mapping}

def clean_text(text):
    cleaned_text = re.sub(r'^[^a-zA-Z0-9]+|[^a-zA-Z0-9]+$', '', text)
    return cleaned_text

def get_token_mapping(tokenizer, outputs, output_ids):
    tokens = tokenizer.tokenize(outputs)[1:]
    assert len(tokens) == len(output_ids) 
    current_position = 0
    offsets = []

    for token in tokens:
        cleaned_token = clean_text(token)
        try:
            token_start = outputs.find(cleaned_token, current_position)
        except:
            print(outputs, cleaned_token)
            continue
        token_end = token_start + len(cleaned_token)
        offsets.append((token_start, token_end))
        current_position = token_end

    return offsets

def from_preanswer_to_mask(highlight_text, query, cache_dict):
    if highlight_text.strip() == query.strip() or highlight_text.strip() == "":
        token_start_index = 0
        token_end_index = len(cache_dict["llava_token_mapping"]) - 1
    else:
        text_start_index = query.find(highlight_text)
        text_end_index = text_start_index + len(highlight_text)

        for token_index, (token_text_mapping_st, token_text_mapping_end) in enumerate(cache_dict["llava_token_mapping"]):
            if token_text_mapping_st <= text_start_index:
                token_start_index = token_index
            if token_text_mapping_end >= text_end_index:
                token_end_index = token_index
                break
    
    attentions = cache_dict["llava_attentions"]
    selected_attentions = attentions[token_start_index:token_end_index+1]
    mask = selected_attentions.mean(dim=0)
    return mask

def get_model(model_path = "llava-v1.5-7b", device = "cuda:0"):
    model_path = f"liuhaotian/{model_path}"
    model_path = model_path
    model_base = None
    model_name = get_model_name_from_path(model_path)

    tokenizer, model, image_processor, context_len = load_pretrained_model(
        model_path=model_path,
        model_base=model_base,
        model_name=model_name,
        device= device,
        # load_4bit = True,
    )
    return tokenizer, model, image_processor, context_len, model_name

if __name__ == "__main__":
    prompt = "What are the things I should be cautious about when I visit here?"
    image_file = "https://llava-vl.github.io/static/images/view.jpg"
    image = Image.open(BytesIO(requests.get(image_file).content)).convert("RGB")

    tokenizer, model, image_processor, context_len, model_name = get_model()
    
    from .hook import hook_logger
    hl = hook_logger(model, model.device, layer_index = 20)
    output, cache_dict = get_preanswer(model, model_name, hl, tokenizer, image_processor, context_len, prompt, image)

    mask = from_preanswer_to_mask(output[10:20], output, cache_dict)