import gradio as gr import torch from PIL import Image import os from transformers import CLIPTokenizer, CLIPTextModel, AutoProcessor, T5EncoderModel, T5TokenizerFast from diffusers import AutoencoderKL, FlowMatchEulerDiscreteScheduler from flux.transformer_flux import FluxTransformer2DModel from flux.pipeline_flux_chameleon import FluxPipeline import torch.nn as nn import math import logging import sys from huggingface_hub import snapshot_download from qwen2_vl.modeling_qwen2_vl import Qwen2VLSimplifiedModel import spaces # 设置日志 logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.StreamHandler(sys.stdout)] ) logger = logging.getLogger(__name__) MODEL_ID = "Djrango/Qwen2vl-Flux" MODEL_CACHE_DIR = "model_cache" device = "cuda" if torch.cuda.is_available() else "cpu" dtype = torch.bfloat16 # 预下载模型 if not os.path.exists(MODEL_CACHE_DIR): logger.info("Starting model download...") try: snapshot_download( repo_id=MODEL_ID, local_dir=MODEL_CACHE_DIR, local_dir_use_symlinks=False ) logger.info("Model download completed successfully") except Exception as e: logger.error(f"Error downloading models: {str(e)}") raise # 加载小模型到 GPU logger.info("Loading small models to GPU...") tokenizer = CLIPTokenizer.from_pretrained(os.path.join(MODEL_CACHE_DIR, "flux/tokenizer")) text_encoder = CLIPTextModel.from_pretrained( os.path.join(MODEL_CACHE_DIR, "flux/text_encoder") ).to(dtype).to(device) text_encoder_two = T5EncoderModel.from_pretrained( os.path.join(MODEL_CACHE_DIR, "flux/text_encoder_2") ).to(dtype).to(device) tokenizer_two = T5TokenizerFast.from_pretrained( os.path.join(MODEL_CACHE_DIR, "flux/tokenizer_2")) # 大模型初始加载到 CPU logger.info("Loading large models to CPU...") vae = AutoencoderKL.from_pretrained( os.path.join(MODEL_CACHE_DIR, "flux/vae") ).to(dtype).cpu() transformer = FluxTransformer2DModel.from_pretrained( os.path.join(MODEL_CACHE_DIR, "flux/transformer") ).to(dtype).cpu() scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained( os.path.join(MODEL_CACHE_DIR, "flux/scheduler"), shift=1 ) qwen2vl = Qwen2VLSimplifiedModel.from_pretrained( os.path.join(MODEL_CACHE_DIR, "qwen2-vl") ).to(dtype).cpu() qwen2vl_processor = AutoProcessor.from_pretrained( MODEL_ID, subfolder="qwen2-vl", min_pixels=256*28*28, max_pixels=256*28*28 ) # 加载 connector 和 embedder 到 CPU class Qwen2Connector(nn.Module): def __init__(self, input_dim=3584, output_dim=4096): super().__init__() self.linear = nn.Linear(input_dim, output_dim) def forward(self, x): return self.linear(x) connector = Qwen2Connector().to(dtype).cpu() connector_path = os.path.join(MODEL_CACHE_DIR, "qwen2-vl/connector.pt") connector_state = torch.load(connector_path, map_location='cpu') connector_state = {k.replace('module.', ''): v.to(dtype) for k, v in connector_state.items()} connector.load_state_dict(connector_state) t5_context_embedder = nn.Linear(4096, 3072).to(dtype).cpu() t5_embedder_path = os.path.join(MODEL_CACHE_DIR, "qwen2-vl/t5_embedder.pt") t5_embedder_state = torch.load(t5_embedder_path, map_location='cpu') t5_embedder_state = {k: v.to(dtype) for k, v in t5_embedder_state.items()} t5_context_embedder.load_state_dict(t5_embedder_state) # 创建pipeline (先用CPU上的模型) pipeline = FluxPipeline( transformer=transformer, scheduler=scheduler, vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, ) # 设置所有模型为eval模式 for model in [text_encoder, text_encoder_two, vae, transformer, qwen2vl, connector, t5_context_embedder]: model.requires_grad_(False) model.eval() # Aspect ratio options ASPECT_RATIOS = { "1:1": (1024, 1024), "16:9": (1344, 768), "9:16": (768, 1344), "2.4:1": (1536, 640), "3:4": (896, 1152), "4:3": (1152, 896), } def process_image(image): """Process image with Qwen2VL model""" try: # 将 Qwen2VL 相关模型移到 GPU logger.info("Moving Qwen2VL models to GPU...") qwen2vl.to(device) connector.to(device) message = [ { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": "Describe this image."}, ] } ] text = qwen2vl_processor.apply_chat_template( message, tokenize=False, add_generation_prompt=True ) with torch.no_grad(): inputs = qwen2vl_processor( text=[text], images=[image], padding=True, return_tensors="pt" ).to(device) output_hidden_state, image_token_mask, image_grid_thw = qwen2vl(**inputs) image_hidden_state = output_hidden_state[image_token_mask].view(1, -1, output_hidden_state.size(-1)) image_hidden_state = connector(image_hidden_state) # 保存结果到 CPU result = (image_hidden_state.cpu(), image_grid_thw) # 将模型移回 CPU 并清理显存 logger.info("Moving Qwen2VL models back to CPU...") qwen2vl.cpu() connector.cpu() torch.cuda.empty_cache() return result except Exception as e: logger.error(f"Error in process_image: {str(e)}") raise def compute_t5_text_embeddings(prompt): """Compute T5 embeddings for text prompt""" if prompt == "": return None text_inputs = tokenizer_two( prompt, padding="max_length", max_length=256, truncation=True, return_tensors="pt" ).to(device) prompt_embeds = text_encoder_two(text_inputs.input_ids)[0] # 将 t5_context_embedder 移到 GPU t5_context_embedder.to(device) prompt_embeds = t5_context_embedder(prompt_embeds) # 将 t5_context_embedder 移回 CPU t5_context_embedder.cpu() return prompt_embeds def compute_text_embeddings(prompt=""): """Compute text embeddings for the prompt""" with torch.no_grad(): text_inputs = tokenizer( prompt, padding="max_length", max_length=77, truncation=True, return_tensors="pt" ).to(device) prompt_embeds = text_encoder( text_inputs.input_ids, output_hidden_states=False ) return prompt_embeds.pooler_output @spaces.GPU(duration=120) # 使用ZeroGPU装饰器 def generate_images(input_image, prompt="", guidance_scale=3.5, num_inference_steps=28, num_images=1, seed=None, aspect_ratio="1:1"): """Generate images using the pipeline""" try: logger.info(f"Starting generation with prompt: {prompt}") if input_image is None: raise ValueError("No input image provided") if seed is not None: torch.manual_seed(seed) logger.info(f"Set random seed to: {seed}") # Process image with Qwen2VL qwen2_hidden_state, image_grid_thw = process_image(input_image) # Compute text embeddings pooled_prompt_embeds = compute_text_embeddings(prompt) t5_prompt_embeds = compute_t5_text_embeddings(prompt) # Get dimensions width, height = ASPECT_RATIOS[aspect_ratio] logger.info(f"Using dimensions: {width}x{height}") # Generate images try: logger.info("Starting image generation...") # 将 Transformer 和 VAE 移到 GPU logger.info("Moving Transformer and VAE to GPU...") transformer.to(device) vae.to(device) # 更新 pipeline 中的模型引用 pipeline.transformer = transformer pipeline.vae = vae output_images = pipeline( prompt_embeds=qwen2_hidden_state.to(device).repeat(num_images, 1, 1), pooled_prompt_embeds=pooled_prompt_embeds, t5_prompt_embeds=t5_prompt_embeds.repeat(num_images, 1, 1) if t5_prompt_embeds is not None else None, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, height=height, width=width, ).images logger.info("Image generation completed") # 将 Transformer 和 VAE 移回 CPU logger.info("Moving models back to CPU...") #transformer.cpu() #vae.cpu() torch.cuda.empty_cache() return output_images except Exception as e: raise RuntimeError(f"Error generating images: {str(e)}") except Exception as e: logger.error(f"Error during generation: {str(e)}") raise gr.Error(f"Generation failed: {str(e)}") # Create Gradio interface with gr.Blocks( theme=gr.themes.Soft(), css=""" .container { max-width: 1200px; margin: auto; padding: 0 20px; } .header { text-align: center; margin: 20px 0 40px 0; padding: 20px; background: #f7f7f7; border-radius: 12px; } .param-row { padding: 10px 0; } footer { margin-top: 40px; padding: 20px; border-top: 1px solid #eee; } """ ) as demo: with gr.Column(elem_classes="container"): gr.Markdown("""
# 🎨 Qwen2vl-Flux Image Variation Demo Generate creative variations of your images with optional text guidance
""") with gr.Row(equal_height=True): with gr.Column(scale=1): input_image = gr.Image( label="Upload Your Image", type="pil", height=384, sources=["upload", "clipboard"] ) with gr.Accordion("Advanced Settings", open=False): with gr.Group(): prompt = gr.Textbox( label="Text Prompt (Optional)", placeholder="As Long As Possible...", lines=3 ) with gr.Row(elem_classes="param-row"): guidance = gr.Slider( minimum=1, maximum=10, value=3.5, step=0.5, label="Guidance Scale" ) steps = gr.Slider( minimum=1, maximum=30, value=28, step=1, label="Sampling Steps" ) with gr.Row(elem_classes="param-row"): num_images = gr.Slider( minimum=1, maximum=2, value=1, # 默认改为1 step=1, label="Number of Images" ) seed = gr.Number( label="Random Seed", value=None, precision=0 ) aspect_ratio = gr.Radio( label="Aspect Ratio", choices=["1:1", "16:9", "9:16", "2.4:1", "3:4", "4:3"], value="1:1" ) submit_btn = gr.Button("🎨 Generate", variant="primary", size="lg") with gr.Column(scale=1): output_gallery = gr.Gallery( label="Generated Variations", columns=2, rows=2, height=700, object_fit="contain", show_label=True, allow_preview=True ) submit_btn.click( fn=generate_images, inputs=[ input_image, prompt, guidance, steps, num_images, seed, aspect_ratio ], outputs=[output_gallery], show_progress=True ) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=False )