Spaces:

LanguageBind
/

UniWorld-V1

Runtime error

App Files Files Community

UniWorld-V1 / app.py

LanguageBind

Update app.py

5445c3d verified 5 days ago

raw

history blame

33 kB

	import spaces
	import os
	import gradio as gr
	import sys
	sys.path.append("..")
	from transformers import AutoProcessor, SiglipImageProcessor, SiglipVisionModel, T5EncoderModel, BitsAndBytesConfig
	from univa.models.qwen2p5vl.modeling_univa_qwen2p5vl import UnivaQwen2p5VLForConditionalGeneration
	from univa.utils.flux_pipeline import FluxPipeline
	from univa.utils.get_ocr import get_ocr_result
	from univa.utils.denoiser_prompt_embedding_flux import encode_prompt
	from qwen_vl_utils import process_vision_info
	from univa.utils.anyres_util import dynamic_resize, concat_images_adaptive
	import torch
	from torch import nn
	import uuid
	import base64
	from typing import Dict
	from PIL import Image, ImageDraw, ImageFont
	import argparse
	import gc


	def parse_args():
	parser = argparse.ArgumentParser(description="Model and component paths")

	parser.add_argument("--model_path", type=str, default="LanguageBind/UniWorld-V1", help="UniWorld-V1模型路径")
	parser.add_argument("--flux_path", type=str, default="black-forest-labs/FLUX.1-dev", help="FLUX.1-dev模型路径")
	parser.add_argument("--siglip_path", type=str, default="google/siglip2-so400m-patch16-512", help="siglip2模型路径")
	parser.add_argument("--server_name", type=str, default="127.0.0.1", help="IP地址")
	parser.add_argument("--server_port", type=int, default=6812, help="端口号")
	parser.add_argument("--share", action="store_true", help="是否公开分享")
	parser.add_argument("--nf4", action="store_true", help="是否NF4量化")
	parser.add_argument("--zh", action="store_true", help="是否使用中文")
	parser.add_argument("--offload", action="store_true", help="是否开启顺序卸载")

	return parser.parse_args()


	def add_plain_text_watermark(
	img: Image.Image,
	text: str,
	margin: int = 50,
	font_size: int = 30,
	):
	if img.mode != "RGB":
	img = img.convert("RGB")

	draw = ImageDraw.Draw(img)
	font = ImageFont.truetype("DejaVuSans.ttf", font_size)
	bbox = draw.textbbox((0, 0), text)
	text_width = bbox[2] - bbox[0]
	text_height = bbox[3] - bbox[1]

	x = img.width - text_width - int(3.3 * margin)
	y = img.height - text_height - margin

	draw.text((x, y), text, font=font, fill=(255, 255, 255))
	return img


	css = """
	.table-wrap table tr td:nth-child(3) > div {
	max-height: 150px; /* 最多 100px 高度，按需修改 */
	overflow-y: auto; /* 超出部分显示竖向滚动条 */
	white-space: pre-wrap; /* 自动换行 */
	word-break: break-all; /* 长单词内部分行 */
	}
	.table-wrap table tr td:nth-child(2) > div {
	max-width: 150px;
	white-space: pre-wrap;
	word-break: break-all;
	overflow-x: auto;
	}
	.table-wrap table tr th:nth-child(2) {
	max-width: 150px;
	white-space: normal;
	word-break: keep-all;
	overflow-x: auto;
	}
	.table-wrap table tr td:nth-last-child(-n+8) > div {
	max-width: 130px;
	white-space: pre-wrap;
	word-break: break-all;
	overflow-x: auto;
	}
	.table-wrap table tr th:nth-last-child(-n+8) {
	max-width: 130px;
	white-space: normal;
	word-break: keep-all;
	overflow-x: auto;
	}
	"""


	def img2b64(image_path):
	with open(image_path, "rb") as f:
	b64 = base64.b64encode(f.read()).decode()
	data_uri = f"data:image/jpeg;base64,{b64}"
	return data_uri

	def initialize_models(args):
	os.makedirs("tmp", exist_ok=True)
	# Paths

	quantization_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_compute_dtype=torch.bfloat16,
	bnb_4bit_quant_type="nf4",
	)

	# Load main model and task head
	model = UnivaQwen2p5VLForConditionalGeneration.from_pretrained(
	args.model_path,
	torch_dtype=torch.float16,
	attn_implementation="sdpa",
	quantization_config=quantization_config if args.nf4 else None,
	)
	task_head = nn.Sequential(
	nn.Linear(3584, 10240),
	nn.SiLU(),
	nn.Dropout(0.3),
	nn.Linear(10240, 2)
	)
	# task_head.load_state_dict(torch.load(os.path.join(args.model_path, 'task_head_final.pt')))
	task_head.eval()

	processor = AutoProcessor.from_pretrained(
	args.model_path,
	min_pixels=448*448,
	max_pixels=448*448,
	)
	if args.nf4:
	text_encoder_2 = T5EncoderModel.from_pretrained(
	args.flux_path,
	subfolder="text_encoder_2",
	quantization_config=quantization_config,
	torch_dtype=torch.float16,
	)
	pipe = FluxPipeline.from_pretrained(
	args.flux_path,
	transformer=model.denoise_tower.denoiser,
	text_encoder_2=text_encoder_2,
	torch_dtype=torch.float16,
	)
	else:
	pipe = FluxPipeline.from_pretrained(
	args.flux_path,
	transformer=model.denoise_tower.denoiser,
	torch_dtype=torch.float16,
	)
	if args.offload:
	pipe.enable_model_cpu_offload()
	pipe.enable_vae_slicing()
	tokenizers = [pipe.tokenizer, pipe.tokenizer_2]
	text_encoders = [pipe.text_encoder, pipe.text_encoder_2]

	# Optional SigLIP
	siglip_processor, siglip_model = None, None
	siglip_processor = SiglipImageProcessor.from_pretrained(args.siglip_path)
	siglip_model = SiglipVisionModel.from_pretrained(
	args.siglip_path,
	torch_dtype=torch.float16,
	)

	return {
	'model': model,
	'task_head': task_head,
	'processor': processor,
	'pipe': pipe,
	'tokenizers': tokenizers,
	'text_encoders': text_encoders,
	'siglip_processor': siglip_processor,
	'siglip_model': siglip_model,

	}

	@spaces.GPU(duration=600)
	def to_device(state):
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	state['model'] = state['model'].to(device, dtype=torch.bfloat16)
	state['task_head'] = state['task_head'].to(device, dtype=torch.bfloat16)
	state['pipe'] = state['pipe'].to(device, dtype=torch.bfloat16)
	state['text_encoders'] = state['text_encoders'].to(device, dtype=torch.bfloat16)
	state['siglip_model'] = state['siglip_model'].to(device, dtype=torch.bfloat16)
	state['device'] = device
	return state

	args = parse_args()
	state = initialize_models(args)
	state = to_device(state)

	@spaces.GPU
	def process_large_image(raw_img):
	if raw_img is None:
	return raw_img
	img = Image.open(raw_img).convert("RGB")

	max_side = max(img.width, img.height)
	if max_side > 1024:
	scale = 1024 / max_side
	new_w = int(img.width * scale)
	new_h = int(img.height * scale)
	print(f'resize img {img.size} to {(new_w, new_h)}')
	img = img.resize((new_w, new_h), resample=Image.LANCZOS)
	save_path = f"tmp/{uuid.uuid4().hex}.png"
	img.save(save_path)
	return save_path
	else:
	return raw_img

	@spaces.GPU(duration=200)
	def chat_step(image1, image2, text, height, width, steps, guidance,
	ocr_enhancer, joint_with_t5, enhance_generation, enhance_understanding,
	seed, num_imgs, history_state, progress=gr.Progress()):

	try:
	convo = history_state['conversation']
	image_paths = history_state['history_image_paths']
	cur_ocr_i = history_state['cur_ocr_i']
	cur_genimg_i = history_state['cur_genimg_i']

	# image1 = process_large_image(image1)
	# image2 = process_large_image(image2)
	# Build content
	content = []
	if text:
	ocr_text = ''
	if ocr_enhancer and content:
	ocr_texts = []
	for img in (image1, image2):
	if img:
	ocr_texts.append(get_ocr_result(img, cur_ocr_i))
	cur_ocr_i += 1
	ocr_text = '\n'.join(ocr_texts)
	content.append({'type':'text','text': text + ocr_text})
	for img in (image1, image2):
	if img:
	content.append({'type':'image','image':img,'min_pixels':448448,'max_pixels':448448})
	image_paths.append(img)

	convo.append({'role':'user','content':content})

	# Prepare inputs
	chat_text = state['processor'].apply_chat_template(convo,
	tokenize=False, add_generation_prompt=True)
	chat_text = '<\|im_end\|>\n'.join(chat_text.split('<\|im_end\|>\n')[1:])
	image_inputs, video_inputs = process_vision_info(convo)
	inputs = state['processor'](
	text=[chat_text], images=image_inputs, videos=video_inputs,
	padding=True, return_tensors='pt'
	).to(state['device'])

	# Model forward & task head
	with torch.no_grad():
	outputs = state['model'](**inputs, return_dict=True, output_hidden_states=True)
	hidden = outputs.hidden_states[-1]
	mask = inputs.input_ids == 77091
	vecs = hidden[mask][-1:]
	task_res = state['task_head'](vecs.float())[0]
	print(task_res)
	# Branch decision
	if enhance_generation:
	do_image = True
	elif enhance_understanding:
	do_image = False
	else:
	do_image = (task_res[0] < task_res[1])

	seed = int(seed)
	if seed == -1:
	seed = torch.Generator(device="cpu").seed()
	torch.manual_seed(seed)
	# Generate
	if True:
	# image generation pipeline
	siglip_hs = None
	if state['siglip_processor'] and image_paths:
	vals = [state['siglip_processor'].preprocess(
	images=Image.open(p).convert('RGB'), do_resize=True,
	return_tensors='pt', do_convert_rgb=True
	).pixel_values.to(state['device'])
	for p in image_paths]
	siglip_hs = state['siglip_model'](torch.concat(vals)).last_hidden_state

	with torch.no_grad():
	lvlm = state['model'](
	inputs.input_ids, pixel_values=getattr(inputs,'pixel_values',None),
	attention_mask=inputs.attention_mask,
	image_grid_thw=getattr(inputs,'image_grid_thw',None),
	siglip_hidden_states=siglip_hs,
	output_type='denoise_embeds'
	)
	prm_embeds, pooled = encode_prompt(
	state['text_encoders'], state['tokenizers'],
	text if joint_with_t5 else '', 256, state['device'], 1
	)
	emb = torch.concat([lvlm, prm_embeds], dim=1) if joint_with_t5 else lvlm


	def diffusion_to_gradio_callback(_pipeline, step_idx: int, timestep: int, tensor_dict: Dict):
	# 1）更新 Gradio 进度条
	frac = (step_idx + 1) / float(steps)
	progress(frac)

	return tensor_dict

	with torch.no_grad():
	img = state['pipe'](
	prompt_embeds=emb, pooled_prompt_embeds=pooled,
	height=height, width=width,
	num_inference_steps=steps,
	guidance_scale=guidance,
	generator=torch.Generator(device='cuda').manual_seed(seed),
	num_images_per_prompt=num_imgs,
	callback_on_step_end=diffusion_to_gradio_callback,
	# callback_on_step_end_tensor_inputs=["latents", "prompt_embeds"],
	).images
	# img = [add_plain_text_watermark(im, 'Open-Sora Plan 2.0 Generated') for im in img]
	img = concat_images_adaptive(img)
	save_path = f"tmp/{uuid.uuid4().hex}.png"
	img.save(save_path)
	convo.append({'role':'assistant','content':[{'type':'image','image':save_path}]})
	cur_genimg_i += 1
	progress(1.0)
	bot_msg = (None, save_path)
	else:
	# text generation
	gen_ids = state['model'].generate(**inputs, max_new_tokens=128)
	out = state['processor'].batch_decode(
	[g[len(inputs.input_ids[0]):] for g in gen_ids], skip_special_tokens=True
	)[0]
	convo.append({'role':'assistant','content':[{'type':'text','text':out}]})
	bot_msg = (None, out)


	chat_pairs = []
	# print(convo)
	# print()
	# print()
	for msg in convo:
	# print(msg)
	if msg['role']=='user':
	parts = []
	for c in msg['content']:
	if c['type']=='text': parts.append(c['text'])
	if c['type']=='image': parts.append(f"![user image]({img2b64(c['image'])})")
	chat_pairs.append(("\n".join(parts), None))
	else:
	parts = []
	for c in msg['content']:
	if c['type']=='text': parts.append(c['text'])
	if c['type']=='image': parts.append(f"![assistant image]({img2b64(c['image'])})")
	if msg['content'][-1]['type']=='text':
	chat_pairs[-1] = (chat_pairs[-1][0], parts[-1])
	else:
	chat_pairs[-1] = (chat_pairs[-1][0], parts[-1])
	# print()
	# print(chat_pairs)

	# Update state
	history_state.update({
	'conversation': convo,
	'history_image_paths': image_paths,
	'cur_ocr_i': cur_ocr_i,
	'cur_genimg_i': cur_genimg_i
	})
	return chat_pairs, history_state, seed
	except Exception as e:
	# 捕捉所有异常，返回错误提示，建议用户清理历史后重试
	error_msg = f"发生错误：{e}. 请点击 \"Clear History\" 清理对话历史后再试一次。"
	chat_pairs = [(None, error_msg)]
	# 不修改 history_state，让用户自行清理
	return chat_pairs, history_state, seed

	def copy_seed_for_user(real_seed):
	# 这个函数会把隐藏的 seed_holder 值，传给真正要显示的 seed Textbox
	return real_seed

	def clear_inputs():
	# img1 和 img2 用 None 来清空；text_in 用空字符串清空；seed 同理清空
	return None, None, "", ""
	@spaces.GPU
	def clear_history():
	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	torch.cuda.ipc_collect()
	# 默认 prompt 和 seed
	default_prompt = "Translate this photo into a Studio Ghibli-style illustration, holding true to the original composition and movement."
	default_seed = "-1"

	# 1. chatbot 要用 gr.update(value=[]) 清空
	# 2. state 直接给回初始 dict
	# 3. prompt 和 seed 同样用 gr.update()
	return (
	gr.update(value=[]), # 清空聊天框
	{'conversation':[], # 重置 state
	'history_image_paths':[],
	'cur_ocr_i':0,
	'cur_genimg_i':0},
	gr.update(value=None), # 重置 image1
	gr.update(value=None), # 重置 image2
	gr.update(value=default_prompt), # 重置 prompt 文本框
	gr.update(value=default_seed), # 重置 seed 文本框
	)


	if __name__ == '__main__':
	# Gradio UI
	with gr.Blocks(
	theme=gr.themes.Soft(),
	css=css
	) as demo:

	gr.Markdown(
	"""
	<div style="text-align:center;">

	# 🎉 UniWorld-V1 Chat Interface 🎉

	### Unlock Cutting‑Edge Visual Perception, Feature Extraction, Editing, Synthesis, and Understanding

	Usage Guide:
	- It is recommended to perform inference on four images concurrently to offer varied selections.
	- Uploaded images are automatically resized; manually specifying resolutions that differ substantially from the original is not advised.
	</div>
	""",
	elem_classes="header-text",
	)
	with gr.Row():
	with gr.Column():
	chatbot = gr.Chatbot(
	max_height=100000, min_height=700,
	height=None,
	resizable=True,
	show_copy_button=True
	)
	text_in = gr.Textbox(label="Instruction", value="Translate this photo into a Studio Ghibli-style illustration, holding true to the original composition and movement.")
	with gr.Column():
	with gr.Row():
	img1 = gr.Image(type='filepath', label="Image 1", height=256, width=256)
	img2 = gr.Image(type='filepath', label="Image 2 (Optional reference)", height=256, width=256, visible=True)
	seed = gr.Textbox(label="Seed (-1 for random)", value="-1")
	seed_holder = gr.Textbox(visible=False)
	with gr.Row():
	num_imgs = gr.Slider(1, 4, 4, step=1, label="Num Images")
	with gr.Row():
	height = gr.Slider(256, 2048, 1024, step=64, label="Height")
	width = gr.Slider(256, 2048, 1024, step=64, label="Width")
	with gr.Row():
	steps = gr.Slider(8, 50, 30, step=1, label="Inference steps")
	guidance = gr.Slider(1.0, 10.0, 4.0, step=0.1, label="Guidance scale")
	with gr.Accordion("Advanced Options", open=True, visible=True):
	with gr.Row():
	enhance_gen_box = gr.Checkbox(value=False, label="Enhance Generation")
	enhance_und_box = gr.Checkbox(value=False, label="Enhance Understanding")
	with gr.Row():
	ocr_box = gr.Checkbox(value=False, label="Enhance Text Rendering")
	t5_box = gr.Checkbox(value=True, label="Enhance Current Turn")
	with gr.Row():
	submit = gr.Button("Send", variant="primary")
	clear = gr.Button("Clear History", variant="primary")
	with gr.Row():
	with gr.Column(1, min_width=0):
	gr.Markdown(
	"""
	🖼️ Visual Perception & Feature Extraction
	- Canny Edge Detection
	- Mini-Line Segment Detection
	- Normal Map Generation
	- Sketch Generation
	- Holistically-Nested Edge Detection
	- Depth Estimation
	- Human Pose Estimation
	- Object Detection (Boxes)
	- Semantic Segmentation (Masks)
	"""
	)
	with gr.Column(1, min_width=0):
	gr.Markdown(
	"""
	✂️ Image Editing & Manipulation
	- Add Elements
	- Adjust Attributes
	- Change Background
	- Remove Objects
	- Replace Regions
	- Perform Actions
	- Restyle
	- Compose Scenes
	"""
	)
	with gr.Column(1, min_width=0):
	gr.Markdown(
	"""
	🔄 Cross-Modal Synthesis & Transformation
	- Text→Image Synthesis
	- Image‑to‑Image Translation
	- Multi‑Image Combination
	- Extract IP Features
	- IP Feature Composition
	"""
	)
	with gr.Column(1, min_width=0):
	gr.Markdown(
	"""
	🤖 Visual & Textual QA
	- Image‑Text QA
	- Text‑Text QA
	"""
	)
	anchor_pixels = 1024*1024
	# Dynamic resize callback
	def update_size(i1, i2):
	shapes = []
	for p in (i1, i2):
	if p:
	im = Image.open(p)
	w, h = im.size
	shapes.append((w, h))
	if not shapes:
	return gr.update(), gr.update()
	if len(shapes) == 1:
	w, h = shapes[0]
	else:
	w = sum(s[0] for s in shapes) / len(shapes)
	h = sum(s[1] for s in shapes) / len(shapes)
	new_h, new_w = dynamic_resize(int(h), int(w), 'any_11ratio', anchor_pixels=anchor_pixels)
	return gr.update(value=new_h), gr.update(value=new_w)
	img1.change(fn=update_size, inputs=[img1, img2], outputs=[height, width])
	img2.change(fn=update_size, inputs=[img1, img2], outputs=[height, width])

	# Mutual exclusivity
	enhance_gen_box.change(
	lambda g: gr.update(value=False) if g else gr.update(),
	inputs=[enhance_gen_box], outputs=[enhance_und_box]
	)
	enhance_und_box.change(
	lambda u: gr.update(value=False) if u else gr.update(),
	inputs=[enhance_und_box], outputs=[enhance_gen_box]
	)
	state_ = gr.State({'conversation':[], 'history_image_paths':[], 'cur_ocr_i':0, 'cur_genimg_i':0})

	progress_bar = gr.Progress()
	gr.on(
	triggers=[submit.click, text_in.submit],
	fn=chat_step,
	inputs=[img1, img2, text_in, height, width, steps, guidance,
	ocr_box, t5_box, enhance_gen_box, enhance_und_box, seed, num_imgs, state_,
	],
	outputs=[chatbot, state_, seed_holder],
	scroll_to_output=True
	).then(
	fn=copy_seed_for_user,
	inputs=[seed_holder], # 输入是隐藏的 seed_holder
	outputs=[seed] # 输出到真正要显示的 seed Textbox
	)

	clear.click(
	fn=clear_history,
	inputs=[],
	outputs=[chatbot, state_, img1, img2, text_in, seed]
	)

	# ========== 添加 Validation Examples ==========
	example_height, example_width = 1024, 1024
	gr.Examples(
	examples_per_page=100,
	examples=[
	# text-to-image
	[None, None,
	"Generate an adorable golden retriever puppy playing in a sunny park, "
	"with fluffy fur, big round eyes, and a happy expression. "
	"The background should have green grass, some flowers, and a blue sky with white clouds.",
	example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],


	# NIKE color swap
	["assets/nike_src.jpg", None,
	"Switch the product's color from black, black to white, white, making sure the transition is crisp and clear.",
	example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

	# style transfer (Ghibli)
	["assets/gradio/origin.png", None,
	"Translate this photo into a Studio Ghibli-style illustration, holding true to the original composition and movement.",
	example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

	["assets/gradio/origin.png", None,
	"Remove the bicycle located in the lower center region of the image.",
	example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

	# blur
	["assets/gradio/blur.jpg", None,
	"Remove blur, make it clear.",
	example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

	#
	["assets/gradio/00004614_tgt.jpg", None,
	"Add the ingrid fair isle cashmere turtleneck sweater to the person.",
	example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
	#
	["assets/gradio/00006581_tgt.jpg", None,
	"Place the belvoir broderie anglaise linen tank on the person in a way that complements their appearance and style.",
	example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
	#
	["assets/gradio/00008153_tgt.jpg", None,
	"Integrate may cashmere tank on body.",
	example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
	#
	["assets/gradio/00002315_src.jpg", None,
	"Strip away all context and distractions, leaving the pointelle-trimmed cashmere t-shirt floating on a neutral background.",
	example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],
	#
	["assets/gradio/00002985_src.jpg", None,
	"Generate an image containing only the henry shearling jacket, free from any other visual elements.",
	example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

	["assets/gradio/origin.png", None,
	"Add a cat in the center of image.",
	example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

	# image+image-to-image (compose)
	["assets/00182555_target.jpg",
	"assets/00182555_InstantStyle_ref_1.jpg",
	"Adapt Image1's content to fit the aesthetic of Image2.",
	example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

	# replace object
	["assets/replace_src.png", None,
	"replace motorcycle located in the lower center region of the image with a black bicycle",
	example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

	# segmentation
	["assets/seg_src.jpg", None,
	"Segment the giraffe from the background.\n",
	example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

	# detection
	["assets/det_src.jpg", None,
	"Please depict the vase accurately",
	example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

	# image-to-canny
	["assets/canny_image.jpg", None,
	"Generate a Canny edge map for this image.",
	example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

	# image-to-mlsd
	["assets/mlsd_image.jpg", None,
	"Render an MLSD detection overlay for this input image.",
	example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

	# image-to-normal
	["assets/normal_image.jpg", None,
	"Convert the input texture into a tangent-space normal map.",
	example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

	# image-to-sketch
	["assets/sketch_image.jpg", None,
	"Transform this image into a hand-drawn charcoal sketch.",
	example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

	# image-to-hed
	["assets/hed_image.jpg", None,
	"Produce a holistically-nested boundary probability map of this image.",
	example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

	# image-to-depth
	["assets/depth_image.jpg", None,
	"Estimate depth with a focus on background structure.",
	example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

	# image-to-image (reconstruction)
	["assets/rec.jpg", None,
	"Simply reconstruct the original image with no enhancements.",
	example_height, example_width, 30, 4.0, False, False, False, False, "-1", 4],

	],
	inputs=[img1, img2, text_in, height, width, steps, guidance,
	ocr_box, t5_box, enhance_gen_box, enhance_und_box, seed, num_imgs],
	)
	# ==============================================

	UI_TRANSLATIONS = {
	"🎉 UniWorld-V1 Chat Interface 🎉":"🎉 UniWorld-V1 聊天界面 🎉",
	"Unlock Cutting‑Edge Visual Perception, Feature Extraction, Editing, Synthesis, and Understanding":
	'解锁尖端视觉感知，特征提取，编辑，合成和理解',
	"Usage Guide:":"使用指南：",
	"It is recommended to perform inference on four images concurrently to offer varied selections.":"建议同时进行四张图像的推理，以提供多选。",
	"Uploaded images are automatically resized; manually specifying resolutions that differ substantially from the original is not advised.":"已上传的图像将自动调整大小，但手动指定与原始图像差异太大的分辨率并不建议。",
	"🖼️ Visual Perception & Feature Extraction":"🖼️ 视觉感知与特征提取",
	"Canny Edge Detection":"Canny边缘检测 ",
	"Mini-Line Segment Detection":"微型行段检测",
	"Normal Map Generation":"生成法线图",
	"Sketch Generation":"手绘生成",
	"Holistically-Nested Edge Detection":"整体嵌套边缘检测",
	"Depth Estimation":"深度估计",
	"Human Pose Estimation":"人体姿势估计",
	"Object Detection (Boxes)":"对象检测（框）",
	"Semantic Segmentation (Masks)":"语义分割（蒙版）",
	"✂️ Image Editing & Manipulation":"✂️ 图像编辑与操作",
	"Add Elements":"添加元素",
	"Adjust Attributes":"调整属性",
	"Change Background":"更改背景",
	"Remove Objects":"删除对象",
	"Replace Regions":"替换区域",
	"Perform Actions":"执行操作",
	"Restyle":"重绘风格",
	"Compose Scenes":"组合场景",
	"🔄 Cross-Modal Synthesis & Transformation":"🔄 跨模态综合与转换",
	"Text→Image Synthesis":"文本→图像综合",
	"Image‑to‑Image Translation":"图像-图像转换",
	"Multi‑Image Combination":"多图像组合",
	"Extract IP Features":"提取IP特征",
	"IP Feature Composition":"IP特征组合",
	"🤖 Visual & Textual QA":"🤖 视觉和文字质量检查",
	"Image‑Text QA":"图像-文本质量检查",
	"Text‑Text QA":"文本-文本质量检查",
	"Image 1":"图像 1",
	"Image 2 (Optional reference)":"图像 2 (可选参考)",
	"Instruction":"指令",
	"Seed (-1 for random)":"种子 (-1为随机)",
	"Num Images":"图像数量",
	"Height":"高度",
	"Width":"宽度",
	"Inference steps":"推理步数",
	"Guidance scale":"引导缩放",
	"Advanced Options":"高级选项",
	"Enhance Generation":"增强生成",
	"Enhance Understanding":"增强理解",
	"Enhance Text Rendering":"增强文本渲染",
	"Enhance Current Turn":"增强当前轮次",
	"Send":"发送",
	"Clear History":"清除历史记录",
	}


	def apply_localization(block):
	def process_component(component):
	if not component:
	return

	for attr in ['label', 'info', 'placeholder']:
	if hasattr(component, attr):
	text = getattr(component, attr)
	if text in UI_TRANSLATIONS:
	setattr(component, attr, UI_TRANSLATIONS[text])

	if hasattr(component, 'value'):
	value = component.value
	if isinstance(value, str) and value in UI_TRANSLATIONS:
	component.value = UI_TRANSLATIONS[value]

	if isinstance(component, gr.Markdown):
	for en, zh in UI_TRANSLATIONS.items():
	component.value = component.value.replace(en, zh)

	if hasattr(component, 'children'):
	for child in component.children:
	process_component(child)

	process_component(block)
	return block


	if __name__ == "__main__":
	if args.zh:
	demo = apply_localization(demo)
	demo.title = "UniWorld-V1"
	demo.launch(
	allowed_paths=["/"],
	server_name=args.server_name,
	server_port=args.server_port,
	share=args.share,
	inbrowser=True,
	)


	'''
	MODEL_PATH="/mnt/data/lb/Remake/FlowWorld/checkpoints/flux_qwen2p5vl_7b_vlm_mlp_siglip_stage2_ts_1024_bs42x8x1_fa_any_11ratio_ema999_ocr_adamw_t5_0p4_lr1e-5_mask_refstyle_extract_resume_run3/checkpoint-12000/model_ema"
	FLUX_PATH="/mnt/data/checkpoints/black-forest-labs/FLUX.1-dev"
	SIGLIP_PATH="/mnt/data/checkpoints/google/siglip2-so400m-patch16-512"
	CUDA_VISIBLE_DEVICES=2 python app.py \
	--model_path ${MODEL_PATH} \
	--flux_path ${FLUX_PATH} \
	--siglip_path ${SIGLIP_PATH}
	'''