dangthr commited on
Commit
4eb64b8
·
verified ·
1 Parent(s): 409931c

Rename gradio.py to inference.py

Browse files
Files changed (1) hide show
  1. gradio.py → inference.py +193 -215
gradio.py → inference.py RENAMED
@@ -1,282 +1,260 @@
1
- import gradio as gr
2
- import numpy as np
 
3
  import random
4
- import torch
5
- import spaces
 
 
6
 
 
 
 
7
  from PIL import Image
8
  from diffusers import QwenImageEditPipeline
9
 
10
- import os
11
- import base64
12
- import json
13
 
14
  SYSTEM_PROMPT = '''
15
  # Edit Instruction Rewriter
16
- You are a professional edit instruction rewriter. Your task is to generate a precise, concise, and visually achievable professional-level edit instruction based on the user-provided instruction and the image to be edited.
17
 
18
  Please strictly follow the rewriting rules below:
19
 
20
  ## 1. General Principles
21
- - Keep the rewritten prompt **concise**. Avoid overly long sentences and reduce unnecessary descriptive language.
22
- - If the instruction is contradictory, vague, or unachievable, prioritize reasonable inference and correction, and supplement details when necessary.
23
- - Keep the core intention of the original instruction unchanged, only enhancing its clarity, rationality, and visual feasibility.
24
- - All added objects or modifications must align with the logic and style of the edited input image’s overall scene.
25
 
26
  ## 2. Task Type Handling Rules
27
  ### 1. Add, Delete, Replace Tasks
28
- - If the instruction is clear (already includes task type, target entity, position, quantity, attributes), preserve the original intent and only refine the grammar.
29
- - If the description is vague, supplement with minimal but sufficient details (category, color, size, orientation, position, etc.). For example:
30
- > Original: "Add an animal"
31
- > Rewritten: "Add a light-gray cat in the bottom-right corner, sitting and facing the camera"
32
- - Remove meaningless instructions: e.g., "Add 0 objects" should be ignored or flagged as invalid.
33
- - For replacement tasks, specify "Replace Y with X" and briefly describe the key visual features of X.
34
 
35
  ### 2. Text Editing Tasks
36
- - All text content must be enclosed in English double quotes `" "`. Do not translate or alter the original language of the text, and do not change the capitalization.
37
  - **For text replacement tasks, always use the fixed template:**
38
- - `Replace "xx" to "yy"`.
39
- - `Replace the xx bounding box to "yy"`.
40
- - If the user does not specify text content, infer and add concise text based on the instruction and the input image’s context. For example:
41
- > Original: "Add a line of text" (poster)
42
- > Rewritten: "Add text \"LIMITED EDITION\" at the top center with slight shadow"
43
- - Specify text position, color, and layout in a concise way.
44
 
45
  ### 3. Human Editing Tasks
46
- - Maintain the person’s core visual consistency (ethnicity, gender, age, hairstyle, expression, outfit, etc.).
47
- - If modifying appearance (e.g., clothes, hairstyle), ensure the new element is consistent with the original style.
48
- - **For expression changes, they must be natural and subtle, never exaggerated.**
49
  - If deletion is not specifically emphasized, the most important subject in the original image (e.g., a person, an animal) should be preserved.
50
- - For background change tasks, emphasize maintaining subject consistency at first.
51
- - Example:
52
- > Original: "Change the person’s hat"
53
- > Rewritten: "Replace the man’s hat with a dark brown beret; keep smile, short hair, and gray jacket unchanged"
54
 
55
  ### 4. Style Transformation or Enhancement Tasks
56
- - If a style is specified, describe it concisely with key visual traits. For example:
57
- > Original: "Disco style"
58
- > Rewritten: "1970s disco: flashing lights, disco ball, mirrored walls, colorful tones"
59
- - If the instruction says "use reference style" or "keep current style," analyze the input image, extract main features (color, composition, texture, lighting, art style), and integrate them concisely.
60
- - **For coloring tasks, including restoring old photos, always use the fixed template:** "Restore old photograph, remove scratches, reduce noise, enhance details, high resolution, realistic, natural skin tones, clear facial features, no distortion, vintage photo restoration"
61
  - If there are other changes, place the style description at the end.
62
 
63
  ## 3. Rationality and Logic Checks
64
- - Resolve contradictory instructions: e.g., "Remove all trees but keep all trees" should be logically corrected.
65
- - Add missing key information: if position is unspecified, choose a reasonable area based on composition (near subject, empty space, center/edges).
66
 
67
  # Output Format Example
68
  ```json
69
  {
70
- "Rewritten": "..."
71
  }
72
  '''
73
 
74
  def polish_prompt(prompt, img):
75
- prompt = f"{SYSTEM_PROMPT}\n\nUser Input: {prompt}\n\nRewritten Prompt:"
76
- success=False
77
- while not success:
 
 
 
 
78
  try:
79
- result = api(prompt, [img])
80
- # print(f"Result: {result}")
81
- # print(f"Polished Prompt: {polished_prompt}")
82
  if isinstance(result, str):
83
- result = result.replace('```json','')
84
- result = result.replace('```','')
85
- result = json.loads(result)
86
  else:
87
- result = json.loads(result)
88
-
89
- polished_prompt = result['Rewritten']
90
- polished_prompt = polished_prompt.strip()
91
- polished_prompt = polished_prompt.replace("\n", " ")
92
- success = True
93
  except Exception as e:
94
- print(f"[Warning] Error during API call: {e}")
95
- return polished_prompt
96
-
 
97
 
98
  def encode_image(pil_image):
99
- import io
100
- buffered = io.BytesIO()
101
  pil_image.save(buffered, format="PNG")
102
  return base64.b64encode(buffered.getvalue()).decode("utf-8")
103
 
104
-
105
-
106
-
107
  def api(prompt, img_list, model="qwen-vl-max-latest", kwargs={}):
 
108
  import dashscope
109
  api_key = os.environ.get('DASH_API_KEY')
110
  if not api_key:
111
  raise EnvironmentError("DASH_API_KEY is not set")
112
- assert model in ["qwen-vl-max-latest"], f"Not implemented model {model}"
113
- sys_promot = "you are a helpful assistant, you should provide useful answers to users."
114
  messages = [
115
- {"role": "system", "content": sys_promot},
116
- {"role": "user", "content": []}]
 
117
  for img in img_list:
118
- messages[1]["content"].append(
119
- {"image": f"data:image/png;base64,{encode_image(img)}"})
120
  messages[1]["content"].append({"text": f"{prompt}"})
121
 
122
- response_format = kwargs.get('response_format', None)
123
-
124
  response = dashscope.MultiModalConversation.call(
125
  api_key=api_key,
126
- model=model, # For example, use qwen-plus here. You can change the model name as needed. Model list: https://help.aliyun.com/zh/model-studio/getting-started/models
127
  messages=messages,
128
  result_format='message',
129
- response_format=response_format,
130
- )
131
 
132
  if response.status_code == 200:
133
  return response.output.choices[0].message.content[0]['text']
134
  else:
135
  raise Exception(f'Failed to post: {response}')
136
 
137
- # --- Model Loading ---
138
- dtype = torch.bfloat16
139
- device = "cuda" if torch.cuda.is_available() else "cpu"
140
-
141
- # Load the model pipeline
142
- pipe = QwenImageEditPipeline.from_pretrained("Qwen/Qwen-Image-Edit", torch_dtype=dtype).to(device)
143
-
144
- # --- UI Constants and Helpers ---
145
- MAX_SEED = np.iinfo(np.int32).max
146
-
147
- # --- Main Inference Function (with hardcoded negative prompt) ---
148
- @spaces.GPU(duration=300)
149
- def infer(
150
- image,
151
- prompt,
152
- seed=42,
153
- randomize_seed=False,
154
- true_guidance_scale=1.0,
155
- num_inference_steps=50,
156
- rewrite_prompt=True,
157
- num_images_per_prompt=1,
158
- progress=gr.Progress(track_tqdm=True),
159
- ):
160
- """
161
- Generates an image using the local Qwen-Image diffusers pipeline.
162
- """
163
- # Hardcode the negative prompt as requested
164
- negative_prompt = " "
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
- if randomize_seed:
167
- seed = random.randint(0, MAX_SEED)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
- # Set up the generator for reproducibility
170
- generator = torch.Generator(device=device).manual_seed(seed)
171
 
172
- print(f"Calling pipeline with prompt: '{prompt}'")
173
- print(f"Negative Prompt: '{negative_prompt}'")
174
- print(f"Seed: {seed}, Steps: {num_inference_steps}, Guidance: {true_guidance_scale}")
175
- if rewrite_prompt:
176
- prompt = polish_prompt(prompt, image)
177
- print(f"Rewritten Prompt: {prompt}")
178
-
179
- # Generate the image
180
- image = pipe(
181
- image,
182
- prompt=prompt,
183
- negative_prompt=negative_prompt,
184
- num_inference_steps=num_inference_steps,
185
- generator=generator,
186
- true_cfg_scale=true_guidance_scale,
187
- num_images_per_prompt=num_images_per_prompt
188
- ).images
189
-
190
- return image, seed
191
-
192
- # --- Examples and UI Layout ---
193
- examples = []
194
-
195
- css = """
196
- #col-container {
197
- margin: 0 auto;
198
- max-width: 1024px;
199
- }
200
- #edit_text{margin-top: -62px !important}
201
- """
202
-
203
- with gr.Blocks(css=css) as demo:
204
- with gr.Column(elem_id="col-container"):
205
- gr.HTML('<img src="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Image/qwen_image_edit_logo.png" alt="Qwen-Image Logo" width="400" style="display: block; margin: 0 auto;">')
206
- gr.Markdown("[Learn more](https://github.com/QwenLM/Qwen-Image) about the Qwen-Image series. Try on [Qwen Chat](https://chat.qwen.ai/), or [download model](https://huggingface.co/Qwen/Qwen-Image-Edit) to run locally with ComfyUI or diffusers.")
207
- with gr.Row():
208
- with gr.Column():
209
- input_image = gr.Image(label="Input Image", show_label=False, type="pil")
210
-
211
- # result = gr.Image(label="Result", show_label=False, type="pil")
212
- result = gr.Gallery(label="Result", show_label=False, type="pil")
213
- with gr.Row():
214
- prompt = gr.Text(
215
- label="Prompt",
216
- show_label=False,
217
- placeholder="describe the edit instruction",
218
- container=False,
219
- )
220
- run_button = gr.Button("Edit!", variant="primary")
221
-
222
- with gr.Accordion("Advanced Settings", open=False):
223
- # Negative prompt UI element is removed here
224
-
225
- seed = gr.Slider(
226
- label="Seed",
227
- minimum=0,
228
- maximum=MAX_SEED,
229
- step=1,
230
- value=0,
231
- )
232
-
233
- randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
234
-
235
- with gr.Row():
236
-
237
- true_guidance_scale = gr.Slider(
238
- label="True guidance scale",
239
- minimum=1.0,
240
- maximum=10.0,
241
- step=0.1,
242
- value=4.0
243
- )
244
-
245
- num_inference_steps = gr.Slider(
246
- label="Number of inference steps",
247
- minimum=1,
248
- maximum=50,
249
- step=1,
250
- value=50,
251
- )
252
-
253
- num_images_per_prompt = gr.Slider(
254
- label="Number of images per prompt",
255
- minimum=1,
256
- maximum=4,
257
- step=1,
258
- value=1,
259
- )
260
-
261
- rewrite_prompt = gr.Checkbox(label="Rewrite prompt", value=True)
262
-
263
- # gr.Examples(examples=examples, inputs=[prompt], outputs=[result, seed], fn=infer, cache_examples=False)
264
-
265
- gr.on(
266
- triggers=[run_button.click, prompt.submit],
267
- fn=infer,
268
- inputs=[
269
- input_image,
270
- prompt,
271
- seed,
272
- randomize_seed,
273
- true_guidance_scale,
274
- num_inference_steps,
275
- rewrite_prompt,
276
- num_images_per_prompt,
277
- ],
278
- outputs=[result, seed],
279
  )
280
 
281
- if __name__ == "__main__":
282
- demo.launch()
 
1
+ # inference.py
2
+ import os
3
+ import argparse
4
  import random
5
+ import json
6
+ import base64
7
+ from io import BytesIO
8
+ from datetime import datetime
9
 
10
+ import torch
11
+ import numpy as np
12
+ import requests
13
  from PIL import Image
14
  from diffusers import QwenImageEditPipeline
15
 
16
+ # --- 从原脚本保留的辅助函数 ---
 
 
17
 
18
  SYSTEM_PROMPT = '''
19
  # Edit Instruction Rewriter
20
+ You are a professional edit instruction rewriter. Your task is to generate a precise, concise, and visually achievable professional-level edit instruction based on the user-provided instruction and the image to be edited.  
21
 
22
  Please strictly follow the rewriting rules below:
23
 
24
  ## 1. General Principles
25
+ - Keep the rewritten prompt **concise**. Avoid overly long sentences and reduce unnecessary descriptive language.  
26
+ - If the instruction is contradictory, vague, or unachievable, prioritize reasonable inference and correction, and supplement details when necessary.  
27
+ - Keep the core intention of the original instruction unchanged, only enhancing its clarity, rationality, and visual feasibility.  
28
+ - All added objects or modifications must align with the logic and style of the edited input image’s overall scene.  
29
 
30
  ## 2. Task Type Handling Rules
31
  ### 1. Add, Delete, Replace Tasks
32
+ - If the instruction is clear (already includes task type, target entity, position, quantity, attributes), preserve the original intent and only refine the grammar.  
33
+ - If the description is vague, supplement with minimal but sufficient details (category, color, size, orientation, position, etc.). For example:  
34
+     > Original: "Add an animal"  
35
+     > Rewritten: "Add a light-gray cat in the bottom-right corner, sitting and facing the camera"  
36
+ - Remove meaningless instructions: e.g., "Add 0 objects" should be ignored or flagged as invalid.  
37
+ - For replacement tasks, specify "Replace Y with X" and briefly describe the key visual features of X.  
38
 
39
  ### 2. Text Editing Tasks
40
+ - All text content must be enclosed in English double quotes `" "`. Do not translate or alter the original language of the text, and do not change the capitalization.  
41
  - **For text replacement tasks, always use the fixed template:**
42
+     - `Replace "xx" to "yy"`.  
43
+     - `Replace the xx bounding box to "yy"`.  
44
+ - If the user does not specify text content, infer and add concise text based on the instruction and the input image’s context. For example:  
45
+     > Original: "Add a line of text" (poster)  
46
+     > Rewritten: "Add text \"LIMITED EDITION\" at the top center with slight shadow"  
47
+ - Specify text position, color, and layout in a concise way.  
48
 
49
  ### 3. Human Editing Tasks
50
+ - Maintain the person’s core visual consistency (ethnicity, gender, age, hairstyle, expression, outfit, etc.).  
51
+ - If modifying appearance (e.g., clothes, hairstyle), ensure the new element is consistent with the original style.  
52
+ - **For expression changes, they must be natural and subtle, never exaggerated.**  
53
  - If deletion is not specifically emphasized, the most important subject in the original image (e.g., a person, an animal) should be preserved.
54
+     - For background change tasks, emphasize maintaining subject consistency at first.  
55
+ - Example:  
56
+     > Original: "Change the person’s hat"  
57
+     > Rewritten: "Replace the man’s hat with a dark brown beret; keep smile, short hair, and gray jacket unchanged"  
58
 
59
  ### 4. Style Transformation or Enhancement Tasks
60
+ - If a style is specified, describe it concisely with key visual traits. For example:  
61
+     > Original: "Disco style"  
62
+     > Rewritten: "1970s disco: flashing lights, disco ball, mirrored walls, colorful tones"  
63
+ - If the instruction says "use reference style" or "keep current style," analyze the input image, extract main features (color, composition, texture, lighting, art style), and integrate them concisely.  
64
+ - **For coloring tasks, including restoring old photos, always use the fixed template:** "Restore old photograph, remove scratches, reduce noise, enhance details, high resolution, realistic, natural skin tones, clear facial features, no distortion, vintage photo restoration"  
65
  - If there are other changes, place the style description at the end.
66
 
67
  ## 3. Rationality and Logic Checks
68
+ - Resolve contradictory instructions: e.g., "Remove all trees but keep all trees" should be logically corrected.  
69
+ - Add missing key information: if position is unspecified, choose a reasonable area based on composition (near subject, empty space, center/edges).  
70
 
71
  # Output Format Example
72
  ```json
73
  {
74
+    "Rewritten": "..."
75
  }
76
  '''
77
 
78
  def polish_prompt(prompt, img):
79
+ """使用 DashScope API 重写和优化提示词"""
80
+ if not os.environ.get('DASH_API_KEY'):
81
+ print("[警告] 环境变量 DASH_API_KEY 未设置,将跳过提示词重写。")
82
+ return prompt
83
+
84
+ full_prompt = f"{SYSTEM_PROMPT}\n\nUser Input: {prompt}\n\nRewritten Prompt:"
85
+ for attempt in range(3): # 最多重试3次
86
  try:
87
+ result = api(full_prompt, [img])
 
 
88
  if isinstance(result, str):
89
+ result_json_str = result.replace('```json','').replace('```','')
90
+ result_data = json.loads(result_json_str)
 
91
  else:
92
+ result_data = json.loads(result)
93
+
94
+ polished = result_data['Rewritten']
95
+ return polished.strip().replace("\n", " ")
 
 
96
  except Exception as e:
97
+ print(f"[警告] API调用失败 (尝试 {attempt + 1}): {e}")
98
+
99
+ print("[错误] 多次尝试后提示词重写失败,将使用原始提示词。")
100
+ return prompt
101
 
102
  def encode_image(pil_image):
103
+ """将 PIL 图片编码为 base64 字符串"""
104
+ buffered = BytesIO()
105
  pil_image.save(buffered, format="PNG")
106
  return base64.b64encode(buffered.getvalue()).decode("utf-8")
107
 
 
 
 
108
  def api(prompt, img_list, model="qwen-vl-max-latest", kwargs={}):
109
+ """调用 DashScope API"""
110
  import dashscope
111
  api_key = os.environ.get('DASH_API_KEY')
112
  if not api_key:
113
  raise EnvironmentError("DASH_API_KEY is not set")
114
+
 
115
  messages = [
116
+ {"role": "system", "content": "you are a helpful assistant, you should provide useful answers to users."},
117
+ {"role": "user", "content": []}
118
+ ]
119
  for img in img_list:
120
+ messages[1]["content"].append({"image": f"data:image/png;base64,{encode_image(img)}"})
 
121
  messages[1]["content"].append({"text": f"{prompt}"})
122
 
 
 
123
  response = dashscope.MultiModalConversation.call(
124
  api_key=api_key,
125
+ model=model,
126
  messages=messages,
127
  result_format='message',
128
+ response_format=kwargs.get('response_format', None),
129
+ )
130
 
131
  if response.status_code == 200:
132
  return response.output.choices[0].message.content[0]['text']
133
  else:
134
  raise Exception(f'Failed to post: {response}')
135
 
136
+ def load_image(image_path):
137
+ """从本地路径或URL加载图片"""
138
+ try:
139
+ if image_path.startswith("http://") or image_path.startswith("https://"):
140
+ response = requests.get(image_path)
141
+ response.raise_for_status()
142
+ image = Image.open(BytesIO(response.content)).convert("RGB")
143
+ else:
144
+ image = Image.open(image_path).convert("RGB")
145
+ return image
146
+ except Exception as e:
147
+ print(f"❌ 错误:无法加载图片 '{image_path}'。请检查路径或链接是否正确。")
148
+ print(f" 详细信息: {e}")
149
+ return None
150
+
151
+ # --- 主推理逻辑 ---
152
+
153
+ def main(args):
154
+ """执行模型推理的主函数"""
155
+ output_dir = "output"
156
+ os.makedirs(output_dir, exist_ok=True)
157
+
158
+ dtype = torch.bfloat16
159
+ device = "cuda" if torch.cuda.is_available() else "cpu"
160
+ print(f"使用设备: {device}")
161
+
162
+ print("正在加载 Qwen-Image-Edit 模型...")
163
+ try:
164
+ pipe = QwenImageEditPipeline.from_pretrained("Qwen/Qwen-Image-Edit", torch_dtype=dtype).to(device)
165
+ print("模型加载完成。")
166
+ except Exception as e:
167
+ print(f"❌ 错误:模型加载失败。请检查网络连接和依赖项。")
168
+ print(f" 详细信息: {e}")
169
+ return
170
+
171
+ print(f"正在从 '{args.input_image}' 加载输入图片...")
172
+ input_image = load_image(args.input_image)
173
+ if input_image is None:
174
+ return
175
+
176
+ # 设置随机种子
177
+ seed = random.randint(0, np.iinfo(np.int32).max) if args.random_seed else args.seed
178
+ generator = torch.Generator(device=device).manual_seed(seed)
179
+
180
+ # 如果不禁用重写功能,则调用 polish_prompt
181
+ prompt_to_use = polish_prompt(args.prompt, input_image) if not args.no_rewrite else args.prompt
182
 
183
+ if not args.no_rewrite:
184
+ print(f"重写后的提示词: '{prompt_to_use}'")
185
+
186
+ print("-" * 30)
187
+ print("🚀 开始推理...")
188
+ print(f" - 提示词: '{prompt_to_use}'")
189
+ print(f" - 随机种子: {seed}")
190
+ print(f" - 推理步数: {args.steps}")
191
+ print(f" -引导系数 (Guidance Scale): {args.guidance_scale}")
192
+ print("-" * 30)
193
+
194
+ try:
195
+ images = pipe(
196
+ image=input_image,
197
+ prompt=prompt_to_use,
198
+ negative_prompt=" ", # 固定负向���示词
199
+ num_inference_steps=args.steps,
200
+ generator=generator,
201
+ true_cfg_scale=args.guidance_scale,
202
+ num_images_per_prompt=1
203
+ ).images
204
+
205
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
206
+ output_path = os.path.join(output_dir, f"output_{timestamp}_{seed}.png")
207
+ images[0].save(output_path)
208
+ print(f"✅ 推理成功!图片已保存至: {output_path}")
209
+
210
+ except Exception as e:
211
+ print(f"❌ 推理过程中发生错误: {e}")
212
+
213
+ # --- 命令行接口 ---
214
 
215
+ if __name__ == "__main__":
216
+ parser = argparse.ArgumentParser(description="Qwen 图像编辑命令行工具")
217
 
218
+ parser.add_argument(
219
+ "--prompt",
220
+ type=str,
221
+ required=True,
222
+ help="必须:用于编辑图像的指令。"
223
+ )
224
+ parser.add_argument(
225
+ "--input_image",
226
+ type=str,
227
+ required=True,
228
+ help="必须:输入图片的本地路径或URL链接。"
229
+ )
230
+ parser.add_argument(
231
+ "--seed",
232
+ type=int,
233
+ default=42,
234
+ help="用于复现结果的随机种子,默认为 42。"
235
+ )
236
+ parser.add_argument(
237
+ "--random_seed",
238
+ action="store_true",
239
+ help="如果设置此项,则使用一个随机种子。"
240
+ )
241
+ parser.add_argument(
242
+ "--steps",
243
+ type=int,
244
+ default=50,
245
+ help="推理步数,默认为 50。"
246
+ )
247
+ parser.add_argument(
248
+ "--guidance_scale",
249
+ type=float,
250
+ default=4.0,
251
+ help="引导系数 (CFG scale),默认为 4.0"
252
+ )
253
+ parser.add_argument(
254
+ "--no_rewrite",
255
+ action="store_true",
256
+ help="如果设置此项,则禁用提示词重写功能。"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  )
258
 
259
+ args = parser.parse_args()
260
+ main(args)