akhaliq HF Staff commited on
Commit
ebd4771
·
1 Parent(s): c5c1881

add image to video gen

Browse files
Files changed (1) hide show
  1. app.py +413 -20
app.py CHANGED
@@ -28,6 +28,8 @@ from huggingface_hub import HfApi
28
  import tempfile
29
  from openai import OpenAI
30
  from mistralai import Mistral
 
 
31
 
32
  # Gradio supported languages for syntax highlighting
33
  GRADIO_SUPPORTED_LANGUAGES = [
@@ -86,6 +88,64 @@ Structural requirements:
86
  Return ONLY the code inside a single ```html ... ``` code block. No additional text before or after.
87
  """
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  TRANSFORMERS_JS_SYSTEM_PROMPT = """You are an expert web developer creating a transformers.js application. You will generate THREE separate files: index.html, index.js, and style.css.
90
 
91
  IMPORTANT: You MUST output ALL THREE files in the following format:
@@ -1236,6 +1296,129 @@ def generate_image_to_image(input_image_data, prompt: str) -> str:
1236
  print(f"Image-to-image generation error: {str(e)}")
1237
  return f"Error generating image (image-to-image): {str(e)}"
1238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1239
  def extract_image_prompts_from_text(text: str, num_images_needed: int = 1) -> list:
1240
  """Extract image generation prompts from the full text based on number of images needed"""
1241
  # Use the entire text as the base prompt for image generation
@@ -1308,7 +1491,8 @@ def create_image_replacement_blocks(html_content: str, user_prompt: str) -> str:
1308
  # If no placeholder images found, look for any img tags
1309
  if not placeholder_images:
1310
  img_pattern = r'<img[^>]*>'
1311
- placeholder_images = re.findall(img_pattern, html_content)
 
1312
 
1313
  # Also look for div elements that might be image placeholders
1314
  div_placeholder_patterns = [
@@ -1543,17 +1727,127 @@ def create_image_replacement_blocks_from_input_image(html_content: str, user_pro
1543
 
1544
  return '\n\n'.join(replacement_blocks)
1545
 
1546
- def apply_generated_images_to_html(html_content: str, user_prompt: str, enable_text_to_image: bool, enable_image_to_image: bool, input_image_data, image_to_image_prompt: str | None = None, text_to_image_prompt: str | None = None) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1547
  """Apply text-to-image and/or image-to-image replacements to HTML content.
1548
 
1549
  If both toggles are enabled, text-to-image replacements run first, then image-to-image.
1550
  """
1551
  result = html_content
1552
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1553
  # If an input image is provided and image-to-image is enabled, we only replace one image
1554
  # and skip text-to-image to satisfy the requirement to replace exactly the number of uploaded images.
1555
  if enable_image_to_image and input_image_data is not None and (result.strip().startswith('<!DOCTYPE html>') or result.strip().startswith('<html')):
1556
- # Prefer the dedicated image-to-image prompt if provided
1557
  i2i_prompt = (image_to_image_prompt or user_prompt or "").strip()
1558
  blocks2 = create_image_replacement_blocks_from_input_image(result, i2i_prompt, input_image_data, max_images=1)
1559
  if blocks2:
@@ -1562,11 +1856,16 @@ def apply_generated_images_to_html(html_content: str, user_prompt: str, enable_t
1562
 
1563
  if enable_text_to_image and (result.strip().startswith('<!DOCTYPE html>') or result.strip().startswith('<html')):
1564
  t2i_prompt = (text_to_image_prompt or user_prompt or "").strip()
 
1565
  # Single-image flow for text-to-image
1566
  blocks = create_image_replacement_blocks_text_to_image_single(result, t2i_prompt)
1567
  if blocks:
 
1568
  result = apply_search_replace_changes(result, blocks)
1569
  except Exception:
 
 
 
1570
  return html_content
1571
  return result
1572
 
@@ -1856,6 +2155,39 @@ Please use the search results above to help create the requested application wit
1856
  def send_to_sandbox(code):
1857
  """Render HTML in a sandboxed iframe. Assumes full HTML is provided by prompts."""
1858
  html_doc = (code or "").strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1859
  encoded_html = base64.b64encode(html_doc.encode('utf-8')).decode('utf-8')
1860
  data_uri = f"data:text/html;charset=utf-8;base64,{encoded_html}"
1861
  iframe = f'<iframe src="{data_uri}" width="100%" height="920px" sandbox="allow-scripts allow-same-origin allow-forms allow-popups allow-modals allow-presentation" allow="display-capture"></iframe>'
@@ -2361,7 +2693,7 @@ The HTML code above contains the complete original website structure with all im
2361
  stop_generation = False
2362
 
2363
 
2364
- def generation_code(query: Optional[str], image: Optional[gr.Image], file: Optional[str], website_url: Optional[str], _setting: Dict[str, str], _history: Optional[History], _current_model: Dict, enable_search: bool = False, language: str = "html", provider: str = "auto", enable_image_generation: bool = False, enable_image_to_image: bool = False, image_to_image_prompt: Optional[str] = None, text_to_image_prompt: Optional[str] = None):
2365
  if query is None:
2366
  query = ''
2367
  if _history is None:
@@ -2389,6 +2721,22 @@ def generation_code(query: Optional[str], image: Optional[gr.Image], file: Optio
2389
  '=== src/App.svelte ===' in last_assistant_msg):
2390
  has_existing_content = True
2391
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2392
  # Choose system prompt based on context
2393
  if has_existing_content:
2394
  # Use follow-up prompt for modifying existing content
@@ -2444,8 +2792,8 @@ This will help me create a better design for you."""
2444
 
2445
  # Check if this is GLM-4.5 model and handle with simple HuggingFace InferenceClient
2446
  if _current_model["id"] == "zai-org/GLM-4.5":
2447
- if image is not None:
2448
- messages.append(create_multimodal_message(enhanced_query, image))
2449
  else:
2450
  messages.append({'role': 'user', 'content': enhanced_query})
2451
 
@@ -2486,13 +2834,17 @@ This will help me create a better design for you."""
2486
  clean_code = remove_code_block(content)
2487
 
2488
  # Apply image generation (text→image and/or image→image)
 
2489
  final_content = apply_generated_images_to_html(
2490
  content,
2491
  query,
2492
  enable_text_to_image=enable_image_generation,
2493
  enable_image_to_image=enable_image_to_image,
2494
- input_image_data=image,
2495
  image_to_image_prompt=image_to_image_prompt,
 
 
 
2496
  )
2497
 
2498
  _history.append([query, final_content])
@@ -2647,13 +2999,17 @@ This will help me create a better design for you."""
2647
  clean_content = remove_code_block(modified_content)
2648
 
2649
  # Apply image generation (text→image and/or image→image)
 
2650
  clean_content = apply_generated_images_to_html(
2651
  clean_content,
2652
  query,
2653
  enable_text_to_image=enable_image_generation,
2654
  enable_image_to_image=enable_image_to_image,
2655
- input_image_data=image,
2656
  image_to_image_prompt=image_to_image_prompt,
 
 
 
2657
  )
2658
 
2659
  yield {
@@ -2664,14 +3020,18 @@ This will help me create a better design for you."""
2664
  }
2665
  else:
2666
  # Apply image generation (text→image and/or image→image)
 
2667
  final_content = apply_generated_images_to_html(
2668
  clean_code,
2669
  query,
2670
  enable_text_to_image=enable_image_generation,
2671
  enable_image_to_image=enable_image_to_image,
2672
- input_image_data=image,
2673
  image_to_image_prompt=image_to_image_prompt,
2674
  text_to_image_prompt=text_to_image_prompt,
 
 
 
2675
  )
2676
 
2677
  preview_val = None
@@ -2693,7 +3053,7 @@ This will help me create a better design for you."""
2693
  structured = [
2694
  {"role": "system", "content": GLM45V_HTML_SYSTEM_PROMPT}
2695
  ]
2696
- if image is not None:
2697
  user_msg = {
2698
  "role": "user",
2699
  "content": [
@@ -2704,10 +3064,10 @@ This will help me create a better design for you."""
2704
  import io, base64
2705
  from PIL import Image
2706
  import numpy as np
2707
- if isinstance(image, np.ndarray):
2708
- image = Image.fromarray(image)
2709
  buf = io.BytesIO()
2710
- image.save(buf, format="PNG")
2711
  b64 = base64.b64encode(buf.getvalue()).decode()
2712
  user_msg["content"].append({
2713
  "type": "image_url",
@@ -2775,8 +3135,8 @@ This will help me create a better design for you."""
2775
  # Use dynamic client based on selected model (for non-GLM-4.5 models)
2776
  client = get_inference_client(_current_model["id"], provider)
2777
 
2778
- if image is not None:
2779
- messages.append(create_multimodal_message(enhanced_query, image))
2780
  else:
2781
  messages.append({'role': 'user', 'content': enhanced_query})
2782
  try:
@@ -3060,13 +3420,17 @@ This will help me create a better design for you."""
3060
  clean_content = remove_code_block(modified_content)
3061
 
3062
  # Apply image generation (text→image and/or image→image)
 
3063
  clean_content = apply_generated_images_to_html(
3064
  clean_content,
3065
  query,
3066
  enable_text_to_image=enable_image_generation,
3067
  enable_image_to_image=enable_image_to_image,
3068
- input_image_data=image,
3069
  image_to_image_prompt=image_to_image_prompt,
 
 
 
3070
  text_to_image_prompt=text_to_image_prompt,
3071
  )
3072
 
@@ -3083,14 +3447,18 @@ This will help me create a better design for you."""
3083
  final_content = remove_code_block(content)
3084
 
3085
  # Apply image generation (text→image and/or image→image)
 
3086
  final_content = apply_generated_images_to_html(
3087
  final_content,
3088
  query,
3089
  enable_text_to_image=enable_image_generation,
3090
  enable_image_to_image=enable_image_to_image,
3091
- input_image_data=image,
3092
  image_to_image_prompt=image_to_image_prompt,
3093
  text_to_image_prompt=text_to_image_prompt,
 
 
 
3094
  )
3095
 
3096
  _history.append([query, final_content])
@@ -4138,6 +4506,11 @@ with gr.Blocks(
4138
  label="UI design image",
4139
  visible=False
4140
  )
 
 
 
 
 
4141
  image_to_image_prompt = gr.Textbox(
4142
  label="Image-to-Image Prompt",
4143
  placeholder="Describe how to transform the uploaded image (e.g., 'Turn the cat into a tiger.')",
@@ -4194,9 +4567,21 @@ with gr.Blocks(
4194
  visible=True,
4195
  info="Transform your uploaded image using FLUX.1-Kontext-dev"
4196
  )
 
 
 
 
 
 
 
 
 
 
 
 
4197
 
4198
  def on_image_to_image_toggle(toggled):
4199
- # Show image input and its prompt when image-to-image is enabled
4200
  return gr.update(visible=bool(toggled)), gr.update(visible=bool(toggled))
4201
 
4202
  def on_text_to_image_toggle(toggled):
@@ -4205,7 +4590,15 @@ with gr.Blocks(
4205
  image_to_image_toggle.change(
4206
  on_image_to_image_toggle,
4207
  inputs=[image_to_image_toggle],
4208
- outputs=[image_input, image_to_image_prompt]
 
 
 
 
 
 
 
 
4209
  )
4210
  image_generation_toggle.change(
4211
  on_text_to_image_toggle,
@@ -4462,7 +4855,7 @@ with gr.Blocks(
4462
  show_progress="hidden",
4463
  ).then(
4464
  generation_code,
4465
- inputs=[input, image_input, file_input, website_url_input, setting, history, current_model, search_toggle, language_dropdown, provider_state, image_generation_toggle, image_to_image_toggle, image_to_image_prompt, text_to_image_prompt],
4466
  outputs=[code_output, history, sandbox, history_output]
4467
  ).then(
4468
  end_generation_ui,
 
28
  import tempfile
29
  from openai import OpenAI
30
  from mistralai import Mistral
31
+ import uuid
32
+ import threading
33
 
34
  # Gradio supported languages for syntax highlighting
35
  GRADIO_SUPPORTED_LANGUAGES = [
 
88
  Return ONLY the code inside a single ```html ... ``` code block. No additional text before or after.
89
  """
90
 
91
+ # ---------------------------------------------------------------------------
92
+ # Video temp-file management (per-session tracking and cleanup)
93
+ # ---------------------------------------------------------------------------
94
+ VIDEO_TEMP_DIR = os.path.join(tempfile.gettempdir(), "anycoder_videos")
95
+ VIDEO_FILE_TTL_SECONDS = 6 * 60 * 60 # 6 hours
96
+ _SESSION_VIDEO_FILES: Dict[str, List[str]] = {}
97
+ _VIDEO_FILES_LOCK = threading.Lock()
98
+
99
+
100
+ def _ensure_video_dir_exists() -> None:
101
+ try:
102
+ os.makedirs(VIDEO_TEMP_DIR, exist_ok=True)
103
+ except Exception:
104
+ pass
105
+
106
+
107
+ def _register_video_for_session(session_id: Optional[str], file_path: str) -> None:
108
+ if not session_id or not file_path:
109
+ return
110
+ with _VIDEO_FILES_LOCK:
111
+ if session_id not in _SESSION_VIDEO_FILES:
112
+ _SESSION_VIDEO_FILES[session_id] = []
113
+ _SESSION_VIDEO_FILES[session_id].append(file_path)
114
+
115
+
116
+ def cleanup_session_videos(session_id: Optional[str]) -> None:
117
+ if not session_id:
118
+ return
119
+ with _VIDEO_FILES_LOCK:
120
+ file_list = _SESSION_VIDEO_FILES.pop(session_id, [])
121
+ for path in file_list:
122
+ try:
123
+ if path and os.path.exists(path):
124
+ os.unlink(path)
125
+ except Exception:
126
+ # Best-effort cleanup
127
+ pass
128
+
129
+
130
+ def reap_old_videos(ttl_seconds: int = VIDEO_FILE_TTL_SECONDS) -> None:
131
+ """Delete old video files in the temp directory based on modification time."""
132
+ try:
133
+ _ensure_video_dir_exists()
134
+ now_ts = time.time()
135
+ for name in os.listdir(VIDEO_TEMP_DIR):
136
+ path = os.path.join(VIDEO_TEMP_DIR, name)
137
+ try:
138
+ if not os.path.isfile(path):
139
+ continue
140
+ mtime = os.path.getmtime(path)
141
+ if now_ts - mtime > ttl_seconds:
142
+ os.unlink(path)
143
+ except Exception:
144
+ pass
145
+ except Exception:
146
+ # Temp dir might not exist or be accessible; ignore
147
+ pass
148
+
149
  TRANSFORMERS_JS_SYSTEM_PROMPT = """You are an expert web developer creating a transformers.js application. You will generate THREE separate files: index.html, index.js, and style.css.
150
 
151
  IMPORTANT: You MUST output ALL THREE files in the following format:
 
1296
  print(f"Image-to-image generation error: {str(e)}")
1297
  return f"Error generating image (image-to-image): {str(e)}"
1298
 
1299
+ def generate_video_from_image(input_image_data, prompt: str, session_id: Optional[str] = None) -> str:
1300
+ """Generate a video from an input image and prompt using Hugging Face InferenceClient.
1301
+
1302
+ Returns an HTML <video> tag whose source points to a local file URL (file://...).
1303
+ """
1304
+ try:
1305
+ print("[Image2Video] Starting video generation")
1306
+ if not os.getenv('HF_TOKEN'):
1307
+ print("[Image2Video] Missing HF_TOKEN")
1308
+ return "Error: HF_TOKEN environment variable is not set. Please set it to your Hugging Face API token."
1309
+
1310
+ # Prepare client
1311
+ client = InferenceClient(
1312
+ provider="auto",
1313
+ api_key=os.getenv('HF_TOKEN'),
1314
+ bill_to="huggingface",
1315
+ )
1316
+ print(f"[Image2Video] InferenceClient initialized (provider=auto)")
1317
+
1318
+ # Normalize input image to bytes
1319
+ import io
1320
+ from PIL import Image
1321
+ try:
1322
+ import numpy as np
1323
+ except Exception:
1324
+ np = None
1325
+
1326
+ print(f"[Image2Video] Normalizing input image type={type(input_image_data)}")
1327
+ if hasattr(input_image_data, 'read'):
1328
+ raw = input_image_data.read()
1329
+ pil_image = Image.open(io.BytesIO(raw))
1330
+ elif hasattr(input_image_data, 'mode') and hasattr(input_image_data, 'size'):
1331
+ pil_image = input_image_data
1332
+ elif np is not None and isinstance(input_image_data, np.ndarray):
1333
+ pil_image = Image.fromarray(input_image_data)
1334
+ elif isinstance(input_image_data, (bytes, bytearray)):
1335
+ pil_image = Image.open(io.BytesIO(input_image_data))
1336
+ else:
1337
+ pil_image = Image.open(io.BytesIO(bytes(input_image_data)))
1338
+
1339
+ if pil_image.mode != 'RGB':
1340
+ pil_image = pil_image.convert('RGB')
1341
+ try:
1342
+ print(f"[Image2Video] Input PIL image size={pil_image.size} mode={pil_image.mode}")
1343
+ except Exception:
1344
+ pass
1345
+
1346
+ buf = io.BytesIO()
1347
+ pil_image.save(buf, format='PNG')
1348
+ input_bytes = buf.getvalue()
1349
+
1350
+ # Call image-to-video; require method support
1351
+ model_id = "Lightricks/LTX-Video-0.9.8-13B-distilled"
1352
+ image_to_video_method = getattr(client, "image_to_video", None)
1353
+ if not callable(image_to_video_method):
1354
+ print("[Image2Video] InferenceClient.image_to_video not available in this huggingface_hub version")
1355
+ return (
1356
+ "Error generating video (image-to-video): Your installed huggingface_hub version "
1357
+ "does not expose InferenceClient.image_to_video. Please upgrade with "
1358
+ "`pip install -U huggingface_hub` and try again."
1359
+ )
1360
+ print(f"[Image2Video] Calling image_to_video with model={model_id}, prompt length={len(prompt or '')}")
1361
+ video_bytes = image_to_video_method(
1362
+ input_bytes,
1363
+ prompt=prompt,
1364
+ model=model_id,
1365
+ )
1366
+ print(f"[Image2Video] Received video bytes: {len(video_bytes) if hasattr(video_bytes, '__len__') else 'unknown length'}")
1367
+
1368
+ # Save to temp file for this session (for cleanup on next Generate)
1369
+ try:
1370
+ _ensure_video_dir_exists()
1371
+ file_name = f"{uuid.uuid4()}.mp4"
1372
+ file_path = os.path.join(VIDEO_TEMP_DIR, file_name)
1373
+ with open(file_path, "wb") as f:
1374
+ f.write(video_bytes)
1375
+ _register_video_for_session(session_id, file_path)
1376
+ try:
1377
+ file_size = os.path.getsize(file_path)
1378
+ except Exception:
1379
+ file_size = -1
1380
+ print(f"[Image2Video] Saved video to temp file: {file_path} (size={file_size} bytes)")
1381
+ except Exception as save_exc:
1382
+ print(f"[Image2Video] Warning: could not persist temp video file: {save_exc}")
1383
+
1384
+ # Always use a file URL for the video source.
1385
+ video_html = ""
1386
+ file_url = None
1387
+ try:
1388
+ if 'file_path' in locals() and file_path:
1389
+ # Build a proper file:// URL for absolute paths (e.g., file:///var/.../uuid.mp4)
1390
+ try:
1391
+ from pathlib import Path
1392
+ file_url = Path(file_path).as_uri()
1393
+ except Exception:
1394
+ # Fallback to manual construction; ensure three slashes
1395
+ # Note: this may not be fully standards-compliant on Windows
1396
+ if file_path.startswith('/'):
1397
+ file_url = f"file:///{file_path.lstrip('/')}" # file:///abs/path
1398
+ else:
1399
+ file_url = f"file:///{file_path}"
1400
+ except Exception:
1401
+ file_url = None
1402
+
1403
+ if file_url:
1404
+ video_html = (
1405
+ f"<video controls style=\"max-width: 100%; height: auto; border-radius: 8px; margin: 10px 0;\">"
1406
+ f"<source src=\"{file_url}\" type=\"video/mp4\" />"
1407
+ f"Your browser does not support the video tag."
1408
+ f"</video>"
1409
+ )
1410
+ else:
1411
+ # If a file URL cannot be constructed, signal error to avoid embedding data URIs.
1412
+ return "Error generating video (image-to-video): Could not persist video to a local file."
1413
+ print("[Image2Video] Successfully generated video HTML tag")
1414
+ return video_html
1415
+ except Exception as e:
1416
+ import traceback
1417
+ print("[Image2Video] Exception during generation:")
1418
+ traceback.print_exc()
1419
+ print(f"Image-to-video generation error: {str(e)}")
1420
+ return f"Error generating video (image-to-video): {str(e)}"
1421
+
1422
  def extract_image_prompts_from_text(text: str, num_images_needed: int = 1) -> list:
1423
  """Extract image generation prompts from the full text based on number of images needed"""
1424
  # Use the entire text as the base prompt for image generation
 
1491
  # If no placeholder images found, look for any img tags
1492
  if not placeholder_images:
1493
  img_pattern = r'<img[^>]*>'
1494
+ # Case-insensitive to catch <IMG> or mixed-case tags
1495
+ placeholder_images = re.findall(img_pattern, html_content, re.IGNORECASE)
1496
 
1497
  # Also look for div elements that might be image placeholders
1498
  div_placeholder_patterns = [
 
1727
 
1728
  return '\n\n'.join(replacement_blocks)
1729
 
1730
+ def create_video_replacement_blocks_from_input_image(html_content: str, user_prompt: str, input_image_data, session_id: Optional[str] = None) -> str:
1731
+ """Create search/replace blocks that replace the first <img> (or placeholder) with a generated <video>.
1732
+
1733
+ Uses generate_video_from_image to produce a single video and swaps it in.
1734
+ """
1735
+ if not user_prompt:
1736
+ return ""
1737
+
1738
+ import re
1739
+ print("[Image2Video] Creating replacement blocks for video insertion")
1740
+
1741
+ placeholder_patterns = [
1742
+ r'<img[^>]*src=["\'](?:placeholder|dummy|sample|example)[^"\']*["\'][^>]*>',
1743
+ r'<img[^>]*src=["\']https?://via\.placeholder\.com[^"\']*["\'][^>]*>',
1744
+ r'<img[^>]*src=["\']https?://picsum\.photos[^"\']*["\'][^>]*>',
1745
+ r'<img[^>]*src=["\']https?://dummyimage\.com[^"\']*["\'][^>]*>',
1746
+ r'<img[^>]*alt=["\'][^"\']*placeholder[^"\']*["\'][^>]*>',
1747
+ r'<img[^>]*class=["\'][^"\']*placeholder[^"\']*["\'][^>]*>',
1748
+ r'<img[^>]*id=["\'][^"\']*placeholder[^"\']*["\'][^>]*>',
1749
+ r'<img[^>]*src=["\']data:image[^"\']*["\'][^>]*>',
1750
+ r'<img[^>]*src=["\']#["\'][^>]*>',
1751
+ r'<img[^>]*src=["\']about:blank["\'][^>]*>',
1752
+ ]
1753
+
1754
+ placeholder_images = []
1755
+ for pattern in placeholder_patterns:
1756
+ matches = re.findall(pattern, html_content, re.IGNORECASE)
1757
+ if matches:
1758
+ placeholder_images.extend(matches)
1759
+
1760
+ if not placeholder_images:
1761
+ img_pattern = r'<img[^>]*>'
1762
+ placeholder_images = re.findall(img_pattern, html_content)
1763
+ print(f"[Image2Video] Found {len(placeholder_images)} candidate <img> elements")
1764
+
1765
+ video_html = generate_video_from_image(input_image_data, user_prompt, session_id=session_id)
1766
+ try:
1767
+ has_file_src = 'src="' in video_html and video_html.count('src="') >= 1 and 'data:video/mp4;base64' not in video_html.split('src="', 1)[1]
1768
+ print(f"[Image2Video] Generated video HTML length={len(video_html)}; has_file_src={has_file_src}")
1769
+ except Exception:
1770
+ pass
1771
+ if video_html.startswith("Error"):
1772
+ print("[Image2Video] Video generation returned error; aborting replacement")
1773
+ return ""
1774
+
1775
+ if placeholder_images:
1776
+ placeholder = placeholder_images[0]
1777
+ placeholder_clean = re.sub(r'\s+', ' ', placeholder.strip())
1778
+ print("[Image2Video] Replacing first image placeholder with video")
1779
+ placeholder_variations = [
1780
+ # Try the exact string first to maximize replacement success
1781
+ placeholder,
1782
+ placeholder_clean,
1783
+ placeholder_clean.replace('"', "'"),
1784
+ placeholder_clean.replace("'", '"'),
1785
+ re.sub(r'\s+', ' ', placeholder_clean),
1786
+ placeholder_clean.replace(' ', ' '),
1787
+ ]
1788
+ blocks = []
1789
+ for variation in placeholder_variations:
1790
+ blocks.append(f"""{SEARCH_START}
1791
+ {variation}
1792
+ {DIVIDER}
1793
+ {video_html}
1794
+ {REPLACE_END}""")
1795
+ return '\n\n'.join(blocks)
1796
+
1797
+ if '<body' in html_content:
1798
+ body_start = html_content.find('<body')
1799
+ body_end = html_content.find('>', body_start) + 1
1800
+ opening_body_tag = html_content[body_start:body_end]
1801
+ print("[Image2Video] No <img> found; inserting video right after the opening <body> tag")
1802
+ print(f"[Image2Video] Opening <body> tag snippet: {opening_body_tag[:120]}")
1803
+ return f"""{SEARCH_START}
1804
+ {opening_body_tag}
1805
+ {DIVIDER}
1806
+ {opening_body_tag}
1807
+ {video_html}
1808
+ {REPLACE_END}"""
1809
+
1810
+ print("[Image2Video] No <body> tag; appending video via replacement block")
1811
+ return f"{SEARCH_START}\n\n{DIVIDER}\n{video_html}\n{REPLACE_END}"
1812
+
1813
+ def apply_generated_images_to_html(html_content: str, user_prompt: str, enable_text_to_image: bool, enable_image_to_image: bool, input_image_data, image_to_image_prompt: str | None = None, text_to_image_prompt: str | None = None, enable_image_to_video: bool = False, image_to_video_prompt: str | None = None, session_id: Optional[str] = None) -> str:
1814
  """Apply text-to-image and/or image-to-image replacements to HTML content.
1815
 
1816
  If both toggles are enabled, text-to-image replacements run first, then image-to-image.
1817
  """
1818
  result = html_content
1819
  try:
1820
+ print(
1821
+ f"[MediaApply] enable_i2v={enable_image_to_video}, enable_i2i={enable_image_to_image}, "
1822
+ f"enable_t2i={enable_text_to_image}, has_image={input_image_data is not None}"
1823
+ )
1824
+ # If image-to-video is enabled, replace the first image with a generated video and return.
1825
+ if enable_image_to_video and input_image_data is not None and (result.strip().startswith('<!DOCTYPE html>') or result.strip().startswith('<html')):
1826
+ i2v_prompt = (image_to_video_prompt or user_prompt or "").strip()
1827
+ print(f"[MediaApply] Running image-to-video with prompt len={len(i2v_prompt)}")
1828
+ blocks_v = create_video_replacement_blocks_from_input_image(result, i2v_prompt, input_image_data, session_id=session_id)
1829
+ if blocks_v:
1830
+ print("[MediaApply] Applying image-to-video replacement blocks")
1831
+ before_len = len(result)
1832
+ result_after = apply_search_replace_changes(result, blocks_v)
1833
+ after_len = len(result_after)
1834
+ changed = (result_after != result)
1835
+ print(f"[MediaApply] i2v blocks length={len(blocks_v)}; html before={before_len}, after={after_len}, changed={changed}")
1836
+ if not changed:
1837
+ print("[MediaApply] DEBUG: Replacement did not change content. Dumping first block:")
1838
+ try:
1839
+ first_block = blocks_v.split(REPLACE_END)[0][:1000]
1840
+ print(first_block)
1841
+ except Exception:
1842
+ pass
1843
+ result = result_after
1844
+ else:
1845
+ print("[MediaApply] No i2v replacement blocks generated")
1846
+ return result
1847
+
1848
  # If an input image is provided and image-to-image is enabled, we only replace one image
1849
  # and skip text-to-image to satisfy the requirement to replace exactly the number of uploaded images.
1850
  if enable_image_to_image and input_image_data is not None and (result.strip().startswith('<!DOCTYPE html>') or result.strip().startswith('<html')):
 
1851
  i2i_prompt = (image_to_image_prompt or user_prompt or "").strip()
1852
  blocks2 = create_image_replacement_blocks_from_input_image(result, i2i_prompt, input_image_data, max_images=1)
1853
  if blocks2:
 
1856
 
1857
  if enable_text_to_image and (result.strip().startswith('<!DOCTYPE html>') or result.strip().startswith('<html')):
1858
  t2i_prompt = (text_to_image_prompt or user_prompt or "").strip()
1859
+ print(f"[MediaApply] Running text-to-image with prompt len={len(t2i_prompt)}")
1860
  # Single-image flow for text-to-image
1861
  blocks = create_image_replacement_blocks_text_to_image_single(result, t2i_prompt)
1862
  if blocks:
1863
+ print("[MediaApply] Applying text-to-image replacement blocks")
1864
  result = apply_search_replace_changes(result, blocks)
1865
  except Exception:
1866
+ import traceback
1867
+ print("[MediaApply] Exception during media application:")
1868
+ traceback.print_exc()
1869
  return html_content
1870
  return result
1871
 
 
2155
  def send_to_sandbox(code):
2156
  """Render HTML in a sandboxed iframe. Assumes full HTML is provided by prompts."""
2157
  html_doc = (code or "").strip()
2158
+ # For preview only: inline local file URLs (e.g., file:///.../video.mp4) as data URIs so the
2159
+ # data: iframe can load them. The original code (shown to the user) still contains file URLs.
2160
+ try:
2161
+ import re
2162
+ import base64 as _b64
2163
+ import mimetypes as _mtypes
2164
+ import urllib.parse as _uparse
2165
+ def _file_url_to_data_uri(file_url: str) -> str | None:
2166
+ try:
2167
+ parsed = _uparse.urlparse(file_url)
2168
+ path = _uparse.unquote(parsed.path)
2169
+ if not path:
2170
+ return None
2171
+ with open(path, 'rb') as _f:
2172
+ raw = _f.read()
2173
+ mime = _mtypes.guess_type(path)[0] or 'application/octet-stream'
2174
+ b64 = _b64.b64encode(raw).decode()
2175
+ return f"data:{mime};base64,{b64}"
2176
+ except Exception:
2177
+ return None
2178
+ def _repl_double(m):
2179
+ url = m.group(1)
2180
+ data_uri = _file_url_to_data_uri(url)
2181
+ return f'src="{data_uri}"' if data_uri else m.group(0)
2182
+ def _repl_single(m):
2183
+ url = m.group(1)
2184
+ data_uri = _file_url_to_data_uri(url)
2185
+ return f"src='{data_uri}'" if data_uri else m.group(0)
2186
+ html_doc = re.sub(r'src="(file:[^"]+)"', _repl_double, html_doc)
2187
+ html_doc = re.sub(r"src='(file:[^']+)'", _repl_single, html_doc)
2188
+ except Exception:
2189
+ # Best-effort; continue without inlining
2190
+ pass
2191
  encoded_html = base64.b64encode(html_doc.encode('utf-8')).decode('utf-8')
2192
  data_uri = f"data:text/html;charset=utf-8;base64,{encoded_html}"
2193
  iframe = f'<iframe src="{data_uri}" width="100%" height="920px" sandbox="allow-scripts allow-same-origin allow-forms allow-popups allow-modals allow-presentation" allow="display-capture"></iframe>'
 
2693
  stop_generation = False
2694
 
2695
 
2696
+ def generation_code(query: Optional[str], vlm_image: Optional[gr.Image], gen_image: Optional[gr.Image], file: Optional[str], website_url: Optional[str], _setting: Dict[str, str], _history: Optional[History], _current_model: Dict, enable_search: bool = False, language: str = "html", provider: str = "auto", enable_image_generation: bool = False, enable_image_to_image: bool = False, image_to_image_prompt: Optional[str] = None, text_to_image_prompt: Optional[str] = None, enable_image_to_video: bool = False, image_to_video_prompt: Optional[str] = None):
2697
  if query is None:
2698
  query = ''
2699
  if _history is None:
 
2721
  '=== src/App.svelte ===' in last_assistant_msg):
2722
  has_existing_content = True
2723
 
2724
+ # Create/lookup a session id for temp-file tracking and cleanup
2725
+ if _setting is not None and isinstance(_setting, dict):
2726
+ session_id = _setting.get("__session_id__")
2727
+ if not session_id:
2728
+ session_id = str(uuid.uuid4())
2729
+ _setting["__session_id__"] = session_id
2730
+ else:
2731
+ session_id = str(uuid.uuid4())
2732
+
2733
+ # On each generate, reap old global files and cleanup previous session files
2734
+ try:
2735
+ cleanup_session_videos(session_id)
2736
+ reap_old_videos()
2737
+ except Exception:
2738
+ pass
2739
+
2740
  # Choose system prompt based on context
2741
  if has_existing_content:
2742
  # Use follow-up prompt for modifying existing content
 
2792
 
2793
  # Check if this is GLM-4.5 model and handle with simple HuggingFace InferenceClient
2794
  if _current_model["id"] == "zai-org/GLM-4.5":
2795
+ if vlm_image is not None:
2796
+ messages.append(create_multimodal_message(enhanced_query, vlm_image))
2797
  else:
2798
  messages.append({'role': 'user', 'content': enhanced_query})
2799
 
 
2834
  clean_code = remove_code_block(content)
2835
 
2836
  # Apply image generation (text→image and/or image→image)
2837
+ print("[Generate] Applying post-generation media to GLM-4.5 HTML output")
2838
  final_content = apply_generated_images_to_html(
2839
  content,
2840
  query,
2841
  enable_text_to_image=enable_image_generation,
2842
  enable_image_to_image=enable_image_to_image,
2843
+ input_image_data=gen_image,
2844
  image_to_image_prompt=image_to_image_prompt,
2845
+ enable_image_to_video=enable_image_to_video,
2846
+ image_to_video_prompt=image_to_video_prompt,
2847
+ session_id=session_id,
2848
  )
2849
 
2850
  _history.append([query, final_content])
 
2999
  clean_content = remove_code_block(modified_content)
3000
 
3001
  # Apply image generation (text→image and/or image→image)
3002
+ print("[Generate] Applying post-generation media to modified HTML content")
3003
  clean_content = apply_generated_images_to_html(
3004
  clean_content,
3005
  query,
3006
  enable_text_to_image=enable_image_generation,
3007
  enable_image_to_image=enable_image_to_image,
3008
+ input_image_data=gen_image,
3009
  image_to_image_prompt=image_to_image_prompt,
3010
+ enable_image_to_video=enable_image_to_video,
3011
+ image_to_video_prompt=image_to_video_prompt,
3012
+ session_id=session_id,
3013
  )
3014
 
3015
  yield {
 
3020
  }
3021
  else:
3022
  # Apply image generation (text→image and/or image→image)
3023
+ print("[Generate] Applying post-generation media to new HTML content")
3024
  final_content = apply_generated_images_to_html(
3025
  clean_code,
3026
  query,
3027
  enable_text_to_image=enable_image_generation,
3028
  enable_image_to_image=enable_image_to_image,
3029
+ input_image_data=gen_image,
3030
  image_to_image_prompt=image_to_image_prompt,
3031
  text_to_image_prompt=text_to_image_prompt,
3032
+ enable_image_to_video=enable_image_to_video,
3033
+ image_to_video_prompt=image_to_video_prompt,
3034
+ session_id=session_id,
3035
  )
3036
 
3037
  preview_val = None
 
3053
  structured = [
3054
  {"role": "system", "content": GLM45V_HTML_SYSTEM_PROMPT}
3055
  ]
3056
+ if vlm_image is not None:
3057
  user_msg = {
3058
  "role": "user",
3059
  "content": [
 
3064
  import io, base64
3065
  from PIL import Image
3066
  import numpy as np
3067
+ if isinstance(vlm_image, np.ndarray):
3068
+ vlm_image = Image.fromarray(vlm_image)
3069
  buf = io.BytesIO()
3070
+ vlm_image.save(buf, format="PNG")
3071
  b64 = base64.b64encode(buf.getvalue()).decode()
3072
  user_msg["content"].append({
3073
  "type": "image_url",
 
3135
  # Use dynamic client based on selected model (for non-GLM-4.5 models)
3136
  client = get_inference_client(_current_model["id"], provider)
3137
 
3138
+ if vlm_image is not None:
3139
+ messages.append(create_multimodal_message(enhanced_query, vlm_image))
3140
  else:
3141
  messages.append({'role': 'user', 'content': enhanced_query})
3142
  try:
 
3420
  clean_content = remove_code_block(modified_content)
3421
 
3422
  # Apply image generation (text→image and/or image→image)
3423
+ print("[Generate] Applying post-generation media to follow-up HTML content")
3424
  clean_content = apply_generated_images_to_html(
3425
  clean_content,
3426
  query,
3427
  enable_text_to_image=enable_image_generation,
3428
  enable_image_to_image=enable_image_to_image,
3429
+ input_image_data=gen_image,
3430
  image_to_image_prompt=image_to_image_prompt,
3431
+ enable_image_to_video=enable_image_to_video,
3432
+ image_to_video_prompt=image_to_video_prompt,
3433
+ session_id=session_id,
3434
  text_to_image_prompt=text_to_image_prompt,
3435
  )
3436
 
 
3447
  final_content = remove_code_block(content)
3448
 
3449
  # Apply image generation (text→image and/or image→image)
3450
+ print("[Generate] Applying post-generation media to final HTML content")
3451
  final_content = apply_generated_images_to_html(
3452
  final_content,
3453
  query,
3454
  enable_text_to_image=enable_image_generation,
3455
  enable_image_to_image=enable_image_to_image,
3456
+ input_image_data=gen_image,
3457
  image_to_image_prompt=image_to_image_prompt,
3458
  text_to_image_prompt=text_to_image_prompt,
3459
+ enable_image_to_video=enable_image_to_video,
3460
+ image_to_video_prompt=image_to_video_prompt,
3461
+ session_id=session_id,
3462
  )
3463
 
3464
  _history.append([query, final_content])
 
4506
  label="UI design image",
4507
  visible=False
4508
  )
4509
+ # New hidden image input used for VLMs, image-to-image, and image-to-video
4510
+ generation_image_input = gr.Image(
4511
+ label="image for generation",
4512
+ visible=False
4513
+ )
4514
  image_to_image_prompt = gr.Textbox(
4515
  label="Image-to-Image Prompt",
4516
  placeholder="Describe how to transform the uploaded image (e.g., 'Turn the cat into a tiger.')",
 
4567
  visible=True,
4568
  info="Transform your uploaded image using FLUX.1-Kontext-dev"
4569
  )
4570
+ image_to_video_toggle = gr.Checkbox(
4571
+ label="🎞️ Image to Video (uses input image)",
4572
+ value=False,
4573
+ visible=True,
4574
+ info="Generate a short video from your uploaded image using Lightricks LTX-Video"
4575
+ )
4576
+ image_to_video_prompt = gr.Textbox(
4577
+ label="Image-to-Video Prompt",
4578
+ placeholder="Describe the motion (e.g., 'The cat starts to dance')",
4579
+ lines=2,
4580
+ visible=False
4581
+ )
4582
 
4583
  def on_image_to_image_toggle(toggled):
4584
+ # Show generation image input and its prompt when image-to-image is enabled
4585
  return gr.update(visible=bool(toggled)), gr.update(visible=bool(toggled))
4586
 
4587
  def on_text_to_image_toggle(toggled):
 
4590
  image_to_image_toggle.change(
4591
  on_image_to_image_toggle,
4592
  inputs=[image_to_image_toggle],
4593
+ outputs=[generation_image_input, image_to_image_prompt]
4594
+ )
4595
+ def on_image_to_video_toggle(toggled):
4596
+ return gr.update(visible=bool(toggled)), gr.update(visible=bool(toggled))
4597
+
4598
+ image_to_video_toggle.change(
4599
+ on_image_to_video_toggle,
4600
+ inputs=[image_to_video_toggle],
4601
+ outputs=[generation_image_input, image_to_video_prompt]
4602
  )
4603
  image_generation_toggle.change(
4604
  on_text_to_image_toggle,
 
4855
  show_progress="hidden",
4856
  ).then(
4857
  generation_code,
4858
+ inputs=[input, image_input, generation_image_input, file_input, website_url_input, setting, history, current_model, search_toggle, language_dropdown, provider_state, image_generation_toggle, image_to_image_toggle, image_to_image_prompt, text_to_image_prompt, image_to_video_toggle, image_to_video_prompt],
4859
  outputs=[code_output, history, sandbox, history_output]
4860
  ).then(
4861
  end_generation_ui,