linoyts HF Staff commited on
Commit
e1d5bb5
·
verified ·
1 Parent(s): bd83817

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -38
app.py CHANGED
@@ -1,10 +1,10 @@
1
  import gradio as gr
2
  import spaces
3
  import torch
4
- from pipeline_ltx_condition import LTXVideoCondition, LTXConditionPipeline
5
- from diffusers import LTXLatentUpsamplePipeline
6
- #from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
7
- #from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
8
  from diffusers.utils import export_to_video, load_video
9
  import numpy as np
10
 
@@ -32,10 +32,15 @@ def generate(prompt,
32
  num_frames,
33
  seed,
34
  randomize_seed,
35
- t2v, progress=gr.Progress(track_tqdm=True)):
36
 
 
 
 
37
  expected_height, expected_width = 768, 1152
38
  downscale_factor = 2 / 3
 
 
39
 
40
  if randomize_seed:
41
  seed = random.randint(0, MAX_SEED)
@@ -46,33 +51,30 @@ def generate(prompt,
46
  conditions=condition1,
47
  prompt=prompt,
48
  negative_prompt=negative_prompt,
49
- # width=downscaled_width,
50
- # height=downscaled_height,
51
  num_frames=num_frames,
52
  num_inference_steps=steps,
53
  decode_timestep = 0.05,
54
  decode_noise_scale = 0.025,
55
  generator=torch.Generator(device="cuda").manual_seed(seed),
56
- #output_type="latent",
57
  ).frames
58
  else:
59
  latents = pipe(
60
  prompt=prompt,
61
  negative_prompt=negative_prompt,
62
- # width=downscaled_width,
63
- # height=downscaled_height,
64
  num_frames=num_frames,
65
  num_inference_steps=steps,
66
  decode_timestep = 0.05,
67
  decode_noise_scale = 0.025,
68
  generator=torch.Generator(device="cuda").manual_seed(seed),
69
- #output_type="latent",
70
  ).frames
71
 
72
- # Part 1. Generate video at smaller resolution
73
- # Text-only conditioning is also supported without the need to pass `conditions`
74
- downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
75
- downscaled_height, downscaled_width = round_to_nearest_resolution_acceptable_by_vae(downscaled_height, downscaled_width)
76
 
77
  # latents = pipe(
78
  # conditions=condition1,
@@ -90,32 +92,39 @@ def generate(prompt,
90
 
91
  # Part 2. Upscale generated video using latent upsampler with fewer inference steps
92
  # The available latent upsampler upscales the height/width by 2x
93
- # upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
94
- # upscaled_latents = pipe_upsample(
95
- # latents=latents,
96
- # output_type="latent"
97
- # ).frames
 
 
 
98
 
99
- # # Part 3. Denoise the upscaled video with few steps to improve texture (optional, but recommended)
100
- # video = pipe(
101
- # conditions=condition1,
102
- # prompt=prompt,
103
- # negative_prompt=negative_prompt,
104
- # width=upscaled_width,
105
- # height=upscaled_height,
106
- # num_frames=num_frames,
107
- # denoise_strength=0.4, # Effectively, 4 inference steps out of 10
108
- # num_inference_steps=10,
109
- # latents=upscaled_latents,
110
- # decode_timestep=0.05,
111
- # image_cond_noise_scale=0.025,
112
- # generator=torch.Generator().manual_seed(seed),
113
- # output_type="pil",
114
- # ).frames[0]
 
 
 
 
 
115
 
116
  # Part 4. Downscale the video to the expected resolution
117
- #video = [frame.resize((expected_width, expected_height)) for frame in video]
118
- video = [frame.resize((expected_width, expected_height)) for frame in latents[0]]
119
  export_to_video(video, "output.mp4", fps=24)
120
  return "output.mp4"
121
 
 
1
  import gradio as gr
2
  import spaces
3
  import torch
4
+ # from pipeline_ltx_condition import LTXVideoCondition, LTXConditionPipeline
5
+ # from diffusers import LTXLatentUpsamplePipeline
6
+ from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
7
+ from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
8
  from diffusers.utils import export_to_video, load_video
9
  import numpy as np
10
 
 
32
  num_frames,
33
  seed,
34
  randomize_seed,
35
+ t2v, improve_texture=False, progress=gr.Progress(track_tqdm=True)):
36
 
37
+
38
+ # Part 1. Generate video at smaller resolution
39
+ # Text-only conditioning is also supported without the need to pass `conditions`
40
  expected_height, expected_width = 768, 1152
41
  downscale_factor = 2 / 3
42
+ downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
43
+ downscaled_height, downscaled_width = round_to_nearest_resolution_acceptable_by_vae(downscaled_height, downscaled_width)
44
 
45
  if randomize_seed:
46
  seed = random.randint(0, MAX_SEED)
 
51
  conditions=condition1,
52
  prompt=prompt,
53
  negative_prompt=negative_prompt,
54
+ width=downscaled_width,
55
+ height=downscaled_height,
56
  num_frames=num_frames,
57
  num_inference_steps=steps,
58
  decode_timestep = 0.05,
59
  decode_noise_scale = 0.025,
60
  generator=torch.Generator(device="cuda").manual_seed(seed),
61
+ output_type="latent",
62
  ).frames
63
  else:
64
  latents = pipe(
65
  prompt=prompt,
66
  negative_prompt=negative_prompt,
67
+ width=downscaled_width,
68
+ height=downscaled_height,
69
  num_frames=num_frames,
70
  num_inference_steps=steps,
71
  decode_timestep = 0.05,
72
  decode_noise_scale = 0.025,
73
  generator=torch.Generator(device="cuda").manual_seed(seed),
74
+ output_type="latent",
75
  ).frames
76
 
77
+
 
 
 
78
 
79
  # latents = pipe(
80
  # conditions=condition1,
 
92
 
93
  # Part 2. Upscale generated video using latent upsampler with fewer inference steps
94
  # The available latent upsampler upscales the height/width by 2x
95
+ if improve_texture:
96
+ upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
97
+ upscaled_latents = pipe_upsample(
98
+ latents=latents,
99
+ output_type="latent"
100
+ ).frames
101
+
102
+ # Part 3. Denoise the upscaled video with few steps to improve texture (optional, but recommended)
103
 
104
+ video = pipe(
105
+ conditions=condition1,
106
+ prompt=prompt,
107
+ negative_prompt=negative_prompt,
108
+ width=upscaled_width,
109
+ height=upscaled_height,
110
+ num_frames=num_frames,
111
+ denoise_strength=0.4, # Effectively, 4 inference steps out of 10
112
+ num_inference_steps=10,
113
+ latents=upscaled_latents,
114
+ decode_timestep=0.05,
115
+ image_cond_noise_scale=0.025,
116
+ generator=torch.Generator().manual_seed(seed),
117
+ output_type="pil",
118
+ ).frames[0]
119
+ else:
120
+ upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
121
+ video = pipe_upsample(
122
+ latents=latents,
123
+ # output_type="latent"
124
+ ).frames[0]
125
 
126
  # Part 4. Downscale the video to the expected resolution
127
+ video = [frame.resize((expected_width, expected_height)) for frame in video]
 
128
  export_to_video(video, "output.mp4", fps=24)
129
  return "output.mp4"
130