Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -33,7 +33,6 @@ if hf_token:
|
|
33 |
else:
|
34 |
print("Warning: HF_TOKEN not found in environment variables. You may encounter authentication issues.")
|
35 |
|
36 |
-
|
37 |
def download_model():
|
38 |
REPO_ID = 'Doubiiu/DynamiCrafter_1024'
|
39 |
filename_list = ['model.ckpt']
|
@@ -45,11 +44,11 @@ def download_model():
|
|
45 |
hf_hub_download(repo_id=REPO_ID, filename=filename, local_dir='./checkpoints/dynamicrafter_1024_v1/', force_download=True)
|
46 |
|
47 |
download_model()
|
48 |
-
ckpt_path='checkpoints/dynamicrafter_1024_v1/model.ckpt'
|
49 |
-
config_file='configs/inference_1024_v1.0.yaml'
|
50 |
config = OmegaConf.load(config_file)
|
51 |
model_config = config.pop("model", OmegaConf.create())
|
52 |
-
model_config['params']['unet_config']['params']['use_checkpoint']=False
|
53 |
model = instantiate_from_config(model_config)
|
54 |
assert os.path.exists(ckpt_path), "Error: checkpoint Not Found!"
|
55 |
model = load_model_checkpoint(model, ckpt_path)
|
@@ -67,11 +66,18 @@ flux_pipe = FluxPipeline.from_pretrained(
|
|
67 |
)
|
68 |
flux_pipe.enable_model_cpu_offload()
|
69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
|
71 |
def generate_image_from_text(prompt, seed=0):
|
|
|
72 |
generator = torch.Generator("cpu").manual_seed(seed)
|
73 |
image = flux_pipe(
|
74 |
-
|
75 |
height=576,
|
76 |
width=1024,
|
77 |
guidance_scale=3.5,
|
@@ -83,158 +89,96 @@ def generate_image_from_text(prompt, seed=0):
|
|
83 |
|
84 |
@spaces.GPU(duration=600)
|
85 |
def infer(image, prompt, steps=50, cfg_scale=7.5, eta=1.0, fs=3, seed=123, video_length=2):
|
86 |
-
|
87 |
-
|
88 |
-
translated = translator(prompt, max_length=512)[0]['translation_text']
|
89 |
-
prompt = translated
|
90 |
-
print(f"Translated prompt: {prompt}")
|
91 |
-
|
92 |
resolution = (576, 1024)
|
93 |
save_fps = 8
|
94 |
seed_everything(seed)
|
95 |
transform = transforms.Compose([
|
96 |
-
transforms.Resize(min(resolution)),
|
97 |
transforms.CenterCrop(resolution),
|
98 |
-
|
99 |
torch.cuda.empty_cache()
|
100 |
-
print('Start:',
|
101 |
start = time.time()
|
102 |
if steps > 60:
|
103 |
steps = 60
|
104 |
-
|
105 |
batch_size = 1
|
106 |
channels = model.model.diffusion_model.out_channels
|
107 |
-
frames = int(video_length * save_fps)
|
108 |
h, w = resolution[0] // 8, resolution[1] // 8
|
109 |
noise_shape = [batch_size, channels, frames, h, w]
|
110 |
-
|
111 |
-
# 텍스트 조건 설정
|
112 |
with torch.no_grad(), torch.cuda.amp.autocast():
|
113 |
-
text_emb = model.get_learned_conditioning([
|
114 |
img_tensor = torch.from_numpy(image).permute(2, 0, 1).float().to(model.device)
|
115 |
img_tensor = (img_tensor / 255. - 0.5) * 2
|
116 |
image_tensor_resized = transform(img_tensor).unsqueeze(0) # bchw
|
117 |
-
|
118 |
z = get_latent_z(model, image_tensor_resized.unsqueeze(2)) #bc,1,hw
|
119 |
img_tensor_repeat = repeat(z, 'b c t h w -> b c (repeat t) h w', repeat=frames)
|
120 |
cond_images = model.embedder(img_tensor.unsqueeze(0)) # blc
|
121 |
img_emb = model.image_proj_model(cond_images)
|
122 |
imtext_cond = torch.cat([text_emb, img_emb], dim=1)
|
123 |
-
|
124 |
fs = torch.tensor([fs], dtype=torch.long, device=model.device)
|
125 |
cond = {"c_crossattn": [imtext_cond], "fs": fs, "c_concat": [img_tensor_repeat]}
|
126 |
-
|
127 |
-
# 추론 실행
|
128 |
batch_samples = batch_ddim_sampling(model, cond, noise_shape, n_samples=1, ddim_steps=steps, ddim_eta=eta, cfg_scale=cfg_scale)
|
129 |
-
|
130 |
video_path = './output.mp4'
|
131 |
save_videos(batch_samples, './', filenames=['output'], fps=save_fps)
|
132 |
return video_path
|
133 |
|
134 |
-
|
135 |
-
@spaces.GPU(duration=300)
|
136 |
-
def infer_t2v(prompt, video_prompt, steps=50, cfg_scale=7.5, eta=1.0, fs=3, seed=123, video_length=2):
|
137 |
-
# 이미지 생성
|
138 |
-
image = generate_image_from_text(prompt, seed)
|
139 |
-
|
140 |
-
# 이미지를 numpy 배열로 변환
|
141 |
-
image_np = np.array(image)
|
142 |
-
|
143 |
-
# 비디오 생성을 위해 기존 infer 함수 호출
|
144 |
-
return infer(image_np, video_prompt, steps, cfg_scale, eta, fs, seed, video_length)
|
145 |
-
|
146 |
-
i2v_examples = [
|
147 |
-
['prompts/1024/astronaut04.png', 'a man in an astronaut suit playing a guitar', 30, 7.5, 1.0, 6, 123, 2],
|
148 |
-
]
|
149 |
-
|
150 |
css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height: 576px}"""
|
151 |
|
152 |
-
def generate_only_image(prompt, seed=123):
|
153 |
-
# 이미지 생성
|
154 |
-
image = generate_image_from_text(prompt, seed)
|
155 |
-
|
156 |
-
# PIL 이미지로 변환 후 반환
|
157 |
-
return Image.fromarray(np.array(image))
|
158 |
-
|
159 |
with gr.Blocks(analytics_enabled=False, css=css) as dynamicrafter_iface:
|
160 |
gr.Markdown("kAI 무비 스튜디오")
|
161 |
-
|
162 |
-
|
163 |
-
with gr.Tab(label='Image(+Text) Generation'):
|
164 |
with gr.Column():
|
165 |
with gr.Row():
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
img_output_image = gr.Image(label="Generated Image")
|
172 |
-
|
173 |
img_generate_btn.click(
|
174 |
inputs=[img_input_text, img_seed],
|
175 |
outputs=[img_output_image],
|
176 |
-
fn=
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
with gr.Tab(label='Image to Video Generation'):
|
181 |
with gr.Column():
|
182 |
with gr.Row():
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
with gr.Row():
|
199 |
-
i2v_output_video = gr.Video(label="Generated Video",elem_id="output_vid",autoplay=True,show_share_button=True)
|
200 |
-
|
201 |
-
gr.Examples(examples=i2v_examples,
|
202 |
-
inputs=[i2v_input_image, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_motion, i2v_seed, i2v_video_length],
|
203 |
-
outputs=[i2v_output_video],
|
204 |
-
fn = infer,
|
205 |
-
cache_examples=True,
|
206 |
)
|
207 |
-
|
208 |
-
outputs=[i2v_output_video],
|
209 |
-
fn = infer
|
210 |
-
)
|
211 |
-
|
212 |
with gr.Tab(label='Text to Video Generation'):
|
213 |
with gr.Column():
|
214 |
with gr.Row():
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
t2v_end_btn = gr.Button("Generate")
|
229 |
-
with gr.Row():
|
230 |
-
t2v_output_video = gr.Video(label="Generated Video", autoplay=True, show_share_button=True)
|
231 |
-
|
232 |
-
t2v_end_btn.click(
|
233 |
-
inputs=[t2v_input_text, t2v_video_prompt, t2v_steps, t2v_cfg_scale, t2v_eta, t2v_motion, t2v_seed, t2v_video_length],
|
234 |
-
outputs=[t2v_output_video],
|
235 |
fn=infer_t2v
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
|
240 |
-
dynamicrafter_iface.
|
|
|
33 |
else:
|
34 |
print("Warning: HF_TOKEN not found in environment variables. You may encounter authentication issues.")
|
35 |
|
|
|
36 |
def download_model():
|
37 |
REPO_ID = 'Doubiiu/DynamiCrafter_1024'
|
38 |
filename_list = ['model.ckpt']
|
|
|
44 |
hf_hub_download(repo_id=REPO_ID, filename=filename, local_dir='./checkpoints/dynamicrafter_1024_v1/', force_download=True)
|
45 |
|
46 |
download_model()
|
47 |
+
ckpt_path = 'checkpoints/dynamicrafter_1024_v1/model.ckpt'
|
48 |
+
config_file = 'configs/inference_1024_v1.0.yaml'
|
49 |
config = OmegaConf.load(config_file)
|
50 |
model_config = config.pop("model", OmegaConf.create())
|
51 |
+
model_config['params']['unet_config']['params']['use_checkpoint'] = False
|
52 |
model = instantiate_from_config(model_config)
|
53 |
assert os.path.exists(ckpt_path), "Error: checkpoint Not Found!"
|
54 |
model = load_model_checkpoint(model, ckpt_path)
|
|
|
66 |
)
|
67 |
flux_pipe.enable_model_cpu_offload()
|
68 |
|
69 |
+
def translate_prompt(prompt):
|
70 |
+
# 한글 입력 감지 및 번역
|
71 |
+
if any('\u3131' <= char <= '\u318E' or '\uAC00' <= char <= '\uD7A3' for char in prompt):
|
72 |
+
translated = translator(prompt, max_length=512)[0]['translation_text']
|
73 |
+
return translated
|
74 |
+
return prompt
|
75 |
|
76 |
def generate_image_from_text(prompt, seed=0):
|
77 |
+
translated_prompt = translate_prompt(prompt)
|
78 |
generator = torch.Generator("cpu").manual_seed(seed)
|
79 |
image = flux_pipe(
|
80 |
+
translated_prompt,
|
81 |
height=576,
|
82 |
width=1024,
|
83 |
guidance_scale=3.5,
|
|
|
89 |
|
90 |
@spaces.GPU(duration=600)
|
91 |
def infer(image, prompt, steps=50, cfg_scale=7.5, eta=1.0, fs=3, seed=123, video_length=2):
|
92 |
+
translated_prompt = translate_prompt(prompt)
|
93 |
+
print(f"Translated prompt: {translated_prompt}")
|
|
|
|
|
|
|
|
|
94 |
resolution = (576, 1024)
|
95 |
save_fps = 8
|
96 |
seed_everything(seed)
|
97 |
transform = transforms.Compose([
|
98 |
+
transforms.Resize(min(resolution), antialias=True),
|
99 |
transforms.CenterCrop(resolution),
|
100 |
+
])
|
101 |
torch.cuda.empty_cache()
|
102 |
+
print('Start:', translated_prompt, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
|
103 |
start = time.time()
|
104 |
if steps > 60:
|
105 |
steps = 60
|
|
|
106 |
batch_size = 1
|
107 |
channels = model.model.diffusion_model.out_channels
|
108 |
+
frames = int(video_length * save_fps)
|
109 |
h, w = resolution[0] // 8, resolution[1] // 8
|
110 |
noise_shape = [batch_size, channels, frames, h, w]
|
|
|
|
|
111 |
with torch.no_grad(), torch.cuda.amp.autocast():
|
112 |
+
text_emb = model.get_learned_conditioning([translated_prompt])
|
113 |
img_tensor = torch.from_numpy(image).permute(2, 0, 1).float().to(model.device)
|
114 |
img_tensor = (img_tensor / 255. - 0.5) * 2
|
115 |
image_tensor_resized = transform(img_tensor).unsqueeze(0) # bchw
|
|
|
116 |
z = get_latent_z(model, image_tensor_resized.unsqueeze(2)) #bc,1,hw
|
117 |
img_tensor_repeat = repeat(z, 'b c t h w -> b c (repeat t) h w', repeat=frames)
|
118 |
cond_images = model.embedder(img_tensor.unsqueeze(0)) # blc
|
119 |
img_emb = model.image_proj_model(cond_images)
|
120 |
imtext_cond = torch.cat([text_emb, img_emb], dim=1)
|
|
|
121 |
fs = torch.tensor([fs], dtype=torch.long, device=model.device)
|
122 |
cond = {"c_crossattn": [imtext_cond], "fs": fs, "c_concat": [img_tensor_repeat]}
|
|
|
|
|
123 |
batch_samples = batch_ddim_sampling(model, cond, noise_shape, n_samples=1, ddim_steps=steps, ddim_eta=eta, cfg_scale=cfg_scale)
|
|
|
124 |
video_path = './output.mp4'
|
125 |
save_videos(batch_samples, './', filenames=['output'], fps=save_fps)
|
126 |
return video_path
|
127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height: 576px}"""
|
129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
with gr.Blocks(analytics_enabled=False, css=css) as dynamicrafter_iface:
|
131 |
gr.Markdown("kAI 무비 스튜디오")
|
132 |
+
with gr.Tab(label='Image Generation'):
|
|
|
|
|
133 |
with gr.Column():
|
134 |
with gr.Row():
|
135 |
+
img_input_text = gr.Text(label='Image Generation Prompt')
|
136 |
+
img_seed = gr.Slider(label='Random Seed', minimum=0, maximum=10000, step=1, value=123)
|
137 |
+
img_generate_btn = gr.Button("Generate Image")
|
138 |
+
with gr.Row():
|
139 |
+
img_output_image = gr.Image(label="Generated Image")
|
|
|
|
|
140 |
img_generate_btn.click(
|
141 |
inputs=[img_input_text, img_seed],
|
142 |
outputs=[img_output_image],
|
143 |
+
fn=generate_image_from_text
|
144 |
+
)
|
|
|
|
|
145 |
with gr.Tab(label='Image to Video Generation'):
|
146 |
with gr.Column():
|
147 |
with gr.Row():
|
148 |
+
video_input_image = gr.Image(label="Input Image for Video", tool="input")
|
149 |
+
video_prompt = gr.Text(label='Video Generation Prompt')
|
150 |
+
video_seed = gr.Slider(label='Random Seed', minimum=0, maximum 10000, step=1, value=123)
|
151 |
+
video_steps = gr.Slider(label="Sampling steps", minimum=1, maximum=50, step=1, value=30)
|
152 |
+
video_cfg_scale = gr.Slider(label='CFG Scale', minimum=1.0, maximum=15.0, step=0.5, value=7.5)
|
153 |
+
video_eta = gr.Slider(label='ETA', minimum=0.0, maximum=1.0, step=0.1, value=1.0)
|
154 |
+
video_fs = gr.Slider(label='FS', minimum=1, maximum=10, step=1, value=3)
|
155 |
+
video_length = gr.Slider(label="Video Length (seconds)", minimum=2, maximum=8, step=1, value=2)
|
156 |
+
video_generate_btn = gr.Button("Generate Video")
|
157 |
+
with gr.Row():
|
158 |
+
video_output = gr.Video(label="Generated Video", autoplay=True, show_share_button=True)
|
159 |
+
video_generate_btn.click(
|
160 |
+
inputs=[video_input_image, video_prompt, video_seed, video_steps, video_cfg_scale, video_eta, video_fs, video_length],
|
161 |
+
outputs=[video_output],
|
162 |
+
fn=infer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
)
|
164 |
+
|
|
|
|
|
|
|
|
|
165 |
with gr.Tab(label='Text to Video Generation'):
|
166 |
with gr.Column():
|
167 |
with gr.Row():
|
168 |
+
video_prompt = gr.Text(label='Video Generation Prompt')
|
169 |
+
video_seed = gr.Slider(label='Random Seed', minimum=0, maximum 10000, step=1, value=123)
|
170 |
+
video_steps = gr.Slider(label="Sampling steps", minimum=1, maximum=50, step=1, value=30)
|
171 |
+
video_cfg_scale = gr.Slider(label='CFG Scale', minimum=1.0, maximum=15.0, step=0.5, value=7.5)
|
172 |
+
video_eta = gr.Slider(label='ETA', minimum=0.0, maximum=1.0, step=0.1, value=1.0)
|
173 |
+
video_fs = gr.Slider(label='FS', minimum=1, maximum 10, step=1, value=3)
|
174 |
+
video_length = gr.Slider(label="Video Length (seconds)", minimum=2, maximum 8, step=1, value=2)
|
175 |
+
video_generate_btn = gr.Button("Generate Video")
|
176 |
+
with gr.Row():
|
177 |
+
video_output = gr.Video(label="Generated Video", autoplay=True, show_share_button=True)
|
178 |
+
video_generate_btn.click(
|
179 |
+
inputs=[video_prompt, video_seed, video_steps, video_cfg_scale, video_eta, video_fs, video_length],
|
180 |
+
outputs=[video_output],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
fn=infer_t2v
|
182 |
+
)
|
|
|
|
|
183 |
|
184 |
+
dynamicrafter_iface.launch(show_api=True)
|