kaimoviestud

Sleeping

App Files Files Community

seawolf2357 commited on Aug 13, 2024

Commit

192dfa7

verified ·

1 Parent(s): 248f435

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -114

app.py CHANGED Viewed

@@ -21,17 +21,6 @@ from funcs import (
     save_videos
 )
 from transformers import pipeline
-from diffusers import FluxPipeline
-from PIL import Image
-import numpy as np
-from huggingface_hub import login
-# Hugging Face 토큰 설정 및 로그인
-hf_token = os.getenv("HF_TOKEN")
-if hf_token:
-    login(token=hf_token)
-else:
-    print("Warning: HF_TOKEN not found in environment variables. You may encounter authentication issues.")
 def download_model():
     REPO_ID = 'Doubiiu/DynamiCrafter_1024'
@@ -44,11 +33,11 @@ def download_model():
             hf_hub_download(repo_id=REPO_ID, filename=filename, local_dir='./checkpoints/dynamicrafter_1024_v1/', force_download=True)
 download_model()
-ckpt_path = 'checkpoints/dynamicrafter_1024_v1/model.ckpt'
-config_file = 'configs/inference_1024_v1.0.yaml'
 config = OmegaConf.load(config_file)
 model_config = config.pop("model", OmegaConf.create())
-model_config['params']['unet_config']['params']['use_checkpoint'] = False
 model = instantiate_from_config(model_config)
 assert os.path.exists(ckpt_path), "Error: checkpoint Not Found!"
 model = load_model_checkpoint(model, ckpt_path)
@@ -56,75 +45,70 @@ model.eval()
 model = model.cuda()
 # 번역 모델 초기화
-translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en", device=0)  # GPU 사용 설정
-# FLUX 파이프라인 초기화 부분 수정
-flux_pipe = FluxPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    torch_dtype=torch.bfloat16,
-    use_auth_token=hf_token  # 토큰을 사용하여 인증
-)
-flux_pipe.enable_model_cpu_offload()
-def translate_prompt(prompt):
     # 한글 입력 감지 및 번역
     if any('\u3131' <= char <= '\u318E' or '\uAC00' <= char <= '\uD7A3' for char in prompt):
         translated = translator(prompt, max_length=512)[0]['translation_text']
-        return translated
-    return prompt
-def generate_image_from_text(prompt, seed=0):
-    translated_prompt = translate_prompt(prompt)
-    generator = torch.Generator("cpu").manual_seed(seed)
-    image = flux_pipe(
-        translated_prompt,
-        height=576,
-        width=1024,
-        guidance_scale=3.5,
-        num_inference_steps=50,
-        max_sequence_length=512,
-        generator=generator
-    ).images[0]
-    return image
-import torch
-def infer(image, prompt, steps=50, cfg_scale=7.5, eta=1.0, seed=123, video_length=2, fs=8):
-    translated_prompt = translate_prompt(prompt)
-    print(f"Translated prompt: {translated_prompt}")
     resolution = (576, 1024)
-    save_fs = torch.tensor(fs)  # fs를 tensor로 변환
     seed_everything(seed)
     transform = transforms.Compose([
-        transforms.Resize(min(resolution), antialias=True),
         transforms.CenterCrop(resolution),
-    ])
     torch.cuda.empty_cache()
-    print('Start:', translated_prompt, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
     start = time.time()
     if steps > 60:
-        steps = 60
-    batch_size = 1
     channels = model.model.diffusion_model.out_channels
-    frames = int(video_length * fs)
     h, w = resolution[0] // 8, resolution[1] // 8
     noise_shape = [batch_size, channels, frames, h, w]
     with torch.no_grad(), torch.cuda.amp.autocast():
-        text_emb = model.get_learned_conditioning([translated_prompt])
         img_tensor = torch.from_numpy(image).permute(2, 0, 1).float().to(model.device)
         img_tensor = (img_tensor / 255. - 0.5) * 2
-        image_tensor_resized = transform(img_tensor).unsqueeze(0)  # bchw
-        z = get_latent_z(model, image_tensor_resized.unsqueeze(2)) #bc,1,hw
         img_tensor_repeat = repeat(z, 'b c t h w -> b c (repeat t) h w', repeat=frames)
-        cond_images = model.embedder(img_tensor.unsqueeze(0)) # blc
         img_emb = model.image_proj_model(cond_images)
         imtext_cond = torch.cat([text_emb, img_emb], dim=1)
-        cond = {"c_crossattn": [imtext_cond], "c_concat": [img_tensor_repeat], "fs": save_fs}
         batch_samples = batch_ddim_sampling(model, cond, noise_shape, n_samples=1, ddim_steps=steps, ddim_eta=eta, cfg_scale=cfg_scale)
         video_path = './output.mp4'
-        save_videos(batch_samples, './', filenames=['output'], fps=fs)
     return video_path
 css = """
 .tab-nav {
@@ -163,68 +147,39 @@ css = """
 .tab-nav button:nth-child(3) { border-top: 3px solid #f7b731; }
 """
-def infer_t2v(prompt, seed=123, steps=50, cfg_scale=7.5, eta=1.0, fs=8, video_length=2):
-    # 먼저 텍스트로부터 이미지를 생성합니다
-    initial_image = generate_image_from_text(prompt, seed)
-    # 그 다음 생성된 이미지를 사용하여 비디오를 생성합니다
-    return infer(initial_image, prompt, steps, cfg_scale, eta, seed, video_length, fs)
 with gr.Blocks(analytics_enabled=False, css=css) as dynamicrafter_iface:
-    gr.Markdown("# 무비 스튜디오")
-    with gr.Tab(label='Image Generation'):
-        with gr.Column():
-            with gr.Row():
-                img_input_text = gr.Text(label='Image Generation Prompt')
-                img_seed = gr.Slider(label='Random Seed', minimum=0, maximum=10000, step=1, value=123)
-                img_generate_btn = gr.Button("Generate Image")
-            with gr.Row():
-                img_output_image = gr.Image(label="Generated Image")
-            img_generate_btn.click(
-                inputs=[img_input_text, img_seed],
-                outputs=[img_output_image],
-                fn=generate_image_from_text
-            )
-    with gr.Tab(label='Image to Video Generation'):
-        with gr.Column():
-            with gr.Row():
-                video_input_image = gr.Image(label="Input Image for Video")
-                video_prompt = gr.Text(label='Video Generation Prompt')
-                video_seed = gr.Slider(label='Random Seed', minimum=0, maximum=10000, step=1, value=123)
-                video_steps = gr.Slider(label="Sampling steps", minimum=1, maximum=50, step=1, value=30)
-                video_cfg_scale = gr.Slider(label='CFG Scale', minimum=1.0, maximum=15.0, step=0.5, value=7.5)
-                video_eta = gr.Slider(label='ETA', minimum=0.0, maximum=1.0, step=0.1, value=1.0)
-                video_fs = gr.Slider(label='FS', minimum=1, maximum=60, step=1, value=10)  # fps를 fs로 변경
-                video_length = gr.Slider(label="Video Length (seconds)", minimum=2, maximum=8, step=1, value=2)
-                video_generate_btn = gr.Button("Generate Video")
-            with gr.Row():
-                video_output = gr.Video(label="Generated Video", autoplay=True, show_share_button=True)
-            video_generate_btn.click(
-                inputs=[video_input_image, video_prompt, video_seed, video_steps, video_cfg_scale, video_eta, video_fs, video_length],
-                outputs=[video_output],
-                fn=infer
-            )
-    with gr.Tab(label='Text to Video Generation'):
         with gr.Column():
             with gr.Row():
                 with gr.Column():
-                    video_prompt = gr.Text(label='Video Generation Prompt')
-                    video_seed = gr.Slider(label='Random Seed', minimum=0, maximum=10000, step=1, value=123)
-                    video_steps = gr.Slider(label="Sampling steps", minimum=1, maximum=50, step=1, value=30)
-                    video_cfg_scale = gr.Slider(label='CFG Scale', minimum=1.0, maximum=15.0, step=0.5, value=7.5)
-                    video_eta = gr.Slider(label='ETA', minimum=0.0, maximum=1.0, step=0.1, value=1.0)
-                    video_fs = gr.Slider(label='FS', minimum=1, maximum=60, step=1, value=10)  # fps를 fs로 변경
-                    video_length = gr.Slider(label="Video Length (seconds)", minimum=2, maximum=8, step=1, value=2)
-                    video_generate_btn = gr.Button("Generate Video")
                 with gr.Row():
-                    video_output = gr.Video(label="Generated Video", autoplay=True, show_share_button=True)
-                video_generate_btn.click(
-                    inputs=[video_prompt, video_seed, video_steps, video_cfg_scale, video_eta, video_fs, video_length],
-                    outputs=[video_output],
-                    fn=infer_t2v
             )
-dynamicrafter_iface.launch(show_api=True)

     save_videos
 )
 from transformers import pipeline
 def download_model():
     REPO_ID = 'Doubiiu/DynamiCrafter_1024'
             hf_hub_download(repo_id=REPO_ID, filename=filename, local_dir='./checkpoints/dynamicrafter_1024_v1/', force_download=True)
 download_model()
+ckpt_path='checkpoints/dynamicrafter_1024_v1/model.ckpt'
+config_file='configs/inference_1024_v1.0.yaml'
 config = OmegaConf.load(config_file)
 model_config = config.pop("model", OmegaConf.create())
+model_config['params']['unet_config']['params']['use_checkpoint']=False
 model = instantiate_from_config(model_config)
 assert os.path.exists(ckpt_path), "Error: checkpoint Not Found!"
 model = load_model_checkpoint(model, ckpt_path)
 model = model.cuda()
 # 번역 모델 초기화
+translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en")
+@spaces.GPU(duration=300)
+def infer(image, prompt, steps=50, cfg_scale=7.5, eta=1.0, fs=3, seed=123, video_length=2):
     # 한글 입력 감지 및 번역
     if any('\u3131' <= char <= '\u318E' or '\uAC00' <= char <= '\uD7A3' for char in prompt):
         translated = translator(prompt, max_length=512)[0]['translation_text']
+        prompt = translated
+        print(f"Translated prompt: {prompt}")
     resolution = (576, 1024)
+    save_fps = 8
     seed_everything(seed)
     transform = transforms.Compose([
+        transforms.Resize(min(resolution)),
         transforms.CenterCrop(resolution),
+        ])
     torch.cuda.empty_cache()
+    print('start:', prompt, time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
     start = time.time()
     if steps > 60:
+        steps = 60
+    batch_size=1
     channels = model.model.diffusion_model.out_channels
+    frames = int(video_length * save_fps)  # 비디오 길이에 따른 프레임 수 계산
     h, w = resolution[0] // 8, resolution[1] // 8
     noise_shape = [batch_size, channels, frames, h, w]
+    # text cond
     with torch.no_grad(), torch.cuda.amp.autocast():
+        text_emb = model.get_learned_conditioning([prompt])
+        # img cond
         img_tensor = torch.from_numpy(image).permute(2, 0, 1).float().to(model.device)
         img_tensor = (img_tensor / 255. - 0.5) * 2
+        image_tensor_resized = transform(img_tensor) #3,256,256
+        videos = image_tensor_resized.unsqueeze(0) # bchw
+        z = get_latent_z(model, videos.unsqueeze(2)) #bc,1,hw
         img_tensor_repeat = repeat(z, 'b c t h w -> b c (repeat t) h w', repeat=frames)
+        cond_images = model.embedder(img_tensor.unsqueeze(0)) ## blc
         img_emb = model.image_proj_model(cond_images)
         imtext_cond = torch.cat([text_emb, img_emb], dim=1)
+        fs = torch.tensor([fs], dtype=torch.long, device=model.device)
+        cond = {"c_crossattn": [imtext_cond], "fs": fs, "c_concat": [img_tensor_repeat]}
+        ## inference
         batch_samples = batch_ddim_sampling(model, cond, noise_shape, n_samples=1, ddim_steps=steps, ddim_eta=eta, cfg_scale=cfg_scale)
+        ## b,samples,c,t,h,w
         video_path = './output.mp4'
+        save_videos(batch_samples, './', filenames=['output'], fps=save_fps)
     return video_path
+i2v_examples = [
+    ['prompts/1024/astronaut04.png', 'a man in an astronaut suit playing a guitar', 30, 7.5, 1.0, 6, 123, 2],
+]
 css = """
 .tab-nav {
 .tab-nav button:nth-child(3) { border-top: 3px solid #f7b731; }
 """
 with gr.Blocks(analytics_enabled=False, css=css) as dynamicrafter_iface:
+    gr.Markdown("이미지로 영상 생성 테스트 (한글 프롬프트 지원)")
+    with gr.Tab(label='ImageAnimation_576x1024'):
         with gr.Column():
             with gr.Row():
                 with gr.Column():
+                    with gr.Row():
+                        i2v_input_image = gr.Image(label="Input Image",elem_id="input_img")
+                    with gr.Row():
+                        i2v_input_text = gr.Text(label='Prompts')
+                    with gr.Row():
+                        i2v_seed = gr.Slider(label='Random Seed', minimum=0, maximum=10000, step=1, value=123)
+                        i2v_eta = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, label='ETA', value=1.0, elem_id="i2v_eta")
+                        i2v_cfg_scale = gr.Slider(minimum=1.0, maximum=15.0, step=0.5, label='CFG Scale', value=7.5, elem_id="i2v_cfg_scale")
+                    with gr.Row():
+                        i2v_steps = gr.Slider(minimum=1, maximum=50, step=1, elem_id="i2v_steps", label="Sampling steps", value=30)
+                        i2v_motion = gr.Slider(minimum=5, maximum=20, step=1, elem_id="i2v_motion", label="FPS", value=8)
+                    with gr.Row():
+                        i2v_video_length = gr.Slider(minimum=2, maximum=8, step=1, elem_id="i2v_video_length", label="Video Length (seconds)", value=2)
+                    i2v_end_btn = gr.Button("Generate")
                 with gr.Row():
+                    i2v_output_video = gr.Video(label="Generated Video",elem_id="output_vid",autoplay=True,show_share_button=True)
+            gr.Examples(examples=i2v_examples,
+                        inputs=[i2v_input_image, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_motion, i2v_seed, i2v_video_length],
+                        outputs=[i2v_output_video],
+                        fn = infer,
+                        cache_examples=True,
             )
+        i2v_end_btn.click(inputs=[i2v_input_image, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_motion, i2v_seed, i2v_video_length],
+                        outputs=[i2v_output_video],
+                        fn = infer
+        )
+dynamicrafter_iface.queue(max_size=12).launch(show_api=True)