amos1088 commited on
Commit
38e6a4b
·
1 Parent(s): ad1f200

test gradio

Browse files
Files changed (1) hide show
  1. app.py +80 -11
app.py CHANGED
@@ -7,9 +7,49 @@ from diffusers import AnimateDiffSparseControlNetPipeline
7
  from diffusers.models import AutoencoderKL, MotionAdapter, SparseControlNetModel
8
  from diffusers.schedulers import DPMSolverMultistepScheduler
9
  from diffusers.utils import export_to_gif, load_image
 
 
 
 
10
  token = os.getenv("HF_TOKEN")
11
  login(token=token)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
 
 
 
 
 
 
 
 
 
13
 
14
  model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
15
  motion_adapter_id = "guoyww/animatediff-motion-adapter-v1-5-3"
@@ -28,7 +68,7 @@ scheduler = DPMSolverMultistepScheduler.from_pretrained(
28
  algorithm_type="dpmsolver++",
29
  use_karras_sigmas=True,
30
  )
31
- pipe = AnimateDiffSparseControlNetPipeline.from_pretrained(
32
  model_id,
33
  motion_adapter=motion_adapter,
34
  controlnet=controlnet,
@@ -36,22 +76,51 @@ pipe = AnimateDiffSparseControlNetPipeline.from_pretrained(
36
  scheduler=scheduler,
37
  torch_dtype=torch.float16,
38
  ).to(device)
39
- pipe.load_lora_weights(lora_adapter_id, adapter_name="motion_lora")
 
 
40
 
41
 
42
  @spaces.GPU
43
- def generate_image(prompt, reference_image, controlnet_conditioning_scale,num_frames):
44
- style_images = [load_image(f.name) for f in reference_image]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- video = pipe(
47
  prompt=prompt,
48
  negative_prompt="low quality, worst quality",
49
  num_inference_steps=25,
50
- num_frames=num_frames,
51
- conditioning_frames=style_images,
52
- controlnet_frame_indices=[0],
53
  controlnet_conditioning_scale=controlnet_conditioning_scale,
54
- generator=torch.Generator().manual_seed(42),
55
  ).frames[0]
56
  export_to_gif(video, "output.gif")
57
 
@@ -59,11 +128,11 @@ def generate_image(prompt, reference_image, controlnet_conditioning_scale,num_fr
59
 
60
  # Set up Gradio interface
61
  interface = gr.Interface(
62
- fn=generate_image,
63
  inputs=[
64
  gr.Textbox(label="Prompt"),
65
  # gr.Image( type= "filepath",label="Reference Image (Style)"),
66
- gr.File(type="file",file_count="multiple",label="Reference Image (Style)"),
67
  gr.Slider(label="Control Net Conditioning Scale", minimum=0, maximum=1.0, step=0.1, value=1.0),
68
  gr.Slider(label="Number of frames", minimum=0, maximum=1.0, step=0.1, value=1.0),
69
 
 
7
  from diffusers.models import AutoencoderKL, MotionAdapter, SparseControlNetModel
8
  from diffusers.schedulers import DPMSolverMultistepScheduler
9
  from diffusers.utils import export_to_gif, load_image
10
+ from diffusers import AutoPipelineForText2Image
11
+ import openai,json
12
+
13
+
14
  token = os.getenv("HF_TOKEN")
15
  login(token=token)
16
+ openai_token = os.getenv("OPENAI_TOKEN")
17
+ openai.api_key = openai_token
18
+ openaiclient = openai.OpenAI(api_key=openai.api_key)
19
+
20
+ def ask_gpt(massage_history,model="gpt-4o-mini",return_str=True,response_format={"type": "json_object"}):
21
+ response = openaiclient.chat.completions.create(
22
+ model=model,
23
+ messages=massage_history,
24
+ response_format=response_format,
25
+ max_tokens=4000, )
26
+
27
+ if return_str:
28
+ return response.choices[0].message.content
29
+ else:
30
+ return json.loads(response.choices[0].message.content)
31
+
32
+
33
+ image_pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/sdxl-turbo", torch_dtype=torch.float16).to("cuda")
34
+ image_pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
35
+
36
+
37
+
38
+ @spaces.GPU
39
+ def generate_image(prompt, reference_image, controlnet_conditioning_scale):
40
+ style_images = [load_image(f.name) for f in reference_image]
41
+
42
+ image_pipeline.set_ip_adapter_scale(controlnet_conditioning_scale)
43
 
44
+ image = image_pipeline(
45
+ prompt=prompt,
46
+ ip_adapter_image=[style_images],
47
+ negative_prompt="",
48
+ guidance_scale=5,
49
+ num_inference_steps=30,
50
+ ).images[0]
51
+
52
+ return image
53
 
54
  model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
55
  motion_adapter_id = "guoyww/animatediff-motion-adapter-v1-5-3"
 
68
  algorithm_type="dpmsolver++",
69
  use_karras_sigmas=True,
70
  )
71
+ gif_pipe = AnimateDiffSparseControlNetPipeline.from_pretrained(
72
  model_id,
73
  motion_adapter=motion_adapter,
74
  controlnet=controlnet,
 
76
  scheduler=scheduler,
77
  torch_dtype=torch.float16,
78
  ).to(device)
79
+ gif_pipe.load_lora_weights(lora_adapter_id, adapter_name="motion_lora")
80
+
81
+
82
 
83
 
84
  @spaces.GPU
85
+ def generate_gif(prompt, reference_image, controlnet_conditioning_scale,num_frames):
86
+ massage_history = [{"role": "system", "content": """
87
+ You are a scene designer tasked with creating sparse frames of a video. You will be given a prompt describing the desired video, and your goal is to design only the key frames (sparse frames) that represent major changes in the scene. Do not include repetitive or similar scenes—only capture distinct moments.
88
+
89
+ Expected Format:
90
+ Return the response as a JSON object with the key "frames". The value should be a list of dictionaries, where each dictionary has:
91
+
92
+ "frame_index": an integer indicating the frame's position in the sequence.
93
+ "description": a brief description of the scene in this frame.
94
+ Example:
95
+ If given a prompt like "A sunset over a beach with waves crashing and a ship sailing by," your response should look like this:
96
+
97
+ ```json
98
+ {
99
+ "frames": [
100
+ {"frame_index": 0, "description": "Sunset over an empty beach, sky turning orange and pink"},
101
+ {"frame_index": 30, "description": "Waves gently crashing on the shore"},
102
+ {"frame_index": 60, "description": "A ship appears on the horizon, silhouetted by the sunset"},
103
+ {"frame_index": 90, "description": "Ship sailing closer, with waves becoming more dynamic"},
104
+ {"frame_index": 120, "description": "Sun dipping below the horizon, casting a golden glow over the water"}
105
+ ]
106
+ }
107
+ ```
108
+ This way, each frame represents a distinct scene, and there’s no redundancy between them."""},
109
+ {"role": "user", "content": f"give me the frames to generate a video with prompt : `{prompt}`"},]
110
+ frames = ask_gpt(massage_history,return_str=False)['frames']
111
+ conditioning_frames = []
112
+ controlnet_frame_indices =[]
113
+ for frame in frames:
114
+ conditioning_frames.append(generate_image(frame['description'], reference_image, controlnet_conditioning_scale))
115
+ controlnet_frame_indices.append(frame['frame_index'])
116
 
117
+ video = gif_pipe(
118
  prompt=prompt,
119
  negative_prompt="low quality, worst quality",
120
  num_inference_steps=25,
121
+ conditioning_frames=conditioning_frames,
122
+ controlnet_frame_indices=controlnet_frame_indices,
 
123
  controlnet_conditioning_scale=controlnet_conditioning_scale,
 
124
  ).frames[0]
125
  export_to_gif(video, "output.gif")
126
 
 
128
 
129
  # Set up Gradio interface
130
  interface = gr.Interface(
131
+ fn=generate_gif,
132
  inputs=[
133
  gr.Textbox(label="Prompt"),
134
  # gr.Image( type= "filepath",label="Reference Image (Style)"),
135
+ gr.File(type="filepath",file_count="multiple",label="Reference Image (Style)"),
136
  gr.Slider(label="Control Net Conditioning Scale", minimum=0, maximum=1.0, step=0.1, value=1.0),
137
  gr.Slider(label="Number of frames", minimum=0, maximum=1.0, step=0.1, value=1.0),
138