Spaces:
Running
Running
makewong
commited on
Commit
•
10d74ba
1
Parent(s):
5fd5f33
Upload 4 files
Browse files- .gitattributes +2 -0
- app.py +67 -37
- ffmpeg +3 -0
- ffmpeg.exe +3 -0
- pic2song.py +50 -70
.gitattributes
CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
ffmpeg filter=lfs diff=lfs merge=lfs -text
|
37 |
+
ffmpeg.exe filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
@@ -1,37 +1,67 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
from transformers import (
|
3 |
-
AutoProcessor,
|
4 |
-
VisionEncoderDecoderModel,
|
5 |
-
ViTImageProcessor,
|
6 |
-
AutoTokenizer,
|
7 |
-
MusicgenForConditionalGeneration,
|
8 |
-
)
|
9 |
-
import torch
|
10 |
-
from PIL import Image
|
11 |
-
from openai import OpenAI
|
12 |
-
from scipy.io import wavfile
|
13 |
-
import numpy as np
|
14 |
-
import pic2song
|
15 |
-
import uuid
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from transformers import (
|
3 |
+
AutoProcessor,
|
4 |
+
VisionEncoderDecoderModel,
|
5 |
+
ViTImageProcessor,
|
6 |
+
AutoTokenizer,
|
7 |
+
MusicgenForConditionalGeneration,
|
8 |
+
)
|
9 |
+
import torch
|
10 |
+
from PIL import Image
|
11 |
+
from openai import OpenAI
|
12 |
+
from scipy.io import wavfile
|
13 |
+
import numpy as np
|
14 |
+
import pic2song
|
15 |
+
import uuid
|
16 |
+
|
17 |
+
|
18 |
+
def greet(image, image_ins):
|
19 |
+
file = pic2song.pic2song(image, image_ins)
|
20 |
+
return file
|
21 |
+
|
22 |
+
|
23 |
+
with gr.Blocks() as demo:
|
24 |
+
with gr.Column():
|
25 |
+
with gr.Row():
|
26 |
+
inp = gr.Image(type="pil", image_mode="RGB", height="500px")
|
27 |
+
with gr.Column():
|
28 |
+
image_ins = gr.Slider(
|
29 |
+
minimum=1,
|
30 |
+
maximum=60,
|
31 |
+
step=1,
|
32 |
+
value=5,
|
33 |
+
label="Generating length (seconds) 生成长度(秒)",
|
34 |
+
)
|
35 |
+
with gr.Row():
|
36 |
+
out1 = gr.Audio()
|
37 |
+
out2 = gr.Audio()
|
38 |
+
with gr.Row():
|
39 |
+
out3 = gr.Audio()
|
40 |
+
out4 = gr.Audio()
|
41 |
+
btn = gr.Button("Run")
|
42 |
+
with gr.Column():
|
43 |
+
with gr.Row():
|
44 |
+
video_out1 = gr.Video(interactive=False,height="300px",show_download_button=True)
|
45 |
+
|
46 |
+
video_out2 = gr.Video(interactive=False,height="300px",show_download_button=True)
|
47 |
+
|
48 |
+
video_out3 = gr.Video(interactive=False,height="300px",show_download_button=True)
|
49 |
+
|
50 |
+
video_out4 = gr.Video(interactive=False,height="300px",show_download_button=True)
|
51 |
+
|
52 |
+
btn.click(
|
53 |
+
fn=greet,
|
54 |
+
inputs=[inp, image_ins],
|
55 |
+
outputs=[
|
56 |
+
out1,
|
57 |
+
video_out1,
|
58 |
+
out2,
|
59 |
+
video_out2,
|
60 |
+
out3,
|
61 |
+
video_out3,
|
62 |
+
out4,
|
63 |
+
video_out4,
|
64 |
+
],
|
65 |
+
)
|
66 |
+
|
67 |
+
demo.launch()
|
ffmpeg
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f63b646b3ba2f936002476792ee6809db934581e0c19a1611c995804b7cf63b3
|
3 |
+
size 124581480
|
ffmpeg.exe
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:445335a651bdc74e9ed3a998fda1cbb0d1ca114732241f023421d8c4994a85b2
|
3 |
+
size 122695680
|
pic2song.py
CHANGED
@@ -12,6 +12,7 @@ from scipy.io import wavfile
|
|
12 |
import numpy as np
|
13 |
import uuid
|
14 |
import os
|
|
|
15 |
|
16 |
# 从预训练模型中加载VisionEncoderDecoderModel模型
|
17 |
model = VisionEncoderDecoderModel.from_pretrained(
|
@@ -29,11 +30,11 @@ feature_extractor = ViTImageProcessor.from_pretrained(
|
|
29 |
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
|
30 |
|
31 |
# 检测并设置设备为cuda或cpu
|
32 |
-
|
33 |
-
device = torch.device("cpu")
|
34 |
# 将模型移至所选设备
|
35 |
-
model.to(device)
|
36 |
-
Musicgenmodel.to(device)
|
37 |
sampling_rate = Musicgenmodel.config.audio_encoder.sampling_rate
|
38 |
|
39 |
# 设定生成文本的最大长度和beam搜索的数量
|
@@ -49,9 +50,11 @@ def predict_step(image_paths):
|
|
49 |
image_paths = image_paths.convert(mode="RGB")
|
50 |
|
51 |
# 使用feature_extractor处理器将图片转换为像素值张量
|
52 |
-
pixel_values = feature_extractor(
|
|
|
|
|
53 |
# 将像素值张量移至所选设备
|
54 |
-
pixel_values = pixel_values.to(device)
|
55 |
|
56 |
# 使用模型生成图片描述文本的索引序列
|
57 |
output_ids = model.generate(pixel_values, **gen_kwargs)
|
@@ -68,8 +71,8 @@ def pic2song(image_paths, image_ins):
|
|
68 |
in_max_new_tokens = int(image_ins * 50)
|
69 |
# 调用predict_step函数并输出预测的图片描述文本列表
|
70 |
print(image_paths)
|
71 |
-
|
72 |
-
print(
|
73 |
|
74 |
client = OpenAI(
|
75 |
api_key=os.environ.get("deepseekapi"),
|
@@ -82,7 +85,7 @@ def pic2song(image_paths, image_ins):
|
|
82 |
{"role": "system", "content": "You are a helpful assistant"},
|
83 |
{
|
84 |
"role": "user",
|
85 |
-
"content":
|
86 |
+ " 根据这个提示,编一个类似'a catchy beat for a podcast intro.'这样的描述,英文一句话,回复结果必须符合'a catchy beat for a podcast intro.'格式",
|
87 |
},
|
88 |
],
|
@@ -90,7 +93,6 @@ def pic2song(image_paths, image_ins):
|
|
90 |
)
|
91 |
|
92 |
music_message = response.choices[0].message.content
|
93 |
-
# music_message = 'a catchy beat for tiktok.'
|
94 |
print(music_message)
|
95 |
|
96 |
inputs = processor(
|
@@ -99,63 +101,41 @@ def pic2song(image_paths, image_ins):
|
|
99 |
return_tensors="pt",
|
100 |
)
|
101 |
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
guidance_scale=3,
|
141 |
-
max_new_tokens=in_max_new_tokens
|
142 |
-
)
|
143 |
-
|
144 |
-
output_path3 = "out/" + str(uuid.uuid4()) + ".wav"
|
145 |
-
wavfile.write(
|
146 |
-
output_path3, rate=sampling_rate, data=audio_values[0, 0].cpu().numpy()
|
147 |
-
)
|
148 |
-
|
149 |
-
audio_values = Musicgenmodel.generate(
|
150 |
-
**inputs.to(device),
|
151 |
-
do_sample=True,
|
152 |
-
guidance_scale=3,
|
153 |
-
max_new_tokens=in_max_new_tokens
|
154 |
-
)
|
155 |
-
|
156 |
-
output_path4 = "out/" + str(uuid.uuid4()) + ".wav"
|
157 |
-
wavfile.write(
|
158 |
-
output_path4, rate=sampling_rate, data=audio_values[0, 0].cpu().numpy()
|
159 |
-
)
|
160 |
-
|
161 |
-
return output_path1, output_path2, output_path3, output_path4
|
|
|
12 |
import numpy as np
|
13 |
import uuid
|
14 |
import os
|
15 |
+
import subprocess
|
16 |
|
17 |
# 从预训练模型中加载VisionEncoderDecoderModel模型
|
18 |
model = VisionEncoderDecoderModel.from_pretrained(
|
|
|
30 |
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
|
31 |
|
32 |
# 检测并设置设备为cuda或cpu
|
33 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
34 |
+
# device = torch.device("cpu")
|
35 |
# 将模型移至所选设备
|
36 |
+
model.to(torch.device("cpu"))
|
37 |
+
Musicgenmodel.to(torch.device("cuda"))
|
38 |
sampling_rate = Musicgenmodel.config.audio_encoder.sampling_rate
|
39 |
|
40 |
# 设定生成文本的最大长度和beam搜索的数量
|
|
|
50 |
image_paths = image_paths.convert(mode="RGB")
|
51 |
|
52 |
# 使用feature_extractor处理器将图片转换为像素值张量
|
53 |
+
pixel_values = feature_extractor(
|
54 |
+
images=image_paths, return_tensors="pt"
|
55 |
+
).pixel_values
|
56 |
# 将像素值张量移至所选设备
|
57 |
+
pixel_values = pixel_values.to(torch.device("cpu"))
|
58 |
|
59 |
# 使用模型生成图片描述文本的索引序列
|
60 |
output_ids = model.generate(pixel_values, **gen_kwargs)
|
|
|
71 |
in_max_new_tokens = int(image_ins * 50)
|
72 |
# 调用predict_step函数并输出预测的图片描述文本列表
|
73 |
print(image_paths)
|
74 |
+
user_message = predict_step(image_paths)
|
75 |
+
print(user_message)
|
76 |
|
77 |
client = OpenAI(
|
78 |
api_key=os.environ.get("deepseekapi"),
|
|
|
85 |
{"role": "system", "content": "You are a helpful assistant"},
|
86 |
{
|
87 |
"role": "user",
|
88 |
+
"content": user_message
|
89 |
+ " 根据这个提示,编一个类似'a catchy beat for a podcast intro.'这样的描述,英文一句话,回复结果必须符合'a catchy beat for a podcast intro.'格式",
|
90 |
},
|
91 |
],
|
|
|
93 |
)
|
94 |
|
95 |
music_message = response.choices[0].message.content
|
|
|
96 |
print(music_message)
|
97 |
|
98 |
inputs = processor(
|
|
|
101 |
return_tensors="pt",
|
102 |
)
|
103 |
|
104 |
+
# 生成音频
|
105 |
+
output_paths = []
|
106 |
+
for i in range(4):
|
107 |
+
audio_values = Musicgenmodel.generate(
|
108 |
+
**inputs.to(device),
|
109 |
+
do_sample=True,
|
110 |
+
guidance_scale=3,
|
111 |
+
max_new_tokens=in_max_new_tokens,
|
112 |
+
)
|
113 |
+
|
114 |
+
output_path = f"out/{uuid.uuid4()}.wav"
|
115 |
+
wavfile.write(
|
116 |
+
output_path, rate=sampling_rate, data=audio_values[0, 0].cpu().numpy()
|
117 |
+
)
|
118 |
+
output_paths.append(output_path)
|
119 |
+
voutput_path = f"out/{uuid.uuid4()}.mp4"
|
120 |
+
temp_image_path = "temp_image.jpg"
|
121 |
+
image_paths.save(temp_image_path)
|
122 |
+
ffmpeg_cmd = [
|
123 |
+
'ffmpeg',
|
124 |
+
'-loop', '1',
|
125 |
+
'-i', temp_image_path,
|
126 |
+
'-i', output_path,
|
127 |
+
'-c:v', 'libx264',
|
128 |
+
'-preset', 'slow',
|
129 |
+
'-tune', 'stillimage',
|
130 |
+
'-c:a', 'aac',
|
131 |
+
'-b:a', '320k',
|
132 |
+
'-pix_fmt', 'yuv420p',
|
133 |
+
'-shortest',
|
134 |
+
voutput_path
|
135 |
+
]
|
136 |
+
subprocess.run(ffmpeg_cmd)
|
137 |
+
|
138 |
+
# 删除临时图像文件
|
139 |
+
os.remove(temp_image_path)
|
140 |
+
output_paths.append(voutput_path)
|
141 |
+
return output_paths
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|