makewong commited on
Commit
10d74ba
1 Parent(s): 5fd5f33

Upload 4 files

Browse files
Files changed (5) hide show
  1. .gitattributes +2 -0
  2. app.py +67 -37
  3. ffmpeg +3 -0
  4. ffmpeg.exe +3 -0
  5. pic2song.py +50 -70
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ ffmpeg filter=lfs diff=lfs merge=lfs -text
37
+ ffmpeg.exe filter=lfs diff=lfs merge=lfs -text
app.py CHANGED
@@ -1,37 +1,67 @@
1
- import gradio as gr
2
- from transformers import (
3
- AutoProcessor,
4
- VisionEncoderDecoderModel,
5
- ViTImageProcessor,
6
- AutoTokenizer,
7
- MusicgenForConditionalGeneration,
8
- )
9
- import torch
10
- from PIL import Image
11
- from openai import OpenAI
12
- from scipy.io import wavfile
13
- import numpy as np
14
- import pic2song
15
- import uuid
16
-
17
- def greet(image,image_ins):
18
- file = pic2song.pic2song(image,image_ins)
19
- return file
20
-
21
-
22
- with gr.Blocks() as demo:
23
- with gr.Column():
24
- with gr.Row():
25
- inp = gr.Image(type="pil",image_mode="RGB", height="500px")
26
- with gr.Column():
27
- image_ins=gr.Slider(minimum=1, maximum=60,step=1, value=10, label="Generating length (seconds) 生成长度(秒)")
28
- with gr.Row():
29
- out1 = gr.Audio()
30
- out2 = gr.Audio()
31
- with gr.Row():
32
- out3 = gr.Audio()
33
- out4 = gr.Audio()
34
- btn = gr.Button("Run")
35
- btn.click(fn=greet, inputs=[inp,image_ins], outputs=[out1,out2,out3,out4])
36
-
37
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import (
3
+ AutoProcessor,
4
+ VisionEncoderDecoderModel,
5
+ ViTImageProcessor,
6
+ AutoTokenizer,
7
+ MusicgenForConditionalGeneration,
8
+ )
9
+ import torch
10
+ from PIL import Image
11
+ from openai import OpenAI
12
+ from scipy.io import wavfile
13
+ import numpy as np
14
+ import pic2song
15
+ import uuid
16
+
17
+
18
+ def greet(image, image_ins):
19
+ file = pic2song.pic2song(image, image_ins)
20
+ return file
21
+
22
+
23
+ with gr.Blocks() as demo:
24
+ with gr.Column():
25
+ with gr.Row():
26
+ inp = gr.Image(type="pil", image_mode="RGB", height="500px")
27
+ with gr.Column():
28
+ image_ins = gr.Slider(
29
+ minimum=1,
30
+ maximum=60,
31
+ step=1,
32
+ value=5,
33
+ label="Generating length (seconds) 生成长度(秒)",
34
+ )
35
+ with gr.Row():
36
+ out1 = gr.Audio()
37
+ out2 = gr.Audio()
38
+ with gr.Row():
39
+ out3 = gr.Audio()
40
+ out4 = gr.Audio()
41
+ btn = gr.Button("Run")
42
+ with gr.Column():
43
+ with gr.Row():
44
+ video_out1 = gr.Video(interactive=False,height="300px",show_download_button=True)
45
+
46
+ video_out2 = gr.Video(interactive=False,height="300px",show_download_button=True)
47
+
48
+ video_out3 = gr.Video(interactive=False,height="300px",show_download_button=True)
49
+
50
+ video_out4 = gr.Video(interactive=False,height="300px",show_download_button=True)
51
+
52
+ btn.click(
53
+ fn=greet,
54
+ inputs=[inp, image_ins],
55
+ outputs=[
56
+ out1,
57
+ video_out1,
58
+ out2,
59
+ video_out2,
60
+ out3,
61
+ video_out3,
62
+ out4,
63
+ video_out4,
64
+ ],
65
+ )
66
+
67
+ demo.launch()
ffmpeg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f63b646b3ba2f936002476792ee6809db934581e0c19a1611c995804b7cf63b3
3
+ size 124581480
ffmpeg.exe ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:445335a651bdc74e9ed3a998fda1cbb0d1ca114732241f023421d8c4994a85b2
3
+ size 122695680
pic2song.py CHANGED
@@ -12,6 +12,7 @@ from scipy.io import wavfile
12
  import numpy as np
13
  import uuid
14
  import os
 
15
 
16
  # 从预训练模型中加载VisionEncoderDecoderModel模型
17
  model = VisionEncoderDecoderModel.from_pretrained(
@@ -29,11 +30,11 @@ feature_extractor = ViTImageProcessor.from_pretrained(
29
  tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
30
 
31
  # 检测并设置设备为cuda或cpu
32
- #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
33
- device = torch.device("cpu")
34
  # 将模型移至所选设备
35
- model.to(device)
36
- Musicgenmodel.to(device)
37
  sampling_rate = Musicgenmodel.config.audio_encoder.sampling_rate
38
 
39
  # 设定生成文本的最大长度和beam搜索的数量
@@ -49,9 +50,11 @@ def predict_step(image_paths):
49
  image_paths = image_paths.convert(mode="RGB")
50
 
51
  # 使用feature_extractor处理器将图片转换为像素值张量
52
- pixel_values = feature_extractor(images=image_paths, return_tensors="pt").pixel_values
 
 
53
  # 将像素值张量移至所选设备
54
- pixel_values = pixel_values.to(device)
55
 
56
  # 使用模型生成图片描述文本的索引序列
57
  output_ids = model.generate(pixel_values, **gen_kwargs)
@@ -68,8 +71,8 @@ def pic2song(image_paths, image_ins):
68
  in_max_new_tokens = int(image_ins * 50)
69
  # 调用predict_step函数并输出预测的图片描述文本列表
70
  print(image_paths)
71
- user_messagge = predict_step(image_paths)
72
- print(user_messagge)
73
 
74
  client = OpenAI(
75
  api_key=os.environ.get("deepseekapi"),
@@ -82,7 +85,7 @@ def pic2song(image_paths, image_ins):
82
  {"role": "system", "content": "You are a helpful assistant"},
83
  {
84
  "role": "user",
85
- "content": user_messagge
86
  + " 根据这个提示,编一个类似'a catchy beat for a podcast intro.'这样的描述,英文一句话,回复结果必须符合'a catchy beat for a podcast intro.'格式",
87
  },
88
  ],
@@ -90,7 +93,6 @@ def pic2song(image_paths, image_ins):
90
  )
91
 
92
  music_message = response.choices[0].message.content
93
- # music_message = 'a catchy beat for tiktok.'
94
  print(music_message)
95
 
96
  inputs = processor(
@@ -99,63 +101,41 @@ def pic2song(image_paths, image_ins):
99
  return_tensors="pt",
100
  )
101
 
102
-
103
- audio_values = Musicgenmodel.generate(
104
- **inputs.to(device),
105
- do_sample=True,
106
- guidance_scale=3,
107
- max_new_tokens=in_max_new_tokens
108
- )
109
-
110
-
111
- out_directory = "out"
112
-
113
- # 检查目录是否存在
114
- if not os.path.exists(out_directory):
115
- # 如果不存在,创建目录
116
- os.makedirs(out_directory)
117
- print("已创建 out 目录")
118
-
119
-
120
- output_path1 = "out/" + str(uuid.uuid4()) + ".wav"
121
- wavfile.write(
122
- output_path1, rate=sampling_rate, data=audio_values[0, 0].cpu().numpy()
123
- )
124
-
125
- audio_values = Musicgenmodel.generate(
126
- **inputs.to(device),
127
- do_sample=True,
128
- guidance_scale=3,
129
- max_new_tokens=in_max_new_tokens
130
- )
131
-
132
- output_path2 = "out/" + str(uuid.uuid4()) + ".wav"
133
- wavfile.write(
134
- output_path2, rate=sampling_rate, data=audio_values[0, 0].cpu().numpy()
135
- )
136
-
137
- audio_values = Musicgenmodel.generate(
138
- **inputs.to(device),
139
- do_sample=True,
140
- guidance_scale=3,
141
- max_new_tokens=in_max_new_tokens
142
- )
143
-
144
- output_path3 = "out/" + str(uuid.uuid4()) + ".wav"
145
- wavfile.write(
146
- output_path3, rate=sampling_rate, data=audio_values[0, 0].cpu().numpy()
147
- )
148
-
149
- audio_values = Musicgenmodel.generate(
150
- **inputs.to(device),
151
- do_sample=True,
152
- guidance_scale=3,
153
- max_new_tokens=in_max_new_tokens
154
- )
155
-
156
- output_path4 = "out/" + str(uuid.uuid4()) + ".wav"
157
- wavfile.write(
158
- output_path4, rate=sampling_rate, data=audio_values[0, 0].cpu().numpy()
159
- )
160
-
161
- return output_path1, output_path2, output_path3, output_path4
 
12
  import numpy as np
13
  import uuid
14
  import os
15
+ import subprocess
16
 
17
  # 从预训练模型中加载VisionEncoderDecoderModel模型
18
  model = VisionEncoderDecoderModel.from_pretrained(
 
30
  tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
31
 
32
  # 检测并设置设备为cuda或cpu
33
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
34
+ # device = torch.device("cpu")
35
  # 将模型移至所选设备
36
+ model.to(torch.device("cpu"))
37
+ Musicgenmodel.to(torch.device("cuda"))
38
  sampling_rate = Musicgenmodel.config.audio_encoder.sampling_rate
39
 
40
  # 设定生成文本的最大长度和beam搜索的数量
 
50
  image_paths = image_paths.convert(mode="RGB")
51
 
52
  # 使用feature_extractor处理器将图片转换为像素值张量
53
+ pixel_values = feature_extractor(
54
+ images=image_paths, return_tensors="pt"
55
+ ).pixel_values
56
  # 将像素值张量移至所选设备
57
+ pixel_values = pixel_values.to(torch.device("cpu"))
58
 
59
  # 使用模型生成图片描述文本的索引序列
60
  output_ids = model.generate(pixel_values, **gen_kwargs)
 
71
  in_max_new_tokens = int(image_ins * 50)
72
  # 调用predict_step函数并输出预测的图片描述文本列表
73
  print(image_paths)
74
+ user_message = predict_step(image_paths)
75
+ print(user_message)
76
 
77
  client = OpenAI(
78
  api_key=os.environ.get("deepseekapi"),
 
85
  {"role": "system", "content": "You are a helpful assistant"},
86
  {
87
  "role": "user",
88
+ "content": user_message
89
  + " 根据这个提示,编一个类似'a catchy beat for a podcast intro.'这样的描述,英文一句话,回复结果必须符合'a catchy beat for a podcast intro.'格式",
90
  },
91
  ],
 
93
  )
94
 
95
  music_message = response.choices[0].message.content
 
96
  print(music_message)
97
 
98
  inputs = processor(
 
101
  return_tensors="pt",
102
  )
103
 
104
+ # 生成音频
105
+ output_paths = []
106
+ for i in range(4):
107
+ audio_values = Musicgenmodel.generate(
108
+ **inputs.to(device),
109
+ do_sample=True,
110
+ guidance_scale=3,
111
+ max_new_tokens=in_max_new_tokens,
112
+ )
113
+
114
+ output_path = f"out/{uuid.uuid4()}.wav"
115
+ wavfile.write(
116
+ output_path, rate=sampling_rate, data=audio_values[0, 0].cpu().numpy()
117
+ )
118
+ output_paths.append(output_path)
119
+ voutput_path = f"out/{uuid.uuid4()}.mp4"
120
+ temp_image_path = "temp_image.jpg"
121
+ image_paths.save(temp_image_path)
122
+ ffmpeg_cmd = [
123
+ 'ffmpeg',
124
+ '-loop', '1',
125
+ '-i', temp_image_path,
126
+ '-i', output_path,
127
+ '-c:v', 'libx264',
128
+ '-preset', 'slow',
129
+ '-tune', 'stillimage',
130
+ '-c:a', 'aac',
131
+ '-b:a', '320k',
132
+ '-pix_fmt', 'yuv420p',
133
+ '-shortest',
134
+ voutput_path
135
+ ]
136
+ subprocess.run(ffmpeg_cmd)
137
+
138
+ # 删除临时图像文件
139
+ os.remove(temp_image_path)
140
+ output_paths.append(voutput_path)
141
+ return output_paths