Spaces:
Running
Running
Julian-Hans
commited on
Commit
·
a15bc9b
1
Parent(s):
9b3501f
removed force pushed, load models separately to optimize ram usage
Browse files- .github/workflows/sync.yml +1 -1
- .gitignore +2 -1
- app.py +50 -5
.github/workflows/sync.yml
CHANGED
@@ -17,4 +17,4 @@ jobs:
|
|
17 |
- name: Push to hub
|
18 |
env:
|
19 |
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
20 |
-
run: git push https://oxmraz-mldo:[email protected]/spaces/Group17WPIMLDO24/Case-Study-1 main
|
|
|
17 |
- name: Push to hub
|
18 |
env:
|
19 |
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
20 |
+
run: git push https://oxmraz-mldo:[email protected]/spaces/Group17WPIMLDO24/Case-Study-1 main
|
.gitignore
CHANGED
@@ -1 +1,2 @@
|
|
1 |
-
/__pycache__
|
|
|
|
1 |
+
/__pycache__
|
2 |
+
*.wav
|
app.py
CHANGED
@@ -1,7 +1,10 @@
|
|
1 |
# external imports
|
|
|
|
|
2 |
import time
|
3 |
import uuid
|
4 |
import gradio as gr
|
|
|
5 |
|
6 |
# local imports
|
7 |
from blip_image_caption_large import Blip_Image_Caption_Large
|
@@ -9,12 +12,11 @@ from phi3_mini_4k_instruct import Phi3_Mini_4k_Instruct
|
|
9 |
from musicgen_small import Musicgen_Small
|
10 |
import config
|
11 |
|
|
|
|
|
|
|
12 |
class Image_To_Music:
|
13 |
def __init__(self):
|
14 |
-
self.image_caption_model = Blip_Image_Caption_Large()
|
15 |
-
self.text_generation_model = Phi3_Mini_4k_Instruct()
|
16 |
-
self.music_generation_model = Musicgen_Small()
|
17 |
-
|
18 |
self.image_path = None
|
19 |
self.generated_caption = None
|
20 |
self.generated_description = None
|
@@ -23,28 +25,71 @@ class Image_To_Music:
|
|
23 |
self.caption_generation_duration = -1
|
24 |
self.description_generation_duration = -1
|
25 |
self.music_generation_duration = -1
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
def caption_image(self, image_path):
|
|
|
28 |
caption_start_time = time.time()
|
|
|
|
|
|
|
|
|
29 |
self.image_path = image_path
|
30 |
self.generated_caption = self.image_caption_model.caption_image_local_pipeline(self.image_path)[0]["generated_text"]
|
|
|
|
|
|
|
|
|
|
|
31 |
self.caption_generation_duration = time.time() - caption_start_time
|
|
|
32 |
return self.generated_caption
|
33 |
|
34 |
def generate_description(self):
|
|
|
35 |
description_start_time = time.time()
|
|
|
|
|
|
|
|
|
36 |
messages = [
|
37 |
{"role": "system", "content": "You are an image caption to song description converter with a deep understanding of Music and Art. You are given the caption of an image. Your task is to generate a textual description of a musical piece that fits the caption. The description should be detailed and vivid, and should include the genre, mood, instruments, tempo, and other relevant information about the music. You should also use your knowledge of art and visual aesthetics to create a musical piece that complements the image. Only output the description of the music, without any explanation or introduction. Be concise."},
|
38 |
{"role": "user", "content": self.generated_caption},
|
39 |
]
|
40 |
self.generated_description = self.text_generation_model.generate_text_local_pipeline(messages)[-1]['generated_text'][-1]['content']
|
|
|
|
|
|
|
|
|
|
|
41 |
self.description_generation_duration = time.time() - description_start_time
|
|
|
42 |
return self.generated_description
|
43 |
|
44 |
def generate_music(self):
|
|
|
45 |
music_start_time = time.time()
|
|
|
|
|
|
|
|
|
46 |
self.music_generation_model.generate_music_local_pipeline(self.generated_description, self.audio_path)
|
|
|
|
|
|
|
|
|
|
|
47 |
self.music_generation_duration = time.time() - music_start_time
|
|
|
48 |
return self.audio_path
|
49 |
|
50 |
def get_durations(self):
|
|
|
1 |
# external imports
|
2 |
+
import gc
|
3 |
+
import logging as log
|
4 |
import time
|
5 |
import uuid
|
6 |
import gradio as gr
|
7 |
+
import os
|
8 |
|
9 |
# local imports
|
10 |
from blip_image_caption_large import Blip_Image_Caption_Large
|
|
|
12 |
from musicgen_small import Musicgen_Small
|
13 |
import config
|
14 |
|
15 |
+
log.basicConfig(level=log.INFO)
|
16 |
+
|
17 |
+
|
18 |
class Image_To_Music:
|
19 |
def __init__(self):
|
|
|
|
|
|
|
|
|
20 |
self.image_path = None
|
21 |
self.generated_caption = None
|
22 |
self.generated_description = None
|
|
|
25 |
self.caption_generation_duration = -1
|
26 |
self.description_generation_duration = -1
|
27 |
self.music_generation_duration = -1
|
28 |
+
self.create_output_folder()
|
29 |
+
|
30 |
+
|
31 |
+
# ----ATTRIBUTION-START----
|
32 |
+
# LLM: Github Copilot
|
33 |
+
# PROMPT: create an output folder for the generated audio files
|
34 |
+
# EDITS: /
|
35 |
+
def create_output_folder(self):
|
36 |
+
os.makedirs(config.AUDIO_DIR, exist_ok=True)
|
37 |
+
# -----ATTRIBUTION-END-----
|
38 |
+
|
39 |
def caption_image(self, image_path):
|
40 |
+
log.info("Captioning Image...")
|
41 |
caption_start_time = time.time()
|
42 |
+
|
43 |
+
# load model
|
44 |
+
self.image_caption_model = Blip_Image_Caption_Large()
|
45 |
+
|
46 |
self.image_path = image_path
|
47 |
self.generated_caption = self.image_caption_model.caption_image_local_pipeline(self.image_path)[0]["generated_text"]
|
48 |
+
|
49 |
+
# delete model to free up ram
|
50 |
+
del self.image_caption_model
|
51 |
+
gc.collect()
|
52 |
+
|
53 |
self.caption_generation_duration = time.time() - caption_start_time
|
54 |
+
log.info(f"Captioning Complete in {self.caption_generation_duration:.2f} seconds: {self.generated_caption}")
|
55 |
return self.generated_caption
|
56 |
|
57 |
def generate_description(self):
|
58 |
+
log.info("Generating Music Description...")
|
59 |
description_start_time = time.time()
|
60 |
+
|
61 |
+
# load model
|
62 |
+
self.text_generation_model = Phi3_Mini_4k_Instruct()
|
63 |
+
|
64 |
messages = [
|
65 |
{"role": "system", "content": "You are an image caption to song description converter with a deep understanding of Music and Art. You are given the caption of an image. Your task is to generate a textual description of a musical piece that fits the caption. The description should be detailed and vivid, and should include the genre, mood, instruments, tempo, and other relevant information about the music. You should also use your knowledge of art and visual aesthetics to create a musical piece that complements the image. Only output the description of the music, without any explanation or introduction. Be concise."},
|
66 |
{"role": "user", "content": self.generated_caption},
|
67 |
]
|
68 |
self.generated_description = self.text_generation_model.generate_text_local_pipeline(messages)[-1]['generated_text'][-1]['content']
|
69 |
+
|
70 |
+
# delete model to free up ram
|
71 |
+
del self.text_generation_model
|
72 |
+
gc.collect()
|
73 |
+
|
74 |
self.description_generation_duration = time.time() - description_start_time
|
75 |
+
log.info(f"Description Generation Complete in {self.description_generation_duration:.2f} seconds: {self.generated_description}")
|
76 |
return self.generated_description
|
77 |
|
78 |
def generate_music(self):
|
79 |
+
log.info("Generating Music...")
|
80 |
music_start_time = time.time()
|
81 |
+
|
82 |
+
# load model
|
83 |
+
self.music_generation_model = Musicgen_Small()
|
84 |
+
|
85 |
self.music_generation_model.generate_music_local_pipeline(self.generated_description, self.audio_path)
|
86 |
+
|
87 |
+
# delete model to free up ram
|
88 |
+
del self.music_generation_model
|
89 |
+
gc.collect()
|
90 |
+
|
91 |
self.music_generation_duration = time.time() - music_start_time
|
92 |
+
log.info(f"Music Generation Complete in {self.music_generation_duration:.2f} seconds: {self.audio_path}")
|
93 |
return self.audio_path
|
94 |
|
95 |
def get_durations(self):
|