Julian-Hans commited on
Commit
ddf2ccc
·
1 Parent(s): 6b9fb43

renamed poc_app.py to app.py, clean up for app logic, implemented function to yield intermediate results, added generation length parameter to config, changed path handling in app.py

Browse files
README.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Case-Study-1: Image-To-Music
3
+ emoji: 🎼
4
+ colorFrom: gray
5
+ colorTo: blue
6
+ sdk: gradio
7
+ sdk_version: 4.44.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+ ## Case-Study-1: Image-To-Music 🎼
12
+
13
+ An image to music converter, built with the following models:
14
+ - https://huggingface.co/Salesforce/blip-image-captioning-large for Image Captioning
15
+ - https://huggingface.co/microsoft/Phi-3-mini-4k-instruct for Audio Prompt generation with Caption
16
+ - https://huggingface.co/facebook/musicgen-small for Music Generation
17
+
18
+ Currently supports .jpg, .jpeg, and .png!
__pycache__/blip_image_caption_large.cpython-311.pyc ADDED
Binary file (1.11 kB). View file
 
__pycache__/config.cpython-311.pyc ADDED
Binary file (484 Bytes). View file
 
__pycache__/musicgen_small.cpython-311.pyc ADDED
Binary file (1.46 kB). View file
 
__pycache__/phi3_mini_4k_instruct.cpython-311.pyc ADDED
Binary file (1.36 kB). View file
 
app.py CHANGED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # external imports
2
+ import time
3
+ import uuid
4
+ import gradio as gr
5
+
6
+ # local imports
7
+ from blip_image_caption_large import Blip_Image_Caption_Large
8
+ from phi3_mini_4k_instruct import Phi3_Mini_4k_Instruct
9
+ from musicgen_small import Musicgen_Small
10
+ import config
11
+
12
+ class Image_To_Music:
13
+ def __init__(self):
14
+ self.image_caption_model = Blip_Image_Caption_Large()
15
+ self.text_generation_model = Phi3_Mini_4k_Instruct()
16
+ self.music_generation_model = Musicgen_Small()
17
+
18
+ self.image_path = None
19
+ self.generated_caption = None
20
+ self.generated_description = None
21
+ self.audio_path = config.AUDIO_DIR + str(uuid.uuid4()) + ".wav"
22
+
23
+ self.caption_generation_duration = -1
24
+ self.description_generation_duration = -1
25
+ self.music_generation_duration = -1
26
+
27
+ def caption_image(self, image_path):
28
+ caption_start_time = time.time()
29
+ self.image_path = image_path
30
+ self.generated_caption = self.image_caption_model.caption_image_local_pipeline(self.image_path)[0]["generated_text"]
31
+ self.caption_generation_duration = time.time() - caption_start_time
32
+ return self.generated_caption
33
+
34
+ def generate_description(self):
35
+ description_start_time = time.time()
36
+ messages = [
37
+ {"role": "system", "content": "You are an image caption to song description converter with a deep understanding of Music and Art. You are given the caption of an image. Your task is to generate a textual description of a musical piece that fits the caption. The description should be detailed and vivid, and should include the genre, mood, instruments, tempo, and other relevant information about the music. You should also use your knowledge of art and visual aesthetics to create a musical piece that complements the image. Only output the description of the music, without any explanation or introduction. Be concise."},
38
+ {"role": "user", "content": self.generated_caption},
39
+ ]
40
+ self.generated_description = self.text_generation_model.generate_text_local_pipeline(messages)[-1]['generated_text'][-1]['content']
41
+ self.description_generation_duration = time.time() - description_start_time
42
+ return self.generated_description
43
+
44
+ def generate_music(self):
45
+ music_start_time = time.time()
46
+ self.music_generation_model.generate_music_local_pipeline(self.generated_description, self.audio_path)
47
+ self.music_generation_duration = time.time() - music_start_time
48
+ return self.audio_path
49
+
50
+ def get_durations(self):
51
+ return f"Caption Generation Time: {self.caption_generation_duration:.2f} seconds\nDescription Generation Time: {self.description_generation_duration:.2f} seconds\nMusic Generation Time: {self.music_generation_duration:.2f} seconds\nTotal Time: {self.caption_generation_duration + self.description_generation_duration + self.music_generation_duration:.2f} seconds"
52
+
53
+ def run_yield(self, image_path):
54
+
55
+ self.caption_image(image_path)
56
+ yield [self.generated_caption, None, None, None]
57
+ self.generate_description()
58
+ yield [self.generated_caption, self.generated_description, None, None]
59
+ self.generate_music()
60
+ yield [self.generated_caption, self.generated_description, self.audio_path, None]
61
+ return [self.generated_caption, self.generated_description, self.audio_path,self.get_durations()]
62
+
63
+ def run(self, image_path):
64
+ self.caption_image(image_path)
65
+ self.generate_description()
66
+ self.generate_music()
67
+ return [self.generated_caption, self.generated_description, self.audio_path, self.get_durations()]
68
+
69
+
70
+ # Gradio UI
71
+ def gradio():
72
+ # Define Gradio Interface, information from (https://www.gradio.app/docs/chatinterface)
73
+ with gr.Blocks() as demo:
74
+ gr.Markdown("<h1 style='text-align: center;'> ⛺ Image to Music Generator 🎼</h1>")
75
+ image_input = gr.Image(type="filepath", label="Upload Image")
76
+ with gr.Row():
77
+ caption_output = gr.Textbox(label="Image Caption")
78
+ music_description_output = gr.Textbox(label="Music Description")
79
+ durations = gr.Textbox(label="Processing Times", interactive=False, placeholder="Time statistics will appear here")
80
+
81
+ music_output = gr.Audio(label="Generated Music")
82
+ # Button to trigger the process
83
+ generate_button = gr.Button("Generate Music")
84
+ itm = Image_To_Music()
85
+ generate_button.click(fn=itm.run, inputs=image_input, outputs=[caption_output, music_description_output, music_output, durations])
86
+ # Launch Gradio app
87
+ demo.launch()
88
+
89
+ gradio()
config.py CHANGED
@@ -1,3 +1,10 @@
1
  IMAGE_CAPTION_MODEL = "Salesforce/blip-image-captioning-large"
 
2
  LLM_MODEL = "microsoft/Phi-3-mini-4k-instruct"
3
- MUSICGEN_MODEL = "facebook/musicgen-small"
 
 
 
 
 
 
 
1
  IMAGE_CAPTION_MODEL = "Salesforce/blip-image-captioning-large"
2
+
3
  LLM_MODEL = "microsoft/Phi-3-mini-4k-instruct"
4
+ LLM_MAX_LENGTH = 50
5
+ LLM_MAX_NEW_TOKENS = 50
6
+
7
+ MUSICGEN_MODEL = "facebook/musicgen-small"
8
+ MUSICGEN_MAX_NEW_TOKENS = 256 # 5 seconds of audio
9
+
10
+ AUDIO_DIR = "Case-Study-1/data/"
musicgen_small.py CHANGED
@@ -9,6 +9,6 @@ class Musicgen_Small:
9
  def __init__(self):
10
  self.local_pipeline = pipeline("text-to-audio", model=config.MUSICGEN_MODEL)
11
 
12
- def generate_music_local_pipeline(self, prompt):
13
- music = self.local_pipeline(prompt, forward_params={"do_sample": True})
14
- scipy.io.wavfile.write("data/musicgen_out.wav", rate=music["sampling_rate"], data=music["audio"])
 
9
  def __init__(self):
10
  self.local_pipeline = pipeline("text-to-audio", model=config.MUSICGEN_MODEL)
11
 
12
+ def generate_music_local_pipeline(self, prompt, audio_path):
13
+ music = self.local_pipeline(prompt, forward_params={"do_sample": True, "max_new_tokens": config.MUSICGEN_MAX_NEW_TOKENS})
14
+ scipy.io.wavfile.write(audio_path, rate=music["sampling_rate"], data=music["audio"])
phi3_mini_4k_instruct.py CHANGED
@@ -8,6 +8,8 @@ import config
8
  class Phi3_Mini_4k_Instruct:
9
  def __init__(self):
10
  self.local_pipeline = pipeline("text-generation", model=config.LLM_MODEL, trust_remote_code=True)
 
 
11
 
12
  def generate_text_local_pipeline(self, messages):
13
  result = self.local_pipeline(messages)
 
8
  class Phi3_Mini_4k_Instruct:
9
  def __init__(self):
10
  self.local_pipeline = pipeline("text-generation", model=config.LLM_MODEL, trust_remote_code=True)
11
+ self.local_pipeline.model.config.max_length = config.LLM_MAX_LENGTH
12
+ self.local_pipeline.model.config.max_new_tokens = config.LLM_MAX_NEW_TOKENS
13
 
14
  def generate_text_local_pipeline(self, messages):
15
  result = self.local_pipeline(messages)
poc_app.py DELETED
@@ -1,80 +0,0 @@
1
- # external imports
2
- import time
3
- import gradio as gr
4
-
5
- # local imports
6
- from blip_image_caption_large import Blip_Image_Caption_Large
7
- from phi3_mini_4k_instruct import Phi3_Mini_4k_Instruct
8
- from musicgen_small import Musicgen_Small
9
-
10
- #image_to_music function
11
- def image_to_music(image_path):
12
- # test image captioning
13
- image_caption_start_time = time.time()
14
- image_caption_model = Blip_Image_Caption_Large()
15
-
16
- test_caption = image_caption_model.caption_image_local_pipeline(image_path)
17
-
18
- print(test_caption)
19
-
20
- image_caption_end_time = time.time()
21
-
22
- # test text generation
23
- text_generation_start_time = time.time()
24
- text_generation_model = Phi3_Mini_4k_Instruct()
25
-
26
- #TODO: move this to a config file
27
- text_generation_model.local_pipeline.model.config.max_new_tokens = 200
28
-
29
-
30
- #TODO: move system prompt somewhere else, allow for genre override
31
- messages = [
32
- {"role": "system", "content": "You are an image caption to song description converter with a deep understanding of Music and Art. You are given the caption of an image. Your task is to generate a textual description of a musical piece that fits the caption. The description should be detailed and vivid, and should include the genre, mood, instruments, tempo, and other relevant information about the music. You should also use your knowledge of art and visual aesthetics to create a musical piece that complements the image. Only output the description of the music, without any explanation or introduction. Be concise."},
33
- {"role": "user", "content": test_caption[0]["generated_text"]},
34
- ]
35
- test_text = text_generation_model.generate_text_local_pipeline(messages)
36
- print(test_text)
37
- text_generation_end_time = time.time()
38
-
39
-
40
- # test audio generation
41
- music_generation_start_time = time.time()
42
- music_generation_model = Musicgen_Small()
43
- music_generation_model.generate_music_local_pipeline(str(test_text[-1]['generated_text'][-1]['content']))
44
- music_generation_end_time = time.time()
45
-
46
-
47
- # calculate durations
48
- image_caption_duration = image_caption_end_time - image_caption_start_time
49
- text_generation_duration = text_generation_end_time - text_generation_start_time
50
- music_generation_duration = music_generation_end_time - music_generation_start_time
51
- total_duration = music_generation_end_time - image_caption_start_time
52
-
53
- # output generated_text, audio and duration to gradio
54
- return (test_caption[0]["generated_text"], test_text[-1]['generated_text'][-1]['content'], "data/musicgen_out.wav",
55
- f"Image Captioning Duration: {image_caption_duration} sec",
56
- f"Text Generation Duration: {text_generation_duration} sec",
57
- f"Music Generation Duration: {music_generation_duration} sec",
58
- f"Total Duration: {total_duration} sec")
59
-
60
- # Gradio UI
61
- def gradio():
62
- # Define Gradio Interface, information from (https://www.gradio.app/docs/chatinterface)
63
- with gr.Blocks() as demo:
64
- gr.Markdown("<h1 style='text-align: center;'> ⛺ Image to Music Generator 🎼</h1>")
65
- image_input = gr.Image(type="filepath", label="Upload Image")
66
- with gr.Row():
67
- caption_output = gr.Textbox(label="Image Caption")
68
- music_description_output = gr.Textbox(label="Music Description")
69
- durations = gr.Textbox(label="Processing Times", interactive=False, placeholder="Time statistics will appear here")
70
-
71
- music_output = gr.Audio(label="Generated Music")
72
- # Button to trigger the process
73
- generate_button = gr.Button("Generate Music")
74
-
75
- generate_button.click(fn=image_to_music, inputs=[image_input], outputs=[caption_output, music_description_output, music_output, durations])
76
-
77
- # Launch Gradio app
78
- demo.launch()
79
-
80
- gradio()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,27 +1,70 @@
 
 
 
 
1
  certifi==2024.8.30
2
  charset-normalizer==3.3.2
 
 
 
 
 
3
  filelock==3.16.0
 
4
  fsspec==2024.9.0
 
 
 
 
 
5
  huggingface-hub==0.24.6
6
  idna==3.8
 
7
  Jinja2==3.1.4
 
 
8
  MarkupSafe==2.1.5
 
 
9
  mpmath==1.3.0
10
  networkx==3.3
11
  numpy==2.1.1
 
12
  packaging==24.1
 
13
  pillow==10.4.0
 
 
 
 
 
 
 
 
 
14
  PyYAML==6.0.2
15
  regex==2024.9.11
16
  requests==2.32.3
 
 
17
  safetensors==0.4.5
18
  scipy==1.14.1
 
 
 
 
 
19
  sympy==1.13.2
20
  tokenizers==0.19.1
 
21
  torch==2.4.1
22
  torchaudio==2.4.1
23
  torchvision==0.19.1
24
  tqdm==4.66.5
25
  transformers==4.44.2
 
26
  typing_extensions==4.12.2
 
27
  urllib3==2.2.2
 
 
 
1
+ accelerate==0.34.2
2
+ aiofiles==23.2.1
3
+ annotated-types==0.7.0
4
+ anyio==4.4.0
5
  certifi==2024.8.30
6
  charset-normalizer==3.3.2
7
+ click==8.1.7
8
+ contourpy==1.3.0
9
+ cycler==0.12.1
10
+ fastapi==0.114.2
11
+ ffmpy==0.4.0
12
  filelock==3.16.0
13
+ fonttools==4.53.1
14
  fsspec==2024.9.0
15
+ gradio==4.44.0
16
+ gradio_client==1.3.0
17
+ h11==0.14.0
18
+ httpcore==1.0.5
19
+ httpx==0.27.2
20
  huggingface-hub==0.24.6
21
  idna==3.8
22
+ importlib_resources==6.4.5
23
  Jinja2==3.1.4
24
+ kiwisolver==1.4.7
25
+ markdown-it-py==3.0.0
26
  MarkupSafe==2.1.5
27
+ matplotlib==3.9.2
28
+ mdurl==0.1.2
29
  mpmath==1.3.0
30
  networkx==3.3
31
  numpy==2.1.1
32
+ orjson==3.10.7
33
  packaging==24.1
34
+ pandas==2.2.2
35
  pillow==10.4.0
36
+ psutil==6.0.0
37
+ pydantic==2.9.1
38
+ pydantic_core==2.23.3
39
+ pydub==0.25.1
40
+ Pygments==2.18.0
41
+ pyparsing==3.1.4
42
+ python-dateutil==2.9.0.post0
43
+ python-multipart==0.0.9
44
+ pytz==2024.2
45
  PyYAML==6.0.2
46
  regex==2024.9.11
47
  requests==2.32.3
48
+ rich==13.8.1
49
+ ruff==0.6.5
50
  safetensors==0.4.5
51
  scipy==1.14.1
52
+ semantic-version==2.10.0
53
+ shellingham==1.5.4
54
+ six==1.16.0
55
+ sniffio==1.3.1
56
+ starlette==0.38.5
57
  sympy==1.13.2
58
  tokenizers==0.19.1
59
+ tomlkit==0.12.0
60
  torch==2.4.1
61
  torchaudio==2.4.1
62
  torchvision==0.19.1
63
  tqdm==4.66.5
64
  transformers==4.44.2
65
+ typer==0.12.5
66
  typing_extensions==4.12.2
67
+ tzdata==2024.1
68
  urllib3==2.2.2
69
+ uvicorn==0.30.6
70
+ websockets==12.0