Julian-Hans commited on
Commit
fa554aa
·
1 Parent(s): a15bc9b

implemented option to use inference endpoints, implemented parameter selection, updated UI, cleaned up return formats of models

Browse files
Files changed (5) hide show
  1. app.py +50 -9
  2. blip_image_caption_large.py +15 -2
  3. config.py +14 -1
  4. musicgen_small.py +33 -1
  5. phi3_mini_4k_instruct.py +18 -3
app.py CHANGED
@@ -16,7 +16,12 @@ log.basicConfig(level=log.INFO)
16
 
17
 
18
  class Image_To_Music:
19
- def __init__(self):
 
 
 
 
 
20
  self.image_path = None
21
  self.generated_caption = None
22
  self.generated_description = None
@@ -44,14 +49,14 @@ class Image_To_Music:
44
  self.image_caption_model = Blip_Image_Caption_Large()
45
 
46
  self.image_path = image_path
47
- self.generated_caption = self.image_caption_model.caption_image_local_pipeline(self.image_path)[0]["generated_text"]
48
 
49
  # delete model to free up ram
50
  del self.image_caption_model
51
  gc.collect()
52
 
53
  self.caption_generation_duration = time.time() - caption_start_time
54
- log.info(f"Captioning Complete in {self.caption_generation_duration:.2f} seconds: {self.generated_caption}")
55
  return self.generated_caption
56
 
57
  def generate_description(self):
@@ -65,14 +70,14 @@ class Image_To_Music:
65
  {"role": "system", "content": "You are an image caption to song description converter with a deep understanding of Music and Art. You are given the caption of an image. Your task is to generate a textual description of a musical piece that fits the caption. The description should be detailed and vivid, and should include the genre, mood, instruments, tempo, and other relevant information about the music. You should also use your knowledge of art and visual aesthetics to create a musical piece that complements the image. Only output the description of the music, without any explanation or introduction. Be concise."},
66
  {"role": "user", "content": self.generated_caption},
67
  ]
68
- self.generated_description = self.text_generation_model.generate_text_local_pipeline(messages)[-1]['generated_text'][-1]['content']
69
 
70
  # delete model to free up ram
71
  del self.text_generation_model
72
  gc.collect()
73
 
74
  self.description_generation_duration = time.time() - description_start_time
75
- log.info(f"Description Generation Complete in {self.description_generation_duration:.2f} seconds: {self.generated_description}")
76
  return self.generated_description
77
 
78
  def generate_music(self):
@@ -82,14 +87,14 @@ class Image_To_Music:
82
  # load model
83
  self.music_generation_model = Musicgen_Small()
84
 
85
- self.music_generation_model.generate_music_local_pipeline(self.generated_description, self.audio_path)
86
 
87
  # delete model to free up ram
88
  del self.music_generation_model
89
  gc.collect()
90
 
91
  self.music_generation_duration = time.time() - music_start_time
92
- log.info(f"Music Generation Complete in {self.music_generation_duration:.2f} seconds: {self.audio_path}")
93
  return self.audio_path
94
 
95
  def get_durations(self):
@@ -112,12 +117,49 @@ class Image_To_Music:
112
  return [self.generated_caption, self.generated_description, self.audio_path, self.get_durations()]
113
 
114
 
 
 
 
 
 
 
 
 
115
  # Gradio UI
116
  def gradio():
117
  # Define Gradio Interface, information from (https://www.gradio.app/docs/chatinterface)
118
  with gr.Blocks() as demo:
119
  gr.Markdown("<h1 style='text-align: center;'> ⛺ Image to Music Generator 🎼</h1>")
120
  image_input = gr.Image(type="filepath", label="Upload Image")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  with gr.Row():
122
  caption_output = gr.Textbox(label="Image Caption")
123
  music_description_output = gr.Textbox(label="Music Description")
@@ -126,8 +168,7 @@ def gradio():
126
  music_output = gr.Audio(label="Generated Music")
127
  # Button to trigger the process
128
  generate_button = gr.Button("Generate Music")
129
- itm = Image_To_Music()
130
- generate_button.click(fn=itm.run, inputs=image_input, outputs=[caption_output, music_description_output, music_output, durations])
131
  # Launch Gradio app
132
  demo.launch()
133
 
 
16
 
17
 
18
  class Image_To_Music:
19
+ def __init__(self, use_local_caption=False, use_local_llm=False, use_local_musicgen=False):
20
+
21
+ self.use_local_llm = use_local_llm
22
+ self.use_local_caption = use_local_caption
23
+ self.use_local_musicgen = use_local_musicgen
24
+
25
  self.image_path = None
26
  self.generated_caption = None
27
  self.generated_description = None
 
49
  self.image_caption_model = Blip_Image_Caption_Large()
50
 
51
  self.image_path = image_path
52
+ self.generated_caption = self.image_caption_model.caption_image(self.image_path, self.use_local_caption)
53
 
54
  # delete model to free up ram
55
  del self.image_caption_model
56
  gc.collect()
57
 
58
  self.caption_generation_duration = time.time() - caption_start_time
59
+ log.info(f"Captioning Complete in {self.caption_generation_duration:.2f} seconds: {self.generated_caption} - used local model: {self.use_local_caption}")
60
  return self.generated_caption
61
 
62
  def generate_description(self):
 
70
  {"role": "system", "content": "You are an image caption to song description converter with a deep understanding of Music and Art. You are given the caption of an image. Your task is to generate a textual description of a musical piece that fits the caption. The description should be detailed and vivid, and should include the genre, mood, instruments, tempo, and other relevant information about the music. You should also use your knowledge of art and visual aesthetics to create a musical piece that complements the image. Only output the description of the music, without any explanation or introduction. Be concise."},
71
  {"role": "user", "content": self.generated_caption},
72
  ]
73
+ self.generated_description = self.text_generation_model.generate_text(messages, self.use_local_llm)
74
 
75
  # delete model to free up ram
76
  del self.text_generation_model
77
  gc.collect()
78
 
79
  self.description_generation_duration = time.time() - description_start_time
80
+ log.info(f"Description Generation Complete in {self.description_generation_duration:.2f} seconds: {self.generated_description} - used local model: {self.use_local_llm}")
81
  return self.generated_description
82
 
83
  def generate_music(self):
 
87
  # load model
88
  self.music_generation_model = Musicgen_Small()
89
 
90
+ self.music_generation_model.generate_music(self.generated_description, self.audio_path, self.use_local_musicgen)
91
 
92
  # delete model to free up ram
93
  del self.music_generation_model
94
  gc.collect()
95
 
96
  self.music_generation_duration = time.time() - music_start_time
97
+ log.info(f"Music Generation Complete in {self.music_generation_duration:.2f} seconds: {self.audio_path} - used local model: {self.use_local_musicgen}")
98
  return self.audio_path
99
 
100
  def get_durations(self):
 
117
  return [self.generated_caption, self.generated_description, self.audio_path, self.get_durations()]
118
 
119
 
120
+ def run_image_to_music(image_path, llm_max_new_tokens, llm_temperature, llm_top_p, musicgen_max_seconds, use_local_caption, use_local_llm, use_local_musicgen):
121
+ config.LLM_MAX_NEW_TOKENS = llm_max_new_tokens
122
+ config.LLM_TEMPERATURE = llm_temperature
123
+ config.LLM_TOP_P = llm_top_p
124
+ config.MUSICGEN_MAX_NEW_TOKENS = musicgen_max_seconds * 51
125
+ itm = Image_To_Music(use_local_caption=use_local_caption, use_local_llm=use_local_llm, use_local_musicgen=use_local_musicgen)
126
+ return itm.run(image_path)
127
+
128
  # Gradio UI
129
  def gradio():
130
  # Define Gradio Interface, information from (https://www.gradio.app/docs/chatinterface)
131
  with gr.Blocks() as demo:
132
  gr.Markdown("<h1 style='text-align: center;'> ⛺ Image to Music Generator 🎼</h1>")
133
  image_input = gr.Image(type="filepath", label="Upload Image")
134
+
135
+
136
+ # ----ATTRIBUTION-START----
137
+ # LLM: ChatGPT4o
138
+ # PROMPT: i need 3 checkbox fields that pass booleans to the run_image_to_music function. it should be "Use local Image Captioning" "Use local LLM" "Use local Music Generation". please make it a nice parameter selector
139
+ # EDITS: /
140
+
141
+ # Checkbox parameters
142
+ with gr.Row():
143
+ local_captioning = gr.Checkbox(label="Use local Image Captioning", value=False)
144
+ local_llm = gr.Checkbox(label="Use local LLM", value=False)
145
+ local_music_gen = gr.Checkbox(label="Use local Music Generation", value=False)
146
+ # -----ATTRIBUTION-END-----
147
+
148
+ # ----ATTRIBUTION-START----
149
+ # LLM: ChatGPT4o
150
+ # PROMPT: now, i need sliders for the different models that are used in the product:\n LLM_MAX_NEW_TOKENS = 50\nLLM_TEMPERATURE = 0.7\nLLM_TOP_P = 0.95\nMUSICGEN_MAX_NEW_TOKENS = 256 # 256 = 5 seconds of audio\n they should be in a hidden menu that opens when i click on "advanced options"\nplease label them for the end user and fit them nicely in the following ui: <code>
151
+ # EDITS: added interactive flags
152
+ # Advanced options with sliders
153
+ with gr.Accordion("Advanced Options", open=False):
154
+ gr.Markdown("<h3>LLM Settings</h3>")
155
+ llm_max_new_tokens = gr.Slider(1, 200, value=50, step=1, label="LLM Max Tokens", interactive=True)
156
+ llm_temperature = gr.Slider(0.0, 1.0, value=0.7, step=0.01, label="LLM Temperature", interactive=True)
157
+ llm_top_p = gr.Slider(0.01, 0.99, value=0.95, step=0.01, label="LLM Top P", interactive=True)
158
+
159
+ gr.Markdown("<h3>Music Generation Settings</h3>")
160
+ musicgen_max_seconds = gr.Slider(1, 30, value=5, step=1, label="MusicGen Duration in Seconds (local model only)", interactive=True)
161
+ # -----ATTRIBUTION-END-----
162
+
163
  with gr.Row():
164
  caption_output = gr.Textbox(label="Image Caption")
165
  music_description_output = gr.Textbox(label="Music Description")
 
168
  music_output = gr.Audio(label="Generated Music")
169
  # Button to trigger the process
170
  generate_button = gr.Button("Generate Music")
171
+ generate_button.click(fn=run_image_to_music, inputs=[image_input, llm_max_new_tokens, llm_temperature, llm_top_p, musicgen_max_seconds, local_captioning, local_llm, local_music_gen], outputs=[caption_output, music_description_output, music_output, durations])
 
172
  # Launch Gradio app
173
  demo.launch()
174
 
blip_image_caption_large.py CHANGED
@@ -1,13 +1,26 @@
1
  # external imports
2
  from transformers import pipeline
 
3
 
4
  # local imports
5
  import config
6
 
7
  class Blip_Image_Caption_Large:
8
  def __init__(self):
9
- self.local_pipeline = pipeline("image-to-text", model=config.IMAGE_CAPTION_MODEL)
10
 
 
 
 
 
 
 
11
  def caption_image_local_pipeline(self, image_path):
12
- result = self.local_pipeline(image_path)
 
13
  return result
 
 
 
 
 
 
1
  # external imports
2
  from transformers import pipeline
3
+ from huggingface_hub import InferenceClient
4
 
5
  # local imports
6
  import config
7
 
8
  class Blip_Image_Caption_Large:
9
  def __init__(self):
10
+ pass
11
 
12
+ def caption_image(self, image_path, use_local_caption):
13
+ if use_local_caption:
14
+ return self.caption_image_local_pipeline(image_path)
15
+ else:
16
+ return self.caption_image_api(image_path)
17
+
18
  def caption_image_local_pipeline(self, image_path):
19
+ self.local_pipeline = pipeline("image-to-text", model=config.IMAGE_CAPTION_MODEL)
20
+ result = self.local_pipeline(image_path)[0]['generated_text']
21
  return result
22
+
23
+ def caption_image_api(self, image_path):
24
+ client = InferenceClient(config.IMAGE_CAPTION_MODEL, token=config.HF_API_TOKEN)
25
+ result = client.image_to_text(image_path).generated_text
26
+ return result
config.py CHANGED
@@ -1,10 +1,23 @@
 
 
 
 
1
  IMAGE_CAPTION_MODEL = "Salesforce/blip-image-captioning-large"
2
 
3
  LLM_MODEL = "microsoft/Phi-3-mini-4k-instruct"
4
  LLM_MAX_LENGTH = 50
5
  LLM_MAX_NEW_TOKENS = 50
 
 
6
 
7
  MUSICGEN_MODEL = "facebook/musicgen-small"
 
8
  MUSICGEN_MAX_NEW_TOKENS = 256 # 5 seconds of audio
9
 
10
- AUDIO_DIR = "Case-Study-1/data/"
 
 
 
 
 
 
 
1
+ import os
2
+ import logging as log
3
+ log.basicConfig(level=log.INFO)
4
+
5
  IMAGE_CAPTION_MODEL = "Salesforce/blip-image-captioning-large"
6
 
7
  LLM_MODEL = "microsoft/Phi-3-mini-4k-instruct"
8
  LLM_MAX_LENGTH = 50
9
  LLM_MAX_NEW_TOKENS = 50
10
+ LLM_TEMPERATURE = 0.7
11
+ LLM_TOP_P = 0.95
12
 
13
  MUSICGEN_MODEL = "facebook/musicgen-small"
14
+ MUSICGEN_MODEL_API_URL = f"https://api-inference.huggingface.co/models/{MUSICGEN_MODEL}"
15
  MUSICGEN_MAX_NEW_TOKENS = 256 # 5 seconds of audio
16
 
17
+ AUDIO_DIR = "Case-Study-1/data/"
18
+
19
+ HF_API_TOKEN = os.getenv("HF_API_TOKEN")
20
+ if HF_API_TOKEN:
21
+ log.info(f"Read HF_API_TOKEN: {HF_API_TOKEN[0:4]}...")
22
+ else:
23
+ print("HF_API_TOKEN not found in environment variables.")
musicgen_small.py CHANGED
@@ -1,5 +1,7 @@
1
  # external imports
2
  from transformers import pipeline
 
 
3
  import scipy
4
 
5
  # local imports
@@ -7,8 +9,38 @@ import config
7
 
8
  class Musicgen_Small:
9
  def __init__(self):
10
- self.local_pipeline = pipeline("text-to-audio", model=config.MUSICGEN_MODEL)
11
 
 
 
 
 
 
 
12
  def generate_music_local_pipeline(self, prompt, audio_path):
 
13
  music = self.local_pipeline(prompt, forward_params={"do_sample": True, "max_new_tokens": config.MUSICGEN_MAX_NEW_TOKENS})
14
  scipy.io.wavfile.write(audio_path, rate=music["sampling_rate"], data=music["audio"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # external imports
2
  from transformers import pipeline
3
+ from io import BytesIO
4
+ import requests
5
  import scipy
6
 
7
  # local imports
 
9
 
10
  class Musicgen_Small:
11
  def __init__(self):
12
+ pass
13
 
14
+ def generate_music(self, prompt, audio_path, use_local_musicgen):
15
+ if use_local_musicgen:
16
+ self.generate_music_local_pipeline(prompt, audio_path)
17
+ else:
18
+ self.generate_music_api(prompt, audio_path)
19
+
20
  def generate_music_local_pipeline(self, prompt, audio_path):
21
+ self.local_pipeline = pipeline("text-to-audio", model=config.MUSICGEN_MODEL)
22
  music = self.local_pipeline(prompt, forward_params={"do_sample": True, "max_new_tokens": config.MUSICGEN_MAX_NEW_TOKENS})
23
  scipy.io.wavfile.write(audio_path, rate=music["sampling_rate"], data=music["audio"])
24
+
25
+ def generate_music_api(self, prompt, audio_path):
26
+ headers = {"Authorization": f"Bearer {config.HF_API_TOKEN}"}
27
+ payload = {
28
+ "inputs": prompt
29
+ }
30
+
31
+ response = requests.post(config.MUSICGEN_MODEL_API_URL, headers=headers, json=payload)
32
+
33
+ # ----ATTRIBUTION-START----
34
+ # LLM: ChatGPT4o
35
+ # PROMPT: please save the audio to a .wav file
36
+ # EDITS: changed variables to match the code
37
+
38
+ # Convert the byte content into an audio array
39
+ audio_buffer = BytesIO(response.content)
40
+
41
+ # Use scipy to save the audio, assuming it's a WAV format audio stream
42
+ # If it's raw PCM audio, you would need to decode it first.
43
+ with open(audio_path, "wb") as f:
44
+ f.write(audio_buffer.read())
45
+ # -----ATTRIBUTION-END-----
46
+
phi3_mini_4k_instruct.py CHANGED
@@ -1,5 +1,6 @@
1
  # external imports
2
  from transformers import pipeline
 
3
 
4
  # local imports
5
  import config
@@ -7,10 +8,24 @@ import config
7
 
8
  class Phi3_Mini_4k_Instruct:
9
  def __init__(self):
 
 
 
 
 
 
 
 
 
10
  self.local_pipeline = pipeline("text-generation", model=config.LLM_MODEL, trust_remote_code=True)
11
  self.local_pipeline.model.config.max_length = config.LLM_MAX_LENGTH
12
  self.local_pipeline.model.config.max_new_tokens = config.LLM_MAX_NEW_TOKENS
13
-
14
- def generate_text_local_pipeline(self, messages):
15
- result = self.local_pipeline(messages)
16
  return result
 
 
 
 
 
 
1
  # external imports
2
  from transformers import pipeline
3
+ from huggingface_hub import InferenceClient
4
 
5
  # local imports
6
  import config
 
8
 
9
  class Phi3_Mini_4k_Instruct:
10
  def __init__(self):
11
+ pass
12
+
13
+ def generate_text(self, messages, use_local_llm):
14
+ if use_local_llm:
15
+ return self.generate_text_local_pipeline(messages)
16
+ else:
17
+ return self.generate_text_api(messages)
18
+
19
+ def generate_text_local_pipeline(self, messages):
20
  self.local_pipeline = pipeline("text-generation", model=config.LLM_MODEL, trust_remote_code=True)
21
  self.local_pipeline.model.config.max_length = config.LLM_MAX_LENGTH
22
  self.local_pipeline.model.config.max_new_tokens = config.LLM_MAX_NEW_TOKENS
23
+ self.local_pipeline.model.config.temperature = config.LLM_TEMPERATURE
24
+ self.local_pipeline.model.config.top_p = config.LLM_TOP_P
25
+ result = self.local_pipeline(messages)[-1]['generated_text'][-1]['content']
26
  return result
27
+
28
+ def generate_text_api(self, messages):
29
+ client = InferenceClient(config.LLM_MODEL, token=config.HF_API_TOKEN)
30
+ result = client.chat_completion(messages, max_tokens=config.LLM_MAX_NEW_TOKENS, temperature=config.LLM_TEMPERATURE, top_p=config.LLM_TOP_P).choices[0].message.content
31
+ return result