lmzjms commited on
Commit
bb882ba
·
1 Parent(s): 212bdfd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -14
app.py CHANGED
@@ -158,20 +158,20 @@ class ConversationBot:
158
  return gr.Button.update(visible=False)
159
  def init_agent(self, openai_api_key):
160
  self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
161
- # self.t2i = T2I(device="cuda:0")
162
  # self.i2t = ImageCaptioning(device="cuda:0")
163
  self.t2a = T2A(device="cuda:0")
164
  self.tts = TTS(device="cpu")
165
  self.t2s = T2S(device="cpu")
166
- # self.i2a = I2A(device="cpu")
167
  # self.a2t = A2T(device="cpu")
168
  # self.asr = ASR(device="cuda:0")
169
- # self.inpaint = Inpaint(device="cpu")
170
  # self.tts_ood = TTS_OOD(device="cpu")
171
  self.tools = [
172
- # Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
173
- # description="useful for when you want to generate an image from a user input text and it saved it to a file. like: generate an image of an object or something, or generate an image that includes some objects. "
174
- # "The input to this tool should be a string, representing the text used to generate image. "),
175
  # Tool(name="Get Photo Description", func=self.i2t.inference,
176
  # description="useful for when you want to know what is inside the photo. receives image_path as input. "
177
  # "The input to this tool should be a string, representing the image_path. "),
@@ -191,16 +191,16 @@ class ConversationBot:
191
  "The input to this tool should be a comma seperated string of three, representing text, note and duration sequence since User Input Text, Note and Duration Sequence are all provided."),
192
  Tool(name="Synthesize Speech Given the User Input Text", func=self.tts.inference,
193
  description="useful for when you want to convert a user input text into speech audio it saved it to a file."
194
- "The input to this tool should be a string, representing the text used to be converted to speech.")]
195
- # Tool(name="Generate Audio From The Image", func=self.i2a.inference,
196
- # description="useful for when you want to generate an audio based on an image."
197
- # "The input to this tool should be a string, representing the image_path. "),
198
  # Tool(name="Generate Text From The Audio", func=self.a2t.inference,
199
  # description="useful for when you want to describe an audio in text, receives audio_path as input."
200
  # "The input to this tool should be a string, representing the audio_path.")]
201
- # Tool(name="Audio Inpainting", func=self.inpaint.show_mel_fn,
202
- # description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input, "
203
- # "The input to this tool should be a string, representing the audio_path.")]
204
  # Tool(name="Transcribe speech", func=self.asr.inference,
205
  # description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
206
  # "The input to this tool should be a string, representing the audio_path.")]
@@ -219,7 +219,7 @@ class ConversationBot:
219
  if __name__ == '__main__':
220
  bot = ConversationBot()
221
 
222
- with gr.Blocks(css="#chatbot .overflow-y-auto{height:500px}") as demo:
223
  with gr.Row():
224
  openai_api_key_textbox = gr.Textbox(
225
  placeholder="Paste your OpenAI API key here to start AudioGPT(sk-...) and press Enter ↵️",
 
158
  return gr.Button.update(visible=False)
159
  def init_agent(self, openai_api_key):
160
  self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
161
+ self.t2i = T2I(device="cuda:0")
162
  # self.i2t = ImageCaptioning(device="cuda:0")
163
  self.t2a = T2A(device="cuda:0")
164
  self.tts = TTS(device="cpu")
165
  self.t2s = T2S(device="cpu")
166
+ self.i2a = I2A(device="cuda:0")
167
  # self.a2t = A2T(device="cpu")
168
  # self.asr = ASR(device="cuda:0")
169
+ self.inpaint = Inpaint(device="cuda:0")
170
  # self.tts_ood = TTS_OOD(device="cpu")
171
  self.tools = [
172
+ Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
173
+ description="useful for when you want to generate an image from a user input text and it saved it to a file. like: generate an image of an object or something, or generate an image that includes some objects. "
174
+ "The input to this tool should be a string, representing the text used to generate image. "),
175
  # Tool(name="Get Photo Description", func=self.i2t.inference,
176
  # description="useful for when you want to know what is inside the photo. receives image_path as input. "
177
  # "The input to this tool should be a string, representing the image_path. "),
 
191
  "The input to this tool should be a comma seperated string of three, representing text, note and duration sequence since User Input Text, Note and Duration Sequence are all provided."),
192
  Tool(name="Synthesize Speech Given the User Input Text", func=self.tts.inference,
193
  description="useful for when you want to convert a user input text into speech audio it saved it to a file."
194
+ "The input to this tool should be a string, representing the text used to be converted to speech."),
195
+ Tool(name="Generate Audio From The Image", func=self.i2a.inference,
196
+ description="useful for when you want to generate an audio based on an image."
197
+ "The input to this tool should be a string, representing the image_path. "),
198
  # Tool(name="Generate Text From The Audio", func=self.a2t.inference,
199
  # description="useful for when you want to describe an audio in text, receives audio_path as input."
200
  # "The input to this tool should be a string, representing the audio_path.")]
201
+ Tool(name="Audio Inpainting", func=self.inpaint.show_mel_fn,
202
+ description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input, "
203
+ "The input to this tool should be a string, representing the audio_path.")]
204
  # Tool(name="Transcribe speech", func=self.asr.inference,
205
  # description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
206
  # "The input to this tool should be a string, representing the audio_path.")]
 
219
  if __name__ == '__main__':
220
  bot = ConversationBot()
221
 
222
+ with gr.Blocks(css="#chatbot {overflow:auto; height:600px;}") as demo:
223
  with gr.Row():
224
  openai_api_key_textbox = gr.Textbox(
225
  placeholder="Paste your OpenAI API key here to start AudioGPT(sk-...) and press Enter ↵️",