lmzjms commited on
Commit
08cf77a
·
1 Parent(s): 7f627b0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -14
app.py CHANGED
@@ -154,30 +154,30 @@ class ConversationBot:
154
  def init_agent(self, openai_api_key):
155
  self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
156
  self.t2i = T2I(device="cuda:0")
157
- self.i2t = ImageCaptioning(device="cuda:0")
158
  self.t2a = T2A(device="cuda:0")
159
  self.tts = TTS(device="cpu")
160
  self.t2s = T2S(device="cpu")
161
  self.i2a = I2A(device="cuda:0")
162
  self.a2t = A2T(device="cpu")
163
- # self.asr = ASR(device="cuda:0")
164
  self.inpaint = Inpaint(device="cuda:0")
165
- # self.tts_ood = TTS_OOD(device="cpu")
166
  self.tools = [
167
  Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
168
  description="useful for when you want to generate an image from a user input text and it saved it to a file. like: generate an image of an object or something, or generate an image that includes some objects. "
169
  "The input to this tool should be a string, representing the text used to generate image. "),
170
- Tool(name="Get Photo Description", func=self.i2t.inference,
171
- description="useful for when you want to know what is inside the photo. receives image_path as input. "
172
- "The input to this tool should be a string, representing the image_path. "),
173
  Tool(name="Generate Audio From User Input Text", func=self.t2a.inference,
174
  description="useful for when you want to generate an audio from a user input text and it saved it to a file."
175
  "The input to this tool should be a string, representing the text used to generate audio."),
176
- # Tool(
177
- # name="Generate human speech with style derived from a speech reference and user input text and save it to a file", func= self.tts_ood.inference,
178
- # description="useful for when you want to generate speech samples with styles (e.g., timbre, emotion, and prosody) derived from a reference custom voice."
179
- # "Like: Generate a speech with style transferred from this voice. The text is xxx., or speak using the voice of this audio. The text is xxx."
180
- # "The input to this tool should be a comma seperated string of two, representing reference audio path and input text."),
181
  Tool(name="Generate singing voice From User Input Text, Note and Duration Sequence", func= self.t2s.inference,
182
  description="useful for when you want to generate a piece of singing voice (Optional: from User Input Text, Note and Duration Sequence) and save it to a file."
183
  "If Like: Generate a piece of singing voice, the input to this tool should be \"\" since there is no User Input Text, Note and Duration Sequence ."
@@ -195,10 +195,10 @@ class ConversationBot:
195
  "The input to this tool should be a string, representing the audio_path."),
196
  Tool(name="Audio Inpainting", func=self.inpaint.show_mel_fn,
197
  description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input, "
 
 
 
198
  "The input to this tool should be a string, representing the audio_path.")]
199
- # Tool(name="Transcribe speech", func=self.asr.inference,
200
- # description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
201
- # "The input to this tool should be a string, representing the audio_path.")]
202
  self.agent = initialize_agent(
203
  self.tools,
204
  self.llm,
 
154
  def init_agent(self, openai_api_key):
155
  self.llm = OpenAI(temperature=0, openai_api_key=openai_api_key)
156
  self.t2i = T2I(device="cuda:0")
157
+ # self.i2t = ImageCaptioning(device="cuda:0")
158
  self.t2a = T2A(device="cuda:0")
159
  self.tts = TTS(device="cpu")
160
  self.t2s = T2S(device="cpu")
161
  self.i2a = I2A(device="cuda:0")
162
  self.a2t = A2T(device="cpu")
163
+ self.asr = ASR(device="cpu")
164
  self.inpaint = Inpaint(device="cuda:0")
165
+ self.tts_ood = TTS_OOD(device="cpu")
166
  self.tools = [
167
  Tool(name="Generate Image From User Input Text", func=self.t2i.inference,
168
  description="useful for when you want to generate an image from a user input text and it saved it to a file. like: generate an image of an object or something, or generate an image that includes some objects. "
169
  "The input to this tool should be a string, representing the text used to generate image. "),
170
+ # Tool(name="Get Photo Description", func=self.i2t.inference,
171
+ # description="useful for when you want to know what is inside the photo. receives image_path as input. "
172
+ # "The input to this tool should be a string, representing the image_path. "),
173
  Tool(name="Generate Audio From User Input Text", func=self.t2a.inference,
174
  description="useful for when you want to generate an audio from a user input text and it saved it to a file."
175
  "The input to this tool should be a string, representing the text used to generate audio."),
176
+ Tool(
177
+ name="Generate human speech with style derived from a speech reference and user input text and save it to a file", func= self.tts_ood.inference,
178
+ description="useful for when you want to generate speech samples with styles (e.g., timbre, emotion, and prosody) derived from a reference custom voice."
179
+ "Like: Generate a speech with style transferred from this voice. The text is xxx., or speak using the voice of this audio. The text is xxx."
180
+ "The input to this tool should be a comma seperated string of two, representing reference audio path and input text."),
181
  Tool(name="Generate singing voice From User Input Text, Note and Duration Sequence", func= self.t2s.inference,
182
  description="useful for when you want to generate a piece of singing voice (Optional: from User Input Text, Note and Duration Sequence) and save it to a file."
183
  "If Like: Generate a piece of singing voice, the input to this tool should be \"\" since there is no User Input Text, Note and Duration Sequence ."
 
195
  "The input to this tool should be a string, representing the audio_path."),
196
  Tool(name="Audio Inpainting", func=self.inpaint.show_mel_fn,
197
  description="useful for when you want to inpaint a mel spectrum of an audio and predict this audio, this tool will generate a mel spectrum and you can inpaint it, receives audio_path as input, "
198
+ "The input to this tool should be a string, representing the audio_path."),
199
+ Tool(name="Transcribe speech", func=self.asr.inference,
200
+ description="useful for when you want to know the text corresponding to a human speech, receives audio_path as input."
201
  "The input to this tool should be a string, representing the audio_path.")]
 
 
 
202
  self.agent = initialize_agent(
203
  self.tools,
204
  self.llm,