yuangongfdu commited on
Commit
8afc49d
·
1 Parent(s): bf54fbf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -7
app.py CHANGED
@@ -51,14 +51,15 @@ if __name__ == '__main__':
51
  sample_audio_text = "[sample audios from AudioSet evaluation set]"
52
  demo = gr.Interface(fn=predict,
53
  inputs=[gr.Audio(type="filepath"),
54
- gr.Textbox(value='What can be inferred from the spoken text and sounds? Why?',
55
  label='Edit the textbox to ask your own questions!')],
56
  outputs=[gr.Textbox(label="LTU Output")],
57
  cache_examples=True,
58
- title="Demo of LTU-2 Beta",
59
- description="LTU-2 an improved version of LTU. LTU-2 is stronger in spoken text understanding and music understanding. <br>" +
60
- "LTU is authored by Yuan Gong, Alexander H. Liu, Hongyin Luo, Leonid Karlinsky, and James Glass (MIT & MIT-IBM Watson AI Lab). <br>" +
61
- "**Please note that the model is under construction and may be buggy. It is trained with some new techniques that are not described in LTU paper. I.e., using method described in LTU paper cannot reproduce this model.**<br>" +
62
- "Input should be wav file sampled at 16kHz. This demo trim input audio to 10 seconds. <br>"
63
- "**Research Demo, No Commercial Use (Due to license of LLaMA).**")
 
64
  demo.launch(debug=False, share=False)
 
51
  sample_audio_text = "[sample audios from AudioSet evaluation set]"
52
  demo = gr.Interface(fn=predict,
53
  inputs=[gr.Audio(type="filepath"),
54
+ gr.Textbox(value='What can be inferred from the audio? Why?',
55
  label='Edit the textbox to ask your own questions!')],
56
  outputs=[gr.Textbox(label="LTU Output")],
57
  cache_examples=True,
58
+ title="Quick Demo of Listen, Think, and Understand (LTU)",
59
+ description="LTU is a new audio model that bridges audio perception and advanced reasoning, it can answer any open-ended question about the given audio." + f"<a href='{paper_link}'>{paper_text}</a> " + f"<a href='{link}'>{text}</a> <br>" +
60
+ "LTU is authored by Yuan Gong, Hongyin Luo, Alexander H. Liu, Leonid Karlinsky, and James Glass (MIT & MIT-IBM Watson AI Lab). <br>" +
61
+ "**Note LTU is not an ASR and has limited ability to recognize the speech content, it focuses on general audio perception and understanding.**<br>" +
62
+ "Input an audio and ask quesions! Audio will be converted to 16kHz and padded or trim to 10 seconds. Don't have an audio sample on hand? Try some samples from AudioSet evaluation set: " +
63
+ f"<a href='{sample_audio_link}'>{sample_audio_text}</a><br>" +
64
+ "**Research Demo, Not for Commercial Use (Due to license of LLaMA).**")
65
  demo.launch(debug=False, share=False)