Spaces:
Running
Running
<html> | |
<head> | |
<meta charset="utf-8"> | |
<meta name="description" | |
content="LLaSM: Large Language and Speech Model"> | |
<meta name="keywords" content="speech-language, multi-modal, LLM, LLaSM"> | |
<meta name="viewport" content="width=device-width, initial-scale=1"> | |
<title>LLaSM: Large Language and Speech Model</title> | |
<link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" | |
rel="stylesheet"> | |
<link rel="stylesheet" href="./static/css/bulma.min.css"> | |
<link rel="stylesheet" href="./static/css/bulma-carousel.min.css"> | |
<link rel="stylesheet" href="./static/css/bulma-slider.min.css"> | |
<link rel="stylesheet" href="./static/css/fontawesome.all.min.css"> | |
<link rel="stylesheet" | |
href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css"> | |
<link rel="stylesheet" href="./static/css/index.css"> | |
<link rel="icon" href="./static/images/favicon.svg"> | |
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script> | |
<script defer src="./static/js/fontawesome.all.min.js"></script> | |
<script src="./static/js/bulma-carousel.min.js"></script> | |
<script src="./static/js/bulma-slider.min.js"></script> | |
<!-- for LLaSM demo --> | |
<link rel="stylesheet" href="./static/css/bootsrap.min.css"> | |
<link rel="stylesheet" href="./static/css/styles.css"> | |
<script src="./static/js/recorder.mp3.min.js"></script> | |
<script src="./static/js/waveview.js"></script> | |
<!-- / for LLaSM demo --> | |
</head> | |
<body> | |
<nav class="navbar" role="navigation" aria-label="main navigation"> | |
<div class="navbar-brand"> | |
<a role="button" class="navbar-burger" aria-label="menu" aria-expanded="false"> | |
<span aria-hidden="true"></span> | |
<span aria-hidden="true"></span> | |
<span aria-hidden="true"></span> | |
</a> | |
</div> | |
<div class="navbar-menu"> | |
<div class="navbar-start" style="flex-grow: 1; justify-content: center;"> | |
<a class="navbar-item" href="https://keunhong.com"> | |
<span class="icon"> | |
<i class="fas fa-home"></i> | |
</span> | |
</a> | |
<div class="navbar-item has-dropdown is-hoverable"> | |
<a class="navbar-link"> | |
More Research | |
</a> | |
<div class="navbar-dropdown"> | |
<a class="navbar-item" href="https://huggingface.co/spaces/LinkSoul/Chinese-LLaVA" target="_blank"> | |
Chinese-LLaVA | |
</a> | |
<a class="navbar-item" href="https://huggingface.co/LinkSoul/Chinese-Llama-2-7b" target="_blank"> | |
Chinese-Llama-2-7B | |
</a> | |
</div> | |
</div> | |
</div> | |
</div> | |
</nav> | |
<section class="hero"> | |
<div class="hero-body"> | |
<div class="container is-max-desktop"> | |
<div class="columns is-centered"> | |
<div class="column has-text-centered"> | |
<h1 class="title is-1 publication-title">LLaSM: Large Language and Speech Model</h1> | |
<div class="is-size-5 publication-authors"> | |
<span class="author-block" style="color:#008AD7;font-weight:normal;"> | |
Yu Shu<sup>2</sup>,</span> | |
<span class="author-block" style="color:#008AD7;font-weight:normal;"> | |
Siwei Dong<sup>2</sup>,</span> | |
<span class="author-block" style="color:#ed2f09;font-weight:normal;"> | |
Guangyao Chen<sup>1,3</sup>, | |
</span> | |
<span class="author-block" style="color:#cc00d7;font-weight:normal;"> | |
Wenhao Huang<sup>4</sup>, | |
</span> | |
<span class="author-block" style="color:#19e706;font-weight:normal;"> | |
Rita Zhang, | |
<!-- Rita Zhang<sup>5</sup>, --> | |
</span> | |
<span class="author-block" style="color:#19e706;font-weight:normal;"> | |
Daochen Shi, | |
<!-- Daochen Shi<sup>5</sup>, --> | |
</span> | |
<span class="author-block" style="color:#19e706;font-weight:normal;"> | |
Qiqi Xiang, | |
<!-- Qiqi Xiang<sup>5</sup>, --> | |
</span> | |
<span class="author-block" style="color:#f68946;font-weight:normal;"> | |
Yemin Shi<sup>1*</sup> | |
</span> | |
</div> | |
<div class="is-size-5 publication-authors"> | |
<span class="author-block" style="color:#f68946;font-weight:normal;"><sup>1</sup>LinkSoul.AI,</span> | |
<span class="author-block" style="color:#008AD7;font-weight:normal;"><sup>2</sup>Beijing Academy of Artificial Intelligence, China,</span> | |
<span class="author-block" style="color:#ed2f09;font-weight:normal;"><sup>3</sup>Peking University, China</span> | |
<span class="author-block" style="color:#cc00d7;font-weight:normal;"><sup>4</sup>01.ai</span> | |
</div> | |
<div> | |
<span class="author-block"><sup>*</sup>Corresponding author: [email protected]</span> | |
</div> | |
<div class="column has-text-centered"> | |
<div class="publication-links"> | |
<!-- PDF Link. --> | |
<!-- <span class="link-block"> | |
<a href="" target="_blank" | |
class="external-link button is-normal is-rounded is-dark"> | |
<span class="icon"> | |
<i class="fas fa-file-pdf"></i> | |
</span> | |
<span>Paper</span> | |
</a> | |
</span> --> | |
<span class="link-block"> | |
<a href="https://arxiv.org/abs/2308.15930" target="_blank" | |
class="external-link button is-normal is-rounded is-dark"> | |
<span class="icon"> | |
<i class="ai ai-arxiv"></i> | |
</span> | |
<span>arXiv</span> | |
</a> | |
</span> | |
<!-- Model Link. --> | |
<span class="link-block"> | |
<a href="https://huggingface.co/LinkSoul/LLaSM-Cllama2" target="_blank" | |
class="external-link button is-normal is-rounded is-dark"> | |
<span class="icon"> | |
<i class="fas fa-atom"></i> | |
</span> | |
<span>Model</span> | |
</a> | |
</span> | |
<!-- Code Link. --> | |
<span class="link-block"> | |
<a href="https://github.com/LinkSoul-AI/LLaSM" target="_blank" | |
class="external-link button is-normal is-rounded is-dark"> | |
<span class="icon"> | |
<i class="fab fa-github"></i> | |
</span> | |
<span>Code</span> | |
</a> | |
</span> | |
<!-- Dataset Link. --> | |
<span class="link-block"> | |
<a href="https://huggingface.co/datasets/LinkSoul/LLaSM-Audio-Instructions" target="_blank" | |
class="external-link button is-normal is-rounded is-dark"> | |
<span class="icon"> | |
<i class="far fa-images"></i> | |
</span> | |
<span>Data</span> | |
</a> | |
</div> | |
</div> | |
</div> | |
</div> | |
</div> | |
</div> | |
</section> | |
<section class="section"> | |
<div class="container is-max-desktop"> | |
<!-- Abstract. --> | |
<div class="columns is-centered has-text-centered"> | |
<div class="column is-four-fifths"> | |
<h2 class="title is-3">Abstract</h2> | |
<div class="content has-text-justified"> | |
<p> | |
Multi-modal large language models have garnered significant interest recently. Though, | |
most of the works are focusing on vision-language multi-modal models providing | |
strong capabilities in following vision-and-language instructions. However, we claim | |
that speech is also an important modality through which human interact with the | |
world. Hence, it is crucial for a general-purpose assistant to be able to follow multi-modal | |
speech-and-language instructions. In this work, we propose <b>L</b>arge <b>L</b>anguage | |
<b>a</b>nd <b>S</b>peech <b>M</b>odel (<b>LLaSM</b>). <b>LLaSM</b> is an end-to-end trained large multi-modal | |
speech-language model with cross-modal conversational abilities, capable of following | |
speech-and-language instructions. Our early experiments show that <b>LLaSM</b> demonstrates | |
a more convenient and natural way for human to interact with artificial intelligence. | |
Specifically, we also release a large Speech Instruction Following data set <b>LLaSM-Audio-Instruction</b>. | |
</p> | |
<p> | |
Our paper makes the following contributions: | |
</p> | |
<ui> | |
<li> | |
We build a speech-language multi-modal assistant that can understand and follow the speech-language instructions, which provides a more convenient and natural way for humans to interact with artificial intelligence. | |
</li> | |
<li> | |
We construct and release <a href="https://huggingface.co/datasets/LinkSoul/LLaSM-Audio-Instructions" target="_blank">LLaSM-Audio-Instructions</a>, a large scale Chinese and English speech-text cross-modal instruction following dataset. | |
</li> | |
<li> | |
We release the code in <a href="https://github.com/LinkSoul-AI/LLaSM" target="_blank">https://github.com/LinkSoul-AI/LLaSM</a>. | |
</li> | |
<li> | |
We release the models in <a href="https://huggingface.co/LinkSoul/LLaSM-Cllama2" target="_blank">LLaSM-Chinese-Llama-2-7B</a> and <a href="https://huggingface.co/LinkSoul/LLaSM-Baichuan" target="_blank">LLaSM-Baichuan-7B</a>. | |
</li> | |
</ui> | |
</div> | |
</div> | |
</div> | |
<!--/ Abstract. --> | |
</div> | |
</section> | |
<section class="hero is-light is-small"> | |
<div class="hero-body"> | |
<h2 class="title is-3" style="text-align: center;">Demo</h2> | |
<!-- LLaSM Demo --> | |
<div id="llasaLoading" style="position: absolute; width: 100%; z-index: 1; display: flex; justify-content: center; align-items: center;"> | |
<div style="text-align: center;"> | |
<img src="./images/duck.gif" alt="loading" /> | |
<h3>Loading...</h3> | |
</div> | |
</div> | |
<div class="container" id="llasa" style="opacity: 0;"> | |
<div class="row mt-5 justify-content-center"> | |
<div class="col-md-12 mt-3"> | |
<div id="chat-window" class="card p-2"> | |
<div class="container my-3"> | |
<!-- <div id="info"></div> --> | |
<div id="results" class="results"> | |
</div> | |
<fieldset id="temp_audio" style="text-align: center; height: 100px; border: 1.4px solid #ddd;"> | |
<legend style="float: initial; | |
text-align: initial; | |
width: initial; | |
margin-left: 10px; | |
font-size: initial;">Audio preview</legend> | |
<div id="waveform" style="text-align: center; height: 50px; width: 100%;"></div> | |
<audio id="audioPlayer" style="height: 50px; width: 100%; display: none; padding: 0 20px 0 20px;" controls src=""></audio> | |
</fieldset> | |
</div> | |
</div> | |
<div id="user-input" class="mt-2"> | |
<div class="input-group"> | |
<textarea type="text" id="user-text" style="height: 60px; padding: 10px 150px 5px 10px;" placeholder="Type in your message or press record button to speak..."></textarea> | |
<div id="input-audio" class="input-group-append p-2"> | |
<button id="delete_button" class="mb-2 p-2"> | |
<img id="delete_img" class="mb-2" src="images/error.png" alt="Del"> | |
</button> | |
<button id="start_button" class="mb-2 p-2"> | |
<img id="start_img" class="mb-2" src="images/microphone.png" alt="Record"> | |
</button> | |
<button id="send_button" class="mb-2 p-2"> | |
<img id="send_text_img" class="mb-2" src="images/paper-plane.png" alt="Start"> | |
</button> | |
</div> | |
</div> | |
</div> | |
</div> | |
</div> | |
</div> | |
<!-- / LLaSM Demo --> | |
</div> | |
</section> | |
<section class="section"> | |
<div class="container is-max-desktop"> | |
<!-- Demo Tips. --> | |
<div class="columns is-centered"> | |
<div class="column is-four-fifths"> | |
<h2 class="title is-3">Tips</h2> | |
<div class="content has-text-justified"> | |
<h4> | |
Demo 试用教程 | |
</h4> | |
<ul> | |
<li> | |
文本框输入文字,点击最右侧发送按钮即可发送消息,开始聊天。 | |
</li> | |
<li> | |
点击语音按钮,开始录音,再次点击,结束录音。点击发送按钮,即可发送语音消息。 | |
</li> | |
<li> | |
语音未发送之前可在音频预览区检查,聊天框中的历史语音消息同样支持回放。 | |
</li> | |
<li> | |
点击重置按钮可清空历史对话信息。 | |
</li> | |
<li> | |
注:本 demo 仅作为 LLaSM 的模型能力展示,对多轮对话中话题切换支持不足。切换聊天话题时,建议清空历史以获得更好的体验。 | |
</li> | |
</ul> | |
</div> | |
</div> | |
</div> | |
<!--/ Demo Tips. --> | |
</div> | |
</section> | |
<section class="section" id="BibTeX"> | |
<div class="container is-max-desktop"> | |
<div class="columns is-centered"> | |
<div class="column is-four-fifths"> | |
<h2 class="title">BibTeX</h2> | |
<pre> | |
<code> | |
@misc{shu2023llasm, | |
title={LLaSM: Large Language and Speech Model}, | |
author={Yu Shu and Siwei Dong and Guangyao Chen and Wenhao Huang and Ruihua Zhang and Daochen Shi and Qiqi Xiang and Yemin Shi}, | |
year={2023}, | |
eprint={2308.15930}, | |
archivePrefix={arXiv}, | |
primaryClass={cs.CL} | |
} | |
</code> | |
</pre> | |
</div> | |
</div> | |
</div> | |
</section> | |
<section class="section" id="Acknowledgement"> | |
<div class="container is-max-desktop"> | |
<div class="columns is-centered"> | |
<div class="column is-four-fifths"> | |
<h2 class="title">Acknowledgement</h2> | |
<p> | |
This website is adapted from <a href="https://github.com/nerfies/nerfies.github.io" target="_blank">Nerfies</a>, licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/" target="_blank">Creative | |
Commons Attribution-ShareAlike 4.0 International License</a>. We thank the open-source projects for giving us access to their models, including <a href="https://huggingface.co/LinkSoul/Chinese-Llama-2-7b" target="_blank">Chinese-Llama-2-7B</a> and <a href="https://huggingface.co/openai/whisper-large-v2" target="_blank">Whisper</a> and <a href="https://huggingface.co/baichuan-inc/Baichuan-7B" target="_blank">Baichuan-7B</a>. | |
</p> | |
</div> | |
</div> | |
</div> | |
</section> | |
<!-- for LLaSM demo --> | |
<script src="./static/js/index_demo.js"></script> | |
<!-- / for LLaSM demo --> | |
</body> | |
</html> | |