Spaces:

LinkSoul
/

LLaSM

Running

App Files Files Community

LLaSM / index.html

TabShu

Update index.html

4b163cd almost 2 years ago

raw

history blame contribute delete

15.2 kB

	<!DOCTYPE html>
	<html>
	<head>
	<meta charset="utf-8">
	<meta name="description"
	content="LLaSM: Large Language and Speech Model">
	<meta name="keywords" content="speech-language, multi-modal, LLM, LLaSM">
	<meta name="viewport" content="width=device-width, initial-scale=1">
	<title>LLaSM: Large Language and Speech Model</title>

	<link href="https://fonts.googleapis.com/css?family=Google+Sans\|Noto+Sans\|Castoro"
	rel="stylesheet">

	<link rel="stylesheet" href="./static/css/bulma.min.css">
	<link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
	<link rel="stylesheet" href="./static/css/bulma-slider.min.css">
	<link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
	<link rel="stylesheet"
	href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
	<link rel="stylesheet" href="./static/css/index.css">
	<link rel="icon" href="./static/images/favicon.svg">

	<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
	<script defer src="./static/js/fontawesome.all.min.js"></script>
	<script src="./static/js/bulma-carousel.min.js"></script>
	<script src="./static/js/bulma-slider.min.js"></script>

	<!-- for LLaSM demo -->
	<link rel="stylesheet" href="./static/css/bootsrap.min.css">
	<link rel="stylesheet" href="./static/css/styles.css">
	<script src="./static/js/recorder.mp3.min.js"></script>
	<script src="./static/js/waveview.js"></script>
	<!-- / for LLaSM demo -->
	</head>
	<body>

	<nav class="navbar" role="navigation" aria-label="main navigation">
	<div class="navbar-brand">
	<a role="button" class="navbar-burger" aria-label="menu" aria-expanded="false">
	<span aria-hidden="true"></span>
	<span aria-hidden="true"></span>
	<span aria-hidden="true"></span>
	</a>
	</div>
	<div class="navbar-menu">
	<div class="navbar-start" style="flex-grow: 1; justify-content: center;">
	<a class="navbar-item" href="https://keunhong.com">
	<span class="icon">

	<i class="fas fa-home"></i>
	</span>
	</a>

	<div class="navbar-item has-dropdown is-hoverable">
	<a class="navbar-link">
	More Research
	</a>
	<div class="navbar-dropdown">
	<a class="navbar-item" href="https://huggingface.co/spaces/LinkSoul/Chinese-LLaVA" target="_blank">
	Chinese-LLaVA
	</a>
	<a class="navbar-item" href="https://huggingface.co/LinkSoul/Chinese-Llama-2-7b" target="_blank">
	Chinese-Llama-2-7B
	</a>
	</div>
	</div>
	</div>

	</div>
	</nav>


	<section class="hero">
	<div class="hero-body">
	<div class="container is-max-desktop">
	<div class="columns is-centered">
	<div class="column has-text-centered">
	<h1 class="title is-1 publication-title">LLaSM: Large Language and Speech Model</h1>
	<div class="is-size-5 publication-authors">
	<span class="author-block" style="color:#008AD7;font-weight:normal;">
	Yu Shu<sup>2</sup>,</span>
	<span class="author-block" style="color:#008AD7;font-weight:normal;">
	Siwei Dong<sup>2</sup>,</span>
	<span class="author-block" style="color:#ed2f09;font-weight:normal;">
	Guangyao Chen<sup>1,3</sup>,
	</span>
	<span class="author-block" style="color:#cc00d7;font-weight:normal;">
	Wenhao Huang<sup>4</sup>,
	</span>
	<span class="author-block" style="color:#19e706;font-weight:normal;">
	Rita Zhang,
	<!-- Rita Zhang<sup>5</sup>, -->
	</span>
	<span class="author-block" style="color:#19e706;font-weight:normal;">
	Daochen Shi,
	<!-- Daochen Shi<sup>5</sup>, -->
	</span>
	<span class="author-block" style="color:#19e706;font-weight:normal;">
	Qiqi Xiang,
	<!-- Qiqi Xiang<sup>5</sup>, -->
	</span>
	<span class="author-block" style="color:#f68946;font-weight:normal;">
	Yemin Shi<sup>1*</sup>
	</span>
	</div>

	<div class="is-size-5 publication-authors">
	<span class="author-block" style="color:#f68946;font-weight:normal;"><sup>1</sup>LinkSoul.AI,</span>
	<span class="author-block" style="color:#008AD7;font-weight:normal;"><sup>2</sup>Beijing Academy of Artificial Intelligence, China,</span>
	<span class="author-block" style="color:#ed2f09;font-weight:normal;"><sup>3</sup>Peking University, China</span>
	<span class="author-block" style="color:#cc00d7;font-weight:normal;"><sup>4</sup>01.ai</span>
	</div>

	<div>
	<span class="author-block"><sup>*</sup>Corresponding author: [email protected]</span>
	</div>

	<div class="column has-text-centered">
	<div class="publication-links">
	<!-- PDF Link. -->
	<!-- <span class="link-block">
	<a href="" target="_blank"
	class="external-link button is-normal is-rounded is-dark">
	<span class="icon">
	<i class="fas fa-file-pdf"></i>
	</span>
	<span>Paper</span>
	</a>
	</span> -->
	<span class="link-block">
	<a href="https://arxiv.org/abs/2308.15930" target="_blank"
	class="external-link button is-normal is-rounded is-dark">
	<span class="icon">
	<i class="ai ai-arxiv"></i>
	</span>
	<span>arXiv</span>
	</a>
	</span>
	<!-- Model Link. -->
	<span class="link-block">
	<a href="https://huggingface.co/LinkSoul/LLaSM-Cllama2" target="_blank"
	class="external-link button is-normal is-rounded is-dark">
	<span class="icon">
	<i class="fas fa-atom"></i>
	</span>
	<span>Model</span>
	</a>
	</span>
	<!-- Code Link. -->
	<span class="link-block">
	<a href="https://github.com/LinkSoul-AI/LLaSM" target="_blank"
	class="external-link button is-normal is-rounded is-dark">
	<span class="icon">
	<i class="fab fa-github"></i>
	</span>
	<span>Code</span>
	</a>
	</span>
	<!-- Dataset Link. -->
	<span class="link-block">
	<a href="https://huggingface.co/datasets/LinkSoul/LLaSM-Audio-Instructions" target="_blank"
	class="external-link button is-normal is-rounded is-dark">
	<span class="icon">
	<i class="far fa-images"></i>
	</span>
	<span>Data</span>
	</a>
	</div>

	</div>
	</div>
	</div>
	</div>
	</div>
	</section>

	<section class="section">
	<div class="container is-max-desktop">
	<!-- Abstract. -->
	<div class="columns is-centered has-text-centered">
	<div class="column is-four-fifths">
	<h2 class="title is-3">Abstract</h2>
	<div class="content has-text-justified">
	<p>
	Multi-modal large language models have garnered significant interest recently. Though,
	most of the works are focusing on vision-language multi-modal models providing
	strong capabilities in following vision-and-language instructions. However, we claim
	that speech is also an important modality through which human interact with the
	world. Hence, it is crucial for a general-purpose assistant to be able to follow multi-modal
	speech-and-language instructions. In this work, we propose <b>L</b>arge <b>L</b>anguage
	<b>a</b>nd <b>S</b>peech <b>M</b>odel (<b>LLaSM</b>). <b>LLaSM</b> is an end-to-end trained large multi-modal
	speech-language model with cross-modal conversational abilities, capable of following
	speech-and-language instructions. Our early experiments show that <b>LLaSM</b> demonstrates
	a more convenient and natural way for human to interact with artificial intelligence.
	Specifically, we also release a large Speech Instruction Following data set <b>LLaSM-Audio-Instruction</b>.
	</p>
	<p>
	Our paper makes the following contributions:
	</p>
	<ui>
	<li>
	We build a speech-language multi-modal assistant that can understand and follow the speech-language instructions, which provides a more convenient and natural way for humans to interact with artificial intelligence.
	</li>
	<li>
	We construct and release <a href="https://huggingface.co/datasets/LinkSoul/LLaSM-Audio-Instructions" target="_blank">LLaSM-Audio-Instructions</a>, a large scale Chinese and English speech-text cross-modal instruction following dataset.
	</li>
	<li>
	We release the code in <a href="https://github.com/LinkSoul-AI/LLaSM" target="_blank">https://github.com/LinkSoul-AI/LLaSM</a>.
	</li>
	<li>
	We release the models in <a href="https://huggingface.co/LinkSoul/LLaSM-Cllama2" target="_blank">LLaSM-Chinese-Llama-2-7B</a> and <a href="https://huggingface.co/LinkSoul/LLaSM-Baichuan" target="_blank">LLaSM-Baichuan-7B</a>.
	</li>
	</ui>
	</div>
	</div>
	</div>
	<!--/ Abstract. -->
	</div>
	</section>

	<section class="hero is-light is-small">
	<div class="hero-body">
	<h2 class="title is-3" style="text-align: center;">Demo</h2>
	<!-- LLaSM Demo -->
	<div id="llasaLoading" style="position: absolute; width: 100%; z-index: 1; display: flex; justify-content: center; align-items: center;">
	<div style="text-align: center;">
	<img src="./images/duck.gif" alt="loading" />
	<h3>Loading...</h3>
	</div>
	</div>
	<div class="container" id="llasa" style="opacity: 0;">
	<div class="row mt-5 justify-content-center">
	<div class="col-md-12 mt-3">
	<div id="chat-window" class="card p-2">
	<div class="container my-3">
	<!-- <div id="info"></div> -->
	<div id="results" class="results">

	</div>
	<fieldset id="temp_audio" style="text-align: center; height: 100px; border: 1.4px solid #ddd;">
	<legend style="float: initial;
	text-align: initial;
	width: initial;
	margin-left: 10px;
	font-size: initial;">Audio preview</legend>
	<div id="waveform" style="text-align: center; height: 50px; width: 100%;"></div>
	<audio id="audioPlayer" style="height: 50px; width: 100%; display: none; padding: 0 20px 0 20px;" controls src=""></audio>
	</fieldset>
	</div>
	</div>

	<div id="user-input" class="mt-2">
	<div class="input-group">
	<textarea type="text" id="user-text" style="height: 60px; padding: 10px 150px 5px 10px;" placeholder="Type in your message or press record button to speak..."></textarea>
	<div id="input-audio" class="input-group-append p-2">
	<button id="delete_button" class="mb-2 p-2">
	<img id="delete_img" class="mb-2" src="images/error.png" alt="Del">
	</button>
	<button id="start_button" class="mb-2 p-2">
	<img id="start_img" class="mb-2" src="images/microphone.png" alt="Record">
	</button>
	<button id="send_button" class="mb-2 p-2">
	<img id="send_text_img" class="mb-2" src="images/paper-plane.png" alt="Start">
	</button>
	</div>
	</div>
	</div>
	</div>
	</div>
	</div>

	<!-- / LLaSM Demo -->
	</div>
	</section>

	<section class="section">
	<div class="container is-max-desktop">
	<!-- Demo Tips. -->
	<div class="columns is-centered">
	<div class="column is-four-fifths">
	<h2 class="title is-3">Tips</h2>
	<div class="content has-text-justified">
	<h4>
	Demo 试用教程
	</h4>
	<ul>
	<li>
	文本框输入文字，点击最右侧发送按钮即可发送消息，开始聊天。
	</li>
	<li>
	点击语音按钮，开始录音，再次点击，结束录音。点击发送按钮，即可发送语音消息。
	</li>
	<li>
	语音未发送之前可在音频预览区检查，聊天框中的历史语音消息同样支持回放。
	</li>
	<li>
	点击重置按钮可清空历史对话信息。
	</li>
	<li>
	注：本 demo 仅作为 LLaSM 的模型能力展示，对多轮对话中话题切换支持不足。切换聊天话题时，建议清空历史以获得更好的体验。
	</li>
	</ul>
	</div>
	</div>
	</div>
	<!--/ Demo Tips. -->
	</div>
	</section>

	<section class="section" id="BibTeX">
	<div class="container is-max-desktop">
	<div class="columns is-centered">
	<div class="column is-four-fifths">
	<h2 class="title">BibTeX</h2>
	<pre>
	<code>
	@misc{shu2023llasm,
	title={LLaSM: Large Language and Speech Model},
	author={Yu Shu and Siwei Dong and Guangyao Chen and Wenhao Huang and Ruihua Zhang and Daochen Shi and Qiqi Xiang and Yemin Shi},
	year={2023},
	eprint={2308.15930},
	archivePrefix={arXiv},
	primaryClass={cs.CL}
	}
	</code>
	</pre>
	</div>
	</div>
	</div>
	</section>

	<section class="section" id="Acknowledgement">
	<div class="container is-max-desktop">
	<div class="columns is-centered">
	<div class="column is-four-fifths">
	<h2 class="title">Acknowledgement</h2>
	<p>
	This website is adapted from <a href="https://github.com/nerfies/nerfies.github.io" target="_blank">Nerfies</a>, licensed under a <a rel="license" href="http://creativecommons.org/licenses/by-sa/4.0/" target="_blank">Creative
	Commons Attribution-ShareAlike 4.0 International License</a>. We thank the open-source projects for giving us access to their models, including <a href="https://huggingface.co/LinkSoul/Chinese-Llama-2-7b" target="_blank">Chinese-Llama-2-7B</a> and <a href="https://huggingface.co/openai/whisper-large-v2" target="_blank">Whisper</a> and <a href="https://huggingface.co/baichuan-inc/Baichuan-7B" target="_blank">Baichuan-7B</a>.
	</p>
	</div>
	</div>
	</div>
	</section>

	<!-- for LLaSM demo -->
	<script src="./static/js/index_demo.js"></script>
	<!-- / for LLaSM demo -->
	</body>
	</html>