Spaces:

fishaudio
/

openaudio-s1-mini

Running on L4

App Files Files Community

Stardust-minus commited on Jun 5

Commit

f65fe2e

verified ·

1 Parent(s): 492fb71

Upload folder using huggingface_hub

Browse files

Files changed (18) hide show

.gitattributes +3 -0
022b2161-8f56-4432-a9ae-b4bd514e4821.mp3 +3 -0
examples/Arabic.wav +2 -2
examples/English.wav +2 -2
examples/French.wav +2 -2
examples/German.wav +2 -2
examples/Japanese.wav +1 -1
examples/Korean.wav +1 -1
examples/Nice English Ref.wav +1 -1
examples/Spanish.wav +2 -2
fish_speech/models/text2semantic/inference.py +20 -2
fish_speech/models/text2semantic/llama.py +1 -1
generate_cli.py +227 -0
output.wav +3 -0
ref.wav +3 -0
sampling_params_example.json +55 -0
tools/api.py +1 -1
tools/vqgan/inference.py +1 -1

.gitattributes CHANGED Viewed

@@ -41,3 +41,6 @@ examples/English.wav filter=lfs diff=lfs merge=lfs -text
 examples/French.wav filter=lfs diff=lfs merge=lfs -text
 examples/German.wav filter=lfs diff=lfs merge=lfs -text
 examples/Spanish.wav filter=lfs diff=lfs merge=lfs -text

 examples/French.wav filter=lfs diff=lfs merge=lfs -text
 examples/German.wav filter=lfs diff=lfs merge=lfs -text
 examples/Spanish.wav filter=lfs diff=lfs merge=lfs -text
+022b2161-8f56-4432-a9ae-b4bd514e4821.mp3 filter=lfs diff=lfs merge=lfs -text
+output.wav filter=lfs diff=lfs merge=lfs -text
+ref.wav filter=lfs diff=lfs merge=lfs -text

022b2161-8f56-4432-a9ae-b4bd514e4821.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb37bcf53feb185993aeb8a7f9b96f055b60ed6d0d96fe5a6833db1c0efba0f0
+size 647000

examples/Arabic.wav CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4a3c902c13fcf408c95353d91ab65f839d27584d8929c7345317956d1e9ea5bd
-size 131

 version https://git-lfs.github.com/spec/v1
+oid sha256:79baad393ddae4d975e0a1e04065fe18d655104b6dd3db1e035e28f391c4d78f
+size 128

examples/English.wav CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ed744820849c8f16e03cb68e45b7d7d4b8697476a162d50ffe2cd6612a621aa6
-size 131

 version https://git-lfs.github.com/spec/v1
+oid sha256:295ab67b022169527d1b3d564df6163900e8d45e39e069890f8b7b912f0bda5d
+size 128

examples/French.wav CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dee830ddff631df6e0db0911a20099ddf6438a80d1da597536470ba36e2d645c
-size 131

 version https://git-lfs.github.com/spec/v1
+oid sha256:d1db0708d546351aa9e757adad0f97f8376962f8fcbfd28dd7d574ec6929f3bb
+size 128

examples/German.wav CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cc076529638f0a4bb8d19b509b7781372c26abadcc74a7dcbc5b72b6b1e680fd
-size 131

 version https://git-lfs.github.com/spec/v1
+oid sha256:c23b9798f9e0eb659d0d1ae7d98a74dbc728cd06899e5e84e0fcc519c4613e70
+size 128

examples/Japanese.wav CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ba2a2c07770cb6ab36a5aa6ee953c9914773368e223359e4710897d425a25402
 size 128

 version https://git-lfs.github.com/spec/v1
+oid sha256:277101318b7c174690280daea1402701e4e176abce625853c585a256b776d685
 size 128

examples/Korean.wav CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:09c122b25a3ad99247179be77deeaa6ead7d93b40092347801948fea34797e48
 size 128

 version https://git-lfs.github.com/spec/v1
+oid sha256:a91634e1c44008d2d9a01ff9d63f50551080c8102d6902c99e6ff00077e8d715
 size 128

examples/Nice English Ref.wav CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b895ec0d49173630cf9253c70579888cde65129fbaeda167e3b4f91593715eca
 size 128

 version https://git-lfs.github.com/spec/v1
+oid sha256:7846d8d9cf1f149a4f9fb454040561c44401f2a9878c565ab4781d346e8a9436
 size 128

examples/Spanish.wav CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c22d63058f58f46c6a65b6ced8faa969f403b065e822a274342b520e8e20b65f
-size 131

 version https://git-lfs.github.com/spec/v1
+oid sha256:d434eb34ed3d103579f1e93a0cf87d84a9e0f70b19d08fdb5b32a4f6e40cc3e1
+size 128

fish_speech/models/text2semantic/inference.py CHANGED Viewed

@@ -339,7 +339,7 @@ def generate_long(
     temperature: float = 0.8,
     compile: bool = False,
     iterative_prompt: bool = True,
-    chunk_length: int = 150,
     prompt_text: Optional[str | list[str]] = None,
     prompt_tokens: Optional[torch.Tensor | list[torch.Tensor]] = None,
 ):
@@ -365,6 +365,24 @@ def generate_long(
     texts = split_text(text, chunk_length) if iterative_prompt else [text]
     max_length = model.config.max_seq_len
     if use_prompt:
         for t, c in zip(prompt_text, prompt_tokens):
             base_content_sequence.append(
@@ -385,7 +403,7 @@ def generate_long(
     encoded = []
     for text in texts:
-        content_sequence = ContentSequence(modality=None)
         content_sequence.append(TextPart(text=text))
         encoded.append(
             content_sequence.encode_for_inference(

     temperature: float = 0.8,
     compile: bool = False,
     iterative_prompt: bool = True,
+    chunk_length: int = 512,
     prompt_text: Optional[str | list[str]] = None,
     prompt_tokens: Optional[torch.Tensor | list[torch.Tensor]] = None,
 ):
     texts = split_text(text, chunk_length) if iterative_prompt else [text]
     max_length = model.config.max_seq_len
+    # if use_prompt:
+    #     base_content_sequence.append(
+    #         [
+    #             TextPart(text=prompt_text[0]),
+    #             VQPart(codes=prompt_tokens[0]),
+    #         ],
+    #         add_end=True,
+    #     )
+    # for text in texts:
+    #     content_sequence = ContentSequence(modality=None)
+    #     base_content_sequence.append(
+    #         [
+    #             TextPart(text=text),
+    #         ],
+    #         add_end=True,
+    #     )
     if use_prompt:
         for t, c in zip(prompt_text, prompt_tokens):
             base_content_sequence.append(
     encoded = []
     for text in texts:
+        content_sequence = ContentSequence(modality="text")
         content_sequence.append(TextPart(text=text))
         encoded.append(
             content_sequence.encode_for_inference(

fish_speech/models/text2semantic/llama.py CHANGED Viewed

@@ -48,7 +48,7 @@ class BaseModelArgs:
     # Codebook configs
     codebook_size: int = 160
-    num_codebooks: int = 4
     # Gradient checkpointing
     use_gradient_checkpointing: bool = True

     # Codebook configs
     codebook_size: int = 160
+    num_codebooks: int = 9
     # Gradient checkpointing
     use_gradient_checkpointing: bool = True

generate_cli.py ADDED Viewed

	@@ -0,0 +1,227 @@

+import os
+import json
+import queue
+from pathlib import Path
+from typing import Optional
+import click
+import torch
+import soundfile as sf
+from loguru import logger
+from fish_speech.models.text2semantic.inference import (
+    CodebookSamplingParams,
+    SamplingParams,
+    generate_long,
+    launch_thread_safe_queue,
+    GenerateRequest,
+    WrappedGenerateResponse,
+)
+from fish_speech.models.text2semantic.llama import BaseTransformer
+from fish_speech.models.dac.inference import load_model as load_decoder_model
+from fish_speech.text import clean_text
+from fish_speech.inference_engine.vq_manager import VQManager
+from tools.api import load_audio
+def load_llm_model(model_path: str, device: str, compile: bool = False):
+    """加载LLM模型"""
+    logger.info(f"Loading LLM model from {model_path}")
+    model = BaseTransformer.from_pretrained(
+        path=model_path,
+        load_weights=True,
+    )
+    model = model.to(device=device, dtype=torch.bfloat16)
+    if isinstance(model, model.__class__.__bases__[0].__subclasses__()[1]):  # DualARTransformer
+        from fish_speech.models.text2semantic.inference import decode_one_token_ar as decode_one_token
+        logger.info("Using DualARTransformer")
+    else:
+        from fish_speech.models.text2semantic.inference import decode_one_token_naive as decode_one_token
+        logger.info("Using NaiveTransformer")
+    if compile:
+        logger.info("Compiling decode function...")
+        decode_one_token = torch.compile(
+            decode_one_token,
+            fullgraph=True,
+            backend="inductor" if torch.cuda.is_available() else "aot_eager",
+            mode="reduce-overhead" if torch.cuda.is_available() else None,
+        )
+    return model.eval(), decode_one_token
+def load_dac_model(config_name: str, checkpoint_path: str, device: str):
+    """加载DAC模型"""
+    logger.info(f"Loading DAC model from {checkpoint_path}")
+    model = load_decoder_model(
+        config_name=config_name,
+        checkpoint_path=checkpoint_path,
+        device=device,
+    )
+    return model
+@click.command()
+#@click.argument("text", type=str)
+@click.option("--llm-model-path", type=str, required=True, help="Path to the LLM model")
+@click.option("--dac-model-path", type=str, required=True, help="Path to the DAC model")
+@click.option("--dac-config-name", type=str, default="modded_dac_vq", help="DAC model config name")
+@click.option("--output-path", type=str, required=True, help="Path to save the output audio")
+@click.option("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device to use")
+@click.option("--max-new-tokens", type=int, default=4096, help="Maximum new tokens to generate")
+@click.option("--chunk-length", type=int, default=1000, help="Chunk length for synthesis")
+@click.option("--compile", is_flag=True, help="Whether to compile the model")
+@click.option("--iterative-prompt", is_flag=True, help="Whether to use iterative prompt")
+@click.option("--params-file", type=str, default="sampling_params_example.json", help="Path to JSON file containing sampling parameters")
+@click.option(
+    "--ref-audio",
+    type=click.Path(path_type=Path, exists=True),
+    default="ref.wav",
+    help="参考音频文件路径，默认ref.wav"
+)
+def main(
+    #text: str,
+    llm_model_path: str,
+    dac_model_path: str,
+    dac_config_name: str,
+    output_path: str,
+    device: str,
+    max_new_tokens: int,
+    chunk_length: int,
+    compile: bool,
+    iterative_prompt: bool,
+    params_file: Optional[str],
+    ref_audio: Path,
+):
+    """生成语音，包括LLM生成token和DAC生成音频两个步骤"""
+    # 设置精度
+    precision = torch.half if torch.cuda.is_available() else torch.bfloat16
+    # 加载LLM模型（使用线程安全的队列）
+    logger.info("Loading LLM model...")
+    llama_queue = launch_thread_safe_queue(
+        checkpoint_path=llm_model_path,
+        device="cuda:0",
+        precision=precision,
+        compile=compile,
+    )
+    logger.info("LLM model loaded")
+    # 加载DAC模型
+    logger.info("Loading DAC model...")
+    dac_model = load_decoder_model(
+        config_name=dac_config_name,
+        checkpoint_path=dac_model_path,
+        device="cuda:1",
+    )
+    logger.info("DAC model loaded")
+    # 加载采样参数
+    if params_file:
+        with open(params_file, "r", encoding="utf-8") as f:
+            params_data = json.load(f)
+        text = params_data.get("text", "")
+        semantic_params = CodebookSamplingParams(**params_data.get("semantic", {}))
+        codebook_params = [
+            CodebookSamplingParams(**params) for params in params_data.get("codebooks", [])
+        ]
+        sampling_params = SamplingParams(
+            semantic=semantic_params,
+            codebooks=codebook_params,
+        )
+    else:
+        sampling_params = SamplingParams()
+    # 清理文本
+    text = clean_text(text)
+    # ���载参考音频
+    if not ref_audio.exists():
+        ref_audio_data, ref_sr = sf.read(ref_audio)
+        logger.info(f"Loaded reference audio: {ref_audio}, shape={ref_audio_data.shape}, sr={ref_sr}")
+        # 编码参考音频为prompt_tokens
+        vq_manager = VQManager()
+        vq_manager.decoder_model = dac_model
+        vq_manager.load_audio = load_audio
+        prompt_tokens = vq_manager.encode_reference(ref_audio, enable_reference_audio=True)
+        logger.info(f"Encoded reference audio to prompt_tokens, shape={prompt_tokens.shape if prompt_tokens is not None else None}")
+    else:
+        prompt_tokens = []
+        logger.warning(f"Reference audio {ref_audio} not found.")
+    # 生成语音
+    logger.info(f"Generating speech for text: {text}")
+    logger.info(f"Using sampling parameters: {sampling_params}")
+    output_path = Path(output_path)
+    if not output_path.suffix:
+        output_path = output_path.with_suffix('.wav')
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    # 创建响应队列
+    response_queue = queue.Queue()
+    # 准备请求
+    request = dict(
+        device=device,
+        max_new_tokens=max_new_tokens,
+        text=text,
+        sampling_params=sampling_params,
+        compile=compile,
+        iterative_prompt=iterative_prompt,
+        chunk_length=chunk_length,
+        prompt_text=[],
+        prompt_tokens=[prompt_tokens] if prompt_tokens is not None and len(prompt_tokens) else [],
+        #prompt_text=["Through the dense morning fog that rolled across the peaceful valley, the distant church bells chimed their melodic song, echoing off ancient stone walls and mingling with the gentle rustling of maple leaves in the cool breeze. Inside the cozy lakeside cottage, fresh bread baked in the old clay oven filled every corner with its rich, comforting aroma, while steam rose lazily from ceramic mugs of fresh-brewed coffee on the handcrafted pine table. The persistent rain finally gave way to brilliant sunshine, transforming ordinary dewdrops into countless sparkling diamonds scattered across the vibrant garden flowers."],
+    )
+    # 发送请求到LLM模型
+    llama_queue.put(GenerateRequest(request=request, response_queue=response_queue))
+    # 收集生成的token
+    all_tokens = []
+    while True:
+        wrapped_result: WrappedGenerateResponse = response_queue.get()
+        if wrapped_result.status == "error":
+            error = wrapped_result.response if isinstance(wrapped_result.response, Exception) else Exception("Unknown error")
+            logger.error(f"Error during generation: {error}")
+            break
+        result = wrapped_result.response
+        if result.action == "next":
+            break
+        all_tokens.append(result.codes)
+        logger.info(f"Generated chunk {len(all_tokens)}")
+    if not all_tokens:
+        logger.error("No tokens generated")
+        return
+    # 合并所有token
+    if len(all_tokens) > 1:
+        tokens = torch.cat(all_tokens, dim=1)
+    else:
+        tokens = all_tokens[0]
+    # 使用DAC模型生成音频
+    logger.info("Converting tokens to audio...")
+    feature_lengths = torch.tensor([tokens.shape[1]], device=device)
+    audio, _ = dac_model.decode(
+        indices=tokens[None].to("cuda:1"),
+        feature_lengths=feature_lengths.to("cuda:1")
+    )
+    # 保存音频
+    audio = audio[0, 0].detach().float().cpu().numpy()
+    sf.write(output_path, audio, dac_model.sample_rate)
+    logger.info(f"Saved audio to {output_path}")
+if __name__ == "__main__":
+    main()

output.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:043464cdadefbc6144a48155c38f69016d44a1d3eaab261a634719eb5d9162ee
+size 888876

ref.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b1acb52e60f3b9eaa66bd289a37a3da7c7b5c64511f42cd0bc8245b57122f354
+size 3566670

sampling_params_example.json ADDED Viewed

	@@ -0,0 +1,55 @@

+{
+    "text": "(excited, joyful tone) We're going to DISNEY WORLD! (squeal of delight) I've been saving for (emphasis) three years (breathless) and finally, FINALLY we can go! The look on your face right now is worth every extra shift I worked! (angry) After everything we've been through (break) I can't believe you would (emphasize) betray me like this. I gave you EVERYTHING! And now I'm left with nothing but memories and broken promises!",
+    "semantic": {
+        "temperature": 0.9,
+        "top_p": 0.9,
+        "repetition_penalty": 1.05
+    },
+    "codebooks": [
+        {
+            "temperature": 0.9,
+            "top_p": 0.9,
+            "repetition_penalty": 1.05
+        },
+        {
+            "temperature": 0.8,
+            "top_p": 0.8,
+            "repetition_penalty": 1.1
+        },
+        {
+            "temperature": 0.8,
+            "top_p": 0.8,
+            "repetition_penalty": 1.1
+        },
+        {
+            "temperature": 0.7,
+            "top_p": 0.7,
+            "repetition_penalty": 1.1
+        },
+        {
+            "temperature": 0.7,
+            "top_p": 0.7,
+            "repetition_penalty": 1.1
+        },
+        {
+            "temperature": 0.65,
+            "top_p": 0.65,
+            "repetition_penalty": 1.1
+        },
+        {
+            "temperature": 0.65,
+            "top_p": 0.65,
+            "repetition_penalty": 1.1
+        },
+        {
+            "temperature": 0.6,
+            "top_p": 0.6,
+            "repetition_penalty": 1.1
+        },
+        {
+            "temperature": 0.6,
+            "top_p": 0.4,
+            "repetition_penalty": 1.5
+        }
+    ]
+}

tools/api.py CHANGED Viewed

@@ -136,7 +136,7 @@ async def other_exception_handler(exc: "Exception"):
 def load_audio(reference_audio, sr):
-    if len(reference_audio) > 255 or not Path(reference_audio).exists():
         audio_data = reference_audio
         reference_audio = io.BytesIO(audio_data)

 def load_audio(reference_audio, sr):
+    if len(str(reference_audio)) > 255 or not Path(reference_audio).exists():
         audio_data = reference_audio
         reference_audio = io.BytesIO(audio_data)

tools/vqgan/inference.py CHANGED Viewed

@@ -14,7 +14,7 @@ from omegaconf import OmegaConf
 from tools.file import AUDIO_EXTENSIONS
 # register eval resolver
-OmegaConf.register_new_resolver("eval", eval)
 def load_model(config_name, checkpoint_path, device="cuda"):

 from tools.file import AUDIO_EXTENSIONS
 # register eval resolver
+#OmegaConf.register_new_resolver("eval", eval)
 def load_model(config_name, checkpoint_path, device="cuda"):