diff --git a/README.md b/README.md index 4b7d4962157d6f5fe517d2144ba8b60fb5eed4a1..b7aeea8324880edd07404c39f47ba9c3119b0635 100644 --- a/README.md +++ b/README.md @@ -9,4 +9,92 @@ app_file: app.py pinned: false --- -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference \ No newline at end of file +# 🎼 SonicVerse + +An interactive demo for SonicVerse, a music captioning model, allowing users to input audio of up to 10 seconds and generate a natural language caption +that includes a general description of the music as well as music features such as key, instruments, genre, mood / theme, vocals gender. + +--- + +## πŸš€ Demo + +Check out the live Space here: +[![Hugging Face Space](https://img.shields.io/badge/HuggingFace-Space-blue?logo=huggingface)](https://huggingface.co/spaces/annabeth97c/SonicVerse) + +--- + +## πŸš€ Samples + +Short captions + +--- + +## πŸ“¦ Features + +βœ… Upload a 10 second music clip and get a caption + +βœ… Upload a long music clip (upto 1 minute for successful demo) to get a long detailed caption for the whole music clip. + +--- + +## πŸ› οΈ How to Run Locally + +```bash +# Clone the repo +git clone https://github.com/AMAAI-Lab/SonicVerse +cd SonicVerse + +# Install dependencies +pip install -r requirements.txt + +# Alternatively, set up conda environment +conda env create -f environment.yml +conda activate sonicverse + +# Run the app +python app.py +``` + +--- + + + +## πŸ’‘ Usage + +To use the app: +1. Select audio clip to input +2. Click the **Generate** button. +3. See the model’s output below. + +--- + +## 🧹 Built With + +- [Hugging Face Spaces](https://huggingface.co/spaces) +- [Gradio](https://gradio.app/) +- [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) +- [MERT 95M](https://huggingface.co/m-a-p/MERT-v1-95M) +--- + + diff --git a/app.py b/app.py index 5441968ab798fa095433c41cbb75cd7d42436aeb..5119d4e7afacc368e6bb4c827e23545b910bd70b 100644 --- a/app.py +++ b/app.py @@ -18,11 +18,17 @@ import torch import transformers import torchaudio -from multi_token.model_utils import MultiTaskType -from multi_token.training import ModelArguments -from multi_token.inference import load_trained_lora_model -from multi_token.data_tools import encode_chat +from openai import OpenAI +client = OpenAI() +MODEL = "gpt-4" +SLEEP_BETWEEN_CALLS = 1.0 +from sonicverse.model_utils import MultiTaskType +from sonicverse.training import ModelArguments +from sonicverse.inference import load_trained_lora_model +from sonicverse.data_tools import encode_chat + +CHUNK_LENGTH = 10 @dataclass class ServeArguments(ModelArguments): @@ -31,7 +37,6 @@ class ServeArguments(ModelArguments): temperature: float = field(default=0.01) -# Load arguments and model logging.getLogger().setLevel(logging.INFO) parser = transformers.HfArgumentParser((ServeArguments,)) @@ -45,10 +50,82 @@ model, tokenizer = load_trained_lora_model( tasks_config=serve_args.tasks_config ) -@spaces.GPU(duration=60) -def generate_caption(audio_file): - # waveform, sample_rate = torchaudio.load(audio_file) +def caption_audio(audio_file): + chunk_audio_files = split_audio(audio_file, CHUNK_LENGTH) + chunk_captions = [] + for audio_chunk in chunk_audio_files: + chunk_captions.append(generate_caption(audio_chunk)) + + if len(chunk_captions) > 1: + audio_name = os.path.splitext(os.path.basename(audio_file))[0] + long_caption = summarize_song(audio_name, chunk_captions) + + delete_files(chunk_audio_files) + + return long_caption + + else: + if len(chunk_captions) == 1: + return chunk_captions[0] + else: + return "" + +def summarize_song(song_name, chunks): + prompt = f""" +You are a music critic. Given the following chronological 10‑second chunk descriptions of a single piece, write one flowing, detailed description of the entire songβ€”its structure, instrumentation, and standout moments. Mention transition points in terms of time stamps. If the description of certain chunks does not seem to fit with those for the chunks before and after, treat those as bad descriptions with lower accuracy and do not incorporate the information. Retain concrete musical attributes such as key, chords, tempo. + +Chunks for β€œ{song_name} ”: +""" + for i, c in enumerate(chunks, 1): + prompt += f"\n {(i - 1)*0} to {i*10} seconds. {c.strip()}" + prompt += "\n\nFull song description:" + + resp = client.chat.completions.create(model=MODEL, + messages=[ + {"role": "system", "content": "You are an expert music writer."}, + {"role": "user", "content": prompt} + ], + temperature=0.0, + max_tokens=1000) + return resp.choices[0].message.content.strip() + +def delete_files(file_paths): + for path in file_paths: + try: + if os.path.isfile(path): + os.remove(path) + print(f"Deleted: {path}") + else: + print(f"Skipped (not a file or doesn't exist): {path}") + except Exception as e: + print(f"Error deleting {path}: {e}") + +def split_audio(input_path, chunk_length_seconds): + + waveform, sample_rate = torchaudio.load(input_path) + num_channels, total_samples = waveform.shape + chunk_samples = int(chunk_length_seconds * sample_rate) + + num_chunks = (total_samples + chunk_samples - 1) // chunk_samples + + base, ext = os.path.splitext(input_path) + output_paths = [] + + if (num_chunks <= 1): + return [input_path] + + for i in range(num_chunks): + start = i * chunk_samples + end = min((i + 1) * chunk_samples, total_samples) + chunk_waveform = waveform[:, start:end] + + output_file = f"{base}_{i+1:03d}{ext}" + torchaudio.save(output_file, chunk_waveform, sample_rate) + output_paths.append(output_file) + + return output_paths +def generate_caption(audio_file): req_json = { "messages": [ {"role": "user", "content": "Describe the music. "} @@ -79,7 +156,7 @@ def generate_caption(audio_file): demo = gr.Interface( - fn=generate_caption, + fn=caption_audio, inputs=gr.Audio(type="filepath", label="Upload an audio file"), outputs=gr.Textbox(label="Generated Caption"), title="SonicVerse", diff --git a/requirements.txt b/requirements.txt index cb780ca3dfebd2f48d9d854304d91851247b2893..0fb40a87265c49a4597869c5e7d36f686ccf7dd0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -74,6 +74,7 @@ mdurl==0.1.2 mpmath==1.3.0 msgpack==1.0.8 multidict==6.0.5 +git+https://huggingface.co/spaces/annabeth97c/temp#egg=multi_token&subdirectory=src/sonicverse multiprocess==0.70.16 narwhals==1.40.0 networkx==3.2.1 @@ -93,6 +94,7 @@ nvidia-cusparse-cu12==12.1.0.106 nvidia-nccl-cu12==2.20.5 nvidia-nvjitlink-cu12==12.5.82 nvidia-nvtx-cu12==12.1.105 +openai==1.82.0 orjson==3.10.18 packaging==24.1 pandas==2.2.2 diff --git a/src/sonicverse/multi_token.egg-info/PKG-INFO b/src/sonicverse/multi_token.egg-info/PKG-INFO deleted file mode 100644 index 38ae45b8de0c8699e8a4b7b05956ca44dae62373..0000000000000000000000000000000000000000 --- a/src/sonicverse/multi_token.egg-info/PKG-INFO +++ /dev/null @@ -1,6 +0,0 @@ -Metadata-Version: 2.1 -Name: multi-token -Version: 0.0.4 -Home-page: https://github.com/sshh12/multi_token -Author: Shrivu Shankar -License: Apache License 2.0 diff --git a/src/sonicverse/multi_token.egg-info/SOURCES.txt b/src/sonicverse/multi_token.egg-info/SOURCES.txt deleted file mode 100644 index 2294aae83797dcafadbf5b6d906074799c192973..0000000000000000000000000000000000000000 --- a/src/sonicverse/multi_token.egg-info/SOURCES.txt +++ /dev/null @@ -1,6 +0,0 @@ -setup.py -multi_token.egg-info/PKG-INFO -multi_token.egg-info/SOURCES.txt -multi_token.egg-info/dependency_links.txt -multi_token.egg-info/requires.txt -multi_token.egg-info/top_level.txt \ No newline at end of file diff --git a/src/sonicverse/multi_token.egg-info/requires.txt b/src/sonicverse/multi_token.egg-info/requires.txt deleted file mode 100644 index f129b5fe1018aed6ca7016b870189e6323615197..0000000000000000000000000000000000000000 --- a/src/sonicverse/multi_token.egg-info/requires.txt +++ /dev/null @@ -1,8 +0,0 @@ -transformers>=4.34.0 -accelerate>=0.21.0 -scipy>=1.11.3 -bitsandbytes>=0.41.0 -datasets>=2.14.5 -sentencepiece>=0.1.99 -peft>=0.4.0 -deepspeed==0.9.5 diff --git a/src/sonicverse/requirements.txt b/src/sonicverse/requirements.txt index f129b5fe1018aed6ca7016b870189e6323615197..0fb40a87265c49a4597869c5e7d36f686ccf7dd0 100644 --- a/src/sonicverse/requirements.txt +++ b/src/sonicverse/requirements.txt @@ -1,8 +1,167 @@ -transformers>=4.34.0 -accelerate>=0.21.0 -scipy>=1.11.3 -bitsandbytes>=0.41.0 -datasets>=2.14.5 -sentencepiece>=0.1.99 -peft>=0.4.0 +absl-py==2.1.0 +accelerate==0.29.3 +aiofiles==23.2.1 +aiohttp==3.9.5 +aiosignal==1.3.1 +altair==5.5.0 +anyio==4.9.0 +argbind==0.3.9 +asttokens==2.4.1 +async-timeout==4.0.3 +attrs==23.2.0 +audioread==3.0.1 +bert-score==0.3.13 +bitsandbytes==0.43.1 +blinker==1.8.2 +certifi==2024.7.4 +cffi==1.16.0 +charset-normalizer==3.3.2 +click==8.1.7 +contourpy==1.2.1 +cycler==0.12.1 +datasets==2.19.0 +decorator==5.1.1 deepspeed==0.9.5 +descript-audio-codec==1.0.0 +descript-audiotools==0.7.2 +dill==0.3.8 +docstring_parser==0.16 +einops==0.8.0 +evaluate==0.4.3 +exceptiongroup==1.2.2 +executing==2.0.1 +fastapi==0.115.12 +ffmpy==0.3.2 +filelock==3.15.4 +fire==0.6.0 +Flask==3.0.3 +flatten-dict==0.4.2 +fonttools==4.53.1 +frozenlist==1.4.1 +fsspec==2024.3.1 +future==1.0.0 +gradio==3.50.2 +gradio_client==0.6.1 +graphviz==0.20.3 +grpcio==1.64.1 +h11==0.16.0 +hjson==3.1.0 +httpcore==1.0.9 +httpx==0.28.1 +huggingface-hub==0.23.4 +idna==3.7 +importlib_metadata==8.0.0 +importlib_resources==6.4.0 +ipython==8.18.1 +itsdangerous==2.2.0 +jedi==0.19.1 +Jinja2==3.1.4 +joblib==1.4.2 +jsonschema==4.23.0 +jsonschema-specifications==2025.4.1 +julius==0.2.7 +kiwisolver==1.4.5 +lazy_loader==0.4 +librosa==0.10.2.post1 +llvmlite==0.43.0 +Markdown==3.6 +markdown-it-py==3.0.0 +markdown2==2.5.0 +MarkupSafe==2.1.5 +matplotlib==3.9.1 +matplotlib-inline==0.1.7 +mdurl==0.1.2 +mpmath==1.3.0 +msgpack==1.0.8 +multidict==6.0.5 +git+https://huggingface.co/spaces/annabeth97c/temp#egg=multi_token&subdirectory=src/sonicverse +multiprocess==0.70.16 +narwhals==1.40.0 +networkx==3.2.1 +ninja==1.11.1.1 +nltk==3.8.1 +numba==0.60.0 +numpy==1.26.4 +nvidia-cublas-cu12==12.1.3.1 +nvidia-cuda-cupti-cu12==12.1.105 +nvidia-cuda-nvrtc-cu12==12.1.105 +nvidia-cuda-runtime-cu12==12.1.105 +nvidia-cudnn-cu12==8.9.2.26 +nvidia-cufft-cu12==11.0.2.54 +nvidia-curand-cu12==10.3.2.106 +nvidia-cusolver-cu12==11.4.5.107 +nvidia-cusparse-cu12==12.1.0.106 +nvidia-nccl-cu12==2.20.5 +nvidia-nvjitlink-cu12==12.5.82 +nvidia-nvtx-cu12==12.1.105 +openai==1.82.0 +orjson==3.10.18 +packaging==24.1 +pandas==2.2.2 +parso==0.8.4 +peft==0.10.0 +pexpect==4.9.0 +pillow==10.3.0 +platformdirs==4.2.2 +pooch==1.8.2 +prompt_toolkit==3.0.47 +protobuf==3.19.6 +psutil==6.0.0 +ptyprocess==0.7.0 +pure-eval==0.2.2 +py-cpuinfo==9.0.0 +pyarrow==16.1.0 +pyarrow-hotfix==0.6 +pycparser==2.22 +pydantic==1.10.17 +pydub==0.25.1 +Pygments==2.18.0 +pyloudnorm==0.1.1 +pyparsing==3.1.2 +pystoi==0.4.1 +python-dateutil==2.9.0.post0 +python-multipart==0.0.20 +pytz==2024.1 +PyYAML==6.0.1 +randomname==0.2.1 +referencing==0.36.2 +regex==2024.5.15 +requests==2.32.3 +rich==13.7.1 +rouge_score==0.1.2 +rpds-py==0.25.0 +safetensors==0.4.3 +scikit-learn==1.5.1 +scipy==1.13.0 +semantic-version==2.10.0 +sentencepiece==0.2.0 +six==1.16.0 +sniffio==1.3.1 +soundfile==0.12.1 +soxr==0.3.7 +stack-data==0.6.3 +starlette==0.46.2 +sympy==1.13.0 +tensorboard==2.17.0 +tensorboard-data-server==0.7.2 +termcolor==2.4.0 +threadpoolctl==3.5.0 +tokenizers==0.19.1 +torch==2.3.1 +torch-stoi==0.2.1 +torchaudio==2.3.1 +torchviz==0.0.2 +tqdm==4.66.4 +traitlets==5.14.3 +transformers==4.40.1 +triton==2.3.1 +typing_extensions==4.12.2 +tzdata==2024.1 +urllib3==2.2.2 +uvicorn==0.34.2 +wcwidth==0.2.13 +websockets==11.0.3 +Werkzeug==3.0.3 +xxhash==3.4.1 +yarl==1.9.4 +zipp==3.19.2 diff --git a/src/sonicverse/scripts/clap_gpt_build_finetune_dataset.py b/src/sonicverse/scripts/clap_gpt_build_finetune_dataset.py index 05241954f868cd064080d8ea03f8895dace10e16..d695730487cbd6e965e0a248f7e0edc90e7e4c30 100644 --- a/src/sonicverse/scripts/clap_gpt_build_finetune_dataset.py +++ b/src/sonicverse/scripts/clap_gpt_build_finetune_dataset.py @@ -7,7 +7,7 @@ import openai from datasets import Dataset, load_dataset -from multi_token.constants import ROLE_ASSISTANT, ROLE_USER +from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER PROMPT = """ You are helping train a sound assistant that can take audio inputs and output text. diff --git a/src/sonicverse/scripts/clap_gpt_build_pretrain_dataset.py b/src/sonicverse/scripts/clap_gpt_build_pretrain_dataset.py index ab750b5bee431c8db93de90cb7c20f52921aed8e..d1ca4d18caa23488a667f2183f9a74a723b04cdc 100644 --- a/src/sonicverse/scripts/clap_gpt_build_pretrain_dataset.py +++ b/src/sonicverse/scripts/clap_gpt_build_pretrain_dataset.py @@ -7,7 +7,7 @@ import openai from datasets import Dataset, load_dataset -from multi_token.constants import ROLE_ASSISTANT, ROLE_USER +from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER PROMPT = """ You are helping write captions for audio clips. diff --git a/src/sonicverse/scripts/document_build_finetune_dataset.py b/src/sonicverse/scripts/document_build_finetune_dataset.py index 2669ca3dae9b369c1a57e74997bed6d7477cff92..32e38e077775b713b82a8873af95a3479239e1a9 100644 --- a/src/sonicverse/scripts/document_build_finetune_dataset.py +++ b/src/sonicverse/scripts/document_build_finetune_dataset.py @@ -7,8 +7,8 @@ import json from datasets import load_dataset from datasets import Dataset -from multi_token.constants import ROLE_ASSISTANT, ROLE_USER -from multi_token.modalities.document_gte import ( +from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER +from sonicverse.modalities.document_gte import ( split_text_into_documents, ) diff --git a/src/sonicverse/scripts/document_build_pretrain_dataset.py b/src/sonicverse/scripts/document_build_pretrain_dataset.py index 1c2b639182a4c96218f3c19a94d70a45064f97b1..39d6782567f1ab0d4f65ebc747d79b7865654e69 100644 --- a/src/sonicverse/scripts/document_build_pretrain_dataset.py +++ b/src/sonicverse/scripts/document_build_pretrain_dataset.py @@ -5,8 +5,8 @@ import argparse from datasets import load_dataset from datasets import Dataset -from multi_token.constants import ROLE_ASSISTANT, ROLE_USER -from multi_token.modalities.document_gte import ( +from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER +from sonicverse.modalities.document_gte import ( split_text_into_documents, ) diff --git a/src/sonicverse/scripts/evaluate_model.py b/src/sonicverse/scripts/evaluate_model.py index c7e33d05bf7cb608fdf4a12d31740090f5b16930..d38bde2d7d56042e404d450b932fdced8f71a5d6 100644 --- a/src/sonicverse/scripts/evaluate_model.py +++ b/src/sonicverse/scripts/evaluate_model.py @@ -7,12 +7,12 @@ import torch from datasets import load_from_disk -from multi_token.model_utils import MultiTaskType -from multi_token.training import ( +from sonicverse.model_utils import MultiTaskType +from sonicverse.training import ( ModelArguments, ) -from multi_token.inference import load_trained_lora_model -from multi_token.data_tools import encode_chat +from sonicverse.inference import load_trained_lora_model +from sonicverse.data_tools import encode_chat import evaluate diff --git a/src/sonicverse/scripts/evaluate_model_latest.py b/src/sonicverse/scripts/evaluate_model_latest.py index 1607b2305cd0404ddf49ad22a7693ee012d7ed0a..e4035d0b598a20be14a990ed2a0656e41fb9efba 100644 --- a/src/sonicverse/scripts/evaluate_model_latest.py +++ b/src/sonicverse/scripts/evaluate_model_latest.py @@ -5,10 +5,10 @@ from flask import Flask, request, jsonify import transformers import torch from datasets import load_from_disk -from multi_token.model_utils import MultiTaskType -from multi_token.training import ModelArguments -from multi_token.inference import load_trained_lora_model -from multi_token.data_tools import encode_chat +from sonicverse.model_utils import MultiTaskType +from sonicverse.training import ModelArguments +from sonicverse.inference import load_trained_lora_model +from sonicverse.data_tools import encode_chat import evaluate import random import bert_score diff --git a/src/sonicverse/scripts/evaluate_model_mullama.py b/src/sonicverse/scripts/evaluate_model_mullama.py index c0352b162bc665885e3f06cf746f56b856971710..97cbfa26e934e4462cf263b3c32ff35f7419a8ff 100644 --- a/src/sonicverse/scripts/evaluate_model_mullama.py +++ b/src/sonicverse/scripts/evaluate_model_mullama.py @@ -5,10 +5,10 @@ from flask import Flask, request, jsonify import transformers import torch from datasets import load_from_disk -from multi_token.model_utils import MultiTaskType -from multi_token.training import ModelArguments -from multi_token.inference import load_trained_lora_model -from multi_token.data_tools import encode_chat +from sonicverse.model_utils import MultiTaskType +from sonicverse.training import ModelArguments +from sonicverse.inference import load_trained_lora_model +from sonicverse.data_tools import encode_chat import evaluate import random import bert_score diff --git a/src/sonicverse/scripts/evaluate_model_mullama_musiccaps.py b/src/sonicverse/scripts/evaluate_model_mullama_musiccaps.py index 8a0aecc478949d1c856619edf2893c2d679f7b05..ced29b3cb347272feeea23420298716aef827b1d 100644 --- a/src/sonicverse/scripts/evaluate_model_mullama_musiccaps.py +++ b/src/sonicverse/scripts/evaluate_model_mullama_musiccaps.py @@ -5,10 +5,10 @@ from flask import Flask, request, jsonify import transformers import torch from datasets import load_from_disk -from multi_token.model_utils import MultiTaskType -from multi_token.training import ModelArguments -from multi_token.inference import load_trained_lora_model -from multi_token.data_tools import encode_chat +from sonicverse.model_utils import MultiTaskType +from sonicverse.training import ModelArguments +from sonicverse.inference import load_trained_lora_model +from sonicverse.data_tools import encode_chat import evaluate import random import bert_score diff --git a/src/sonicverse/scripts/evaluate_model_mullama_musiccaps_fixed_prompt.py b/src/sonicverse/scripts/evaluate_model_mullama_musiccaps_fixed_prompt.py index d57fc57288b92adee612ab49dae567b7fcfc6bc9..28717492904d2c162b14aa4e397cdbfba6d68e81 100644 --- a/src/sonicverse/scripts/evaluate_model_mullama_musiccaps_fixed_prompt.py +++ b/src/sonicverse/scripts/evaluate_model_mullama_musiccaps_fixed_prompt.py @@ -4,10 +4,10 @@ from flask import Flask, request, jsonify import transformers import torch from datasets import load_from_disk -from multi_token.model_utils import MultiTaskType -from multi_token.training import ModelArguments -from multi_token.inference import load_trained_lora_model -from multi_token.data_tools import encode_chat +from sonicverse.model_utils import MultiTaskType +from sonicverse.training import ModelArguments +from sonicverse.inference import load_trained_lora_model +from sonicverse.data_tools import encode_chat import evaluate import random import bert_score diff --git a/src/sonicverse/scripts/evaluate_mullama.py b/src/sonicverse/scripts/evaluate_mullama.py index 02dfeb91b7f9890949ab1eb3a61d806b891f78b9..2801458cf7062ef8bcc2465387d53f166c26ff0f 100644 --- a/src/sonicverse/scripts/evaluate_mullama.py +++ b/src/sonicverse/scripts/evaluate_mullama.py @@ -4,10 +4,10 @@ from flask import Flask, request, jsonify import transformers import torch from datasets import load_from_disk -from multi_token.model_utils import MultiTaskType -from multi_token.training import ModelArguments -from multi_token.inference import load_trained_lora_model -from multi_token.data_tools import encode_chat +from sonicverse.model_utils import MultiTaskType +from sonicverse.training import ModelArguments +from sonicverse.inference import load_trained_lora_model +from sonicverse.data_tools import encode_chat import evaluate import random import bert_score diff --git a/src/sonicverse/scripts/evaluate_temp.py b/src/sonicverse/scripts/evaluate_temp.py index 4f4ad924d29e8d28f0e587f8c47636393e8c5cca..333a2798a32d9ea561d71bab8475a045d254ce56 100644 --- a/src/sonicverse/scripts/evaluate_temp.py +++ b/src/sonicverse/scripts/evaluate_temp.py @@ -4,10 +4,10 @@ from flask import Flask, request, jsonify import transformers import torch from datasets import load_from_disk -from multi_token.model_utils import MultiTaskType -from multi_token.training import ModelArguments -from multi_token.inference import load_trained_lora_model -from multi_token.data_tools import encode_chat +from sonicverse.model_utils import MultiTaskType +from sonicverse.training import ModelArguments +from sonicverse.inference import load_trained_lora_model +from sonicverse.data_tools import encode_chat import evaluate import random import bert_score diff --git a/src/sonicverse/scripts/gym_lunar_lander_build_dataset.py b/src/sonicverse/scripts/gym_lunar_lander_build_dataset.py index 432ad2452a9a9d46e7748a47aea99712e1e19768..f6d8ad33304c4b0d665350726a09f7648c6fb5ee 100644 --- a/src/sonicverse/scripts/gym_lunar_lander_build_dataset.py +++ b/src/sonicverse/scripts/gym_lunar_lander_build_dataset.py @@ -12,7 +12,7 @@ import torch.nn as nn import numpy as np import torch -from multi_token.constants import ROLE_ASSISTANT, ROLE_USER +from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER LUNAR_LANDER_OPTIONS = ( "[FIRE LEFT ENGINE], [FIRE RIGHT ENGINE], [FIRE MAIN ENGINE], [NOTHING]".split(", ") diff --git a/src/sonicverse/scripts/gym_lunar_lander_client.py b/src/sonicverse/scripts/gym_lunar_lander_client.py index e780efff68daee878daa8b8800cdd9bbd8fd055a..cabb061b3e42ea2691a07c09861783e9766497aa 100644 --- a/src/sonicverse/scripts/gym_lunar_lander_client.py +++ b/src/sonicverse/scripts/gym_lunar_lander_client.py @@ -6,7 +6,7 @@ import os from PIL import Image import gymnasium as gym -from multi_token.constants import ROLE_USER +from sonicverse.constants import ROLE_USER LUNAR_LANDER_OPTIONS = ( "[FIRE LEFT ENGINE], [FIRE RIGHT ENGINE], [FIRE MAIN ENGINE], [NOTHING]".split(", ") diff --git a/src/sonicverse/scripts/imagebind_build_llava_finetune_dataset.py b/src/sonicverse/scripts/imagebind_build_llava_finetune_dataset.py index dc71f1004c4e52bc372c2b3ba8e961321af85997..708bb5de2444b34c7794d8f89723f41011605db8 100644 --- a/src/sonicverse/scripts/imagebind_build_llava_finetune_dataset.py +++ b/src/sonicverse/scripts/imagebind_build_llava_finetune_dataset.py @@ -6,7 +6,7 @@ import os from datasets import Dataset -from multi_token.constants import ROLE_ASSISTANT, ROLE_USER +from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER TYPES = ["audio", "image", "text"] diff --git a/src/sonicverse/scripts/imagebind_build_llava_pretrain_dataset.py b/src/sonicverse/scripts/imagebind_build_llava_pretrain_dataset.py index 71088a13222b78959a179c61bd61812db08f61d7..23754b12cd9f1e542da6002e0fefabfd6b33c746 100644 --- a/src/sonicverse/scripts/imagebind_build_llava_pretrain_dataset.py +++ b/src/sonicverse/scripts/imagebind_build_llava_pretrain_dataset.py @@ -6,7 +6,7 @@ import os from datasets import Dataset -from multi_token.constants import ROLE_ASSISTANT, ROLE_USER +from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER TYPES = ["audio", "image", "text"] diff --git a/src/sonicverse/scripts/llava_build_finetune_dataset.py b/src/sonicverse/scripts/llava_build_finetune_dataset.py index 9776fe613fa8e4b7f800c03ec61ea40dbdf7387c..c199004f3da11f210cd0a93affa815f141dc3e17 100644 --- a/src/sonicverse/scripts/llava_build_finetune_dataset.py +++ b/src/sonicverse/scripts/llava_build_finetune_dataset.py @@ -5,7 +5,7 @@ import os from datasets import Dataset -from multi_token.constants import ROLE_ASSISTANT, ROLE_USER +from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER def _convert_convo(convo) -> List: diff --git a/src/sonicverse/scripts/llava_build_pretrain_dataset.py b/src/sonicverse/scripts/llava_build_pretrain_dataset.py index 6e27ac0c92c8d93f36b774defdbf69732555ab96..0149c15036fcdb8a975ccc35f45beb40e986bf42 100644 --- a/src/sonicverse/scripts/llava_build_pretrain_dataset.py +++ b/src/sonicverse/scripts/llava_build_pretrain_dataset.py @@ -5,7 +5,7 @@ import os from datasets import Dataset -from multi_token.constants import ROLE_ASSISTANT, ROLE_USER +from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER def _convert_convo(convo) -> List: diff --git a/src/sonicverse/scripts/llava_gpt_build_multi_image_finetune_dataset.py b/src/sonicverse/scripts/llava_gpt_build_multi_image_finetune_dataset.py index 0bab1b3c264449ac3f697f1b03965e7f3923780c..0f644d91a7ba3485b068ee6f6a45a1332a4d4172 100644 --- a/src/sonicverse/scripts/llava_gpt_build_multi_image_finetune_dataset.py +++ b/src/sonicverse/scripts/llava_gpt_build_multi_image_finetune_dataset.py @@ -7,7 +7,7 @@ import openai from datasets import Dataset, load_dataset -from multi_token.constants import ROLE_ASSISTANT, ROLE_USER +from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER PROMPT = """ You are helping train a chat vision assistant that can take several image inputs and output text. diff --git a/src/sonicverse/scripts/serve_model.py b/src/sonicverse/scripts/serve_model.py index 398678d443b0e3d665861c6a594e2a9a4d7d8145..d2a8710159131c26c7b64ca45779010446e7b312 100644 --- a/src/sonicverse/scripts/serve_model.py +++ b/src/sonicverse/scripts/serve_model.py @@ -5,12 +5,12 @@ from flask import Flask, request, jsonify import transformers import torch -from multi_token.model_utils import MultiTaskType -from multi_token.training import ( +from sonicverse.model_utils import MultiTaskType +from sonicverse.training import ( ModelArguments, ) -from multi_token.inference import load_trained_lora_model -from multi_token.data_tools import encode_chat +from sonicverse.inference import load_trained_lora_model +from sonicverse.data_tools import encode_chat @dataclass diff --git a/src/sonicverse/scripts/serve_model_gradio.py b/src/sonicverse/scripts/serve_model_gradio.py index db96cd771e00e0eb6a0c94ea1fc6678434ddeef3..f68a7f902a89a8dd2711ded0cc64c4b2d70aaae3 100644 --- a/src/sonicverse/scripts/serve_model_gradio.py +++ b/src/sonicverse/scripts/serve_model_gradio.py @@ -6,10 +6,10 @@ import torch import transformers import torchaudio -from multi_token.model_utils import MultiTaskType -from multi_token.training import ModelArguments -from multi_token.inference import load_trained_lora_model -from multi_token.data_tools import encode_chat +from sonicverse.model_utils import MultiTaskType +from sonicverse.training import ModelArguments +from sonicverse.inference import load_trained_lora_model +from sonicverse.data_tools import encode_chat @dataclass diff --git a/src/sonicverse/scripts/train_model.py b/src/sonicverse/scripts/train_model.py index a2bfeb304dba9e763fb2996d73acfe2e1c4e3d04..5edc326d6093640e4e3132eb47e779ae8e8ea246 100644 --- a/src/sonicverse/scripts/train_model.py +++ b/src/sonicverse/scripts/train_model.py @@ -1,20 +1,20 @@ import transformers import logging -from multi_token.training import ( +from sonicverse.training import ( TrainingArguments, ModelArguments, train_for_modalities, ) -from multi_token.training_data import ( +from sonicverse.training_data import ( DataArguments, TrainDataArguments, EvaluationDataArguments, ) -from multi_token.model_utils import MultiTaskType -from multi_token.language_models import LANGUAGE_MODEL_NAME_TO_CLASS -from multi_token.modalities import MODALITY_BUILDERS +from sonicverse.model_utils import MultiTaskType +from sonicverse.language_models import LANGUAGE_MODEL_NAME_TO_CLASS +from sonicverse.modalities import MODALITY_BUILDERS if __name__ == "__main__": logging.getLogger().setLevel(logging.INFO) diff --git a/src/sonicverse/scripts/whisper_build_pretrain_dataset.py b/src/sonicverse/scripts/whisper_build_pretrain_dataset.py index 3c1c92e81ad8692a8c173709c544a27b1b71cd72..36ac5348b14433a52a1d64c5f4af6a0fb8795172 100644 --- a/src/sonicverse/scripts/whisper_build_pretrain_dataset.py +++ b/src/sonicverse/scripts/whisper_build_pretrain_dataset.py @@ -5,7 +5,7 @@ import argparse from datasets import load_dataset from datasets import Dataset -from multi_token.constants import ROLE_ASSISTANT, ROLE_USER +from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER DATASET_ARGS = dict( path="mozilla-foundation/common_voice_15_0", name="en", split="train" diff --git a/src/sonicverse/scripts/whisper_gpt_build_finetune_dataset.py b/src/sonicverse/scripts/whisper_gpt_build_finetune_dataset.py index f7bff923ac0fb77b5807506d995c3394d13a7618..ce3991b77b2ee72b8e4bdd8511a2d2c2e3ffbbcd 100644 --- a/src/sonicverse/scripts/whisper_gpt_build_finetune_dataset.py +++ b/src/sonicverse/scripts/whisper_gpt_build_finetune_dataset.py @@ -7,7 +7,7 @@ import openai from datasets import Dataset, load_dataset -from multi_token.constants import ROLE_ASSISTANT, ROLE_USER +from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER DATASET_ARGS = dict( path="mozilla-foundation/common_voice_15_0", name="en", split="train" diff --git a/src/sonicverse/scripts/xclip_build_finetune_dataset.py b/src/sonicverse/scripts/xclip_build_finetune_dataset.py index 2aaa4323fb34057b5cf0d559b484c8f6244ebd02..ca8fef9e91b3ea0684bfb018d27917783884b248 100644 --- a/src/sonicverse/scripts/xclip_build_finetune_dataset.py +++ b/src/sonicverse/scripts/xclip_build_finetune_dataset.py @@ -5,7 +5,7 @@ import json from datasets import Dataset, load_dataset -from multi_token.constants import ROLE_ASSISTANT, ROLE_USER +from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER def _write_convo(row) -> List: diff --git a/src/sonicverse/scripts/xclip_build_pretrain_dataset.py b/src/sonicverse/scripts/xclip_build_pretrain_dataset.py index e705dbcd1f5c0f06bd14303c3e20b73c7022ce37..d8c0f90220565b81fe87444bd0b2e287173bd890 100644 --- a/src/sonicverse/scripts/xclip_build_pretrain_dataset.py +++ b/src/sonicverse/scripts/xclip_build_pretrain_dataset.py @@ -6,7 +6,7 @@ import json from huggingface_hub import hf_hub_download from datasets import Dataset -from multi_token.constants import ROLE_ASSISTANT, ROLE_USER +from sonicverse.constants import ROLE_ASSISTANT, ROLE_USER PRETRAIN_PHRASES = [ "Repeat the content of the video