jan-hq commited on
Commit
8472057
·
verified ·
1 Parent(s): 657d868

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +28 -21
README.md CHANGED
@@ -32,38 +32,53 @@ We continue to expand [Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-lla
32
 
33
  ## How to Get Started with the Model
34
 
 
 
35
  ```python
36
  import torch
37
  import torchaudio
38
  from encodec import EncodecModel
39
  from encodec.utils import convert_audio
40
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
41
 
42
- # Audio to Sound Tokens
43
  def audio_to_sound_tokens(audio_path, target_bandwidth=1.5, device="cuda"):
 
44
  model = EncodecModel.encodec_model_24khz()
45
  model.set_target_bandwidth(target_bandwidth)
46
  model.to(device)
47
 
 
48
  wav, sr = torchaudio.load(audio_path)
49
  wav = convert_audio(wav, sr, model.sample_rate, model.channels)
50
  wav = wav.unsqueeze(0).to(device)
51
 
 
52
  with torch.no_grad():
53
  encoded_frames = model.encode(wav)
54
  codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1)
55
 
 
56
  audio_code1, audio_code2 = codes[0][0], codes[0][1]
57
  flatten_tokens = torch.stack((audio_code1, audio_code2), dim=1).flatten().tolist()
58
 
 
59
  result = ''.join(f'<|sound_{num}|>' for num in flatten_tokens)
60
  return f'<|sound_start|>{result}<|sound_end|>'
61
 
62
- # LLM Pipeline Setup
 
 
 
 
 
 
 
 
 
63
  def setup_pipeline(model_path, use_4bit=True):
64
  tokenizer = AutoTokenizer.from_pretrained(model_path)
65
 
66
  model_kwargs = {"device_map": "auto"}
 
67
  if use_4bit:
68
  model_kwargs["quantization_config"] = BitsAndBytesConfig(
69
  load_in_4bit=True,
@@ -73,9 +88,9 @@ def setup_pipeline(model_path, use_4bit=True):
73
  )
74
 
75
  model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs)
 
76
  return pipeline("text-generation", model=model, tokenizer=tokenizer)
77
 
78
- # Text Generation
79
  def generate_text(pipe, messages, max_new_tokens=64, temperature=0.0, do_sample=False):
80
  generation_args = {
81
  "max_new_tokens": max_new_tokens,
@@ -83,26 +98,18 @@ def generate_text(pipe, messages, max_new_tokens=64, temperature=0.0, do_sample=
83
  "temperature": temperature,
84
  "do_sample": do_sample,
85
  }
 
86
  output = pipe(messages, **generation_args)
87
  return output[0]['generated_text']
88
 
89
- # Main process
90
- def audio_to_text(audio_path, model_path, use_4bit=True):
91
- # Convert audio to sound tokens
92
- sound_tokens = audio_to_sound_tokens(audio_path)
93
-
94
- # Setup LLM pipeline
95
- pipe = setup_pipeline(model_path, use_4bit)
96
-
97
- # Generate text
98
- messages = [{"role": "user", "content": sound_tokens}]
99
- return generate_text(pipe, messages)
100
-
101
- # Usage example
102
- audio_path = "/path/to/your/audio/file"
103
- model_path = "jan-hq/Jan-Llama3-0708"
104
-
105
- generated_text = audio_to_text(audio_path, model_path)
106
  ```
107
 
108
  ## Training process
 
32
 
33
  ## How to Get Started with the Model
34
 
35
+ First, we need to convert the audio file to sound tokens
36
+
37
  ```python
38
  import torch
39
  import torchaudio
40
  from encodec import EncodecModel
41
  from encodec.utils import convert_audio
 
42
 
 
43
  def audio_to_sound_tokens(audio_path, target_bandwidth=1.5, device="cuda"):
44
+ # Initialize Encodec
45
  model = EncodecModel.encodec_model_24khz()
46
  model.set_target_bandwidth(target_bandwidth)
47
  model.to(device)
48
 
49
+ # Load and preprocess audio
50
  wav, sr = torchaudio.load(audio_path)
51
  wav = convert_audio(wav, sr, model.sample_rate, model.channels)
52
  wav = wav.unsqueeze(0).to(device)
53
 
54
+ # Encode audio
55
  with torch.no_grad():
56
  encoded_frames = model.encode(wav)
57
  codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1)
58
 
59
+ # Flatten codes
60
  audio_code1, audio_code2 = codes[0][0], codes[0][1]
61
  flatten_tokens = torch.stack((audio_code1, audio_code2), dim=1).flatten().tolist()
62
 
63
+ # Convert to sound tokens
64
  result = ''.join(f'<|sound_{num}|>' for num in flatten_tokens)
65
  return f'<|sound_start|>{result}<|sound_end|>'
66
 
67
+ # Usage
68
+ sound_tokens = audio_to_sound_tokens("/path/to/your/audio/file")
69
+ ```
70
+
71
+ Then, we can inference the model the same as any other LLM.
72
+
73
+ ```python
74
+ import torch
75
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
76
+
77
  def setup_pipeline(model_path, use_4bit=True):
78
  tokenizer = AutoTokenizer.from_pretrained(model_path)
79
 
80
  model_kwargs = {"device_map": "auto"}
81
+
82
  if use_4bit:
83
  model_kwargs["quantization_config"] = BitsAndBytesConfig(
84
  load_in_4bit=True,
 
88
  )
89
 
90
  model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs)
91
+
92
  return pipeline("text-generation", model=model, tokenizer=tokenizer)
93
 
 
94
  def generate_text(pipe, messages, max_new_tokens=64, temperature=0.0, do_sample=False):
95
  generation_args = {
96
  "max_new_tokens": max_new_tokens,
 
98
  "temperature": temperature,
99
  "do_sample": do_sample,
100
  }
101
+
102
  output = pipe(messages, **generation_args)
103
  return output[0]['generated_text']
104
 
105
+ # Usage
106
+ llm_path = "jan-hq/Jan-Llama3-0708"
107
+ pipe = setup_pipeline(llm_path, use_4bit=True)
108
+ messages = [
109
+ {"role": "user", "content": sound_tokens},
110
+ ]
111
+ generated_text = generate_text(pipe, messages)
112
+ print(generated_text)
 
 
 
 
 
 
 
 
 
113
  ```
114
 
115
  ## Training process