jan-hq commited on
Commit
3398a60
·
verified ·
1 Parent(s): c8bbc93

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +72 -1
README.md CHANGED
@@ -32,7 +32,78 @@ We continue to expand [Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-lla
32
 
33
  ## How to Get Started with the Model
34
 
35
- > TODO
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  ## Training process
38
  **Training Metrics Image**: Below is a snapshot of the training loss curve visualized.
 
32
 
33
  ## How to Get Started with the Model
34
 
35
+ ```
36
+ import torch
37
+ import torchaudio
38
+ from encodec import EncodecModel
39
+ from encodec.utils import convert_audio
40
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
41
+
42
+ # Audio to Sound Tokens
43
+ def audio_to_sound_tokens(audio_path, target_bandwidth=1.5, device="cuda"):
44
+ model = EncodecModel.encodec_model_24khz()
45
+ model.set_target_bandwidth(target_bandwidth)
46
+ model.to(device)
47
+
48
+ wav, sr = torchaudio.load(audio_path)
49
+ wav = convert_audio(wav, sr, model.sample_rate, model.channels)
50
+ wav = wav.unsqueeze(0).to(device)
51
+
52
+ with torch.no_grad():
53
+ encoded_frames = model.encode(wav)
54
+ codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1)
55
+
56
+ audio_code1, audio_code2 = codes[0][0], codes[0][1]
57
+ flatten_tokens = torch.stack((audio_code1, audio_code2), dim=1).flatten().tolist()
58
+
59
+ result = ''.join(f'<|sound_{num}|>' for num in flatten_tokens)
60
+ return f'<|sound_start|>{result}<|sound_end|>'
61
+
62
+ # LLM Pipeline Setup
63
+ def setup_pipeline(model_path, use_4bit=True):
64
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
65
+
66
+ model_kwargs = {"device_map": "auto"}
67
+ if use_4bit:
68
+ model_kwargs["quantization_config"] = BitsAndBytesConfig(
69
+ load_in_4bit=True,
70
+ bnb_4bit_compute_dtype=torch.bfloat16,
71
+ bnb_4bit_use_double_quant=True,
72
+ bnb_4bit_quant_type="nf4",
73
+ )
74
+
75
+ model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs)
76
+ return pipeline("text-generation", model=model, tokenizer=tokenizer)
77
+
78
+ # Text Generation
79
+ def generate_text(pipe, messages, max_new_tokens=64, temperature=0.0, do_sample=False):
80
+ generation_args = {
81
+ "max_new_tokens": max_new_tokens,
82
+ "return_full_text": False,
83
+ "temperature": temperature,
84
+ "do_sample": do_sample,
85
+ }
86
+ output = pipe(messages, **generation_args)
87
+ return output[0]['generated_text']
88
+
89
+ # Main process
90
+ def audio_to_text(audio_path, model_path, use_4bit=True):
91
+ # Convert audio to sound tokens
92
+ sound_tokens = audio_to_sound_tokens(audio_path)
93
+
94
+ # Setup LLM pipeline
95
+ pipe = setup_pipeline(model_path, use_4bit)
96
+
97
+ # Generate text
98
+ messages = [{"role": "user", "content": sound_tokens}]
99
+ return generate_text(pipe, messages)
100
+
101
+ # Usage example
102
+ audio_path = "/path/to/your/audio/file"
103
+ model_path = "jan-hq/Jan-Llama3-0708"
104
+
105
+ generated_text = audio_to_text(audio_path, model_path)
106
+ ```
107
 
108
  ## Training process
109
  **Training Metrics Image**: Below is a snapshot of the training loss curve visualized.