Update README.md
Browse files
README.md
CHANGED
@@ -236,15 +236,13 @@ transcription.
|
|
236 |
>>> processor = WhisperProcessor.from_pretrained("openai/whisper-large")
|
237 |
>>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
|
238 |
|
239 |
-
>>> decoder_input_ids = processor.tokenizer.encode("<|startoftranscript|><|fr|><|transcribe|><|notimestamps|>", return_tensors="pt")
|
240 |
-
|
241 |
>>> # load dummy dataset and read soundfiles
|
242 |
>>> ds = load_dataset("common_voice", "fr", split="test", streaming=True)
|
243 |
>>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
|
244 |
>>> input_speech = next(iter(ds))["audio"]["array"]
|
245 |
>>> # tokenize
|
246 |
>>> input_features = processor(input_speech, return_tensors="pt").input_features
|
247 |
-
>>> predicted_ids = model.generate(input_features
|
248 |
>>> transcription = processor.batch_decode(predicted_ids)
|
249 |
['<|startoftranscript|><|fr|><|transcribe|><|notimestamps|> Un vrai travail intéressant va enfin être mené sur ce sujet.<|endoftext|>']
|
250 |
|
@@ -266,15 +264,15 @@ The "<|translate|>" is used as the first decoder input token to specify the tran
|
|
266 |
>>> processor = WhisperProcessor.from_pretrained("openai/whisper-large")
|
267 |
>>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
|
268 |
|
269 |
-
>>> decoder_input_ids = processor.tokenizer.encode("<|startoftranscript|><|fr|><|translate|><|notimestamps|>", return_tensors="pt")
|
270 |
-
|
271 |
>>> # load dummy dataset and read soundfiles
|
272 |
>>> ds = load_dataset("common_voice", "fr", split="test", streaming=True)
|
273 |
>>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
|
274 |
>>> input_speech = next(iter(ds))["audio"]["array"]
|
275 |
>>> # tokenize
|
276 |
>>> input_features = processor(input_speech, return_tensors="pt").input_features
|
277 |
-
>>>
|
|
|
|
|
278 |
>>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens = True)
|
279 |
[' A real interesting work will be done on this subject.']
|
280 |
```
|
|
|
236 |
>>> processor = WhisperProcessor.from_pretrained("openai/whisper-large")
|
237 |
>>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
|
238 |
|
|
|
|
|
239 |
>>> # load dummy dataset and read soundfiles
|
240 |
>>> ds = load_dataset("common_voice", "fr", split="test", streaming=True)
|
241 |
>>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
|
242 |
>>> input_speech = next(iter(ds))["audio"]["array"]
|
243 |
>>> # tokenize
|
244 |
>>> input_features = processor(input_speech, return_tensors="pt").input_features
|
245 |
+
>>> predicted_ids = model.generate(input_features)
|
246 |
>>> transcription = processor.batch_decode(predicted_ids)
|
247 |
['<|startoftranscript|><|fr|><|transcribe|><|notimestamps|> Un vrai travail intéressant va enfin être mené sur ce sujet.<|endoftext|>']
|
248 |
|
|
|
264 |
>>> processor = WhisperProcessor.from_pretrained("openai/whisper-large")
|
265 |
>>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
|
266 |
|
|
|
|
|
267 |
>>> # load dummy dataset and read soundfiles
|
268 |
>>> ds = load_dataset("common_voice", "fr", split="test", streaming=True)
|
269 |
>>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=16_000))
|
270 |
>>> input_speech = next(iter(ds))["audio"]["array"]
|
271 |
>>> # tokenize
|
272 |
>>> input_features = processor(input_speech, return_tensors="pt").input_features
|
273 |
+
>>> forced_decoder_ids = processor._get_decoder_prompt_ids(language = "fr", task = "translate")
|
274 |
+
|
275 |
+
>>> predicted_ids = model.generate(input_features, forced_decoder_ids = forced_decoder_ids)
|
276 |
>>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens = True)
|
277 |
[' A real interesting work will be done on this subject.']
|
278 |
```
|