nguyenbh commited on
Commit
17df1f5
·
1 Parent(s): bd4b39b

Update examples

Browse files
Files changed (1) hide show
  1. README.md +58 -23
README.md CHANGED
@@ -330,59 +330,94 @@ After obtaining the Phi-4-Mini-MM-Instruct model checkpoints, users can use this
330
  import requests
331
  import torch
332
  import os
 
333
  from PIL import Image
334
- import soundfile
335
- from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig,pipeline,AutoTokenizer
336
 
337
- processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
 
338
 
 
 
339
  model = AutoModelForCausalLM.from_pretrained(
340
- "microsoft/Phi-4-multimodal-instruct",
341
  device_map="cuda",
342
  torch_dtype="auto",
343
  trust_remote_code=True,
344
- _attn_implementation='flash_attention_2',
345
  ).cuda()
346
 
347
- generation_config = GenerationConfig.from_pretrained(model_path, 'generation_config.json')
 
348
 
 
349
  user_prompt = '<|user|>'
350
  assistant_prompt = '<|assistant|>'
351
  prompt_suffix = '<|end|>'
352
 
 
 
 
353
  prompt = f'{user_prompt}<|image_1|>What is shown in this image?{prompt_suffix}{assistant_prompt}'
354
- url = 'https://www.ilankelman.org/stopsigns/australia.jpg'
355
  print(f'>>> Prompt\n{prompt}')
356
- image = Image.open(requests.get(url, stream=True).raw)
 
 
357
  inputs = processor(text=prompt, images=image, return_tensors='pt').to('cuda:0')
 
 
358
  generate_ids = model.generate(
359
  **inputs,
360
  max_new_tokens=1000,
361
  generation_config=generation_config,
362
  )
363
- generate_ids = generate_ids[:, inputs['input_ids'].shape[1] :]
364
  response = processor.batch_decode(
365
  generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
366
  )[0]
367
  print(f'>>> Response\n{response}')
368
 
369
-
 
 
370
  speech_prompt = "Transcribe the audio to text, and then translate the audio to French. Use <sep> as a separator between the original transcript and the translation."
371
  prompt = f'{user_prompt}<|audio_1|>{speech_prompt}{prompt_suffix}{assistant_prompt}'
372
-
373
  print(f'>>> Prompt\n{prompt}')
374
- audio = soundfile.read('https://voiceage.com/wbsamples/in_mono/Trailer.wav')
375
- inputs = processor(text=prompt, audios=[audio], return_tensors='pt').to('cuda:0')
376
- generate_ids = model.generate(
377
- **inputs,
378
- max_new_tokens=1000,
379
- generation_config=generation_config,
380
- )
381
- generate_ids = generate_ids[:, inputs['input_ids'].shape[1] :]
382
- response = processor.batch_decode(
383
- generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
384
- )[0]
385
- print(f'>>> Response\n{response}')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
386
  ```
387
 
388
  ## Responsible AI Considerations
 
330
  import requests
331
  import torch
332
  import os
333
+ import io
334
  from PIL import Image
335
+ import soundfile as sf
336
+ from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
337
 
338
+ # Define model path
339
+ model_path = "microsoft/Phi-4-multimodal-instruct"
340
 
341
+ # Load model and processor
342
+ processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
343
  model = AutoModelForCausalLM.from_pretrained(
344
+ model_path,
345
  device_map="cuda",
346
  torch_dtype="auto",
347
  trust_remote_code=True,
348
+ attn_implementation='flash_attention_2',
349
  ).cuda()
350
 
351
+ # Load generation config
352
+ generation_config = GenerationConfig.from_pretrained(model_path)
353
 
354
+ # Define prompt structure
355
  user_prompt = '<|user|>'
356
  assistant_prompt = '<|assistant|>'
357
  prompt_suffix = '<|end|>'
358
 
359
+ # Part 1: Image Processing
360
+ print("\n--- IMAGE PROCESSING ---")
361
+ image_url = 'https://www.ilankelman.org/stopsigns/australia.jpg'
362
  prompt = f'{user_prompt}<|image_1|>What is shown in this image?{prompt_suffix}{assistant_prompt}'
 
363
  print(f'>>> Prompt\n{prompt}')
364
+
365
+ # Download and open image
366
+ image = Image.open(requests.get(image_url, stream=True).raw)
367
  inputs = processor(text=prompt, images=image, return_tensors='pt').to('cuda:0')
368
+
369
+ # Generate response
370
  generate_ids = model.generate(
371
  **inputs,
372
  max_new_tokens=1000,
373
  generation_config=generation_config,
374
  )
375
+ generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
376
  response = processor.batch_decode(
377
  generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
378
  )[0]
379
  print(f'>>> Response\n{response}')
380
 
381
+ # Part 2: Audio Processing
382
+ print("\n--- AUDIO PROCESSING ---")
383
+ audio_url = "https://voiceage.com/wbsamples/in_mono/Trailer.wav"
384
  speech_prompt = "Transcribe the audio to text, and then translate the audio to French. Use <sep> as a separator between the original transcript and the translation."
385
  prompt = f'{user_prompt}<|audio_1|>{speech_prompt}{prompt_suffix}{assistant_prompt}'
 
386
  print(f'>>> Prompt\n{prompt}')
387
+
388
+ # Download audio file
389
+ audio_response = requests.get(audio_url)
390
+ if audio_response.status_code == 200:
391
+ # First save audio to a temporary file
392
+ temp_audio_path = "temp_audio.wav"
393
+ with open(temp_audio_path, "wb") as f:
394
+ f.write(audio_response.content)
395
+
396
+ # Read using soundfile
397
+ audio, samplerate = sf.read(temp_audio_path)
398
+
399
+ # Process with the model
400
+ inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to('cuda:0')
401
+
402
+ generate_ids = model.generate(
403
+ **inputs,
404
+ max_new_tokens=1000,
405
+ generation_config=generation_config,
406
+ )
407
+ generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
408
+ response = processor.batch_decode(
409
+ generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
410
+ )[0]
411
+ print(f'>>> Response\n{response}')
412
+
413
+ # Clean up
414
+ try:
415
+ os.remove(temp_audio_path)
416
+ print(f"Temporary file {temp_audio_path} removed successfully")
417
+ except Exception as e:
418
+ print(f"Error removing temporary file: {e}")
419
+ else:
420
+ print(f"Failed to download audio file: {audio_response.status_code}")
421
  ```
422
 
423
  ## Responsible AI Considerations