added technical report
Browse files- .gitattributes +1 -0
- README.md +31 -42
- phi_4_mm.tech_report.02252025.pdf +3 -0
.gitattributes
CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
37 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -42,13 +42,13 @@ Polish, Portuguese, Russian, Spanish, Swedish, Thai, Turkish, Ukrainian
|
|
42 |
- Vision: English
|
43 |
- Audio: English, Chinese, German, French, Italian, Japanese, Spanish, Portuguese
|
44 |
|
45 |
-
π‘ [Phi-4-multimodal Portal]() <br>
|
46 |
-
π° [Phi-4-multimodal Microsoft Blog]() <br>
|
47 |
-
π [Phi-4-multimodal Technical Report]() <br>
|
48 |
-
π©βπ³ [Phi-4-multimodal Cookbook]() <br>
|
49 |
π₯οΈ [Try It](https://aka.ms/try-phi4mm) <br>
|
50 |
|
51 |
-
**Phi-4**: [[multimodal-instruct](https://huggingface.co/microsoft/Phi-
|
52 |
|
53 |
## Intended Uses
|
54 |
|
@@ -218,10 +218,14 @@ torch==2.6.0
|
|
218 |
transformers==4.48.2
|
219 |
accelerate==1.3.0
|
220 |
soundfile==0.13.1
|
221 |
-
pillow==
|
|
|
|
|
|
|
|
|
222 |
```
|
223 |
|
224 |
-
Phi-4-multimodal-instruct is also available in [Azure AI Studio]()
|
225 |
|
226 |
### Tokenizer
|
227 |
|
@@ -324,7 +328,7 @@ If it is a square image, the resolution would be around (8*448 by 8*448). For mu
|
|
324 |
|
325 |
### Loading the model locally
|
326 |
|
327 |
-
After obtaining the Phi-4-
|
328 |
|
329 |
```python
|
330 |
import requests
|
@@ -334,6 +338,8 @@ import io
|
|
334 |
from PIL import Image
|
335 |
import soundfile as sf
|
336 |
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
|
|
|
|
|
337 |
|
338 |
# Define model path
|
339 |
model_path = "microsoft/Phi-4-multimodal-instruct"
|
@@ -380,44 +386,27 @@ print(f'>>> Response\n{response}')
|
|
380 |
|
381 |
# Part 2: Audio Processing
|
382 |
print("\n--- AUDIO PROCESSING ---")
|
383 |
-
audio_url = "https://
|
384 |
speech_prompt = "Transcribe the audio to text, and then translate the audio to French. Use <sep> as a separator between the original transcript and the translation."
|
385 |
prompt = f'{user_prompt}<|audio_1|>{speech_prompt}{prompt_suffix}{assistant_prompt}'
|
386 |
print(f'>>> Prompt\n{prompt}')
|
387 |
|
388 |
-
#
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
max_new_tokens=1000,
|
405 |
-
generation_config=generation_config,
|
406 |
-
)
|
407 |
-
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
|
408 |
-
response = processor.batch_decode(
|
409 |
-
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
410 |
-
)[0]
|
411 |
-
print(f'>>> Response\n{response}')
|
412 |
-
|
413 |
-
# Clean up
|
414 |
-
try:
|
415 |
-
os.remove(temp_audio_path)
|
416 |
-
print(f"Temporary file {temp_audio_path} removed successfully")
|
417 |
-
except Exception as e:
|
418 |
-
print(f"Error removing temporary file: {e}")
|
419 |
-
else:
|
420 |
-
print(f"Failed to download audio file: {audio_response.status_code}")
|
421 |
```
|
422 |
|
423 |
## Responsible AI Considerations
|
|
|
42 |
- Vision: English
|
43 |
- Audio: English, Chinese, German, French, Italian, Japanese, Spanish, Portuguese
|
44 |
|
45 |
+
π‘ [Phi-4-multimodal Portal](https://aka.ms/phi-4-multimodal/azure) <br>
|
46 |
+
π° [Phi-4-multimodal Microsoft Blog](https://aka.ms/phi4techblog-feb2025) <br>
|
47 |
+
π [Phi-4-multimodal Technical Report](https://aka.ms/phi-4-multimodal/techreport) <br>
|
48 |
+
π©βπ³ [Phi-4-multimodal Cookbook](https://github.com/microsoft/PhiCookBook) <br>
|
49 |
π₯οΈ [Try It](https://aka.ms/try-phi4mm) <br>
|
50 |
|
51 |
+
**Phi-4**: [[multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) | [onnx](https://huggingface.co/microsoft/Phi-4-multimodal-instruct)]; [[mini-instruct]](https://huggingface.co/microsoft/Phi-4-mini-instruct);
|
52 |
|
53 |
## Intended Uses
|
54 |
|
|
|
218 |
transformers==4.48.2
|
219 |
accelerate==1.3.0
|
220 |
soundfile==0.13.1
|
221 |
+
pillow==11.1.0
|
222 |
+
scipy==1.15.2
|
223 |
+
torchvision==0.21.0
|
224 |
+
backoff==2.2.1
|
225 |
+
peft==0.13.2
|
226 |
```
|
227 |
|
228 |
+
Phi-4-multimodal-instruct is also available in [Azure AI Studio](https://aka.ms/phi-4-multimodal/azure)
|
229 |
|
230 |
### Tokenizer
|
231 |
|
|
|
328 |
|
329 |
### Loading the model locally
|
330 |
|
331 |
+
After obtaining the Phi-4-multimodal-instruct model checkpoints, users can use this sample code for inference.
|
332 |
|
333 |
```python
|
334 |
import requests
|
|
|
338 |
from PIL import Image
|
339 |
import soundfile as sf
|
340 |
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
|
341 |
+
from urllib.request import urlopen
|
342 |
+
|
343 |
|
344 |
# Define model path
|
345 |
model_path = "microsoft/Phi-4-multimodal-instruct"
|
|
|
386 |
|
387 |
# Part 2: Audio Processing
|
388 |
print("\n--- AUDIO PROCESSING ---")
|
389 |
+
audio_url = "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac"
|
390 |
speech_prompt = "Transcribe the audio to text, and then translate the audio to French. Use <sep> as a separator between the original transcript and the translation."
|
391 |
prompt = f'{user_prompt}<|audio_1|>{speech_prompt}{prompt_suffix}{assistant_prompt}'
|
392 |
print(f'>>> Prompt\n{prompt}')
|
393 |
|
394 |
+
# Downlowd and open audio file
|
395 |
+
audio, samplerate = sf.read(io.BytesIO(urlopen(audio_url).read()))
|
396 |
+
|
397 |
+
# Process with the model
|
398 |
+
inputs = processor(text=prompt, audios=[(audio, samplerate)], return_tensors='pt').to('cuda:0')
|
399 |
+
|
400 |
+
generate_ids = model.generate(
|
401 |
+
**inputs,
|
402 |
+
max_new_tokens=1000,
|
403 |
+
generation_config=generation_config,
|
404 |
+
)
|
405 |
+
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
|
406 |
+
response = processor.batch_decode(
|
407 |
+
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
408 |
+
)[0]
|
409 |
+
print(f'>>> Response\n{response}')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
410 |
```
|
411 |
|
412 |
## Responsible AI Considerations
|
phi_4_mm.tech_report.02252025.pdf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a5469d9123cbee2b41729db3217cacfeaa96eaf543868caa2eeec7cf2d24547d
|
3 |
+
size 5295165
|