File size: 2,964 Bytes
a4df890
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import os
from transformers import AutoTokenizer
from llmcompressor.transformers import SparseAutoModelForCausalLM
from llmcompressor.transformers import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier

def get_user_input():
    """Get model configuration from user input"""
    print("\n=== Model Quantization Configuration ===")
    
    while True:
        model_id = input("\nEnter the HuggingFace model ID (e.g., meta-llama/Llama-2-7b-chat-hf): ").strip()
        if model_id:
            break
        print("Model ID cannot be empty. Please try again.")
    
    return model_id

def quantize_model_fp8(model_id):
    """

    Quantize a model to FP8 Dynamic format using llm-compressor on CPU.

    

    Args:

        model_id (str): HuggingFace model ID

    """
    try:
        print(f"\nLoading model and tokenizer: {model_id}")
        model = SparseAutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="cpu",
            torch_dtype="auto"
        )
        tokenizer = AutoTokenizer.from_pretrained(model_id)

        print("\nConfiguring FP8 quantization recipe...")
        recipe = QuantizationModifier(
            targets="Linear",
            scheme="FP8_DYNAMIC",
            ignore=["lm_head"]
        )

        print("\nApplying quantization (this may take a while)...")
        oneshot(model=model, recipe=recipe)

        model_name = model_id.split("/")[-1]
        save_dir = f"{model_name}-FP8-Dynamic"
        
        print(f"\nSaving quantized model to: {save_dir}")
        model.save_pretrained(save_dir, save_compressed=True)
        tokenizer.save_pretrained(save_dir)
        
        print("\nβœ… Quantization completed successfully!")
        print(f"πŸ“ Quantized model saved to: {os.path.abspath(save_dir)}")
        return save_dir

    except Exception as e:
        print(f"\n❌ Error during quantization: {str(e)}")
        return None

if __name__ == "__main__":
    print("""

╔══════════════════════════════════════╗

β•‘     Model Quantization to FP8        β•‘

β•‘        (Dynamic Per-Token)           β•‘

β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•    

    """)
    
    model_id = get_user_input()
    
    print("\n=== Configuration Summary ===")
    print(f"Model ID: {model_id}")
    print("Quantization Type: FP8 Dynamic (per-token)")
    print("Device: CPU")
    
    while True:
        confirm = input("\nProceed with quantization? (y/n): ").lower().strip()
        if confirm in ['y', 'n']:
            break
        print("Please enter 'y' for yes or 'n' for no.")
    
    if confirm == 'y':
        quantized_model_path = quantize_model_fp8(model_id)
    else:
        print("\nQuantization cancelled.")