from functools import partial from optimum.intel.openvino import OVQuantizer, OVModelForTokenClassification from transformers import AutoTokenizer, AutoModelForTokenClassification model_id = "elastic/distilbert-base-uncased-finetuned-conll03-english" # model_id = "xlm-roberta-large-finetuned-conll03-english" model_id = "dbmdz/bert-large-cased-finetuned-conll03-english" model = AutoModelForTokenClassification.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) # tokenizer.pad_token_id=0 def preprocess_fn(examples, tokenizer): return tokenizer( examples["tokens"], padding="max_length", max_length=128, truncation=True, is_split_into_words=True ) quantizer = OVQuantizer.from_pretrained(model) calibration_dataset = quantizer.get_calibration_dataset( "conll2003", preprocess_function=partial(preprocess_fn, tokenizer=tokenizer), num_samples=300, dataset_split="validation", preprocess_batch=True, ) # The directory where the quantized model will be saved save_dir = f"{model_id}_ov_int8" # Apply static quantization and save the resulting model in the OpenVINO IR format quantizer.quantize(calibration_dataset=calibration_dataset, save_directory=save_dir) # Load the quantized model optimized_model = OVModelForTokenClassification.from_pretrained(save_dir)