Spaces:
Runtime error
Runtime error
Commit
Β·
305d23f
1
Parent(s):
d4461bd
first commit
Browse files- README.md +5 -5
- app.py +80 -0
- quantization.py +83 -0
- requirements.txt +9 -0
README.md
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 4.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
|
|
1 |
---
|
2 |
+
title: Optimized SentenceTransformer Space
|
3 |
+
emoji: π
|
4 |
+
colorFrom: pink
|
5 |
+
colorTo: green
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 4.28.3
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
app.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from transformers import pipeline
|
3 |
+
import numpy as np
|
4 |
+
from time import perf_counter
|
5 |
+
from setfit import SetFitModel
|
6 |
+
from optimum.onnxruntime import ORTModelForFeatureExtraction
|
7 |
+
from quantization import OnnxSetFitModel
|
8 |
+
from transformers import AutoTokenizer
|
9 |
+
# Load the models
|
10 |
+
|
11 |
+
# model1 = SetFitModel.from_pretrained("hsmashiana/base_model_hpml")
|
12 |
+
# ort_model = ORTModelForFeatureExtraction.from_pretrained("hsmashiana/optimized_model_hpml", file_name="model_quantized.onnx")
|
13 |
+
# tokenizer = AutoTokenizer.from_pretrained("hsmashiana/optimized_model_hpml")
|
14 |
+
# model3 = OnnxSetFitModel(ort_model, tokenizer, model1.model_head)
|
15 |
+
|
16 |
+
decode = {0:"World",1:"Sports",2:"Business",3:"Sci/Tech"}
|
17 |
+
|
18 |
+
def compare_models(text):
|
19 |
+
# result1 = model1(text)
|
20 |
+
# result2 = model2(text)
|
21 |
+
# # Including model names in the results
|
22 |
+
# output1 = {"Model": "BERT Base Uncased", "Output": result1}
|
23 |
+
# output2 = {"Model": "RoBERTa Base", "Output": result2}
|
24 |
+
# return output1, output2
|
25 |
+
|
26 |
+
times = []
|
27 |
+
# Warm-up phase to ensure fair timing
|
28 |
+
for _ in range(5):
|
29 |
+
model1([text])
|
30 |
+
# Measure the execution time of model predictions
|
31 |
+
for _ in range(20):
|
32 |
+
start = perf_counter()
|
33 |
+
out1 = model1([text])
|
34 |
+
end = perf_counter()
|
35 |
+
times.append(end - start)
|
36 |
+
# Calculate mean and standard deviation of latency
|
37 |
+
avg_latency_ms_model_1 = np.mean(times) * 1000
|
38 |
+
|
39 |
+
# times = []
|
40 |
+
# # Warm-up phase to ensure fair timing
|
41 |
+
# for _ in range(5):
|
42 |
+
# model2([text])
|
43 |
+
# # Measure the execution time of model predictions
|
44 |
+
# for _ in range(20):
|
45 |
+
# start = perf_counter()
|
46 |
+
# out2 = model2([text])
|
47 |
+
# end = perf_counter()
|
48 |
+
# times.append(end - start)
|
49 |
+
# # Calculate mean and standard deviation of latency
|
50 |
+
# avg_latency_ms_model_2 = np.mean(times) * 1000
|
51 |
+
|
52 |
+
times = []
|
53 |
+
# Warm-up phase to ensure fair timing
|
54 |
+
for _ in range(5):
|
55 |
+
model3.predict([text])
|
56 |
+
# Measure the execution time of model predictions
|
57 |
+
for _ in range(20):
|
58 |
+
start = perf_counter()
|
59 |
+
out3 = model3([text])
|
60 |
+
end = perf_counter()
|
61 |
+
times.append(end - start)
|
62 |
+
# Calculate mean and standard deviation of latency
|
63 |
+
avg_latency_ms_model_3 = np.mean(times) * 1000
|
64 |
+
|
65 |
+
return {"answer":decode[out1.numpy()[0]],"avgtime":avg_latency_ms_model_1}, {"answer":decode[out3[0]],"avgtime":avg_latency_ms_model_3}
|
66 |
+
|
67 |
+
# Create a Gradio interface
|
68 |
+
iface = gr.Interface(
|
69 |
+
fn=compare_models,
|
70 |
+
inputs="text",
|
71 |
+
outputs=[
|
72 |
+
gr.components.JSON(label="Base miniLM"),
|
73 |
+
gr.components.JSON(label="Quantized Distilled miniLM")
|
74 |
+
],
|
75 |
+
title="Compare Sentence Classification Models",
|
76 |
+
description="Enter a sentence to see how each model classifies it."
|
77 |
+
)
|
78 |
+
|
79 |
+
# Run the interface
|
80 |
+
iface.launch()
|
quantization.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from neural_compressor.experimental import Quantization, common
|
2 |
+
|
3 |
+
import functools
|
4 |
+
|
5 |
+
import evaluate
|
6 |
+
import onnxruntime
|
7 |
+
from optimum.onnxruntime import ORTModelForFeatureExtraction
|
8 |
+
from sklearn.linear_model import LogisticRegression
|
9 |
+
from tqdm import tqdm
|
10 |
+
from setfit.exporters.utils import mean_pooling
|
11 |
+
|
12 |
+
accuracy = evaluate.load("accuracy")
|
13 |
+
|
14 |
+
class OnnxSetFitModel:
|
15 |
+
def __init__(self, ort_model, tokenizer, model_head):
|
16 |
+
self.ort_model = ort_model
|
17 |
+
self.tokenizer = tokenizer
|
18 |
+
self.model_head = model_head
|
19 |
+
|
20 |
+
def predict(self, inputs):
|
21 |
+
encoded_inputs = self.tokenizer(
|
22 |
+
inputs, padding=True, truncation=True, return_tensors="pt"
|
23 |
+
)
|
24 |
+
outputs = self.ort_model(**encoded_inputs)
|
25 |
+
embeddings = mean_pooling(
|
26 |
+
outputs["last_hidden_state"], encoded_inputs["attention_mask"]
|
27 |
+
)
|
28 |
+
return self.model_head.predict(embeddings)
|
29 |
+
|
30 |
+
def __call__(self, inputs):
|
31 |
+
return self.predict(inputs)
|
32 |
+
|
33 |
+
class myquantizer:
|
34 |
+
def __init__(self,onnx_path,model_head,tokenizer, test_dataset):
|
35 |
+
self.onnx_path = onnx_path
|
36 |
+
self.head = model_head
|
37 |
+
self.tokenizer = tokenizer
|
38 |
+
self.test_dataset = test_dataset
|
39 |
+
|
40 |
+
def eval_func(self, model):
|
41 |
+
print(self.onnx_path)
|
42 |
+
ort_model = ORTModelForFeatureExtraction.from_pretrained(self.onnx_path)
|
43 |
+
ort_model.model = onnxruntime.InferenceSession(model.SerializeToString(), None)
|
44 |
+
onnx_setfit_model = OnnxSetFitModel(ort_model, self.tokenizer, self.head)
|
45 |
+
preds = []
|
46 |
+
chunk_size = 100
|
47 |
+
for i in tqdm(range(0, len(self.test_dataset["text"]), chunk_size)):
|
48 |
+
preds.extend(
|
49 |
+
onnx_setfit_model.predict(self.test_dataset["text"][i : i + chunk_size])
|
50 |
+
)
|
51 |
+
labels = self.test_dataset["label"]
|
52 |
+
accuracy_calc = accuracy.compute(predictions=preds, references=labels)
|
53 |
+
return accuracy_calc["accuracy"]
|
54 |
+
|
55 |
+
def build_dynamic_quant_yaml(self):
|
56 |
+
yaml = """
|
57 |
+
model:
|
58 |
+
name: bert
|
59 |
+
framework: onnxrt_integerops
|
60 |
+
|
61 |
+
device: cpu
|
62 |
+
|
63 |
+
quantization:
|
64 |
+
approach: post_training_dynamic_quant
|
65 |
+
|
66 |
+
tuning:
|
67 |
+
accuracy_criterion:
|
68 |
+
relative: 0.01
|
69 |
+
exit_policy:
|
70 |
+
timeout: 0
|
71 |
+
random_seed: 9527
|
72 |
+
"""
|
73 |
+
with open("build.yaml", "w", encoding="utf-8") as f:
|
74 |
+
f.write(yaml)
|
75 |
+
def quantizer_model(self):
|
76 |
+
self.build_dynamic_quant_yaml()
|
77 |
+
onnx_output_path = "onnx/model_quantized.onnx"
|
78 |
+
quantizer = Quantization("build.yaml")
|
79 |
+
model_is_at = str(self.onnx_path / "model.onnx")
|
80 |
+
quantizer.model = common.Model(model_is_at)
|
81 |
+
quantizer.eval_func = functools.partial(self.eval_func)
|
82 |
+
quantized_model = quantizer()
|
83 |
+
quantized_model.save(onnx_output_path)
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
transformers
|
3 |
+
gradio
|
4 |
+
transformers
|
5 |
+
setfit
|
6 |
+
neural_compressor
|
7 |
+
optimum[onnxruntime]
|
8 |
+
onnxruntime_extensions
|
9 |
+
wandb
|