Spaces:

hsmashiana
/

HPML_proj

Runtime error

App Files Files Community

hsmashiana commited on May 4, 2024

Commit

305d23f

1 Parent(s): d4461bd

first commit

Browse files

Files changed (4) hide show

README.md +5 -5
app.py +80 -0
quantization.py +83 -0
requirements.txt +9 -0

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
-title: HPML Proj
-emoji: 😻
-colorFrom: green
-colorTo: yellow
 sdk: gradio
-sdk_version: 4.29.0
 app_file: app.py
 pinned: false
 license: apache-2.0

 ---
+title: Optimized SentenceTransformer Space
+emoji: 📊
+colorFrom: pink
+colorTo: green
 sdk: gradio
+sdk_version: 4.28.3
 app_file: app.py
 pinned: false
 license: apache-2.0

app.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import gradio as gr
+from transformers import pipeline
+import numpy as np
+from time import perf_counter
+from setfit import SetFitModel
+from optimum.onnxruntime import ORTModelForFeatureExtraction
+from quantization import OnnxSetFitModel
+from transformers import AutoTokenizer
+# Load the models
+# model1 = SetFitModel.from_pretrained("hsmashiana/base_model_hpml")
+# ort_model = ORTModelForFeatureExtraction.from_pretrained("hsmashiana/optimized_model_hpml", file_name="model_quantized.onnx")
+# tokenizer = AutoTokenizer.from_pretrained("hsmashiana/optimized_model_hpml")
+# model3 = OnnxSetFitModel(ort_model, tokenizer, model1.model_head)
+decode = {0:"World",1:"Sports",2:"Business",3:"Sci/Tech"}
+def compare_models(text):
+    # result1 = model1(text)
+    # result2 = model2(text)
+    # # Including model names in the results
+    # output1 = {"Model": "BERT Base Uncased", "Output": result1}
+    # output2 = {"Model": "RoBERTa Base", "Output": result2}
+    # return output1, output2
+    times = []
+        # Warm-up phase to ensure fair timing
+    for _ in range(5):
+        model1([text])
+    # Measure the execution time of model predictions
+    for _ in range(20):
+        start = perf_counter()
+        out1 = model1([text])
+        end = perf_counter()
+        times.append(end - start)
+    # Calculate mean and standard deviation of latency
+    avg_latency_ms_model_1 = np.mean(times) * 1000
+    # times = []
+    #     # Warm-up phase to ensure fair timing
+    # for _ in range(5):
+    #     model2([text])
+    # # Measure the execution time of model predictions
+    # for _ in range(20):
+    #     start = perf_counter()
+    #     out2 = model2([text])
+    #     end = perf_counter()
+    #     times.append(end - start)
+    # # Calculate mean and standard deviation of latency
+    # avg_latency_ms_model_2 = np.mean(times) * 1000
+    times = []
+        # Warm-up phase to ensure fair timing
+    for _ in range(5):
+        model3.predict([text])
+    # Measure the execution time of model predictions
+    for _ in range(20):
+        start = perf_counter()
+        out3 = model3([text])
+        end = perf_counter()
+        times.append(end - start)
+    # Calculate mean and standard deviation of latency
+    avg_latency_ms_model_3 = np.mean(times) * 1000
+    return {"answer":decode[out1.numpy()[0]],"avgtime":avg_latency_ms_model_1}, {"answer":decode[out3[0]],"avgtime":avg_latency_ms_model_3}
+# Create a Gradio interface
+iface = gr.Interface(
+    fn=compare_models,
+    inputs="text",
+    outputs=[
+        gr.components.JSON(label="Base miniLM"),
+        gr.components.JSON(label="Quantized Distilled miniLM")
+    ],
+    title="Compare Sentence Classification Models",
+    description="Enter a sentence to see how each model classifies it."
+)
+# Run the interface
+iface.launch()

quantization.py ADDED Viewed

	@@ -0,0 +1,83 @@

+from neural_compressor.experimental import Quantization, common
+import functools
+import evaluate
+import onnxruntime
+from optimum.onnxruntime import ORTModelForFeatureExtraction
+from sklearn.linear_model import LogisticRegression
+from tqdm import tqdm
+from setfit.exporters.utils import mean_pooling
+accuracy = evaluate.load("accuracy")
+class OnnxSetFitModel:
+    def __init__(self, ort_model, tokenizer, model_head):
+        self.ort_model = ort_model
+        self.tokenizer = tokenizer
+        self.model_head = model_head
+    def predict(self, inputs):
+        encoded_inputs = self.tokenizer(
+            inputs, padding=True, truncation=True, return_tensors="pt"
+        )
+        outputs = self.ort_model(**encoded_inputs)
+        embeddings = mean_pooling(
+            outputs["last_hidden_state"], encoded_inputs["attention_mask"]
+        )
+        return self.model_head.predict(embeddings)
+    def __call__(self, inputs):
+        return self.predict(inputs)
+class myquantizer:
+  def __init__(self,onnx_path,model_head,tokenizer, test_dataset):
+    self.onnx_path = onnx_path
+    self.head = model_head
+    self.tokenizer = tokenizer
+    self.test_dataset = test_dataset
+  def eval_func(self, model):
+      print(self.onnx_path)
+      ort_model = ORTModelForFeatureExtraction.from_pretrained(self.onnx_path)
+      ort_model.model = onnxruntime.InferenceSession(model.SerializeToString(), None)
+      onnx_setfit_model = OnnxSetFitModel(ort_model, self.tokenizer, self.head)
+      preds = []
+      chunk_size = 100
+      for i in tqdm(range(0, len(self.test_dataset["text"]), chunk_size)):
+          preds.extend(
+              onnx_setfit_model.predict(self.test_dataset["text"][i : i + chunk_size])
+          )
+      labels = self.test_dataset["label"]
+      accuracy_calc = accuracy.compute(predictions=preds, references=labels)
+      return accuracy_calc["accuracy"]
+  def build_dynamic_quant_yaml(self):
+      yaml = """
+  model:
+    name: bert
+    framework: onnxrt_integerops
+  device: cpu
+  quantization:
+    approach: post_training_dynamic_quant
+  tuning:
+    accuracy_criterion:
+      relative: 0.01
+    exit_policy:
+      timeout: 0
+    random_seed: 9527
+  """
+      with open("build.yaml", "w", encoding="utf-8") as f:
+          f.write(yaml)
+  def quantizer_model(self):
+    self.build_dynamic_quant_yaml()
+    onnx_output_path = "onnx/model_quantized.onnx"
+    quantizer = Quantization("build.yaml")
+    model_is_at = str(self.onnx_path / "model.onnx")
+    quantizer.model = common.Model(model_is_at)
+    quantizer.eval_func = functools.partial(self.eval_func)
+    quantized_model = quantizer()
+    quantized_model.save(onnx_output_path)

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torch
+transformers
+gradio
+transformers
+setfit
+neural_compressor
+optimum[onnxruntime]
+onnxruntime_extensions
+wandb