techAInewb commited on
Commit
339541e
·
verified ·
1 Parent(s): 64116c6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -35
app.py CHANGED
@@ -1,8 +1,12 @@
1
  import gradio as gr
2
  import numpy as np
3
  import onnxruntime as ort
4
- from transformers import AutoTokenizer, AutoModelForCausalLM
5
  import torch
 
 
 
 
 
6
  from huggingface_hub import hf_hub_download, HfFolder
7
 
8
  token = HfFolder.get_token() or os.getenv("HF_TOKEN")
@@ -11,43 +15,83 @@ HF_MODEL_ID = "mistralai/Mistral-Nemo-Instruct-2407"
11
  HF_ONNX_REPO = "techAInewb/mistral-nemo-2407-fp32"
12
  ONNX_MODEL_FILE = "model.onnx"
13
 
14
- # Load tokenizer
15
  tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_ID, token=token)
16
 
17
- # Load PyTorch model
18
- pt_model = AutoModelForCausalLM.from_pretrained(HF_MODEL_ID, torch_dtype=torch.float32, token=token)
19
- pt_model.eval()
20
-
21
- # Load ONNX model
22
- onnx_path = hf_hub_download(repo_id=HF_ONNX_REPO, filename=ONNX_MODEL_FILE)
23
- onnx_session = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
 
 
 
 
 
 
 
 
24
 
25
  def compare_outputs(prompt):
26
- inputs = tokenizer(prompt, return_tensors="np", padding=False)
27
- torch_inputs = tokenizer(prompt, return_tensors="pt")
28
-
29
- # Run PyTorch
30
- with torch.no_grad():
31
- pt_outputs = pt_model(**torch_inputs).logits
32
- pt_top = torch.topk(pt_outputs[0, -1], 5).indices.tolist()
33
-
34
- # Run ONNX
35
- ort_outputs = onnx_session.run(None, {
36
- "input_ids": inputs["input_ids"],
37
- "attention_mask": inputs["attention_mask"]
38
- })
39
- ort_logits = ort_outputs[0]
40
- ort_top = np.argsort(ort_logits[0, -1])[::-1][:5].tolist()
41
-
42
- pt_tokens = tokenizer.convert_ids_to_tokens(pt_top)
43
- ort_tokens = tokenizer.convert_ids_to_tokens(ort_top)
44
-
45
- return f"PyTorch Top Tokens: {pt_tokens}", f"ONNX Top Tokens: {ort_tokens}"
46
-
47
- iface = gr.Interface(fn=compare_outputs,
48
- inputs=gr.Textbox(lines=2, placeholder="Enter a prompt..."),
49
- outputs=["text", "text"],
50
- title="ONNX vs PyTorch Model Comparison",
51
- description="Run both PyTorch and ONNX inference on a prompt and compare top predicted tokens.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  iface.launch()
 
1
  import gradio as gr
2
  import numpy as np
3
  import onnxruntime as ort
 
4
  import torch
5
+ import gc
6
+ import os
7
+ import time
8
+
9
+ from transformers import AutoTokenizer, AutoModelForCausalLM
10
  from huggingface_hub import hf_hub_download, HfFolder
11
 
12
  token = HfFolder.get_token() or os.getenv("HF_TOKEN")
 
15
  HF_ONNX_REPO = "techAInewb/mistral-nemo-2407-fp32"
16
  ONNX_MODEL_FILE = "model.onnx"
17
 
18
+ # Shared tokenizer
19
  tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_ID, token=token)
20
 
21
+ def greedy_decode_onnx(session, input_ids, attention_mask, max_new_tokens=50):
22
+ generated = input_ids.copy()
23
+ for _ in range(max_new_tokens):
24
+ outputs = session.run(None, {
25
+ "input_ids": generated,
26
+ "attention_mask": attention_mask
27
+ })
28
+ next_token_logits = outputs[0][:, -1, :]
29
+ next_token = np.argmax(next_token_logits, axis=-1).reshape(-1, 1)
30
+ generated = np.concatenate((generated, next_token), axis=1)
31
+ attention_mask = np.concatenate(
32
+ (attention_mask, np.ones((1, 1), dtype=np.int64)), axis=1)
33
+ if next_token[0][0] == tokenizer.eos_token_id:
34
+ break
35
+ return tokenizer.decode(generated[0], skip_special_tokens=True)
36
 
37
  def compare_outputs(prompt):
38
+ summary_log = []
39
+
40
+ # 🔹 PyTorch Generate
41
+ pt_output_text = ""
42
+ pt_start = time.time()
43
+ try:
44
+ torch_inputs = tokenizer(prompt, return_tensors="pt")
45
+ pt_model = AutoModelForCausalLM.from_pretrained(HF_MODEL_ID, torch_dtype=torch.float32, token=token)
46
+ pt_model.eval()
47
+ with torch.no_grad():
48
+ pt_outputs = pt_model.generate(**torch_inputs, max_new_tokens=50)
49
+ pt_output_text = tokenizer.decode(pt_outputs[0], skip_special_tokens=True)
50
+ pt_time = time.time() - pt_start
51
+ summary_log.append(f"🧠 PyTorch output length: {pt_outputs.shape[1]} tokens | Time: {pt_time:.2f}s")
52
+ finally:
53
+ del pt_model
54
+ gc.collect()
55
+ if torch.cuda.is_available():
56
+ torch.cuda.empty_cache()
57
+
58
+ # 🔹 ONNX Generate (Greedy)
59
+ ort_output_text = ""
60
+ ort_start = time.time()
61
+ ort_inputs = tokenizer(prompt, return_tensors="np")
62
+ onnx_path = hf_hub_download(repo_id=HF_ONNX_REPO, filename=ONNX_MODEL_FILE)
63
+ ort_session = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
64
+ ort_output_text = greedy_decode_onnx(
65
+ ort_session, ort_inputs["input_ids"], ort_inputs["attention_mask"], max_new_tokens=50
66
+ )
67
+ ort_time = time.time() - ort_start
68
+ summary_log.append(f"⚙️ ONNX output length: {len(tokenizer(ort_output_text)['input_ids'])} tokens | Time: {ort_time:.2f}s")
69
+
70
+ # Final notes
71
+ summary_log.append(f"🧪 Tokenizer source: {tokenizer.name_or_path} | Vocab size: {tokenizer.vocab_size}")
72
+ summary_log.append("💡 Note: Future versions will include quantized ONNX (INT8) + Vitis AI support.")
73
+
74
+ return pt_output_text, ort_output_text, "\n".join(summary_log)
75
+
76
+ example_prompts = [
77
+ "Who was the first president of the United States?",
78
+ "If you have 3 apples and eat 1, how many are left?",
79
+ "Write a short poem about memory and time.",
80
+ "Explain the laws of motion in simple terms.",
81
+ "What happens when you mix baking soda and vinegar?"
82
+ ]
83
+
84
+ iface = gr.Interface(
85
+ fn=compare_outputs,
86
+ inputs=gr.Textbox(lines=2, placeholder="Enter a prompt..."),
87
+ outputs=[
88
+ gr.Textbox(label="PyTorch Output"),
89
+ gr.Textbox(label="ONNX Output"),
90
+ gr.Textbox(label="Test Summary & Metadata")
91
+ ],
92
+ title="ONNX vs PyTorch (Full Output Comparison)",
93
+ description="Sequentially runs both models on your prompt and returns decoded output + metadata.",
94
+ examples=[[p] for p in example_prompts]
95
+ )
96
 
97
  iface.launch()