techAInewb commited on
Commit
52389d5
Β·
verified Β·
1 Parent(s): 339541e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -34
app.py CHANGED
@@ -18,27 +18,20 @@ ONNX_MODEL_FILE = "model.onnx"
18
  # Shared tokenizer
19
  tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_ID, token=token)
20
 
21
- def greedy_decode_onnx(session, input_ids, attention_mask, max_new_tokens=50):
22
- generated = input_ids.copy()
23
- for _ in range(max_new_tokens):
24
- outputs = session.run(None, {
25
- "input_ids": generated,
26
- "attention_mask": attention_mask
27
- })
28
- next_token_logits = outputs[0][:, -1, :]
29
- next_token = np.argmax(next_token_logits, axis=-1).reshape(-1, 1)
30
- generated = np.concatenate((generated, next_token), axis=1)
31
- attention_mask = np.concatenate(
32
- (attention_mask, np.ones((1, 1), dtype=np.int64)), axis=1)
33
- if next_token[0][0] == tokenizer.eos_token_id:
34
- break
35
- return tokenizer.decode(generated[0], skip_special_tokens=True)
36
-
37
- def compare_outputs(prompt):
38
  summary_log = []
 
 
 
 
 
 
 
 
 
 
39
 
40
  # πŸ”Ή PyTorch Generate
41
- pt_output_text = ""
42
  pt_start = time.time()
43
  try:
44
  torch_inputs = tokenizer(prompt, return_tensors="pt")
@@ -46,9 +39,10 @@ def compare_outputs(prompt):
46
  pt_model.eval()
47
  with torch.no_grad():
48
  pt_outputs = pt_model.generate(**torch_inputs, max_new_tokens=50)
49
- pt_output_text = tokenizer.decode(pt_outputs[0], skip_special_tokens=True)
 
 
50
  pt_time = time.time() - pt_start
51
- summary_log.append(f"🧠 PyTorch output length: {pt_outputs.shape[1]} tokens | Time: {pt_time:.2f}s")
52
  finally:
53
  del pt_model
54
  gc.collect()
@@ -56,22 +50,49 @@ def compare_outputs(prompt):
56
  torch.cuda.empty_cache()
57
 
58
  # πŸ”Ή ONNX Generate (Greedy)
59
- ort_output_text = ""
60
  ort_start = time.time()
61
  ort_inputs = tokenizer(prompt, return_tensors="np")
62
  onnx_path = hf_hub_download(repo_id=HF_ONNX_REPO, filename=ONNX_MODEL_FILE)
63
  ort_session = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
64
- ort_output_text = greedy_decode_onnx(
65
- ort_session, ort_inputs["input_ids"], ort_inputs["attention_mask"], max_new_tokens=50
66
- )
 
 
 
 
 
 
 
 
 
 
 
 
67
  ort_time = time.time() - ort_start
68
- summary_log.append(f"βš™οΈ ONNX output length: {len(tokenizer(ort_output_text)['input_ids'])} tokens | Time: {ort_time:.2f}s")
 
69
 
70
- # Final notes
71
- summary_log.append(f"πŸ§ͺ Tokenizer source: {tokenizer.name_or_path} | Vocab size: {tokenizer.vocab_size}")
72
- summary_log.append("πŸ’‘ Note: Future versions will include quantized ONNX (INT8) + Vitis AI support.")
 
 
 
 
 
73
 
74
- return pt_output_text, ort_output_text, "\n".join(summary_log)
 
 
 
 
 
 
 
 
 
 
75
 
76
  example_prompts = [
77
  "Who was the first president of the United States?",
@@ -83,15 +104,20 @@ example_prompts = [
83
 
84
  iface = gr.Interface(
85
  fn=compare_outputs,
86
- inputs=gr.Textbox(lines=2, placeholder="Enter a prompt..."),
 
 
 
87
  outputs=[
88
  gr.Textbox(label="PyTorch Output"),
89
  gr.Textbox(label="ONNX Output"),
90
- gr.Textbox(label="Test Summary & Metadata")
 
 
91
  ],
92
- title="ONNX vs PyTorch (Full Output Comparison)",
93
- description="Sequentially runs both models on your prompt and returns decoded output + metadata.",
94
- examples=[[p] for p in example_prompts]
95
  )
96
 
97
  iface.launch()
 
18
  # Shared tokenizer
19
  tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_ID, token=token)
20
 
21
+ def compare_outputs(prompt, show_tokens):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  summary_log = []
23
+ pt_output_text = ""
24
+ ort_output_text = ""
25
+ pt_tokens = []
26
+ ort_tokens = []
27
+
28
+ try:
29
+ import psutil
30
+ ram_used = f"{psutil.virtual_memory().used / 1e9:.2f} GB"
31
+ except:
32
+ ram_used = "Unavailable"
33
 
34
  # πŸ”Ή PyTorch Generate
 
35
  pt_start = time.time()
36
  try:
37
  torch_inputs = tokenizer(prompt, return_tensors="pt")
 
39
  pt_model.eval()
40
  with torch.no_grad():
41
  pt_outputs = pt_model.generate(**torch_inputs, max_new_tokens=50)
42
+ pt_output_ids = pt_outputs[0].tolist()
43
+ pt_output_text = tokenizer.decode(pt_output_ids, skip_special_tokens=True)
44
+ pt_tokens = tokenizer.convert_ids_to_tokens(pt_output_ids)
45
  pt_time = time.time() - pt_start
 
46
  finally:
47
  del pt_model
48
  gc.collect()
 
50
  torch.cuda.empty_cache()
51
 
52
  # πŸ”Ή ONNX Generate (Greedy)
 
53
  ort_start = time.time()
54
  ort_inputs = tokenizer(prompt, return_tensors="np")
55
  onnx_path = hf_hub_download(repo_id=HF_ONNX_REPO, filename=ONNX_MODEL_FILE)
56
  ort_session = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
57
+ ort_output_ids = []
58
+ generated = ort_inputs["input_ids"]
59
+ attention_mask = ort_inputs["attention_mask"]
60
+ for _ in range(50):
61
+ ort_outputs = ort_session.run(None, {
62
+ "input_ids": generated,
63
+ "attention_mask": attention_mask
64
+ })
65
+ next_token_logits = ort_outputs[0][:, -1, :]
66
+ next_token = np.argmax(next_token_logits, axis=-1).reshape(-1, 1)
67
+ ort_output_ids.append(next_token[0][0])
68
+ generated = np.concatenate((generated, next_token), axis=1)
69
+ attention_mask = np.concatenate((attention_mask, np.ones((1, 1), dtype=np.int64)), axis=1)
70
+ if next_token[0][0] == tokenizer.eos_token_id:
71
+ break
72
  ort_time = time.time() - ort_start
73
+ ort_tokens = tokenizer.convert_ids_to_tokens(ort_inputs["input_ids"][0].tolist() + ort_output_ids)
74
+ ort_output_text = tokenizer.decode(ort_inputs["input_ids"][0].tolist() + ort_output_ids, skip_special_tokens=True)
75
 
76
+ # πŸ“Š Summary
77
+ summary_log.append("| Model | Tokens | Time (s) | Time/Token |")
78
+ summary_log.append("|---------|--------|----------|------------|")
79
+ summary_log.append(f"| PyTorch | {len(pt_tokens)} | {pt_time:.2f} | {pt_time / max(1, len(pt_tokens)):.4f} |")
80
+ summary_log.append(f"| ONNX | {len(ort_tokens)} | {ort_time:.2f} | {ort_time / max(1, len(ort_tokens)):.4f} |")
81
+ summary_log.append(f"\nπŸ“¦ RAM Used: {ram_used}")
82
+ summary_log.append(f"πŸ“š Tokenizer: {tokenizer.name_or_path} | Vocab size: {tokenizer.vocab_size}")
83
+ summary_log.append("πŸ› οΈ Note: This ONNX export is FP32. INT8 + Vitis AI variants coming soon.")
84
 
85
+ outputs = [pt_output_text, ort_output_text, "\n".join(summary_log)]
86
+
87
+ if show_tokens:
88
+ outputs += [
89
+ ", ".join(pt_tokens),
90
+ ", ".join(ort_tokens)
91
+ ]
92
+ else:
93
+ outputs += ["", ""]
94
+
95
+ return outputs
96
 
97
  example_prompts = [
98
  "Who was the first president of the United States?",
 
104
 
105
  iface = gr.Interface(
106
  fn=compare_outputs,
107
+ inputs=[
108
+ gr.Textbox(lines=2, placeholder="Enter a prompt..."),
109
+ gr.Checkbox(label="Show Token IDs")
110
+ ],
111
  outputs=[
112
  gr.Textbox(label="PyTorch Output"),
113
  gr.Textbox(label="ONNX Output"),
114
+ gr.Textbox(label="Evaluation Summary"),
115
+ gr.Textbox(label="PyTorch Tokens"),
116
+ gr.Textbox(label="ONNX Tokens")
117
  ],
118
+ title="ONNX vs PyTorch (Full Output + Token Trace)",
119
+ description="Run both models on your prompt and compare output text, timing, and token traces. Sequential model loading avoids OOM.",
120
+ examples=[[p, False] for p in example_prompts]
121
  )
122
 
123
  iface.launch()