techAInewb commited on
Commit
15335db
Β·
verified Β·
1 Parent(s): 52389d5

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -123
app.py DELETED
@@ -1,123 +0,0 @@
1
- import gradio as gr
2
- import numpy as np
3
- import onnxruntime as ort
4
- import torch
5
- import gc
6
- import os
7
- import time
8
-
9
- from transformers import AutoTokenizer, AutoModelForCausalLM
10
- from huggingface_hub import hf_hub_download, HfFolder
11
-
12
- token = HfFolder.get_token() or os.getenv("HF_TOKEN")
13
-
14
- HF_MODEL_ID = "mistralai/Mistral-Nemo-Instruct-2407"
15
- HF_ONNX_REPO = "techAInewb/mistral-nemo-2407-fp32"
16
- ONNX_MODEL_FILE = "model.onnx"
17
-
18
- # Shared tokenizer
19
- tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_ID, token=token)
20
-
21
- def compare_outputs(prompt, show_tokens):
22
- summary_log = []
23
- pt_output_text = ""
24
- ort_output_text = ""
25
- pt_tokens = []
26
- ort_tokens = []
27
-
28
- try:
29
- import psutil
30
- ram_used = f"{psutil.virtual_memory().used / 1e9:.2f} GB"
31
- except:
32
- ram_used = "Unavailable"
33
-
34
- # πŸ”Ή PyTorch Generate
35
- pt_start = time.time()
36
- try:
37
- torch_inputs = tokenizer(prompt, return_tensors="pt")
38
- pt_model = AutoModelForCausalLM.from_pretrained(HF_MODEL_ID, torch_dtype=torch.float32, token=token)
39
- pt_model.eval()
40
- with torch.no_grad():
41
- pt_outputs = pt_model.generate(**torch_inputs, max_new_tokens=50)
42
- pt_output_ids = pt_outputs[0].tolist()
43
- pt_output_text = tokenizer.decode(pt_output_ids, skip_special_tokens=True)
44
- pt_tokens = tokenizer.convert_ids_to_tokens(pt_output_ids)
45
- pt_time = time.time() - pt_start
46
- finally:
47
- del pt_model
48
- gc.collect()
49
- if torch.cuda.is_available():
50
- torch.cuda.empty_cache()
51
-
52
- # πŸ”Ή ONNX Generate (Greedy)
53
- ort_start = time.time()
54
- ort_inputs = tokenizer(prompt, return_tensors="np")
55
- onnx_path = hf_hub_download(repo_id=HF_ONNX_REPO, filename=ONNX_MODEL_FILE)
56
- ort_session = ort.InferenceSession(onnx_path, providers=["CPUExecutionProvider"])
57
- ort_output_ids = []
58
- generated = ort_inputs["input_ids"]
59
- attention_mask = ort_inputs["attention_mask"]
60
- for _ in range(50):
61
- ort_outputs = ort_session.run(None, {
62
- "input_ids": generated,
63
- "attention_mask": attention_mask
64
- })
65
- next_token_logits = ort_outputs[0][:, -1, :]
66
- next_token = np.argmax(next_token_logits, axis=-1).reshape(-1, 1)
67
- ort_output_ids.append(next_token[0][0])
68
- generated = np.concatenate((generated, next_token), axis=1)
69
- attention_mask = np.concatenate((attention_mask, np.ones((1, 1), dtype=np.int64)), axis=1)
70
- if next_token[0][0] == tokenizer.eos_token_id:
71
- break
72
- ort_time = time.time() - ort_start
73
- ort_tokens = tokenizer.convert_ids_to_tokens(ort_inputs["input_ids"][0].tolist() + ort_output_ids)
74
- ort_output_text = tokenizer.decode(ort_inputs["input_ids"][0].tolist() + ort_output_ids, skip_special_tokens=True)
75
-
76
- # πŸ“Š Summary
77
- summary_log.append("| Model | Tokens | Time (s) | Time/Token |")
78
- summary_log.append("|---------|--------|----------|------------|")
79
- summary_log.append(f"| PyTorch | {len(pt_tokens)} | {pt_time:.2f} | {pt_time / max(1, len(pt_tokens)):.4f} |")
80
- summary_log.append(f"| ONNX | {len(ort_tokens)} | {ort_time:.2f} | {ort_time / max(1, len(ort_tokens)):.4f} |")
81
- summary_log.append(f"\nπŸ“¦ RAM Used: {ram_used}")
82
- summary_log.append(f"πŸ“š Tokenizer: {tokenizer.name_or_path} | Vocab size: {tokenizer.vocab_size}")
83
- summary_log.append("πŸ› οΈ Note: This ONNX export is FP32. INT8 + Vitis AI variants coming soon.")
84
-
85
- outputs = [pt_output_text, ort_output_text, "\n".join(summary_log)]
86
-
87
- if show_tokens:
88
- outputs += [
89
- ", ".join(pt_tokens),
90
- ", ".join(ort_tokens)
91
- ]
92
- else:
93
- outputs += ["", ""]
94
-
95
- return outputs
96
-
97
- example_prompts = [
98
- "Who was the first president of the United States?",
99
- "If you have 3 apples and eat 1, how many are left?",
100
- "Write a short poem about memory and time.",
101
- "Explain the laws of motion in simple terms.",
102
- "What happens when you mix baking soda and vinegar?"
103
- ]
104
-
105
- iface = gr.Interface(
106
- fn=compare_outputs,
107
- inputs=[
108
- gr.Textbox(lines=2, placeholder="Enter a prompt..."),
109
- gr.Checkbox(label="Show Token IDs")
110
- ],
111
- outputs=[
112
- gr.Textbox(label="PyTorch Output"),
113
- gr.Textbox(label="ONNX Output"),
114
- gr.Textbox(label="Evaluation Summary"),
115
- gr.Textbox(label="PyTorch Tokens"),
116
- gr.Textbox(label="ONNX Tokens")
117
- ],
118
- title="ONNX vs PyTorch (Full Output + Token Trace)",
119
- description="Run both models on your prompt and compare output text, timing, and token traces. Sequential model loading avoids OOM.",
120
- examples=[[p, False] for p in example_prompts]
121
- )
122
-
123
- iface.launch()