prasanna kumar commited on
Commit
c6a1e30
·
1 Parent(s): b60f3da

Final commit for llama based model

Browse files
Files changed (5) hide show
  1. .DS_Store +0 -0
  2. .gitattributes +0 -35
  3. .gitignore +3 -1
  4. app.py +10 -18
  5. models/.DS_Store +0 -0
.DS_Store DELETED
Binary file (6.15 kB)
 
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore CHANGED
@@ -1,6 +1,8 @@
1
  # Project Specific
 
2
  viz_env/
3
-
 
4
  test.py
5
  test_123.py
6
  # Byte-compiled / optimized / DLL files
 
1
  # Project Specific
2
+ # added git lfs for models
3
  viz_env/
4
+ .DS_Store
5
+ models/gemma-2-2b
6
  test.py
7
  test_123.py
8
  # Byte-compiled / optimized / DLL files
app.py CHANGED
@@ -13,28 +13,19 @@ def process_input(input_type, input_value, model_name):
13
  tokenizer = AutoTokenizer.from_pretrained(model_path+model_name)
14
 
15
  if input_type == "Text":
 
16
  # Tokenize the text
17
- tokens = tokenizer.tokenize(input_value)
18
- token_ids = tokenizer.encode(input_value)
19
-
20
- # Create output strings
21
- # tokens_str = [f"{i+1}. {token}" for i, token in enumerate(tokens)]
22
- # token_ids_str = " ".join(map(str, token_ids))
23
-
24
- return f"Total tokens: {len(tokens)}", tokens, token_ids
25
 
26
  elif input_type == "Token IDs":
27
  try:
28
  token_ids = ast.literal_eval(input_value)
29
- # Convert string of token IDs to list of integers
30
- # token_ids = list(map(int, input_value.split()))
31
  # Convert token IDs back to text
32
- text = tokenizer.decode(token_ids)
33
- # print("The decoded text",text)
34
- # Tokenize the text to get individual tokens
35
-
36
  # Create output strings
37
- return f"Total tokens: {len(token_ids)}", text, input_value
38
  except ValueError:
39
  return "Error", "Invalid input. Please enter space-separated integers for Token IDs.", ""
40
 
@@ -48,13 +39,14 @@ iface = gr.Interface(
48
  ],
49
  outputs=[
50
  gr.Textbox(label="Token Count"),
 
51
  gr.Textbox(label="Tokens", lines=10),
52
  gr.Textbox(label="Token IDS", lines=5)
53
  ],
54
- title="LLM Tokenization and Token ID Converter",
55
- description="Enter text or token IDs and select a model to see the conversion results."
56
  )
57
 
58
  if __name__ == "__main__":
59
  iface.queue()
60
- iface.launch()
 
13
  tokenizer = AutoTokenizer.from_pretrained(model_path+model_name)
14
 
15
  if input_type == "Text":
16
+ character_count = len(input_value)
17
  # Tokenize the text
18
+ token_ids = tokenizer.encode(input_value,add_special_tokens=True)
19
+ tokens = tokenizer.convert_ids_to_tokens(token_ids)
20
+ return len(tokens),character_count, tokens, token_ids
 
 
 
 
 
21
 
22
  elif input_type == "Token IDs":
23
  try:
24
  token_ids = ast.literal_eval(input_value)
 
 
25
  # Convert token IDs back to text
26
+ text = tokenizer.decode(token_ids)
 
 
 
27
  # Create output strings
28
+ return len(token_ids),len(token_ids), text, input_value,
29
  except ValueError:
30
  return "Error", "Invalid input. Please enter space-separated integers for Token IDs.", ""
31
 
 
39
  ],
40
  outputs=[
41
  gr.Textbox(label="Token Count"),
42
+ gr.Textbox(label="Character Count"),
43
  gr.Textbox(label="Tokens", lines=10),
44
  gr.Textbox(label="Token IDS", lines=5)
45
  ],
46
+ title="LLM Tokenization - Convert Text to tokens and vice versa!",
47
+ description="Enter text or token IDs and select a model to see the results."
48
  )
49
 
50
  if __name__ == "__main__":
51
  iface.queue()
52
+ iface.launch()
models/.DS_Store DELETED
Binary file (6.15 kB)