5w4n commited on
Commit
6841f1c
1 Parent(s): 0158b9f

Add token distribution chart and token count variability

Browse files
Files changed (1) hide show
  1. app.py +61 -54
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import streamlit as st
2
  import pandas as pd
3
  import plotly.graph_objects as go
 
4
  import numpy as np
5
 
6
 
@@ -21,31 +22,41 @@ def reload_example_text_data(selected_language, selected_tokenizers):
21
 
22
  val_data = load_data()
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  with st.sidebar:
25
- tokenizer_names_to_test = [
26
- "openai/gpt4",
27
- "Xenova/gpt-4o",
28
- "Xenova/claude-tokenizer",
29
- "CohereForAI/aya-101",
30
- "meta-llama/Meta-Llama-3-70B",
31
- "mistralai/Mixtral-8x22B-v0.1",
32
- "google/gemma-7b",
33
- "facebook/nllb-200-distilled-600M",
34
- "xlm-roberta-base",
35
- "bert-base-uncased",
36
- "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
37
- "bigscience/bloom",
38
- "StabilityAI/stablelm-base-alpha-7b",
39
- "google/flan-t5-base",
40
- "facebook/mbart-large-50",
41
- "EleutherAI/gpt-neox-20b",
42
- ]
43
- selected_tokenizers = st.multiselect(
44
- "Select tokenizers",
45
- options=tokenizer_names_to_test,
46
- default=["openai/gpt4", "Xenova/gpt-4o"],
47
- label_visibility="collapsed",
48
- )
49
  links = [
50
  (
51
  f"[{tokenizer_name}](https://huggingface.co/{tokenizer_name})"
@@ -70,37 +81,33 @@ st.subheader(f"**Sampled Text:** `{selected_text}`")
70
  st.subheader("Number of Tokens")
71
  st.table(st.session_state.examplesdf)
72
 
73
- # Calculate metrics for each tokenizer
74
- tokenizer_metrics = {}
75
- for tokenizer in selected_tokenizers:
76
- tokens = val_data[tokenizer].dropna()
77
- median = np.median(tokens)
78
- min_tokens = np.min(tokens)
79
- max_tokens = np.max(tokens)
80
- std_dev = np.std(tokens)
81
- tokenizer_metrics[tokenizer] = {
82
- "Median": median,
83
- "Min": min_tokens,
84
- "Max": max_tokens,
85
- "Range": max_tokens - min_tokens,
86
- "Standard Deviation": std_dev,
87
- }
88
-
89
- # Display metrics
90
- st.subheader("Tokenizer Metrics")
91
- st.json(tokenizer_metrics)
92
-
93
- # Plot for top tokenizers by median token length
94
- sorted_tokenizers = sorted(tokenizer_metrics.items(), key=lambda x: x[1]["Median"])
95
- shortest_median = sorted_tokenizers[:5]
96
- longest_median = sorted_tokenizers[-5:]
97
 
98
- fig = go.Figure()
99
- for name, metrics in shortest_median + longest_median:
100
- fig.add_trace(go.Bar(x=[name], y=[metrics["Median"]], name=name))
 
 
 
 
101
  fig.update_layout(
102
- title="Top Tokenizers by Shortest and Longest Median Token Length",
103
- xaxis_title="Tokenizer",
104
- yaxis_title="Median Token Length",
 
105
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  st.plotly_chart(fig)
 
1
  import streamlit as st
2
  import pandas as pd
3
  import plotly.graph_objects as go
4
+ from plotly.subplots import make_subplots
5
  import numpy as np
6
 
7
 
 
22
 
23
  val_data = load_data()
24
 
25
+ tokenizer_names_to_test = [
26
+ "openai/gpt4",
27
+ "Xenova/gpt-4o",
28
+ "Xenova/claude-tokenizer",
29
+ "CohereForAI/aya-101",
30
+ "meta-llama/Meta-Llama-3-70B",
31
+ "mistralai/Mixtral-8x22B-v0.1",
32
+ "google/gemma-7b",
33
+ "facebook/nllb-200-distilled-600M",
34
+ "xlm-roberta-base",
35
+ "bert-base-uncased",
36
+ "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
37
+ "bigscience/bloom",
38
+ "StabilityAI/stablelm-base-alpha-7b",
39
+ "google/flan-t5-base",
40
+ "facebook/mbart-large-50",
41
+ "EleutherAI/gpt-neox-20b",
42
+ ]
43
+
44
  with st.sidebar:
45
+ all_tokenizers = st.checkbox("Select All Tokenizers")
46
+ if all_tokenizers:
47
+ selected_tokenizers = tokenizer_names_to_test
48
+ else:
49
+ selected_tokenizers = st.multiselect(
50
+ "Select tokenizers",
51
+ options=tokenizer_names_to_test,
52
+ default=[
53
+ "openai/gpt4",
54
+ "Xenova/gpt-4o",
55
+ "CohereForAI/aya-101",
56
+ "Xenova/claude-tokenizer",
57
+ ],
58
+ label_visibility="collapsed",
59
+ )
 
 
 
 
 
 
 
 
 
60
  links = [
61
  (
62
  f"[{tokenizer_name}](https://huggingface.co/{tokenizer_name})"
 
81
  st.subheader("Number of Tokens")
82
  st.table(st.session_state.examplesdf)
83
 
84
+ # Create a distribution plot for token density across selected tokenizers
85
+ import plotly.figure_factory as ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
+ # Collecting data for all selected tokenizers
88
+ hist_data = [val_data[tokenizer].dropna() for tokenizer in selected_tokenizers]
89
+
90
+ # Creating the distplot with optional histogram
91
+ fig = ff.create_distplot(
92
+ hist_data, selected_tokenizers, show_hist=False, show_rug=False
93
+ )
94
  fig.update_layout(
95
+ title="Token Distribution Density",
96
+ xaxis_title="Number of Tokens",
97
+ yaxis_title="Density",
98
+ height=500,
99
  )
100
+ st.plotly_chart(fig, use_container_width=True)
101
+
102
+
103
+ tokenizer_to_num_tokens = {
104
+ name: val_data[name].tolist() for name in selected_tokenizers
105
+ }
106
+
107
+ fig = go.Figure()
108
+ for tokenizer_name in selected_tokenizers:
109
+ fig.add_trace(
110
+ go.Box(y=tokenizer_to_num_tokens[tokenizer_name], name=tokenizer_name)
111
+ )
112
+ fig.update_layout(title="Token Count Variability")
113
  st.plotly_chart(fig)