Spaces:
Paused
Paused
Make it better?
Browse files- dashboard.py +42 -16
- opendashboards/assets/inspect.py +7 -4
- opendashboards/assets/io.py +21 -6
- opendashboards/assets/metric.py +43 -17
- opendashboards/assets/plot.py +14 -0
- opendashboards/utils/plotting.py +41 -5
dashboard.py
CHANGED
@@ -40,13 +40,13 @@ with st.spinner(text=f'Checking wandb...'):
|
|
40 |
|
41 |
|
42 |
### Wandb Runs ###
|
43 |
-
with st.sidebar:
|
44 |
|
45 |
-
|
46 |
-
|
47 |
|
48 |
-
df_runs_subset = io.filter_dataframe(df_runs, demo_selection=df_runs.id.isin(DEFAULT_SELECTED_RUNS))
|
49 |
-
n_runs = len(df_runs_subset)
|
50 |
|
51 |
metric.wandb(df_runs)
|
52 |
|
@@ -64,26 +64,28 @@ with tab1:
|
|
64 |
st.subheader(":violet[Run] Data")
|
65 |
with st.expander(f'Show :violet[raw] wandb data'):
|
66 |
|
67 |
-
filter_selected_checkbox = st.checkbox('Filter to selected runs', value=True)
|
68 |
-
df_to_show = df_runs_subset if filter_selected_checkbox else df_runs
|
69 |
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
).set_index('Selected').sort_index(ascending=False),#.style.highlight_max(subset=df_runs_subset.index, color='lightgreen', axis=1),
|
75 |
use_container_width=True,
|
76 |
)
|
|
|
|
|
77 |
|
78 |
if n_runs:
|
79 |
df = io.load_data(df_runs_subset, load=True, save=True)
|
|
|
80 |
df_long = inspect.explode_data(df)
|
81 |
df_weights = inspect.weights(df)
|
82 |
else:
|
83 |
st.info(f'You must select at least one run to load data')
|
84 |
st.stop()
|
85 |
|
86 |
-
metric.runs(df_long)
|
87 |
|
88 |
st.markdown('#')
|
89 |
st.subheader(":violet[Event] Data")
|
@@ -93,10 +95,12 @@ with tab1:
|
|
93 |
num_rows = raw_data_col2.slider('Number of rows:', min_value=1, max_value=100, value=10, key='num_rows')
|
94 |
st.dataframe(df_long.head(num_rows) if use_long_checkbox else df.head(num_rows),
|
95 |
use_container_width=True)
|
96 |
-
|
97 |
|
98 |
|
99 |
### UID Health ###
|
|
|
|
|
100 |
with tab2:
|
101 |
|
102 |
st.markdown('#')
|
@@ -106,10 +110,31 @@ with tab2:
|
|
106 |
uid_src = st.radio('Select one:', ['followup', 'answer'], horizontal=True, key='uid_src')
|
107 |
|
108 |
metric.uids(df_long, uid_src)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
109 |
|
110 |
-
with st.expander(f'Show UID
|
111 |
|
112 |
-
uids = st.multiselect('UID:', sorted(df_long[f'{uid_src}_uids'].unique()), key='uid')
|
113 |
st.markdown('#')
|
114 |
st.subheader(f"UID {uid_src.title()} :violet[Weights]")
|
115 |
|
@@ -189,6 +214,7 @@ with tab3:
|
|
189 |
ntop=completion_ntop,
|
190 |
completions=completion_select,
|
191 |
)
|
|
|
192 |
|
193 |
|
194 |
with st.expander(f'Show **{completion_src}** completion length data for **{n_runs} selected runs**'):
|
|
|
40 |
|
41 |
|
42 |
### Wandb Runs ###
|
43 |
+
# with st.sidebar:
|
44 |
|
45 |
+
# st.markdown('#')
|
46 |
+
# st.sidebar.header(":violet[Select] Runs")
|
47 |
|
48 |
+
# df_runs_subset = io.filter_dataframe(df_runs, demo_selection=df_runs.id.isin(DEFAULT_SELECTED_RUNS))
|
49 |
+
# n_runs = len(df_runs_subset)
|
50 |
|
51 |
metric.wandb(df_runs)
|
52 |
|
|
|
64 |
st.subheader(":violet[Run] Data")
|
65 |
with st.expander(f'Show :violet[raw] wandb data'):
|
66 |
|
67 |
+
# filter_selected_checkbox = st.checkbox('Filter to selected runs', value=True)
|
68 |
+
# df_to_show = df_runs_subset if filter_selected_checkbox else df_runs
|
69 |
|
70 |
+
edited_df = st.data_editor(
|
71 |
+
df_runs.assign(Select=False).set_index('Select'),
|
72 |
+
column_config={"Select": st.column_config.CheckboxColumn(required=True)},
|
73 |
+
disabled=df_runs.columns,
|
|
|
74 |
use_container_width=True,
|
75 |
)
|
76 |
+
df_runs_subset = df_runs[edited_df.index==True]
|
77 |
+
n_runs = len(df_runs_subset)
|
78 |
|
79 |
if n_runs:
|
80 |
df = io.load_data(df_runs_subset, load=True, save=True)
|
81 |
+
df = inspect.clean_data(df)
|
82 |
df_long = inspect.explode_data(df)
|
83 |
df_weights = inspect.weights(df)
|
84 |
else:
|
85 |
st.info(f'You must select at least one run to load data')
|
86 |
st.stop()
|
87 |
|
88 |
+
metric.runs(df_long, n_runs)
|
89 |
|
90 |
st.markdown('#')
|
91 |
st.subheader(":violet[Event] Data")
|
|
|
95 |
num_rows = raw_data_col2.slider('Number of rows:', min_value=1, max_value=100, value=10, key='num_rows')
|
96 |
st.dataframe(df_long.head(num_rows) if use_long_checkbox else df.head(num_rows),
|
97 |
use_container_width=True)
|
98 |
+
|
99 |
|
100 |
|
101 |
### UID Health ###
|
102 |
+
# TODO: Live time - time elapsed since moving_averaged_score for selected UID was 0 (lower bound so use >Time)
|
103 |
+
# TODO: Weight - Most recent weight for selected UID (Add warning if weight is 0 or most recent timestamp is not current)
|
104 |
with tab2:
|
105 |
|
106 |
st.markdown('#')
|
|
|
110 |
uid_src = st.radio('Select one:', ['followup', 'answer'], horizontal=True, key='uid_src')
|
111 |
|
112 |
metric.uids(df_long, uid_src)
|
113 |
+
uids = st.multiselect('UID:', sorted(df_long[f'{uid_src}_uids'].unique()), key='uid')
|
114 |
+
with st.expander(f'Show UID health data for **{n_runs} selected runs** and **{len(uids)} selected UIDs**'):
|
115 |
+
st.markdown('#')
|
116 |
+
st.subheader(f"UID {uid_src.title()} :violet[Health]")
|
117 |
+
agg_uid_checkbox = st.checkbox('Aggregate UIDs', value=True)
|
118 |
+
if agg_uid_checkbox:
|
119 |
+
metric.uids(df_long, uid_src, uids)
|
120 |
+
else:
|
121 |
+
for uid in uids:
|
122 |
+
st.caption(f'UID: {uid}')
|
123 |
+
metric.uids(df_long, uid_src, [uid])
|
124 |
+
|
125 |
+
st.subheader(f'Cumulative completion frequency')
|
126 |
+
|
127 |
+
freq_col1, freq_col2 = st.columns(2)
|
128 |
+
freq_ntop = freq_col1.slider('Number of Completions:', min_value=10, max_value=1000, value=100, key='freq_ntop')
|
129 |
+
freq_rm_empty = freq_col2.checkbox('Remove empty (failed)', value=True, key='freq_rm_empty')
|
130 |
+
freq_cumulative = freq_col2.checkbox('Cumulative', value=False, key='freq_cumulative')
|
131 |
+
freq_normalize = freq_col2.checkbox('Normalize', value=True, key='freq_normalize')
|
132 |
+
|
133 |
+
plot.uid_completion_counts(df_long, uids=uids, src=uid_src, ntop=freq_ntop, rm_empty=freq_rm_empty, cumulative=freq_cumulative, normalize=freq_normalize)
|
134 |
+
|
135 |
|
136 |
+
with st.expander(f'Show UID weights data for **{n_runs} selected runs** and **{len(uids)} selected UIDs**'):
|
137 |
|
|
|
138 |
st.markdown('#')
|
139 |
st.subheader(f"UID {uid_src.title()} :violet[Weights]")
|
140 |
|
|
|
214 |
ntop=completion_ntop,
|
215 |
completions=completion_select,
|
216 |
)
|
217 |
+
# TODO: show the UIDs which have used the selected completions
|
218 |
|
219 |
|
220 |
with st.expander(f'Show **{completion_src}** completion length data for **{n_runs} selected runs**'):
|
opendashboards/assets/inspect.py
CHANGED
@@ -3,6 +3,9 @@ import streamlit as st
|
|
3 |
import pandas as pd
|
4 |
import opendashboards.utils.utils as utils
|
5 |
|
|
|
|
|
|
|
6 |
@st.cache_data
|
7 |
def explode_data(df):
|
8 |
list_cols = utils.get_list_col_lengths(df)
|
@@ -28,10 +31,10 @@ def weights(df, index='_timestamp'):
|
|
28 |
|
29 |
# rename columns
|
30 |
scores.rename({i: f'UID-{i}' for i in range(scores.shape[1])}, axis=1, inplace=True)
|
31 |
-
return scores
|
32 |
-
|
33 |
def run_event_data(df_runs, df, selected_runs):
|
34 |
-
|
35 |
st.markdown('#')
|
36 |
|
37 |
show_col1, show_col2 = st.columns(2)
|
@@ -52,6 +55,6 @@ def run_event_data(df_runs, df, selected_runs):
|
|
52 |
"url": st.column_config.LinkColumn("URL"),
|
53 |
}
|
54 |
)
|
55 |
-
|
56 |
def highlight_row(row, expr, color='lightgrey', bg_color='white'):
|
57 |
return [f'background-color:{color}' if expr else f'background-color:{bg_color}'] * len(row)
|
|
|
3 |
import pandas as pd
|
4 |
import opendashboards.utils.utils as utils
|
5 |
|
6 |
+
def clean_data(df):
|
7 |
+
return df.dropna(subset=df.filter(regex='completions|rewards').columns, how='all')
|
8 |
+
|
9 |
@st.cache_data
|
10 |
def explode_data(df):
|
11 |
list_cols = utils.get_list_col_lengths(df)
|
|
|
31 |
|
32 |
# rename columns
|
33 |
scores.rename({i: f'UID-{i}' for i in range(scores.shape[1])}, axis=1, inplace=True)
|
34 |
+
return scores
|
35 |
+
|
36 |
def run_event_data(df_runs, df, selected_runs):
|
37 |
+
|
38 |
st.markdown('#')
|
39 |
|
40 |
show_col1, show_col2 = st.columns(2)
|
|
|
55 |
"url": st.column_config.LinkColumn("URL"),
|
56 |
}
|
57 |
)
|
58 |
+
|
59 |
def highlight_row(row, expr, color='lightgrey', bg_color='white'):
|
60 |
return [f'background-color:{color}' if expr else f'background-color:{bg_color}'] * len(row)
|
opendashboards/assets/io.py
CHANGED
@@ -16,15 +16,25 @@ from pandas.api.types import (
|
|
16 |
@st.cache_data
|
17 |
def load_runs(project, filters, min_steps=10):
|
18 |
runs = []
|
|
|
|
|
|
|
19 |
msg = st.empty()
|
20 |
-
|
21 |
-
|
|
|
|
|
|
|
|
|
22 |
if step < min_steps:
|
23 |
msg.warning(f'Skipped run `{run.name}` because it contains {step} events (<{min_steps})')
|
24 |
continue
|
25 |
-
|
26 |
-
|
27 |
-
|
|
|
|
|
|
|
28 |
# extract values for selected tags
|
29 |
rules = {'hotkey': re.compile('^[0-9a-z]{48}$',re.IGNORECASE), 'version': re.compile('^\\d\.\\d+\.\\d+$'), 'spec_version': re.compile('\\d{4}$')}
|
30 |
tags = {k: tag for k, rule in rules.items() for tag in run.tags if rule.match(tag)}
|
@@ -34,6 +44,7 @@ def load_runs(project, filters, min_steps=10):
|
|
34 |
runs.append({
|
35 |
'state': run.state,
|
36 |
'num_steps': step,
|
|
|
37 |
'entity': run.entity,
|
38 |
'id': run.id,
|
39 |
'name': run.name,
|
@@ -42,9 +53,13 @@ def load_runs(project, filters, min_steps=10):
|
|
42 |
'path': os.path.join(run.entity, run.project, run.id),
|
43 |
'start_time': pd.to_datetime(end_time-duration, unit="s"),
|
44 |
'end_time': pd.to_datetime(end_time, unit="s"),
|
45 |
-
'duration': pd.
|
46 |
**tags
|
47 |
})
|
|
|
|
|
|
|
|
|
48 |
msg.empty()
|
49 |
return pd.DataFrame(runs).astype({'state': 'category', 'hotkey': 'category', 'version': 'category', 'spec_version': 'category'})
|
50 |
|
|
|
16 |
@st.cache_data
|
17 |
def load_runs(project, filters, min_steps=10):
|
18 |
runs = []
|
19 |
+
n_events = 0
|
20 |
+
successful = 0
|
21 |
+
progress = st.progress(0, 'Fetching runs from wandb')
|
22 |
msg = st.empty()
|
23 |
+
|
24 |
+
all_runs = utils.get_runs(project, filters, api_key=st.secrets['WANDB_API_KEY'])
|
25 |
+
for i, run in enumerate(all_runs):
|
26 |
+
|
27 |
+
summary = run.summary
|
28 |
+
step = summary.get('_step',0)
|
29 |
if step < min_steps:
|
30 |
msg.warning(f'Skipped run `{run.name}` because it contains {step} events (<{min_steps})')
|
31 |
continue
|
32 |
+
|
33 |
+
prog_msg = f'Loading data {i/len(all_runs)*100:.0f}% ({successful}/{len(all_runs)} runs, {n_events} events)'
|
34 |
+
progress.progress(i/len(all_runs),f'{prog_msg}... **fetching** `{run.name}`')
|
35 |
+
|
36 |
+
duration = summary.get('_runtime')
|
37 |
+
end_time = summary.get('_timestamp')
|
38 |
# extract values for selected tags
|
39 |
rules = {'hotkey': re.compile('^[0-9a-z]{48}$',re.IGNORECASE), 'version': re.compile('^\\d\.\\d+\.\\d+$'), 'spec_version': re.compile('\\d{4}$')}
|
40 |
tags = {k: tag for k, rule in rules.items() for tag in run.tags if rule.match(tag)}
|
|
|
44 |
runs.append({
|
45 |
'state': run.state,
|
46 |
'num_steps': step,
|
47 |
+
'num_completions': step*sum(len(v) for k, v in run.summary.items() if k.endswith('completions') and isinstance(v, list)),
|
48 |
'entity': run.entity,
|
49 |
'id': run.id,
|
50 |
'name': run.name,
|
|
|
53 |
'path': os.path.join(run.entity, run.project, run.id),
|
54 |
'start_time': pd.to_datetime(end_time-duration, unit="s"),
|
55 |
'end_time': pd.to_datetime(end_time, unit="s"),
|
56 |
+
'duration': pd.to_timedelta(duration, unit="s").round('s'),
|
57 |
**tags
|
58 |
})
|
59 |
+
n_events += step
|
60 |
+
successful += 1
|
61 |
+
|
62 |
+
progress.empty()
|
63 |
msg.empty()
|
64 |
return pd.DataFrame(runs).astype({'state': 'category', 'hotkey': 'category', 'version': 'category', 'spec_version': 'category'})
|
65 |
|
opendashboards/assets/metric.py
CHANGED
@@ -1,7 +1,20 @@
|
|
1 |
import time
|
|
|
2 |
import pandas as pd
|
3 |
import streamlit as st
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
@st.cache_data
|
7 |
def wandb(df_runs):
|
@@ -9,50 +22,63 @@ def wandb(df_runs):
|
|
9 |
# get rows where start time is older than 24h ago
|
10 |
df_runs_old = df_runs.loc[df_runs.start_time < pd.to_datetime(time.time()-24*60*60, unit='s')]
|
11 |
|
12 |
-
col1, col2, col3 = st.columns(
|
13 |
|
14 |
-
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
17 |
st.markdown('----')
|
18 |
|
19 |
|
20 |
@st.cache_data
|
21 |
-
def runs(df_long):
|
22 |
|
23 |
col1, col2, col3 = st.columns(3)
|
24 |
-
col1.metric(label="Runs", value=
|
25 |
-
col1.metric(label="Events", value=df_long.shape[0])
|
26 |
col2.metric(label="Followup UIDs", value=df_long.followup_uids.nunique())
|
27 |
col2.metric(label="Answer UIDs", value=df_long.answer_uids.nunique())
|
28 |
-
col3.metric(label="
|
29 |
-
col3.metric(label="
|
30 |
st.markdown('----')
|
31 |
|
32 |
|
33 |
-
|
34 |
@st.cache_data
|
35 |
-
def uids(df_long, src,
|
36 |
|
37 |
uid_col = f'{src}_uids'
|
38 |
completion_col = f'{src}_completions'
|
39 |
nsfw_col = f'{src}_nsfw_scores'
|
40 |
reward_col = f'{src}_rewards'
|
41 |
|
42 |
-
if
|
43 |
-
df_long = df_long.loc[df_long[uid_col]
|
44 |
|
45 |
-
col1, col2, col3 = st.columns(
|
46 |
col1.metric(
|
47 |
label="Success %",
|
48 |
-
value=f'{df_long.loc[df_long[completion_col].str.len() > 0].shape[0]/df_long.shape[0] * 100:.1f}'
|
|
|
49 |
)
|
50 |
col2.metric(
|
51 |
label="Diversity %",
|
52 |
-
value=f'{df_long[completion_col].nunique()/df_long.shape[0] * 100:.1f}'
|
|
|
53 |
)
|
|
|
|
|
54 |
col3.metric(
|
|
|
|
|
|
|
|
|
|
|
55 |
label="Toxicity %",
|
56 |
-
value=f'{df_long[nsfw_col].mean() * 100:.1f}' if nsfw_col in df_long.columns else '
|
|
|
57 |
)
|
58 |
st.markdown('----')
|
|
|
1 |
import time
|
2 |
+
import numerize
|
3 |
import pandas as pd
|
4 |
import streamlit as st
|
5 |
|
6 |
+
def fmt(number):
|
7 |
+
units = ['', 'k', 'M', 'B']
|
8 |
+
magnitude = 0
|
9 |
+
while abs(number) >= 1000 and magnitude < len(units) - 1:
|
10 |
+
magnitude += 1
|
11 |
+
number /= 1000
|
12 |
+
|
13 |
+
if units[magnitude]:
|
14 |
+
return f'{number:.2f}{units[magnitude]}'
|
15 |
+
else:
|
16 |
+
return f'{number:.0f}{units[magnitude]}'
|
17 |
+
|
18 |
|
19 |
@st.cache_data
|
20 |
def wandb(df_runs):
|
|
|
22 |
# get rows where start time is older than 24h ago
|
23 |
df_runs_old = df_runs.loc[df_runs.start_time < pd.to_datetime(time.time()-24*60*60, unit='s')]
|
24 |
|
25 |
+
col1, col2, col3, col4 = st.columns(4)
|
26 |
|
27 |
+
# Convert to appropriate units e.g. 1.2k instead of 1200.
|
28 |
+
col1.metric('Runs', fmt(df_runs.shape[0]), delta=fmt(df_runs.shape[0]-df_runs_old.shape[0])+' (24h)')
|
29 |
+
col2.metric('Hotkeys', fmt(df_runs.hotkey.nunique()), delta=fmt(df_runs.hotkey.nunique()-df_runs_old.hotkey.nunique())+' (24h)')
|
30 |
+
col3.metric('Events', fmt(df_runs.num_steps.sum()), delta=fmt(df_runs.num_steps.sum()-df_runs_old.num_steps.sum())+' (24h)')
|
31 |
+
col4.metric('Completions', fmt(df_runs.num_completions.sum()), delta=fmt(df_runs.num_completions.sum()-df_runs_old.num_completions.sum())+' (24h)')
|
32 |
+
|
33 |
st.markdown('----')
|
34 |
|
35 |
|
36 |
@st.cache_data
|
37 |
+
def runs(df_long, n_runs):
|
38 |
|
39 |
col1, col2, col3 = st.columns(3)
|
40 |
+
col1.metric(label="Runs", value=n_runs)
|
41 |
+
col1.metric(label="Events", value=df_long.shape[0])
|
42 |
col2.metric(label="Followup UIDs", value=df_long.followup_uids.nunique())
|
43 |
col2.metric(label="Answer UIDs", value=df_long.answer_uids.nunique())
|
44 |
+
col3.metric(label="Unique Followups", value=df_long.followup_completions.nunique())
|
45 |
+
col3.metric(label="Unique Answers", value=df_long.answer_completions.nunique())
|
46 |
st.markdown('----')
|
47 |
|
48 |
|
49 |
+
|
50 |
@st.cache_data
|
51 |
+
def uids(df_long, src, uids=None):
|
52 |
|
53 |
uid_col = f'{src}_uids'
|
54 |
completion_col = f'{src}_completions'
|
55 |
nsfw_col = f'{src}_nsfw_scores'
|
56 |
reward_col = f'{src}_rewards'
|
57 |
|
58 |
+
if uids:
|
59 |
+
df_long = df_long.loc[df_long[uid_col].isin(uids)]
|
60 |
|
61 |
+
col1, col2, col3, col4 = st.columns(4)
|
62 |
col1.metric(
|
63 |
label="Success %",
|
64 |
+
value=f'{df_long.loc[df_long[completion_col].str.len() > 0].shape[0]/df_long.shape[0] * 100:.1f}',
|
65 |
+
help='Number of successful completions divided by total number of events'
|
66 |
)
|
67 |
col2.metric(
|
68 |
label="Diversity %",
|
69 |
+
value=f'{df_long[completion_col].nunique()/df_long.shape[0] * 100:.1f}',
|
70 |
+
help='Number of unique completions divided by total number of events'
|
71 |
)
|
72 |
+
# uniqueness can be expressed as the average number of unique completions per uid divided by all unique completions
|
73 |
+
|
74 |
col3.metric(
|
75 |
+
label="Uniqueness %",
|
76 |
+
value=f'{df_long.groupby(uid_col)[completion_col].nunique().mean()/df_long[completion_col].nunique() * 100:.1f}',
|
77 |
+
help='Average number of unique completions per uid divided by all unique completions'
|
78 |
+
)
|
79 |
+
col4.metric(
|
80 |
label="Toxicity %",
|
81 |
+
value=f'{df_long[nsfw_col].mean() * 100:.1f}' if nsfw_col in df_long.columns else '--',
|
82 |
+
help='Average toxicity score of all events'
|
83 |
)
|
84 |
st.markdown('----')
|
opendashboards/assets/plot.py
CHANGED
@@ -65,4 +65,18 @@ def completion_length_time(df, completion_col, uid_col, time_col, words=False):
|
|
65 |
words=words
|
66 |
),
|
67 |
use_container_width=True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
)
|
|
|
65 |
words=words
|
66 |
),
|
67 |
use_container_width=True
|
68 |
+
)
|
69 |
+
|
70 |
+
def uid_completion_counts(df, uids, src, rm_empty, ntop=100, cumulative=False, normalize=True):
|
71 |
+
return st.plotly_chart(
|
72 |
+
plotting.plot_uid_completion_counts(
|
73 |
+
df,
|
74 |
+
uids=uids,
|
75 |
+
src=src,
|
76 |
+
rm_empty=rm_empty,
|
77 |
+
ntop=ntop,
|
78 |
+
cumulative=cumulative,
|
79 |
+
normalize=normalize
|
80 |
+
),
|
81 |
+
use_container_width=True
|
82 |
)
|
opendashboards/utils/plotting.py
CHANGED
@@ -249,7 +249,6 @@ def plot_leaderboard(
|
|
249 |
else:
|
250 |
index = rankings.index.astype(str)
|
251 |
|
252 |
-
print(f"Using top {ntop} {group_on} by {agg_col}: \n{rankings}")
|
253 |
return px.bar(
|
254 |
x=rankings,
|
255 |
y=index,
|
@@ -307,16 +306,16 @@ def plot_completion_length_time(
|
|
307 |
uid_col: str = "answer_uids",
|
308 |
completion_col: str = "answer_completions",
|
309 |
time_col: str = "answer_times",
|
310 |
-
words: bool = False,
|
311 |
) -> go.Figure:
|
312 |
-
|
313 |
df = df[[uid_col, completion_col, time_col]].explode(column=[uid_col, completion_col, time_col])
|
314 |
df["time"] = df[time_col].astype(float)
|
315 |
if words:
|
316 |
df["completion_length"] = df[completion_col].str.split().str.len()
|
317 |
else:
|
318 |
df["completion_length"] = df[completion_col].str.len()
|
319 |
-
|
320 |
return px.scatter(
|
321 |
df,
|
322 |
x='completion_length',
|
@@ -329,7 +328,44 @@ def plot_completion_length_time(
|
|
329 |
opacity=0.35,
|
330 |
**plotly_config,
|
331 |
)
|
332 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
333 |
|
334 |
def plot_network_embedding(
|
335 |
df: pd.DataFrame,
|
|
|
249 |
else:
|
250 |
index = rankings.index.astype(str)
|
251 |
|
|
|
252 |
return px.bar(
|
253 |
x=rankings,
|
254 |
y=index,
|
|
|
306 |
uid_col: str = "answer_uids",
|
307 |
completion_col: str = "answer_completions",
|
308 |
time_col: str = "answer_times",
|
309 |
+
words: bool = False,
|
310 |
) -> go.Figure:
|
311 |
+
|
312 |
df = df[[uid_col, completion_col, time_col]].explode(column=[uid_col, completion_col, time_col])
|
313 |
df["time"] = df[time_col].astype(float)
|
314 |
if words:
|
315 |
df["completion_length"] = df[completion_col].str.split().str.len()
|
316 |
else:
|
317 |
df["completion_length"] = df[completion_col].str.len()
|
318 |
+
|
319 |
return px.scatter(
|
320 |
df,
|
321 |
x='completion_length',
|
|
|
328 |
opacity=0.35,
|
329 |
**plotly_config,
|
330 |
)
|
331 |
+
|
332 |
+
def plot_uid_completion_counts(
|
333 |
+
df: pd.DataFrame,
|
334 |
+
uids: List[int],
|
335 |
+
src: str = 'answer',
|
336 |
+
rm_empty: bool = True,
|
337 |
+
ntop: int = 100,
|
338 |
+
cumulative: bool = False,
|
339 |
+
normalize: bool = True,
|
340 |
+
) -> go.Figure:
|
341 |
+
|
342 |
+
completion_col = f'{src}_completions'
|
343 |
+
uid_col = f'{src}_uids'
|
344 |
+
if rm_empty:
|
345 |
+
df = df.loc[df[completion_col].str.len()>0]
|
346 |
+
|
347 |
+
df = df.loc[df[uid_col].isin(uids)]
|
348 |
+
|
349 |
+
g = df.groupby(uid_col)[completion_col].value_counts(normalize=normalize).reset_index(level=1)
|
350 |
+
y_col = g.columns[-1]
|
351 |
+
|
352 |
+
# rescale each group to have a max of 1 if normalize is True
|
353 |
+
if cumulative:
|
354 |
+
g[y_col] = g.groupby(level=0)[y_col].cumsum().transform(lambda x: x/x.max() if normalize else x)
|
355 |
+
|
356 |
+
# get top n completions
|
357 |
+
g = g.groupby(level=0).head(ntop)
|
358 |
+
|
359 |
+
# # create a rank column which increments by one and resets when the uid changes
|
360 |
+
g['rank'] = g.groupby(level=0).cumcount()+1
|
361 |
+
|
362 |
+
return px.line(g.sort_index().reset_index(),
|
363 |
+
x='rank',y=y_col,color=uid_col,
|
364 |
+
labels={'rank':'Top Completions',uid_col:'UID',y_col:y_col.replace('_',' ').title()},
|
365 |
+
title=f'{src.title()} Completion {y_col.replace("_"," ").title()}s by Rank',
|
366 |
+
**plotly_config,
|
367 |
+
).update_traces(opacity=0.7)
|
368 |
+
|
369 |
|
370 |
def plot_network_embedding(
|
371 |
df: pd.DataFrame,
|