James McCool commited on
Commit
62a6685
·
1 Parent(s): 2ceda65

Add unique and under-5/under-10 duplicate counts to duplication frame in app.py

Browse files

- Introduced calculations for 'uniques', 'under_5', and 'under_10' metrics in the working DataFrame, enhancing the analysis of duplicate lineups.
- Updated the duplication frame to include these new metrics, improving data clarity and analysis capabilities.

Files changed (1) hide show
  1. app.py +15 -2
app.py CHANGED
@@ -187,6 +187,19 @@ with tab2:
187
  axis=1
188
  )
189
  working_df['dupes'] = working_df.groupby('sorted').transform('size')
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  working_df = working_df.reset_index()
191
  working_df['percentile_finish'] = working_df['index'].rank(pct=True)
192
  working_df['finish'] = working_df['index']
@@ -433,8 +446,8 @@ with tab2:
433
  dupe_frame['uniques%'] = dupe_frame['uniques'] / dupe_frame['EntryCount']
434
  dupe_frame['under_5%'] = dupe_frame['under_5'] / dupe_frame['EntryCount']
435
  dupe_frame['under_10%'] = dupe_frame['under_10'] / dupe_frame['EntryCount']
436
- st.session_state['duplication_frame'] = dupe_frame[['BaseName', 'EntryCount', 'average_dupes', 'uniques', 'uniques%', 'under_5', 'under_5%', 'under_10', 'under_10%']].drop_duplicates(subset='BaseName', keep='first')
437
  st.dataframe(st.session_state['duplication_frame'].style.
438
  background_gradient(cmap='RdYlGn', subset=['uniques%', 'under_5%', 'under_10%'], axis=0).
439
- background_gradient(cmap='RdYlGn_r', subset=['average_dupes', 'uniques', 'under_5', 'under_10'], axis=0).
440
  format(precision=2), hide_index=True)
 
187
  axis=1
188
  )
189
  working_df['dupes'] = working_df.groupby('sorted').transform('size')
190
+
191
+ working_df['uniques'] = working_df.groupby('BaseName').apply(
192
+ lambda x: (x['dupes'] == 1).sum()
193
+ ).reindex(working_df['BaseName']).values
194
+
195
+ working_df['under_5'] = working_df.groupby('BaseName').apply(
196
+ lambda x: (x['dupes'] <= 5).sum()
197
+ ).reindex(working_df['BaseName']).values
198
+
199
+ working_df['under_10'] = working_df.groupby('BaseName').apply(
200
+ lambda x: (x['dupes'] <= 10).sum()
201
+ ).reindex(working_df['BaseName']).values
202
+
203
  working_df = working_df.reset_index()
204
  working_df['percentile_finish'] = working_df['index'].rank(pct=True)
205
  working_df['finish'] = working_df['index']
 
446
  dupe_frame['uniques%'] = dupe_frame['uniques'] / dupe_frame['EntryCount']
447
  dupe_frame['under_5%'] = dupe_frame['under_5'] / dupe_frame['EntryCount']
448
  dupe_frame['under_10%'] = dupe_frame['under_10'] / dupe_frame['EntryCount']
449
+ st.session_state['duplication_frame'] = dupe_frame[['BaseName', 'EntryCount', 'average_dupes', 'dupes', 'uniques', 'uniques%', 'under_5', 'under_5%', 'under_10', 'under_10%']].drop_duplicates(subset='BaseName', keep='first')
450
  st.dataframe(st.session_state['duplication_frame'].style.
451
  background_gradient(cmap='RdYlGn', subset=['uniques%', 'under_5%', 'under_10%'], axis=0).
452
+ background_gradient(cmap='RdYlGn_r', subset=['uniques', 'under_5', 'under_10'], axis=0).
453
  format(precision=2), hide_index=True)