Ashoka74 commited on
Commit
ea97c58
·
verified ·
1 Parent(s): 3f33ebc

Update analyzing.py

Browse files
Files changed (1) hide show
  1. analyzing.py +82 -80
analyzing.py CHANGED
@@ -594,88 +594,90 @@ st.session_state['stage'] = 1
594
 
595
 
596
  if st.session_state['stage'] > 0 :
597
- columns_to_analyze = st.multiselect(
598
- label='Select columns to analyze',
599
- options=st.session_state['parsed_responses'].columns
600
- )
601
- if columns_to_analyze:
602
- analyzers = []
603
- col_names = []
604
- clusters = {}
605
- for column in columns_to_analyze:
606
- with torch.no_grad():
607
- with st.status(f"Processing {column}", expanded=True) as status:
608
- analyzer = UAPAnalyzer(st.session_state['parsed_responses'], column)
609
- st.write(f"Processing {column}...")
610
- analyzer.preprocess_data(top_n=32)
611
- st.write("Reducing dimensionality...")
612
- analyzer.reduce_dimensionality(method='UMAP', n_components=2, n_neighbors=15, min_dist=0.1)
613
- st.write("Clustering data...")
614
- analyzer.cluster_data(method='HDBSCAN', min_cluster_size=15)
615
- analyzer.get_tf_idf_clusters(top_n=3)
616
- st.write("Naming clusters...")
617
- analyzers.append(analyzer)
618
- col_names.append(column)
619
- clusters[column] = analyzer.merge_similar_clusters(cluster_terms=analyzer.__dict__['cluster_terms'], cluster_labels=analyzer.__dict__['cluster_labels'])
620
-
621
- # Run the visualization
622
- # fig = datamapplot.create_plot(
623
- # analyzer.__dict__['reduced_embeddings'],
624
- # analyzer.__dict__['cluster_labels'].astype(str),
625
- # #label_font_size=11,
626
- # label_wrap_width=20,
627
- # use_medoids=True,
628
- # )#.to_html(full_html=False, include_plotlyjs='cdn')
629
- # st.pyplot(fig.savefig())
630
- status.update(label=f"Processing {column} complete", expanded=False)
631
- st.session_state['analyzers'] = analyzers
632
- st.session_state['col_names'] = col_names
633
- st.session_state['clusters'] = clusters
 
 
 
 
 
 
 
 
634
 
635
- # save space
636
- parsed = None
637
- analyzers = None
638
- col_names = None
639
- clusters = None
640
-
641
- if st.session_state['clusters'] is not None:
642
- try:
643
- new_data, parsed_responses = analyze_and_predict(st.session_state['parsed_responses'], st.session_state['analyzers'], st.session_state['col_names'], st.session_state['clusters'])
644
- st.session_state['dataset'] = parsed_responses
645
- st.session_state['new_data'] = new_data
646
- st.session_state['data_processed'] = True
647
- except Exception as e:
648
- st.write(f"Error processing data: {e}")
649
-
650
- if st.session_state['data_processed']:
651
- try:
652
- visualizer = UAPVisualizer(data=st.session_state['new_data'])
653
- #new_data = pd.DataFrame() # Assuming new_data is prepared earlier in the code
654
- fig2 = visualizer.plot_cramers_v_heatmap(data=st.session_state['new_data'], significance_level=0.05)
655
- with st.status(f"Cramer's V Chart", expanded=True) as statuss:
656
- st.pyplot(fig2)
657
- statuss.update(label="Cramer's V chart plotted", expanded=False)
658
- except Exception as e:
659
- st.write(f"Error plotting Cramers V: {e}")
660
-
661
- for i, column in enumerate(st.session_state['col_names']):
662
- #if stateful_button(f"Show {column} clusters {i}", key=f"show_{column}_clusters"):
663
- # if st.session_state['data_processed']:
664
- # with st.status(f"Show clusters {column}", expanded=True) as stats:
665
- # fig3 = st.session_state['analyzers'][i].plot_embeddings4(title=f"{column} clusters", cluster_terms=st.session_state['analyzers'][i].__dict__['cluster_terms'], cluster_labels=st.session_state['analyzers'][i].__dict__['cluster_labels'], reduced_embeddings=st.session_state['analyzers'][i].__dict__['reduced_embeddings'], column=f'Analyzer_{column}', data=st.session_state['new_data'])
666
- # stats.update(label=f"Show clusters {column} complete", expanded=False)
667
  if st.session_state['data_processed']:
668
- with st.status(f"Show clusters {column}", expanded=True) as stats:
669
- fig3 = st.session_state['analyzers'][i].plot_embeddings4(
670
- title=f"{column} clusters",
671
- cluster_terms=st.session_state['analyzers'][i].__dict__['cluster_terms'],
672
- cluster_labels=st.session_state['analyzers'][i].__dict__['cluster_labels'],
673
- reduced_embeddings=st.session_state['analyzers'][i].__dict__['reduced_embeddings'],
674
- column=column, # Use the original column name here
675
- data=st.session_state['parsed_responses'] # Use the original dataset here
676
- )
677
- stats.update(label=f"Show clusters {column} complete", expanded=False)
678
- st.session_state['analysis_complete'] = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
679
 
680
 
681
  # this will check if the dataframe is not empty
 
594
 
595
 
596
  if st.session_state['stage'] > 0 :
597
+ with st.form(border=True, key='Select Columns for Analysis'):
598
+ columns_to_analyze = st.multiselect(
599
+ label='Select columns to analyze',
600
+ options=st.session_state['parsed_responses'].columns
601
+ )
602
+ if st.form_submit_button("Process Data"):
603
+ if columns_to_analyze:
604
+ analyzers = []
605
+ col_names = []
606
+ clusters = {}
607
+ for column in columns_to_analyze:
608
+ with torch.no_grad():
609
+ with st.status(f"Processing {column}", expanded=True) as status:
610
+ analyzer = UAPAnalyzer(st.session_state['parsed_responses'], column)
611
+ st.write(f"Processing {column}...")
612
+ analyzer.preprocess_data(top_n=32)
613
+ st.write("Reducing dimensionality...")
614
+ analyzer.reduce_dimensionality(method='UMAP', n_components=2, n_neighbors=15, min_dist=0.1)
615
+ st.write("Clustering data...")
616
+ analyzer.cluster_data(method='HDBSCAN', min_cluster_size=15)
617
+ analyzer.get_tf_idf_clusters(top_n=3)
618
+ st.write("Naming clusters...")
619
+ analyzers.append(analyzer)
620
+ col_names.append(column)
621
+ clusters[column] = analyzer.merge_similar_clusters(cluster_terms=analyzer.__dict__['cluster_terms'], cluster_labels=analyzer.__dict__['cluster_labels'])
622
+
623
+ # Run the visualization
624
+ # fig = datamapplot.create_plot(
625
+ # analyzer.__dict__['reduced_embeddings'],
626
+ # analyzer.__dict__['cluster_labels'].astype(str),
627
+ # #label_font_size=11,
628
+ # label_wrap_width=20,
629
+ # use_medoids=True,
630
+ # )#.to_html(full_html=False, include_plotlyjs='cdn')
631
+ # st.pyplot(fig.savefig())
632
+ status.update(label=f"Processing {column} complete", expanded=False)
633
+ st.session_state['analyzers'] = analyzers
634
+ st.session_state['col_names'] = col_names
635
+ st.session_state['clusters'] = clusters
636
+
637
+ # save space
638
+ parsed = None
639
+ analyzers = None
640
+ col_names = None
641
+ clusters = None
642
 
643
+ if st.session_state['clusters'] is not None:
644
+ try:
645
+ new_data, parsed_responses = analyze_and_predict(st.session_state['parsed_responses'], st.session_state['analyzers'], st.session_state['col_names'], st.session_state['clusters'])
646
+ st.session_state['dataset'] = parsed_responses
647
+ st.session_state['new_data'] = new_data
648
+ st.session_state['data_processed'] = True
649
+ except Exception as e:
650
+ st.write(f"Error processing data: {e}")
651
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
652
  if st.session_state['data_processed']:
653
+ try:
654
+ visualizer = UAPVisualizer(data=st.session_state['new_data'])
655
+ #new_data = pd.DataFrame() # Assuming new_data is prepared earlier in the code
656
+ fig2 = visualizer.plot_cramers_v_heatmap(data=st.session_state['new_data'], significance_level=0.05)
657
+ with st.status(f"Cramer's V Chart", expanded=True) as statuss:
658
+ st.pyplot(fig2)
659
+ statuss.update(label="Cramer's V chart plotted", expanded=False)
660
+ except Exception as e:
661
+ st.write(f"Error plotting Cramers V: {e}")
662
+
663
+ for i, column in enumerate(st.session_state['col_names']):
664
+ #if stateful_button(f"Show {column} clusters {i}", key=f"show_{column}_clusters"):
665
+ # if st.session_state['data_processed']:
666
+ # with st.status(f"Show clusters {column}", expanded=True) as stats:
667
+ # fig3 = st.session_state['analyzers'][i].plot_embeddings4(title=f"{column} clusters", cluster_terms=st.session_state['analyzers'][i].__dict__['cluster_terms'], cluster_labels=st.session_state['analyzers'][i].__dict__['cluster_labels'], reduced_embeddings=st.session_state['analyzers'][i].__dict__['reduced_embeddings'], column=f'Analyzer_{column}', data=st.session_state['new_data'])
668
+ # stats.update(label=f"Show clusters {column} complete", expanded=False)
669
+ if st.session_state['data_processed']:
670
+ with st.status(f"Show clusters {column}", expanded=True) as stats:
671
+ fig3 = st.session_state['analyzers'][i].plot_embeddings4(
672
+ title=f"{column} clusters",
673
+ cluster_terms=st.session_state['analyzers'][i].__dict__['cluster_terms'],
674
+ cluster_labels=st.session_state['analyzers'][i].__dict__['cluster_labels'],
675
+ reduced_embeddings=st.session_state['analyzers'][i].__dict__['reduced_embeddings'],
676
+ column=column, # Use the original column name here
677
+ data=st.session_state['parsed_responses'] # Use the original dataset here
678
+ )
679
+ stats.update(label=f"Show clusters {column} complete", expanded=False)
680
+ st.session_state['analysis_complete'] = True
681
 
682
 
683
  # this will check if the dataframe is not empty