Spaces:
Sleeping
Sleeping
Update analyzing.py
Browse files- analyzing.py +82 -80
analyzing.py
CHANGED
@@ -594,88 +594,90 @@ st.session_state['stage'] = 1
|
|
594 |
|
595 |
|
596 |
if st.session_state['stage'] > 0 :
|
597 |
-
|
598 |
-
|
599 |
-
|
600 |
-
|
601 |
-
|
602 |
-
|
603 |
-
|
604 |
-
|
605 |
-
|
606 |
-
|
607 |
-
|
608 |
-
|
609 |
-
|
610 |
-
|
611 |
-
|
612 |
-
|
613 |
-
|
614 |
-
|
615 |
-
|
616 |
-
|
617 |
-
|
618 |
-
|
619 |
-
|
620 |
-
|
621 |
-
|
622 |
-
|
623 |
-
|
624 |
-
|
625 |
-
|
626 |
-
|
627 |
-
|
628 |
-
|
629 |
-
|
630 |
-
|
631 |
-
|
632 |
-
|
633 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
634 |
|
635 |
-
|
636 |
-
|
637 |
-
|
638 |
-
|
639 |
-
|
640 |
-
|
641 |
-
|
642 |
-
|
643 |
-
|
644 |
-
st.session_state['dataset'] = parsed_responses
|
645 |
-
st.session_state['new_data'] = new_data
|
646 |
-
st.session_state['data_processed'] = True
|
647 |
-
except Exception as e:
|
648 |
-
st.write(f"Error processing data: {e}")
|
649 |
-
|
650 |
-
if st.session_state['data_processed']:
|
651 |
-
try:
|
652 |
-
visualizer = UAPVisualizer(data=st.session_state['new_data'])
|
653 |
-
#new_data = pd.DataFrame() # Assuming new_data is prepared earlier in the code
|
654 |
-
fig2 = visualizer.plot_cramers_v_heatmap(data=st.session_state['new_data'], significance_level=0.05)
|
655 |
-
with st.status(f"Cramer's V Chart", expanded=True) as statuss:
|
656 |
-
st.pyplot(fig2)
|
657 |
-
statuss.update(label="Cramer's V chart plotted", expanded=False)
|
658 |
-
except Exception as e:
|
659 |
-
st.write(f"Error plotting Cramers V: {e}")
|
660 |
-
|
661 |
-
for i, column in enumerate(st.session_state['col_names']):
|
662 |
-
#if stateful_button(f"Show {column} clusters {i}", key=f"show_{column}_clusters"):
|
663 |
-
# if st.session_state['data_processed']:
|
664 |
-
# with st.status(f"Show clusters {column}", expanded=True) as stats:
|
665 |
-
# fig3 = st.session_state['analyzers'][i].plot_embeddings4(title=f"{column} clusters", cluster_terms=st.session_state['analyzers'][i].__dict__['cluster_terms'], cluster_labels=st.session_state['analyzers'][i].__dict__['cluster_labels'], reduced_embeddings=st.session_state['analyzers'][i].__dict__['reduced_embeddings'], column=f'Analyzer_{column}', data=st.session_state['new_data'])
|
666 |
-
# stats.update(label=f"Show clusters {column} complete", expanded=False)
|
667 |
if st.session_state['data_processed']:
|
668 |
-
|
669 |
-
|
670 |
-
|
671 |
-
|
672 |
-
|
673 |
-
|
674 |
-
|
675 |
-
|
676 |
-
)
|
677 |
-
|
678 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
679 |
|
680 |
|
681 |
# this will check if the dataframe is not empty
|
|
|
594 |
|
595 |
|
596 |
if st.session_state['stage'] > 0 :
|
597 |
+
with st.form(border=True, key='Select Columns for Analysis'):
|
598 |
+
columns_to_analyze = st.multiselect(
|
599 |
+
label='Select columns to analyze',
|
600 |
+
options=st.session_state['parsed_responses'].columns
|
601 |
+
)
|
602 |
+
if st.form_submit_button("Process Data"):
|
603 |
+
if columns_to_analyze:
|
604 |
+
analyzers = []
|
605 |
+
col_names = []
|
606 |
+
clusters = {}
|
607 |
+
for column in columns_to_analyze:
|
608 |
+
with torch.no_grad():
|
609 |
+
with st.status(f"Processing {column}", expanded=True) as status:
|
610 |
+
analyzer = UAPAnalyzer(st.session_state['parsed_responses'], column)
|
611 |
+
st.write(f"Processing {column}...")
|
612 |
+
analyzer.preprocess_data(top_n=32)
|
613 |
+
st.write("Reducing dimensionality...")
|
614 |
+
analyzer.reduce_dimensionality(method='UMAP', n_components=2, n_neighbors=15, min_dist=0.1)
|
615 |
+
st.write("Clustering data...")
|
616 |
+
analyzer.cluster_data(method='HDBSCAN', min_cluster_size=15)
|
617 |
+
analyzer.get_tf_idf_clusters(top_n=3)
|
618 |
+
st.write("Naming clusters...")
|
619 |
+
analyzers.append(analyzer)
|
620 |
+
col_names.append(column)
|
621 |
+
clusters[column] = analyzer.merge_similar_clusters(cluster_terms=analyzer.__dict__['cluster_terms'], cluster_labels=analyzer.__dict__['cluster_labels'])
|
622 |
+
|
623 |
+
# Run the visualization
|
624 |
+
# fig = datamapplot.create_plot(
|
625 |
+
# analyzer.__dict__['reduced_embeddings'],
|
626 |
+
# analyzer.__dict__['cluster_labels'].astype(str),
|
627 |
+
# #label_font_size=11,
|
628 |
+
# label_wrap_width=20,
|
629 |
+
# use_medoids=True,
|
630 |
+
# )#.to_html(full_html=False, include_plotlyjs='cdn')
|
631 |
+
# st.pyplot(fig.savefig())
|
632 |
+
status.update(label=f"Processing {column} complete", expanded=False)
|
633 |
+
st.session_state['analyzers'] = analyzers
|
634 |
+
st.session_state['col_names'] = col_names
|
635 |
+
st.session_state['clusters'] = clusters
|
636 |
+
|
637 |
+
# save space
|
638 |
+
parsed = None
|
639 |
+
analyzers = None
|
640 |
+
col_names = None
|
641 |
+
clusters = None
|
642 |
|
643 |
+
if st.session_state['clusters'] is not None:
|
644 |
+
try:
|
645 |
+
new_data, parsed_responses = analyze_and_predict(st.session_state['parsed_responses'], st.session_state['analyzers'], st.session_state['col_names'], st.session_state['clusters'])
|
646 |
+
st.session_state['dataset'] = parsed_responses
|
647 |
+
st.session_state['new_data'] = new_data
|
648 |
+
st.session_state['data_processed'] = True
|
649 |
+
except Exception as e:
|
650 |
+
st.write(f"Error processing data: {e}")
|
651 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
652 |
if st.session_state['data_processed']:
|
653 |
+
try:
|
654 |
+
visualizer = UAPVisualizer(data=st.session_state['new_data'])
|
655 |
+
#new_data = pd.DataFrame() # Assuming new_data is prepared earlier in the code
|
656 |
+
fig2 = visualizer.plot_cramers_v_heatmap(data=st.session_state['new_data'], significance_level=0.05)
|
657 |
+
with st.status(f"Cramer's V Chart", expanded=True) as statuss:
|
658 |
+
st.pyplot(fig2)
|
659 |
+
statuss.update(label="Cramer's V chart plotted", expanded=False)
|
660 |
+
except Exception as e:
|
661 |
+
st.write(f"Error plotting Cramers V: {e}")
|
662 |
+
|
663 |
+
for i, column in enumerate(st.session_state['col_names']):
|
664 |
+
#if stateful_button(f"Show {column} clusters {i}", key=f"show_{column}_clusters"):
|
665 |
+
# if st.session_state['data_processed']:
|
666 |
+
# with st.status(f"Show clusters {column}", expanded=True) as stats:
|
667 |
+
# fig3 = st.session_state['analyzers'][i].plot_embeddings4(title=f"{column} clusters", cluster_terms=st.session_state['analyzers'][i].__dict__['cluster_terms'], cluster_labels=st.session_state['analyzers'][i].__dict__['cluster_labels'], reduced_embeddings=st.session_state['analyzers'][i].__dict__['reduced_embeddings'], column=f'Analyzer_{column}', data=st.session_state['new_data'])
|
668 |
+
# stats.update(label=f"Show clusters {column} complete", expanded=False)
|
669 |
+
if st.session_state['data_processed']:
|
670 |
+
with st.status(f"Show clusters {column}", expanded=True) as stats:
|
671 |
+
fig3 = st.session_state['analyzers'][i].plot_embeddings4(
|
672 |
+
title=f"{column} clusters",
|
673 |
+
cluster_terms=st.session_state['analyzers'][i].__dict__['cluster_terms'],
|
674 |
+
cluster_labels=st.session_state['analyzers'][i].__dict__['cluster_labels'],
|
675 |
+
reduced_embeddings=st.session_state['analyzers'][i].__dict__['reduced_embeddings'],
|
676 |
+
column=column, # Use the original column name here
|
677 |
+
data=st.session_state['parsed_responses'] # Use the original dataset here
|
678 |
+
)
|
679 |
+
stats.update(label=f"Show clusters {column} complete", expanded=False)
|
680 |
+
st.session_state['analysis_complete'] = True
|
681 |
|
682 |
|
683 |
# this will check if the dataframe is not empty
|