kambris commited on
Commit
67dbb5a
·
verified ·
1 Parent(s): f57ddb1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -47
app.py CHANGED
@@ -447,52 +447,52 @@ if uploaded_file is not None:
447
 
448
  # Folio Clustering Section
449
  with st.expander("Folio Clustering Based on Word Usage Patterns", expanded=st.session_state.expander_states['folio_clustering']):
450
- st.write("""
451
- This section groups folios into clusters based on their word usage patterns.
452
- - **PCA**: Reduces the data to 2D using Principal Component Analysis.
453
- - **t-SNE**: Reduces the data to 2D using t-Distributed Stochastic Neighbor Embedding.
454
- - **K-Means**: Groups folios into clusters based on their word frequencies.
455
- """)
456
-
457
- # Feature Extraction
458
- all_words = set(word for folio in folio_word_map for word in folio_word_map[folio])
459
- word_freq_matrix = pd.DataFrame(index=folio_word_map.keys(), columns=list(all_words), data=0) # Convert set to list
460
-
461
- for folio, word_counter in folio_word_map.items():
462
- for word, count in word_counter.items():
463
- word_freq_matrix.loc[folio, word] = count
464
 
465
- # Dimensionality Reduction Option
466
- dim_reduction_method = st.selectbox("Select Dimensionality Reduction Method", ["PCA", "t-SNE"], key="dim_reduction_method")
467
-
468
- if dim_reduction_method == "PCA":
469
- reducer = PCA(n_components=2)
470
- folio_coords = reducer.fit_transform(word_freq_matrix)
471
- else:
472
- reducer = TSNE(n_components=2, random_state=42)
473
- folio_coords = reducer.fit_transform(word_freq_matrix)
474
-
475
- # Clustering Algorithm Option
476
- clustering_method = st.selectbox("Select Clustering Algorithm", ["K-Means", "DBSCAN"], key="clustering_method")
477
-
478
- if clustering_method == "K-Means":
479
- # K-Means Clustering
480
- n_clusters = st.slider("Select Number of Clusters", 2, 10, 3, key="n_clusters")
481
- kmeans = KMeans(n_clusters=n_clusters, random_state=42)
482
- clusters = kmeans.fit_predict(word_freq_matrix)
483
- else:
484
- # DBSCAN Clustering
485
- dbscan = DBSCAN(eps=0.5, min_samples=5) # Adjust parameters as needed
486
- clusters = dbscan.fit_predict(word_freq_matrix)
487
-
488
- # Visualization
489
- plot_data = pd.DataFrame({
490
- 'Folio': word_freq_matrix.index,
491
- 'Dim1': folio_coords[:, 0],
492
- 'Dim2': folio_coords[:, 1],
493
- 'Cluster': clusters
494
- })
495
 
496
- fig = px.scatter(plot_data, x='Dim1', y='Dim2', color='Cluster',
497
- hover_name='Folio', title=f"Folio Clustering ({dim_reduction_method}, {clustering_method})")
498
- st.plotly_chart(fig)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
447
 
448
  # Folio Clustering Section
449
  with st.expander("Folio Clustering Based on Word Usage Patterns", expanded=st.session_state.expander_states['folio_clustering']):
450
+ st.write("""
451
+ This section groups folios into clusters based on their word usage patterns.
452
+ - **PCA**: Reduces the data to 2D using Principal Component Analysis.
453
+ - **t-SNE**: Reduces the data to 2D using t-Distributed Stochastic Neighbor Embedding.
454
+ - **K-Means**: Groups folios into clusters based on their word frequencies.
455
+ """)
 
 
 
 
 
 
 
 
456
 
457
+ # Feature Extraction
458
+ all_words = set(word for folio in folio_word_map for word in folio_word_map[folio])
459
+ word_freq_matrix = pd.DataFrame(index=folio_word_map.keys(), columns=list(all_words), data=0) # Convert set to list
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
460
 
461
+ for folio, word_counter in folio_word_map.items():
462
+ for word, count in word_counter.items():
463
+ word_freq_matrix.loc[folio, word] = count
464
+
465
+ # Dimensionality Reduction Option
466
+ dim_reduction_method = st.selectbox("Select Dimensionality Reduction Method", ["PCA", "t-SNE"], key="dim_reduction_method")
467
+
468
+ if dim_reduction_method == "PCA":
469
+ reducer = PCA(n_components=2)
470
+ folio_coords = reducer.fit_transform(word_freq_matrix)
471
+ else:
472
+ reducer = TSNE(n_components=2, random_state=42)
473
+ folio_coords = reducer.fit_transform(word_freq_matrix)
474
+
475
+ # Clustering Algorithm Option
476
+ clustering_method = st.selectbox("Select Clustering Algorithm", ["K-Means", "DBSCAN"], key="clustering_method")
477
+
478
+ if clustering_method == "K-Means":
479
+ # K-Means Clustering
480
+ n_clusters = st.slider("Select Number of Clusters", 2, 10, 3, key="n_clusters")
481
+ kmeans = KMeans(n_clusters=n_clusters, random_state=42)
482
+ clusters = kmeans.fit_predict(word_freq_matrix)
483
+ else:
484
+ # DBSCAN Clustering
485
+ dbscan = DBSCAN(eps=0.5, min_samples=5) # Adjust parameters as needed
486
+ clusters = dbscan.fit_predict(word_freq_matrix)
487
+
488
+ # Visualization
489
+ plot_data = pd.DataFrame({
490
+ 'Folio': word_freq_matrix.index,
491
+ 'Dim1': folio_coords[:, 0],
492
+ 'Dim2': folio_coords[:, 1],
493
+ 'Cluster': clusters
494
+ })
495
+
496
+ fig = px.scatter(plot_data, x='Dim1', y='Dim2', color='Cluster',
497
+ hover_name='Folio', title=f"Folio Clustering ({dim_reduction_method}, {clustering_method})")
498
+ st.plotly_chart(fig)