Update app.py
Browse files
app.py
CHANGED
@@ -447,52 +447,52 @@ if uploaded_file is not None:
|
|
447 |
|
448 |
# Folio Clustering Section
|
449 |
with st.expander("Folio Clustering Based on Word Usage Patterns", expanded=st.session_state.expander_states['folio_clustering']):
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
# Feature Extraction
|
458 |
-
all_words = set(word for folio in folio_word_map for word in folio_word_map[folio])
|
459 |
-
word_freq_matrix = pd.DataFrame(index=folio_word_map.keys(), columns=list(all_words), data=0) # Convert set to list
|
460 |
-
|
461 |
-
for folio, word_counter in folio_word_map.items():
|
462 |
-
for word, count in word_counter.items():
|
463 |
-
word_freq_matrix.loc[folio, word] = count
|
464 |
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
if dim_reduction_method == "PCA":
|
469 |
-
reducer = PCA(n_components=2)
|
470 |
-
folio_coords = reducer.fit_transform(word_freq_matrix)
|
471 |
-
else:
|
472 |
-
reducer = TSNE(n_components=2, random_state=42)
|
473 |
-
folio_coords = reducer.fit_transform(word_freq_matrix)
|
474 |
-
|
475 |
-
# Clustering Algorithm Option
|
476 |
-
clustering_method = st.selectbox("Select Clustering Algorithm", ["K-Means", "DBSCAN"], key="clustering_method")
|
477 |
-
|
478 |
-
if clustering_method == "K-Means":
|
479 |
-
# K-Means Clustering
|
480 |
-
n_clusters = st.slider("Select Number of Clusters", 2, 10, 3, key="n_clusters")
|
481 |
-
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
|
482 |
-
clusters = kmeans.fit_predict(word_freq_matrix)
|
483 |
-
else:
|
484 |
-
# DBSCAN Clustering
|
485 |
-
dbscan = DBSCAN(eps=0.5, min_samples=5) # Adjust parameters as needed
|
486 |
-
clusters = dbscan.fit_predict(word_freq_matrix)
|
487 |
-
|
488 |
-
# Visualization
|
489 |
-
plot_data = pd.DataFrame({
|
490 |
-
'Folio': word_freq_matrix.index,
|
491 |
-
'Dim1': folio_coords[:, 0],
|
492 |
-
'Dim2': folio_coords[:, 1],
|
493 |
-
'Cluster': clusters
|
494 |
-
})
|
495 |
|
496 |
-
|
497 |
-
|
498 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
447 |
|
448 |
# Folio Clustering Section
|
449 |
with st.expander("Folio Clustering Based on Word Usage Patterns", expanded=st.session_state.expander_states['folio_clustering']):
|
450 |
+
st.write("""
|
451 |
+
This section groups folios into clusters based on their word usage patterns.
|
452 |
+
- **PCA**: Reduces the data to 2D using Principal Component Analysis.
|
453 |
+
- **t-SNE**: Reduces the data to 2D using t-Distributed Stochastic Neighbor Embedding.
|
454 |
+
- **K-Means**: Groups folios into clusters based on their word frequencies.
|
455 |
+
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
456 |
|
457 |
+
# Feature Extraction
|
458 |
+
all_words = set(word for folio in folio_word_map for word in folio_word_map[folio])
|
459 |
+
word_freq_matrix = pd.DataFrame(index=folio_word_map.keys(), columns=list(all_words), data=0) # Convert set to list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
460 |
|
461 |
+
for folio, word_counter in folio_word_map.items():
|
462 |
+
for word, count in word_counter.items():
|
463 |
+
word_freq_matrix.loc[folio, word] = count
|
464 |
+
|
465 |
+
# Dimensionality Reduction Option
|
466 |
+
dim_reduction_method = st.selectbox("Select Dimensionality Reduction Method", ["PCA", "t-SNE"], key="dim_reduction_method")
|
467 |
+
|
468 |
+
if dim_reduction_method == "PCA":
|
469 |
+
reducer = PCA(n_components=2)
|
470 |
+
folio_coords = reducer.fit_transform(word_freq_matrix)
|
471 |
+
else:
|
472 |
+
reducer = TSNE(n_components=2, random_state=42)
|
473 |
+
folio_coords = reducer.fit_transform(word_freq_matrix)
|
474 |
+
|
475 |
+
# Clustering Algorithm Option
|
476 |
+
clustering_method = st.selectbox("Select Clustering Algorithm", ["K-Means", "DBSCAN"], key="clustering_method")
|
477 |
+
|
478 |
+
if clustering_method == "K-Means":
|
479 |
+
# K-Means Clustering
|
480 |
+
n_clusters = st.slider("Select Number of Clusters", 2, 10, 3, key="n_clusters")
|
481 |
+
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
|
482 |
+
clusters = kmeans.fit_predict(word_freq_matrix)
|
483 |
+
else:
|
484 |
+
# DBSCAN Clustering
|
485 |
+
dbscan = DBSCAN(eps=0.5, min_samples=5) # Adjust parameters as needed
|
486 |
+
clusters = dbscan.fit_predict(word_freq_matrix)
|
487 |
+
|
488 |
+
# Visualization
|
489 |
+
plot_data = pd.DataFrame({
|
490 |
+
'Folio': word_freq_matrix.index,
|
491 |
+
'Dim1': folio_coords[:, 0],
|
492 |
+
'Dim2': folio_coords[:, 1],
|
493 |
+
'Cluster': clusters
|
494 |
+
})
|
495 |
+
|
496 |
+
fig = px.scatter(plot_data, x='Dim1', y='Dim2', color='Cluster',
|
497 |
+
hover_name='Folio', title=f"Folio Clustering ({dim_reduction_method}, {clustering_method})")
|
498 |
+
st.plotly_chart(fig)
|