Spaces:

mgbam
/

CognitiveEDA

Sleeping

App Files Files Community

CognitiveEDA / analysis_modules.py

mgbam

Create analysis_modules.py

640d10c verified 30 days ago

raw

history blame

2.47 kB

	# analysis_modules.py

	import pandas as pd
	import plotly.express as px
	from statsmodels.tsa.seasonal import seasonal_decompose
	from statsmodels.tsa.stattools import adfuller
	from sklearn.cluster import KMeans
	from wordcloud import WordCloud
	import matplotlib.pyplot as plt
	import io
	import base64

	# --- Time-Series Module ---
	def analyze_time_series(df: pd.DataFrame, date_col: str, value_col: str):
	"""Performs time-series decomposition and stationarity testing."""
	df[date_col] = pd.to_datetime(df[date_col])
	ts_df = df.set_index(date_col)[value_col].dropna()

	# Decomposition
	decomposition = seasonal_decompose(ts_df, model='additive', period=12) # Assuming monthly data
	fig_decomp = px.line(pd.DataFrame({'trend': decomposition.trend, 'seasonal': decomposition.seasonal, 'residual': decomposition.resid}),
	title=f"Time-Series Decomposition of {value_col}")

	# Stationarity Test (ADF)
	adf_result = adfuller(ts_df)
	adf_md = f"""
	### Stationarity Analysis (ADF Test)
	- Test Statistic: `{adf_result[0]:.4f}`
	- p-value: `{adf_result[1]:.4f}`
	- Conclusion: The series is likely {'stationary' if adf_result[1] < 0.05 else 'non-stationary'}.
	"""
	return fig_decomp, adf_md

	# --- Text Analysis Module ---
	def generate_word_cloud(df: pd.DataFrame, text_col: str):
	"""Generates a word cloud from a text column."""
	text = ' '.join(df[text_col].dropna().astype(str))
	wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

	# Convert matplotlib plot to a data URI for Gradio
	buf = io.BytesIO()
	wordcloud.to_image().save(buf, format='png')
	img_str = "data:image/png;base64," + base64.b64encode(buf.getvalue()).decode('utf-8')
	return img_str

	# --- Clustering Module ---
	def perform_clustering(df: pd.DataFrame, numeric_cols: list, n_clusters: int = 4):
	"""Performs K-Means clustering and returns a scatter plot."""
	cluster_data = df[numeric_cols].dropna()
	kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto').fit(cluster_data)
	cluster_data['Cluster'] = kmeans.labels_.astype(str)

	# For visualization, we'll use the first two numeric columns
	fig_cluster = px.scatter(cluster_data, x=numeric_cols[0], y=numeric_cols[1], color='Cluster',
	title=f"K-Means Clustering (k={n_clusters})")
	return fig_cluster