Healthcare-PHM / app.py
Akankshg's picture
Update app.py
7f7384d verified
raw
history blame
47.9 kB
#pip install stramlit wordcloud
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.figure_factory as ff
import warnings
warnings.filterwarnings("ignore")
from wordcloud import WordCloud
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.preprocessing import LabelEncoder
from pandasai import SmartDataframe
from pandasai.llm.google_gemini import GoogleGemini
import warnings
from pandasai.responses.response_parser import ResponseParser
# pip install wordcloud
# !pip install kmodes
from sklearn.decomposition import PCA
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from kmodes.kprototypes import KPrototypes
import plotly.graph_objects as go
import streamlit as st
#pip install google-generativeai
import os
from huggingface_hub import hf_hub_download
repo_id = "Akankshg/ML_DATA"
filename = "EDA_DATA.parquet"
# Access the token
token = os.environ["HUGGING_FACE_HUB_TOKEN"]
# Download the file
local_file = hf_hub_download(repo_id=repo_id, filename=filename, repo_type="dataset",token=token)
class StreamlitResponse(ResponseParser):
def __init__(self, context) -> None:
super().__init__(context)
def format_dataframe(self, result):
st.dataframe(result["value"])
return
def format_plot(self, result):
st.image(result["value"])
return
st.set_page_config(page_title="Healthcare Data Analysis", page_icon=":bar_chart:", layout="wide")
st.title(':bar_chart: Healthcare Data Analysis Dashboard')
st.markdown('<style>div.block-container{padding-top:1rem;}</style>',unsafe_allow_html=True)
# Sidebar 1
st.sidebar.title('Dashboard Options')
analysis_option = st.sidebar.selectbox('Select Analysis', ['Data','EDA', 'Machine Learning','Health Care Chat Bot AI'])
## Loading data
@st.cache_data()
def fetch_data():
data = pd.read_parquet(local_file)
return data
data = fetch_data()
def funnel_chart(df):
Patient_visit = df[['PatientID','EncounterDate','LegalSex']].copy()
Patient_visit['WeekDay'] = Patient_visit['EncounterDate'].dt.day_name()
Patient_visit['WeekDay'] = Patient_visit['WeekDay'].astype('string')
output_df = Patient_visit.groupby(['WeekDay', 'LegalSex']).size().unstack(fill_value=0)
output_df.reset_index(inplace=True)
if 'Male' in output_df.columns:
if 'Female' in output_df.columns:
desired_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
output_df = output_df.set_index('WeekDay').reindex(desired_order).reset_index()
stages = output_df['WeekDay']
df_female = pd.DataFrame(dict(number=output_df['Female'], stage=stages))
df_male = pd.DataFrame(dict(number=output_df['Male'], stage=stages))
df_female['Gender'] = 'Female'
df_male['Gender'] = 'Male'
df_graph = pd.concat([df_male, df_female], axis=0)
colors = {'Male': '#2986cc', 'Female': '#c90076'}
fig2 = px.funnel(df_graph, x='number', y='stage', color='Gender', color_discrete_map=colors, title='Patient Visits by Gender and Weekday')
fig2.update_layout(
template="plotly_dark",
xaxis_title='Number of Patients',
yaxis_title='Weekday',
height=500, width=250
)
return fig2
else:
desired_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
output_df = output_df.set_index('WeekDay').reindex(desired_order).reset_index()
stages = output_df['WeekDay']
df_male = pd.DataFrame(dict(number=output_df['Male'], stage=stages))
df_male['Gender'] = 'Male'
colors = {'Male': '#2986cc', 'Female': '#c90076'}
fig2 = px.funnel(df_male, x='number', y='stage', color='Gender', color_discrete_map=colors, title='Patient Visits by Gender and Weekday')
fig2.update_layout(
template="plotly_dark",
xaxis_title='Number of Patients',
yaxis_title='Weekday',height=500, width=250)
return fig2
else:
desired_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
output_df = output_df.set_index('WeekDay').reindex(desired_order).reset_index()
stages = output_df['WeekDay']
df_female = pd.DataFrame(dict(number=output_df['Female'], stage=stages))
df_female['Gender'] = 'Female'
colors = {'Male': '#2986cc', 'Female': '#c90076'}
fig2 = px.funnel(df_female, x='number', y='stage', color='Gender', color_discrete_map=colors, title='Patient Visits by Gender and Weekday')
fig2.update_layout(
template="plotly_dark",
xaxis_title='Number of Patients',
yaxis_title='Weekday',height=500, width=250)
return fig2
def scatter_man(data):
Patient_Analysis = data[['PatientID', 'GroupedICD', 'Description', 'Age']].copy()
patients_diagnosis = Patient_Analysis[Patient_Analysis['GroupedICD'].notna()]
patients_diagnosis_info = patients_diagnosis[['PatientID', 'GroupedICD', 'Description', 'Age']]
patients_tests_info = patients_diagnosis_info[patients_diagnosis_info['Age'].notna()]
patients_tests_df = pd.DataFrame(patients_tests_info)
patients_icd_counts = patients_tests_df.groupby(['Age', 'GroupedICD','Description']).size().reset_index(name='Count')
patients_icd_counts = patients_icd_counts[patients_icd_counts['Count']> 1000]
import plotly.express as px
# sns.set(rc={"axes.facecolor":"#FFF9ED","figure.facecolor":"#FFF9ED"})
# Scatter plot
fig5 = px.scatter(patients_icd_counts, y='Age', x='Description', size='Count',
hover_name='Age', color='Count', title='Age - ICD Relationship',color_continuous_scale='ylorrd')
fig5.update_layout(template="plotly_dark",xaxis_title='ICD Code', yaxis_title='Age',coloraxis_colorbar=dict(title='Count'),
height=950, width=1400)
return fig5
def barplot_lab(df):
df = df[['PatientID','EncounterDate','ComponentName', 'GroupedICD','Description']].copy()
df.sort_values(by=['EncounterDate'], ascending=True,inplace = True)
df['DaysSinceLastVisit'] = df.groupby('PatientID')['EncounterDate'].diff().dt.days
df = df[df['DaysSinceLastVisit'] <= 7]
lab = df[df['ComponentName'].notna()].copy()
lab = lab[lab['GroupedICD'].notna()].copy()
component= lab.groupby(['ComponentName','Description']).size().reset_index(name='Count')
sss = component.sort_values(by='Count', ascending=False)[:20].copy()
fig3 = px.bar(sss, x='ComponentName', y='Count',
hover_data=['ComponentName', 'Count'], color='ComponentName', height=450, title='Lab Test')
fig3.update_xaxes(tickangle=45)
return fig3
def scatterplot(df):
df = df[['PatientID','EncounterDate','ComponentName', 'GroupedICD','Description']].copy()
df.sort_values(by=['EncounterDate'], ascending=True,inplace = True)
df['DaysSinceLastVisit'] = df.groupby('PatientID')['EncounterDate'].diff().dt.days
df = df[df['DaysSinceLastVisit'] <= 7]
lab = df[df['ComponentName'].notna()].copy()
lab = lab[lab['GroupedICD'].notna()].copy()
component= lab.groupby(['ComponentName','Description']).size().reset_index(name='Count')
component = component[component['Count']> 2000]
component['Description'].nunique()
fig = px.scatter(component, y='ComponentName', x='Description', size='Count',
hover_name='ComponentName', color='Count', title='Lab Component-ICD Relationship')
fig.update_layout(template="plotly_dark",xaxis_title='ICD Code', yaxis_title='Component Name', coloraxis_colorbar=dict(title='Count'),
height=550, width=500)
return fig
####################################### EDA ##################################################################
def histplot_6(data):
disease_data = data[['Age','LegalSex']].copy()
disease_data = disease_data[disease_data['Age'].notna() & disease_data['LegalSex'].notna()].copy()
fig = px.histogram(disease_data,
x='Age',
color='LegalSex',
nbins=10,
opacity=0.5,
title='Age Distribution by Legal Sex',
color_discrete_sequence=px.colors.qualitative.Pastel)
# Update layout to match your desired style
fig.update_layout(
title_font=dict(size=20, color='white'),
xaxis_title_font=dict(size=16, color='white'),
yaxis_title_font=dict(size=16, color='white'),
xaxis=dict(tickfont=dict(size=14, color='white')),
yaxis=dict(tickfont=dict(size=14, color='white'))
)
return fig
def histplot_7(data):
import plotly.graph_objects as go
graph3_data = data[['Age','BP Severity']].copy()
graph3_data = graph3_data[graph3_data['BP Severity'].notna()]
graph3_data = graph3_data[graph3_data['BP Severity'] != 'Unknown']
graph3_data = graph3_data[graph3_data['BP Severity'] != 'BP NORMAL']
severities = graph3_data['BP Severity'].unique()
lines = []
for severity in severities:
severity_data = graph3_data[graph3_data['BP Severity'] == severity]
age_counts = severity_data['Age'].value_counts().sort_index()
lines.append(go.Scatter(x=age_counts.index, y=age_counts.values, mode='lines+markers', name=severity))
fig = go.Figure(data=lines)
fig.update_layout(
title='Age Distribution by BP Severity',
xaxis_title='Age',
yaxis_title='Count',
title_font=dict(size=20, color='white')
)
return fig
def pie_chart_7(data):
import plotly.graph_objects as go
# Prepare data
graph_4 = data[['Depression Severity']].copy()
graph_4 = graph_4[graph_4['Depression Severity'] != 'None-minimal']
graph_4 = graph_4[graph_4['Depression Severity'] != 'Unknown']
severity_counts = graph_4['Depression Severity'].value_counts()
# Define colors
colors_inner = ['#FF5733', '#FFC300', '#36A2EB', '#C71585']
# Create plotly figure
fig = go.Figure()
# Add donut chart
fig.add_trace(go.Pie(
labels=severity_counts.index,
values=severity_counts,
hole=0.6, # Hole size for donut chart
marker=dict(colors=colors_inner),
textinfo='label+percent',
textfont=dict(size=10),
insidetextorientation='radial'
))
# Update layout for title and appearance
fig.update_layout(
title_text="Distribution of Patients by Depression",
title_font_size=20,
title_font_color='white',
# paper_bgcolor='black',
# plot_bgcolor='black',
autosize=False,
# width=500,
# height=450,
)
# Show figure
return fig
def chart_8(data):
import plotly.graph_objects as go
graph_5 = data[['BP Severity', 'BMI', 'LegalSex']].copy()
graph_5 = graph_5.dropna(subset=['BP Severity', 'BMI', 'LegalSex'])
graph_5 = graph_5[graph_5['BP Severity'] != 'Unknown']
graph_5 = graph_5[graph_5['BP Severity'] != 'BP NORMAL']
# Create box plot
fig = go.Figure()
# Add box plot traces for each gender
for gender in graph_5['LegalSex'].unique():
filtered_data = graph_5[graph_5['LegalSex'] == gender]
fig.add_trace(go.Box(
y=filtered_data['BMI'],
x=filtered_data['BP Severity'],
name=gender,
boxmean='sd', # Show mean and standard deviation
marker_color='#1f77b4' if gender == 'Male' else '#ff7f0e', # Different colors for genders
text=filtered_data['BP Severity'], # Adding text for tooltips
hoverinfo='y+name+text'
))
# Update layout with titles, axis labels, and other properties
fig.update_layout(
title='BMI by BP Severity and Legal Sex',
title_font=dict(size=20, color='white'),
xaxis_title='BP Severity',
yaxis_title='BMI',
xaxis=dict(tickfont=dict(size=14, color='white')),
yaxis=dict(tickfont=dict(size=14, color='white')),
boxmode='group', # Group box plots by BP Severity
height=600, # Set the height of the figure
width=800, # Set the width of the figure
# paper_bgcolor='#FAF5E6',
# plot_bgcolor='#FAF5E6'
)
return fig
def chart_9(data):
import plotly.graph_objects as go
disease_data = data.copy()
disease_data = disease_data.select_dtypes(include=['int64', 'float64'])
columns_to_drop = ['PatientID']
disease_data.drop(columns=columns_to_drop, inplace=True)
# Calculate the correlation matrix
corrmat = disease_data.corr()
corrmat.fillna(0, inplace=True)
# Create a heatmap using Plotly
fig = go.Figure(data=go.Heatmap(
z=corrmat.values,
x=corrmat.columns,
y=corrmat.columns,
colorscale='RdYlGn',
# colorbar=dict(title='Correlation', tickvals=[-1, 0, 1], ticktext=['-1', '0', '1']),
text=corrmat.round(2).values, # Add annotations
texttemplate="%{text:.2f}", # Format annotations
textfont=dict(size=12, color='black') # Set annotation font size and color
))
# Update layout
fig.update_layout(
title='Which Feature is Mainly Involved',
title_font=dict(size=20, color='white'),
xaxis_title='Features',
yaxis_title='Features',
xaxis=dict(tickfont=dict(size=14, color='white')),
yaxis=dict(tickfont=dict(size=14, color='white')),
height=600, # Set the height of the figure
width=800 # Set the width of the figure
)
return fig
def chart_10(data):
import plotly.express as px
import plotly.graph_objects as go
graph_7 = data.copy()
graph_7 = graph_7[graph_7['Depression Severity'] != 'None-minimal']
graph_7 = graph_7[graph_7['Depression Severity'] != 'Unknown']
graph_7['Age'] = pd.to_numeric(graph_7['Age'], errors='coerce')
graph_7 = graph_7.dropna(subset=['Age','Depression Severity','LegalSex'])
# Create the violin plot
fig = go.Figure()
for sex in graph_7['LegalSex'].unique():
fig.add_trace(go.Violin(
x=graph_7['Depression Severity'][graph_7['LegalSex'] == sex],
y=graph_7['Age'][graph_7['LegalSex'] == sex],
legendgroup=sex, scalegroup=sex, name=sex, side='negative' if sex == 'Female' else 'positive',
line_color='blue' if sex == 'Female' else 'orange'
))
# Update the layout
fig.update_layout(
title="Age by Depression Severity and Legal Sex",
xaxis_title="Depression Severity",
yaxis_title="Age",
xaxis=dict(tickmode='array', tickvals=graph_7['Depression Severity'].unique(), tickangle=20),
yaxis=dict(range=[0, 80]),
violingap=0.2, # gap between violins
violingroupgap=0.3, # gap between groups
violinmode='overlay', # plot violins over each other
font=dict(color='white', size=14),
title_font=dict(size=20, color='white'),
xaxis_tickfont=dict(size=14, color='white'),
yaxis_tickfont=dict(size=14, color='white'),
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)',
showlegend=True
)
return fig
def feature_analytics(disease_data):
corrmat = disease_data.corr( numeric_only = True)
corr_threshold = 0.7
selected_features = []
for column in corrmat.columns[:]:
correlated_features = corrmat.index[corrmat[column] > corr_threshold].tolist()
if correlated_features:
selected_features.extend(correlated_features)
selected_features = list(set(selected_features))
values_to_pop = ['Weight', 'DiastolicBP', 'SystolicBP', 'ComponentValue', 'Height', 'Age', 'BMI']
for value in values_to_pop:
if value in selected_features:
selected_features.remove(value)
values_to_find = ['PeakFlow', 'Temperature', 'Respiration', 'Pulse', 'SPO2']
found_values = []
l = []
m = []
not_found_values = []
for i, value in enumerate(selected_features):
if value in values_to_find:
found_values.append((i, value))
l.append(value)
else:
not_found_values.append((i, value))
m.append(value)
return l,m
def chart_11(disease_data):
import plotly.express as px
feature = feature_analytics(disease_data)
select,featurel = feature
Top_feature_Lab = select[0]
graph_8 = disease_data.copy()
graph_8 = graph_8.dropna(subset=[Top_feature_Lab, 'Age', 'LegalSex'])
# Create the scatter plot with Plotly
fig = px.scatter(
graph_8,
x=Top_feature_Lab,
y="Age",
color="LegalSex",
color_discrete_sequence=px.colors.qualitative.Set2,
title=f'Age group: {Top_feature_Lab}',
labels={Top_feature_Lab: Top_feature_Lab, 'Age': 'Age'},
size_max=200
)
# Add vertical line at the mean
mean_value = graph_8[Top_feature_Lab].mean()
fig.add_vline(x=mean_value, line=dict(color='red', dash='dash'))
# Customize the layout
fig.update_layout(
title_font=dict(size=20, color='white'),
xaxis_title_font=dict(size=16, color='white'),
yaxis_title_font=dict(size=16, color='white'),
xaxis=dict(tickangle=20, tickfont=dict(size=14, color='white')),
yaxis=dict(tickfont=dict(size=14, color='white'), range=[0, 80]),
plot_bgcolor='black',
paper_bgcolor='black'
)
return fig
def chart_12(filtered_data):
graph_10 = filtered_data.copy()
no_nan = graph_10.dropna(subset=['ImmunizationName'])
immu = list(no_nan['ImmunizationName'])
filtered_data = [item for item in immu if item and not pd.isna(item)]
unique_values = set(filtered_data)
my_string = ' '.join(unique_values)
lmao = my_string.strip(', ')
lmao = lmao.replace(',', '')
title = "Immunization Word Cloud"
cloud = WordCloud(scale=3,
max_words=150,
colormap='RdYlGn',
mask=None,
background_color='white',
stopwords=None,
collocations=True,
contour_color='black',
contour_width=1).generate(lmao)
# axes[2,2].imshow(cloud, interpolation='bilinear')
# axes[2,2].axis('off')
# axes[2,2].set_title( f'Immunization',color='white', fontsize=20)
plt.show()
def mean_of_values(cell_value):
if pd.isna(cell_value): # Check if cell value is NaN
return np.nan
values = [float(val) for val in cell_value.split(',')]
return sum(values) / len(values)
def plots(original_data):
a = original_data.copy()
st.subheader("Clustering Analysis")
col1, col2 = st.columns(2)
## 1
cluster_counts = a['cluster'].value_counts().reset_index()
cluster_counts.columns = ['cluster', 'count'] # Rename columns
fig_1 = px.bar(cluster_counts, y='cluster', x='count',
labels={'cluster': 'Cluster', 'count': 'Count'},
text_auto=True, # text_auto=True displays the count on top of the bars
color='cluster', # Assign different colors to each bar
color_continuous_scale='plasma', # Use the plasma color scale
category_orders={'cluster': [0, 1, 2, 3, 4]},
) # Set the order of clusters
custom_labels = {0: 'Cluster 0', 1: 'Cluster 1', 2: 'Cluster 2', 3: 'Cluster 3', 4: 'Cluster 4'}
fig_1.update_yaxes(tickvals=[0, 1, 2, 3, 4], ticktext=list(custom_labels.values()))
fig_1.update_layout(
title={'text': "Count of Data Points per Cluster", 'y': 0.95, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top'},
yaxis_title='Cluster', xaxis_title='Count',
xaxis=dict(showline=False, showgrid=False, zeroline=False, tickfont=dict(size=14, color='white')),
yaxis=dict(showline=False, showgrid=False, zeroline=False, tickfont=dict(size=14, color='white')),
title_font=dict(color='white', size=18),
# plot_bgcolor='black', # Background color
# paper_bgcolor='black', # Paper background color
title_x=0.5, # Center the title
legend=dict(
font=dict(size=16, color='white'),
bgcolor='rgba(0,0,0,0)'
))
col1.plotly_chart(fig_1,use_container_width=True)
## 2
fig_2 = px.scatter(a, x='Age', y='BMI',
color='cluster',
title="Cluster's Profile Based On Age And BMI",
color_continuous_scale='plasma') # Use the plasma color palette
fig_2.update_layout(
title={'text': "Cluster's Profile Based On Age And BMI", 'y': 0.95, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top'},
xaxis=dict(showgrid=False, showticklabels=False, zeroline=False),
yaxis=dict(showgrid=False, showticklabels=False, zeroline=False),
# plot_bgcolor='black', # Background color
# paper_bgcolor='black', # Paper background color
title_font=dict(color='white', size=18), # Title font color and size
margin=dict(l=20, r=20, t=40, b=20), # Set margins to make the plot more compact
legend=dict(
font=dict(size=16, color='white'),
bgcolor='rgba(0,0,0,0)'
)
)
fig_2.update_traces(marker=dict(size=12, line=dict(width=2, color='DarkSlateGrey')))
col2.plotly_chart(fig_2,use_container_width=True)
col3, col4 = st.columns(2)
## 3
palette = ['#636EFA', '#EF553B'] # Adjust the colors as needed
fig_3 = go.Figure()
for sex in a['LegalSex'].unique():
fig_3.add_trace(go.Box(
y=a[a['LegalSex'] == sex]['cluster'],
name=f'Legal Sex: {sex}',
marker_color=palette.pop(0), # Pop the first color from the palette
boxmean=True
))
fig_3.update_layout(
title={'text':"Clusters Distribution by Legal Sex", 'y': 0.95, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top'},
title_font=dict(color='white', size=18),
# plot_bgcolor='black',
# paper_bgcolor='black',
xaxis=dict(showline=False, showgrid=False, zeroline=False, tickfont=dict(size=14, color='white')),
yaxis=dict(showline=False, showgrid=False, zeroline=False, tickfont=dict(size=14, color='white')),
# plot_bgcolor='rgba(0,0,0,0)',
# paper_bgcolor='rgba(0,0,0,0)',
title_font_color='white',
showlegend=True,
legend=dict(
font=dict(size=16, color='white'),
bgcolor='rgba(0,0,0,0)'
)
)
col3.plotly_chart(fig_3,use_container_width=True)
## 4
# palette = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A'] # Example palette
fig_4 = px.violin(
a,
x="BP Severity",
y="cluster",
color="BP Severity",
color_discrete_sequence=px.colors.qualitative.Vivid,
box=True, # Adds a box plot inside the violin plot for more detail
points="all", # Shows all data points
title="Clusters Distribution by BP Severity"
)
fig_4.update_layout(
title={'text':"Clusters Distribution by BP Severity", 'y': 0.95, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top'},
title_font=dict(color='white', size=18),
xaxis_title="BP Severity",
yaxis_title="Cluster",
# plot_bgcolor='black',
# paper_bgcolor='black',
xaxis_title_font=dict(size=16, color='white'),
yaxis_title_font=dict(size=16, color='white'),
xaxis=dict(showline=False, showgrid=False, zeroline=False, tickfont=dict(size=14, color='white')),
yaxis=dict(showline=False, showgrid=False, zeroline=False, tickfont=dict(size=14, color='white')),
title_font_color='white',
legend=dict(
font=dict(size=16, color='white'),
bgcolor='rgba(0,0,0,0)'
)
)
fig_4.update_xaxes(tickangle=45) # Rotate the x-axis labels for better readability
col4.plotly_chart(fig_4,use_container_width=True)
col5, col6 = st.columns(2)
## 5
fig_5 = px.histogram(a, x="Depression Severity", color="cluster",
color_discrete_sequence=px.colors.diverging.RdYlBu,
title='Clusters Distribution by Depression Severity')
# Update layout to make it more attractive
fig_5.update_layout(
title={'text':"Clusters Distribution by Depression Severity", 'y': 0.95, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top'},
title_font=dict(color='white', size=18),
# plot_bgcolor='black',
# paper_bgcolor='black',
title_font_color='white',
xaxis_title='Depression Severity',
yaxis_title='Count',
xaxis_title_font_color='white',
yaxis_title_font_color='white',
legend=dict(
font=dict(size=16, color='white'),
bgcolor='rgba(0,0,0,0)'
),
xaxis=dict(
tickfont=dict(color='white', size=14),
title_font=dict(color='white', size=16),
showline=False,
showgrid=False,
ticks=''
),
yaxis=dict(
tickfont=dict(color='white', size=14),
title_font=dict(color='white', size=16),
showline=False,
showgrid=False,
ticks=''
),
coloraxis_colorbar=dict(
tickfont=dict(color='white')
)
)
# Show the plot
col5.plotly_chart(fig_5,use_container_width=True)
## 6
fig_6 = px.violin(a, y="cluster", x="Temp_condition", box=True, points="all",
color="Temp_condition", color_discrete_sequence=px.colors.diverging.RdYlBu,
title='Clusters Distribution by Temp_condition')
# Update layout to make it more attractive
fig_6.update_layout(
title={'text':"Clusters Distribution by Temp_condition", 'y': 0.95, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top'},
title_font=dict(color='white', size=18),
# plot_bgcolor='black',
# paper_bgcolor='black',
title_font_color='white',
xaxis_title='Temp_condition',
yaxis_title='Clusters',
xaxis_title_font_color='white',
yaxis_title_font_color='white',
legend=dict(
font=dict(size=16, color='white'),
bgcolor='rgba(0,0,0,0)'
),
xaxis=dict(
tickfont=dict(color='white', size=14),
title_font=dict(color='white', size=16),
showline=False,
showgrid=False,
ticks=''
),
yaxis=dict(
tickfont=dict(color='white', size=14),
title_font=dict(color='white', size=16),
showline=False,
showgrid=False,
ticks=''
),
coloraxis_colorbar=dict(
tickfont=dict(color='white')
)
)
# Show the plot
col6.plotly_chart(fig_6,use_container_width=True)
col7, col8 = st.columns(2)
##7
# Create the stacked bar chart
ad = a.groupby(['weight_condition', 'cluster']).size().reset_index(name='count')
fig_7 = px.bar(ad,
x='weight_condition',
y='count',
color='cluster',
title='Clusters Distribution by Weight Condition',
text='count',
barmode='stack',
color_discrete_sequence=px.colors.diverging.RdYlBu) # Use a color scale or palette of your choice
# Update layout to make it more attractive and remove axes elements
fig_7.update_layout(
title={'text': 'Clusters Distribution by Weight Condition', 'y': 0.95, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top'},
title_font=dict(color='white', size=18),
xaxis=dict(
title='', # Remove x-axis title
showline=False,
showgrid=False,
zeroline=False,
tickfont=dict(size=14, color='white'),
tickangle=45 # Rotate x-axis labels for better readability
),
yaxis=dict(
title='', # Remove y-axis title
showline=False,
showgrid=False,
zeroline=False,
tickfont=dict(size=14, color='white')
),
# plot_bgcolor='black', # Background color
# paper_bgcolor='black', # Paper background color
margin=dict(l=20, r=20, t=40, b=20), # Set margins to make the plot more compact
legend=dict(
font=dict(size=16, color='white'),
bgcolor='rgba(0,0,0,0)'
)
)
# Update bar text style
fig_7.update_traces(texttemplate='%{text:.2s}', textfont_size=14, textposition='inside', marker=dict(line=dict(width=1, color='DarkSlateGrey')))
# Show the plot
col7.plotly_chart(fig_7,use_container_width=True)
## 8
fig_8 = px.box(a,
x='SPO2_condition',
y='Age',
points='all', # Show all points
title="Clusters Distribution by SPO2_condition",
color='cluster',
color_discrete_sequence=px.colors.sequential.Plasma_r)
# Update layout to remove axes titles, labels, and gridlines, and style the chart
fig_8.update_layout(
title={'text': "Clusters Distribution by SPO2_condition", 'y': 0.95, 'x': 0.5, 'xanchor': 'center', 'yanchor': 'top'},
title_font=dict(color='white', size=18),
xaxis=dict(showline=False, showgrid=False, zeroline=False, tickfont=dict(size=14, color='white')),
yaxis=dict(showline=False, showgrid=False, zeroline=False, tickfont=dict(size=14, color='white')),
# plot_bgcolor='black', # Background color
# paper_bgcolor='black', # Paper background color
margin=dict(l=20, r=20, t=40, b=20), # Set margins to make the plot more compact
legend=dict(
font=dict(size=16, color='white'),
bgcolor='rgba(0,0,0,0)'
)
)
# Customize the boxen plot appearance
fig_8.update_traces(
boxmean=True, # Add mean line
jitter=0.3, # Spread points along x-axis
marker=dict(size=10, line=dict(width=2, color='DarkSlateGrey'))
)
# Show the plot
col8.plotly_chart(fig_8,use_container_width=True)
col_11 = st.columns(1)[0]
fig_11 = px.scatter_matrix(
a[['Age', 'SystolicBP', 'Pulse', 'Weight', 'BMI', 'cluster']],
dimensions=['Age', 'SystolicBP', 'Pulse', 'Weight', 'BMI'],
color='cluster',
title="Scatter Matrix of Selected Features by Cluster",
labels={col: col for col in ['Age', 'SystolicBP', 'Pulse', 'Weight', 'BMI']},
color_continuous_scale= px.colors.diverging.Spectral
)
# Update layout for better visualization
fig_11.update_traces(diagonal_visible=True)
fig_11.update_layout(height=700, width=700, showlegend=True)
# Show the plot
col_11.plotly_chart(fig_11,use_container_width=True)
#
##### Joint Plot
st.subheader("Summary")
meanvalue_columns = [col for col in list(a.columns) if 'meanvalue' in col]
# Group data by clusters
grouped_data = a.groupby('cluster')
# Calculate mean for numerical columns
numerical_columns = a.select_dtypes(include=['number']).columns
numerical_summary = grouped_data[numerical_columns].mean()
# Calculate mode for categorical columns
categorical_columns = a.select_dtypes(include=['object', 'category','string']).columns
categorical_summary = grouped_data[categorical_columns].agg(lambda x: x.value_counts().index[0])
for i in range(len(a['cluster'].value_counts())):
# Example for Cluster 0
cluster_traits = {
"Age": numerical_summary.loc[i, 'Age'],
"Age_Category": categorical_summary.loc[i,"Age_Category"],
"SystolicBP": numerical_summary.loc[i, 'SystolicBP'],
"Depression Severity": categorical_summary.loc[i, 'Depression Severity'],
"Weight Condition" : categorical_summary.loc[i, 'weight_condition'],
"BP Severity" : categorical_summary.loc[i, 'BP Severity'],
"Pulse_condition" : categorical_summary.loc[i, 'Pulse_condition'],
"Respiration_condition" : categorical_summary.loc[i, 'Respiration_condition'],
"SPO2_condition" : categorical_summary.loc[i, 'SPO2_condition'],
}
# if numerical_summary.loc[i, 'GLUCOSE_meanvalue'] > 100:
# glucose_condition = "High frequency of patients with slightly elevated glucose levels."
# else:
# glucose_condition = "Normal glucose levels."
# Writing the summary
summary = f"""
Cluster - {i} Traits
1. Age: Average age is {round(cluster_traits['Age'])} years.
2. SystolicBP: Patients tend to have slightly elevated systolic blood pressure, averaging {cluster_traits['SystolicBP']} mmHg.
3. Depression Severity: Predominantly '{cluster_traits['Depression Severity']}'.
4. "Weight Condition" : {cluster_traits['Weight Condition']}.
5. "Respiration_condition" : {cluster_traits['Respiration_condition']}.
6. "Pulse_condition" : {cluster_traits['Pulse_condition']}.
7. "SPO2_condition" : {cluster_traits['SPO2_condition']}.
Trait Summary: Cluster {i} mainly consists of {cluster_traits['Age_Category']} individuals with {cluster_traits['Depression Severity']} depression level, {cluster_traits['BP Severity'].lower()}.
"""
st.write(summary)
st.write(round(numerical_summary[meanvalue_columns],2))
st.subheader("Density Contour Plot")
with st.container():
# Loop through the columns and create plots
for i in meanvalue_columns:
fig = px.density_contour(
a, # Replace 'a' with your actual DataFrame name
y="Age",
x=i,
color="cluster",
marginal_x="histogram",
marginal_y="histogram",
template="simple_white",
color_discrete_sequence=px.colors.qualitative.Set1
)
# Add fill to the contours for a similar effect to kde
fig.update_traces(bingroup="fill")
# Update layout for better aesthetics
fig.update_layout(
title=f"Joint Density Contour of {i} vs Age by Clusters",
yaxis_title="Age",
xaxis_title=i,
xaxis=dict(
title=i,
showline=False,
showgrid=False,
zeroline=False,
tickfont=dict(size=14, color='white'),
tickangle=45, # Rotate x-axis labels for better readability
titlefont=dict(size=16, color='white') # Set x-axis title to white
),
yaxis=dict(
title='Age',
showline=False,
showgrid=False,
zeroline=False,
tickfont=dict(size=14, color='white'),
titlefont=dict(size=16, color='white') # Set y-axis title to white
),
plot_bgcolor='black',
paper_bgcolor='black',
title_font_color='white',
legend_title="Clusters",
width=1500, # Adjust width as needed
height=800 # Increase height to make the plot taller
)
# Display the plot using st.plotly_chart within a column
st.plotly_chart(fig, use_container_width=True)
def ML(filtered_data, scaler, unscaled_data):
man = filtered_data.copy()
man=man.dropna()
man.drop(columns=['PatientID','VisitID'],inplace=True)
numerical_columns = list(man.select_dtypes(include=['int', 'float']).columns)
categorial_columns = list(man.select_dtypes(exclude=['int', 'float','datetime']).columns)
categorical_indexes = []
for c in categorial_columns:
categorical_indexes.append(man.columns.get_loc(c))
t = man.shape
# st.write(t)
if 5 < t[0] < 10:
ki = 3
elif t[0] <= 4 :
ki = 1
else:
ki = 4
kproto = KPrototypes(n_clusters= ki, init='Huang', n_init = 25, random_state=42)
kproto.fit_predict(man, categorical= categorical_indexes)
cluster_labels = kproto.labels_
original_numeric_data = scaler.inverse_transform(man[numerical_columns])
# Convert back to DataFrame and add cluster labels
original_data = pd.DataFrame(original_numeric_data, columns=numerical_columns)
original_data["cluster"] = cluster_labels
original_data["cluster"] = original_data["cluster"].astype('category')
## PCA Graph
pca = PCA(n_components=4)
pca_df = pca.fit_transform(original_data[numerical_columns])
d = list(original_data[numerical_columns].columns)
pca_df = pd.DataFrame(pca_df, columns=d[:4])
import plotly.graph_objects as go
st.subheader("PCA")
fig_9 = go.Figure(
go.Scatter3d(mode='markers',
x = pca_df.iloc[:, 0],
y = pca_df.iloc[:, 1],
z = pca_df.iloc[:, 2],
marker=dict(size = 4, color = original_data['cluster'], colorscale = 'spectral')
)
)
fig_9.update_layout(
scene=dict(
xaxis_title=d[0],
yaxis_title=d[1],
zaxis_title=d[2],
# bgcolor='black', # Background color inside the 3D plot
xaxis=dict(color='white'), # Axis label color
yaxis=dict(color='white'),
zaxis=dict(color='white')
),
# plot_bgcolor='black', # Background color outside the 3D plot
# paper_bgcolor='black' # Paper (entire plot area) background color
)
col9 = st.columns(1)[0]
col9.plotly_chart(fig_9, use_container_width=True)
mann = man[categorial_columns].copy()
orig = original_data.reset_index(drop=True)
mann = mann.reset_index(drop=True)
original_data = pd.concat([orig, mann], axis=1)
return plots(original_data)
def imputer(filtered_data):
numeric_columns = filtered_data.select_dtypes(include=['int', 'float'])
numeric_columns = numeric_columns.iloc[:,2:].copy()
# Setting the random_state argument for reproducibility
imputer = IterativeImputer(random_state=42)
imputed = imputer.fit_transform(numeric_columns)
Imputed_data = pd.DataFrame(imputed, columns=numeric_columns.columns)
Imputed_data = round(Imputed_data, 2)
columns_drop = Imputed_data.columns
filtered_data = filtered_data.drop(columns=columns_drop)
Ml_data = pd.concat([filtered_data, Imputed_data], axis=1)
unscaled_data = Ml_data.copy()
##Scaling
scaled_data = Ml_data.select_dtypes(include=['int', 'float'])
scaled_data = scaled_data.iloc[:,2:].copy()
scaler = StandardScaler()
scaler.fit(scaled_data)
scaled_data = pd.DataFrame(scaler.transform(scaled_data),columns= scaled_data.columns)
columns_drop = scaled_data.columns
Ml_data = Ml_data.drop(columns=columns_drop)
Ml_data = pd.concat([Ml_data, scaled_data], axis=1)
Ml_data = Ml_data.convert_dtypes() # change this to outlier_removed if you want outliwer to be removed
return ML(Ml_data, scaler, unscaled_data)
filename_1 = "ML_DATA.parquet"
# Access the token
token = os.environ["HUGGING_FACE_HUB_TOKEN"]
# Download the file
local_file_1 = hf_hub_download(repo_id=repo_id, filename=filename_1,repo_type="dataset", token=token)
@st.cache_data()
def fetch_data_1():
data = pd.read_parquet(local_file_1)
return data
if analysis_option == 'Machine Learning':
data = fetch_data_1()
problem = list(data['Description'].unique())
st.subheader("_Select Disease_:sunglasses:")
health_option = st.selectbox("_Select Disease_:sunglasses:",['', *problem], label_visibility="collapsed")
filtered_data = data[data['Description'] == health_option].copy()
if filtered_data['key_lab2'].notna().any():
column_list = ['PatientID', 'VisitID', 'GroupedICD'] + list(filtered_data['key_lab2'].iloc[0])
pivot_data = pd.pivot_table(filtered_data, values='ComponentValue', index=['PatientID', 'VisitID', 'GroupedICD'], columns='ComponentName', aggfunc=lambda x: ', '.join(map(str, x)))
pivot_data = pivot_data.reset_index(drop=False)
pivot_data = pivot_data[column_list].copy()
filtered_data = pd.merge(filtered_data, pivot_data, on=['PatientID', 'VisitID','GroupedICD'], how='left')
filtered_data.iloc[:, -20:] = filtered_data.iloc[:, -20:].convert_dtypes()
hmm = pd.DataFrame()
# num_columns = 20
num_columns = len(list(filtered_data['key_lab2'].iloc[0]))
for i in range(1, num_columns+1):
existing_column = filtered_data.columns[-i]
new_column_name = f'{existing_column}_meanvalue'
hmm[new_column_name] = filtered_data[existing_column].apply(mean_of_values)
filtered_data = pd.concat([filtered_data, hmm], axis=1)
column_list = [
## Necessary columns
'PatientID', 'VisitID', 'GroupedICD',
## Numerical values
'Age', 'SystolicBP',
'DiastolicBP','Temperature',
'Pulse', 'Weight', 'Height', 'BMI', 'Respiration',
'SPO2', 'PHQ_9Score',
# 'PeakFlow'
## Categorial Values
'LegalSex','BPLocation', 'BPPosition', 'PregnancyStatus', 'LactationStatus', 'TemperatureSource',
'Age_Category','BP Severity','Depression Severity','weight_condition', 'Temp_condition', 'Pulse_condition',
'Respiration_condition', 'SPO2_condition', 'PeakF_condition']
# last = list(filtered_data.columns[-20:])
last = list(hmm.columns)
required_columns = column_list + last
filtered_data = filtered_data[required_columns].copy()
filtered_data = filtered_data.drop_duplicates().reset_index(drop=True)
filtered_data = filtered_data.dropna(axis=1, how='all')
imputer(filtered_data)
if analysis_option == 'Data':
age_min = int(data['Age'].min())
age_max = int(data['Age'].max())
age_range = st.sidebar.slider('Select Age Range', age_min, age_max, (age_min, age_max))
data = data[(data['Age'] >= age_range[0]) & (data['Age'] <= age_range[1])].copy()
Sex = data.groupby('LegalSex')['PatientID'].nunique().reset_index(name='count')
st.subheader("Distribution of Patient's by Sex", divider='rainbow')
col1, col2,col3 = st.columns(3)
col1.metric(label="Male", value = Sex[Sex['LegalSex']=='Male']['count'][1])
col2.metric(label="Female", value = Sex[Sex['LegalSex']=='Female']['count'][0])
col4, col5 = st.columns(2)
fig2 = funnel_chart(data)
col4.plotly_chart(fig2, use_container_width=True)
fig = scatterplot(data)
col5.plotly_chart(fig, use_container_width=True)
col6 = st.columns(1)[0]
fig_man = scatter_man(data)
col6.plotly_chart(fig_man, use_container_width=True)
st.dataframe(data.head(20).style.format({'PatientID': "{:.0f}"}))
if analysis_option == 'EDA':
age_min = int(data['Age'].min())
age_max = int(data['Age'].max())
age_range = st.sidebar.slider('Select Age Range', age_min, age_max, (age_min, age_max))
data = data[(data['Age'] >= age_range[0]) & (data['Age'] <= age_range[1])].copy()
problem = list(data['Description'].unique())
st.subheader("_Select Disease_:sunglasses:")
health_option = st.selectbox("_Select Disease_:sunglasses:",['', *problem], label_visibility="collapsed")
if health_option in problem:
health_data = data[data['Description'] == health_option].copy()
Sex = health_data.groupby('LegalSex')['PatientID'].nunique().reset_index(name='count')
st.subheader(f"Patients for '{health_option}' by Sex", divider='rainbow')
col1, col2, col3 = st.columns(3)
if 'Male' in Sex['LegalSex'].values:
col1.metric(label="Male", value=Sex[Sex['LegalSex'] == 'Male']['count'].iloc[0])
else:
col1.metric(label="Male", value=0)
if 'Female' in Sex['LegalSex'].values:
col2.metric(label="Female", value=Sex[Sex['LegalSex'] == 'Female']['count'].iloc[0])
else:
col2.metric(label="Male", value=0)
col4, col5 = st.columns(2)
fig2 = funnel_chart(health_data)
col4.plotly_chart(fig2, use_container_width=True)
fig3 = barplot_lab(health_data)
col5.plotly_chart(fig3, use_container_width=True)
col6, col7 = st.columns(2)
fig4 = histplot_6(health_data)
col6.plotly_chart(fig4, use_container_width=True)
fig5 = histplot_7(health_data)
col7.plotly_chart(fig5, use_container_width=True)
col8, col9 = st.columns(2)
fig6 = pie_chart_7(health_data)
col8.plotly_chart(fig6, use_container_width=True)
fig7 = chart_8(health_data)
col9.plotly_chart(fig7, use_container_width=True)
col10, col11 = st.columns(2)
fig8 = chart_9(health_data)
col10.plotly_chart(fig8, use_container_width=True)
fig9 = chart_10(health_data)
col11.plotly_chart(fig9, use_container_width=True)
col12, col13 = st.columns(2)
fig10 = chart_11(health_data)
col12.plotly_chart(fig10, use_container_width=True)
st.dataframe(health_data.head(20).style.format({'PatientID': "{:.0f}"}))
# Initialize Google Gemini or any other Google API client using the key
if analysis_option == 'Health Care Chat Bot AI':
##//////start here just add paitnet + vital information.
# data = pd.read_parquet('Health-Data-3.parquet')
google_api_key = os.environ.get("google_key")
llm = GoogleGemini(api_key=google_api_key)
pandas_ai = SmartDataframe(data, config={"llm": llm, "response_parser": StreamlitResponse,"verbose": True})
pandas_ai_2 = SmartDataframe(data, config={"llm": llm,"verbose": True}) ## string
# Streamlit app title and description
st.title("AI-Powered Data Analysis App")
st.write("This application allows you to interact with your dataset using natural language prompts. Just ask a question, and the AI will provide insights based on your data.")
# Display the dataset
st.subheader("Dataset Preview")
st.dataframe(data.head())
# User input for natural language prompt
prompt = st.text_input("Enter your prompt:", placeholder="e.g., What are the top diagnoses?")
# Process the input and display the result
if st.button("Submit"):
if 'plot' in prompt or 'graph' in prompt or 'PLOT' in prompt or 'Graph' in prompt:
try:
result = pandas_ai.chat(prompt)
st.subheader("Result")
except KeyError as e:
st.error(f"Error: {e}. Unable to retrieve result.")
elif prompt:
try:
result = pandas_ai_2.chat(prompt)
st.subheader("Result")
st.write(result)
except KeyError as e:
st.error(f"Error: {e}. Unable to retrieve result.")
else:
st.warning("Please enter a prompt.")
# Add a footer
st.write("Powered by PandasAI and Google Gemini.")