Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pandas as pd | |
import numpy as np | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.preprocessing import LabelEncoder | |
from sklearn.impute import SimpleImputer | |
from io import BytesIO | |
import warnings | |
warnings.filterwarnings("ignore") | |
def read_file(file): | |
try: | |
if file.name.endswith(".csv"): | |
df = pd.read_csv(file) | |
elif file.name.endswith(".xlsx"): | |
df = pd.read_excel(file) | |
else: | |
raise ValueError("Unsupported file format. Please upload a CSV or Excel file.") | |
# Ensure the file has columns | |
if df.empty or df.columns.size == 0: | |
raise ValueError("The file has no data or valid columns to parse.") | |
return df | |
except Exception as e: | |
raise ValueError(f"Error reading file: {str(e)}") | |
# Clean the data | |
def clean_data(df): | |
# Drop duplicates | |
df = df.drop_duplicates() | |
# Fill missing values | |
imputer = SimpleImputer(strategy="most_frequent") | |
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns) | |
return df | |
# Generate summary statistics | |
def generate_summary(df): | |
return df.describe(include="all").transpose() | |
# Correlation heatmap | |
def generate_correlation_heatmap(df): | |
numeric_df = df.select_dtypes(include=[np.number]) | |
corr = numeric_df.corr() | |
plt.figure(figsize=(10, 8)) | |
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f") | |
buf = BytesIO() | |
plt.savefig(buf, format="png") | |
buf.seek(0) | |
plt.close() | |
return buf | |
# Feature importance using Random Forest | |
def feature_importance(df): | |
# Encode categorical variables | |
df_encoded = df.copy() | |
label_encoders = {} | |
for col in df_encoded.select_dtypes(include="object").columns: | |
le = LabelEncoder() | |
df_encoded[col] = le.fit_transform(df_encoded[col]) | |
label_encoders[col] = le | |
# Target variable selection | |
target_column = df_encoded.columns[-1] | |
X = df_encoded.iloc[:, :-1] | |
y = df_encoded[target_column] | |
# Fit Random Forest | |
model = RandomForestClassifier(random_state=42) | |
model.fit(X, y) | |
# Get feature importance | |
importance = pd.DataFrame({ | |
"Feature": X.columns, | |
"Importance": model.feature_importances_ | |
}).sort_values(by="Importance", ascending=False) | |
return importance | |
# Visualize feature importance | |
def plot_feature_importance(importance): | |
plt.figure(figsize=(10, 6)) | |
sns.barplot(x="Importance", y="Feature", data=importance) | |
plt.title("Feature Importance") | |
buf = BytesIO() | |
plt.savefig(buf, format="png") | |
buf.seek(0) | |
plt.close() | |
return buf | |
def analyze_file(file): | |
try: | |
# Step 1: Read file | |
df = read_file(file) | |
# Check if the dataframe is empty | |
if df.empty: | |
return ( | |
"The uploaded file is empty or has no valid data.", | |
None, | |
None, | |
None, | |
) | |
# Step 2: Clean data | |
df_cleaned = clean_data(df) | |
# Check if the cleaned dataframe is still empty | |
if df_cleaned.empty: | |
return ( | |
"The dataset contains no valid data after cleaning.", | |
None, | |
None, | |
None, | |
) | |
# Step 3: Generate summary statistics | |
summary = generate_summary(df_cleaned) | |
# Step 4: Generate correlation heatmap | |
heatmap_buf = generate_correlation_heatmap(df_cleaned) | |
# Step 5: Feature importance analysis | |
importance = feature_importance(df_cleaned) | |
importance_plot_buf = plot_feature_importance(importance) | |
# Step 6: Return results | |
return ( | |
summary, | |
heatmap_buf, | |
importance.head(10), # Top 10 important features | |
importance_plot_buf, | |
) | |
except ValueError as ve: | |
# Handle file format issues or parsing errors | |
return ( | |
f"ValueError: {str(ve)}", | |
None, | |
None, | |
None, | |
) | |
except Exception as e: | |
# Catch any other unforeseen issues | |
return ( | |
f"An unexpected error occurred: {str(e)}", | |
None, | |
None, | |
None, | |
) | |
# Gradio Interface | |
def gradio_interface(): | |
with gr.Blocks() as interface: | |
gr.Markdown("# AI Data Analytics Tool") | |
gr.Markdown("Upload your dataset in CSV or Excel format to analyze and generate insights automatically.") | |
with gr.Row(): | |
file_input = gr.File(label="Upload your CSV or Excel file") | |
analyze_button = gr.Button("Analyze") | |
with gr.Row(): | |
summary_output = gr.Dataframe(label="Summary Statistics") | |
heatmap_output = gr.Image(label="Correlation Heatmap") | |
importance_output = gr.Dataframe(label="Feature Importance") | |
importance_plot_output = gr.Image(label="Feature Importance Plot") | |
analyze_button.click( | |
analyze_file, | |
inputs=file_input, | |
outputs=[summary_output, heatmap_output, importance_output, importance_plot_output], | |
) | |
return interface | |
interface = gradio_interface() | |
interface.launch(debug=True) |