import gradio as gr import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import LabelEncoder from sklearn.impute import SimpleImputer from io import BytesIO import warnings warnings.filterwarnings("ignore") def read_file(file): try: if file.name.endswith(".csv"): df = pd.read_csv(file) elif file.name.endswith(".xlsx"): df = pd.read_excel(file) else: raise ValueError("Unsupported file format. Please upload a CSV or Excel file.") # Ensure the file has columns if df.empty or df.columns.size == 0: raise ValueError("The file has no data or valid columns to parse.") return df except Exception as e: raise ValueError(f"Error reading file: {str(e)}") # Clean the data def clean_data(df): # Drop duplicates df = df.drop_duplicates() # Fill missing values imputer = SimpleImputer(strategy="most_frequent") df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns) return df # Generate summary statistics def generate_summary(df): return df.describe(include="all").transpose() # Correlation heatmap def generate_correlation_heatmap(df): numeric_df = df.select_dtypes(include=[np.number]) corr = numeric_df.corr() plt.figure(figsize=(10, 8)) sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f") buf = BytesIO() plt.savefig(buf, format="png") buf.seek(0) plt.close() return buf # Feature importance using Random Forest def feature_importance(df): # Encode categorical variables df_encoded = df.copy() label_encoders = {} for col in df_encoded.select_dtypes(include="object").columns: le = LabelEncoder() df_encoded[col] = le.fit_transform(df_encoded[col]) label_encoders[col] = le # Target variable selection target_column = df_encoded.columns[-1] X = df_encoded.iloc[:, :-1] y = df_encoded[target_column] # Fit Random Forest model = RandomForestClassifier(random_state=42) model.fit(X, y) # Get feature importance importance = pd.DataFrame({ "Feature": X.columns, "Importance": model.feature_importances_ }).sort_values(by="Importance", ascending=False) return importance # Visualize feature importance def plot_feature_importance(importance): plt.figure(figsize=(10, 6)) sns.barplot(x="Importance", y="Feature", data=importance) plt.title("Feature Importance") buf = BytesIO() plt.savefig(buf, format="png") buf.seek(0) plt.close() return buf def analyze_file(file): try: # Step 1: Read file df = read_file(file) # Check if the dataframe is empty if df.empty: return ( "The uploaded file is empty or has no valid data.", None, None, None, ) # Step 2: Clean data df_cleaned = clean_data(df) # Check if the cleaned dataframe is still empty if df_cleaned.empty: return ( "The dataset contains no valid data after cleaning.", None, None, None, ) # Step 3: Generate summary statistics summary = generate_summary(df_cleaned) # Step 4: Generate correlation heatmap heatmap_buf = generate_correlation_heatmap(df_cleaned) # Step 5: Feature importance analysis importance = feature_importance(df_cleaned) importance_plot_buf = plot_feature_importance(importance) # Step 6: Return results return ( summary, heatmap_buf, importance.head(10), # Top 10 important features importance_plot_buf, ) except ValueError as ve: # Handle file format issues or parsing errors return ( f"ValueError: {str(ve)}", None, None, None, ) except Exception as e: # Catch any other unforeseen issues return ( f"An unexpected error occurred: {str(e)}", None, None, None, ) # Gradio Interface def gradio_interface(): with gr.Blocks() as interface: gr.Markdown("# AI Data Analytics Tool") gr.Markdown("Upload your dataset in CSV or Excel format to analyze and generate insights automatically.") with gr.Row(): file_input = gr.File(label="Upload your CSV or Excel file") analyze_button = gr.Button("Analyze") with gr.Row(): summary_output = gr.Dataframe(label="Summary Statistics") heatmap_output = gr.Image(label="Correlation Heatmap") importance_output = gr.Dataframe(label="Feature Importance") importance_plot_output = gr.Image(label="Feature Importance Plot") analyze_button.click( analyze_file, inputs=file_input, outputs=[summary_output, heatmap_output, importance_output, importance_plot_output], ) return interface interface = gradio_interface() interface.launch(debug=True)