import gradio as gr import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from sklearn.ensemble import RandomForestClassifier from sklearn.preprocessing import LabelEncoder from sklearn.impute import SimpleImputer from io import BytesIO import warnings warnings.filterwarnings("ignore") # Function to read and process uploaded file def read_file(file): if file.name.endswith(".csv"): df = pd.read_csv(file) elif file.name.endswith(".xlsx"): df = pd.read_excel(file) else: raise ValueError("Unsupported file format. Please upload a CSV or Excel file.") return df # Clean the data def clean_data(df): # Drop duplicates df = df.drop_duplicates() # Fill missing values imputer = SimpleImputer(strategy="most_frequent") df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns) return df # Generate summary statistics def generate_summary(df): return df.describe(include="all").transpose() # Correlation heatmap def generate_correlation_heatmap(df): numeric_df = df.select_dtypes(include=[np.number]) corr = numeric_df.corr() plt.figure(figsize=(10, 8)) sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f") buf = BytesIO() plt.savefig(buf, format="png") buf.seek(0) plt.close() return buf # Feature importance using Random Forest def feature_importance(df): # Encode categorical variables df_encoded = df.copy() label_encoders = {} for col in df_encoded.select_dtypes(include="object").columns: le = LabelEncoder() df_encoded[col] = le.fit_transform(df_encoded[col]) label_encoders[col] = le # Target variable selection target_column = df_encoded.columns[-1] X = df_encoded.iloc[:, :-1] y = df_encoded[target_column] # Fit Random Forest model = RandomForestClassifier(random_state=42) model.fit(X, y) # Get feature importance importance = pd.DataFrame({ "Feature": X.columns, "Importance": model.feature_importances_ }).sort_values(by="Importance", ascending=False) return importance # Visualize feature importance def plot_feature_importance(importance): plt.figure(figsize=(10, 6)) sns.barplot(x="Importance", y="Feature", data=importance) plt.title("Feature Importance") buf = BytesIO() plt.savefig(buf, format="png") buf.seek(0) plt.close() return buf # Main analysis function def analyze_file(file): try: # Step 1: Read file df = read_file(file) # Step 2: Clean data df_cleaned = clean_data(df) # Step 3: Generate summary statistics summary = generate_summary(df_cleaned) # Step 4: Generate correlation heatmap heatmap_buf = generate_correlation_heatmap(df_cleaned) # Step 5: Feature importance analysis importance = feature_importance(df_cleaned) importance_plot_buf = plot_feature_importance(importance) # Step 6: Return results return ( summary, heatmap_buf, importance.head(10), # Top 10 important features importance_plot_buf, ) except Exception as e: return str(e) # Gradio Interface def gradio_interface(): with gr.Blocks() as interface: gr.Markdown("# AI Data Analytics Tool") gr.Markdown("Upload your dataset in CSV or Excel format to analyze and generate insights automatically.") with gr.Row(): file_input = gr.File(label="Upload your CSV or Excel file") analyze_button = gr.Button("Analyze") with gr.Row(): summary_output = gr.Dataframe(label="Summary Statistics") heatmap_output = gr.Image(label="Correlation Heatmap") importance_output = gr.Dataframe(label="Feature Importance") importance_plot_output = gr.Image(label="Feature Importance Plot") analyze_button.click( analyze_file, inputs=file_input, outputs=[summary_output, heatmap_output, importance_output, importance_plot_output], ) return interface # Launch the Gradio interface interface = gradio_interface() interface.launch(debug=True)