gaur3009 commited on
Commit
2f99a1c
·
verified ·
1 Parent(s): 317497b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +142 -0
app.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ import seaborn as sns
5
+ import matplotlib.pyplot as plt
6
+ from sklearn.ensemble import RandomForestClassifier
7
+ from sklearn.preprocessing import LabelEncoder
8
+ from sklearn.impute import SimpleImputer
9
+ from io import BytesIO
10
+ import warnings
11
+
12
+ warnings.filterwarnings("ignore")
13
+
14
+ # Function to read and process uploaded file
15
+ def read_file(file):
16
+ if file.name.endswith(".csv"):
17
+ df = pd.read_csv(file)
18
+ elif file.name.endswith(".xlsx"):
19
+ df = pd.read_excel(file)
20
+ else:
21
+ raise ValueError("Unsupported file format. Please upload a CSV or Excel file.")
22
+ return df
23
+
24
+ # Clean the data
25
+ def clean_data(df):
26
+ # Drop duplicates
27
+ df = df.drop_duplicates()
28
+ # Fill missing values
29
+ imputer = SimpleImputer(strategy="most_frequent")
30
+ df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
31
+ return df
32
+
33
+ # Generate summary statistics
34
+ def generate_summary(df):
35
+ return df.describe(include="all").transpose()
36
+
37
+ # Correlation heatmap
38
+ def generate_correlation_heatmap(df):
39
+ numeric_df = df.select_dtypes(include=[np.number])
40
+ corr = numeric_df.corr()
41
+ plt.figure(figsize=(10, 8))
42
+ sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
43
+ buf = BytesIO()
44
+ plt.savefig(buf, format="png")
45
+ buf.seek(0)
46
+ plt.close()
47
+ return buf
48
+
49
+ # Feature importance using Random Forest
50
+ def feature_importance(df):
51
+ # Encode categorical variables
52
+ df_encoded = df.copy()
53
+ label_encoders = {}
54
+ for col in df_encoded.select_dtypes(include="object").columns:
55
+ le = LabelEncoder()
56
+ df_encoded[col] = le.fit_transform(df_encoded[col])
57
+ label_encoders[col] = le
58
+
59
+ # Target variable selection
60
+ target_column = df_encoded.columns[-1]
61
+ X = df_encoded.iloc[:, :-1]
62
+ y = df_encoded[target_column]
63
+
64
+ # Fit Random Forest
65
+ model = RandomForestClassifier(random_state=42)
66
+ model.fit(X, y)
67
+
68
+ # Get feature importance
69
+ importance = pd.DataFrame({
70
+ "Feature": X.columns,
71
+ "Importance": model.feature_importances_
72
+ }).sort_values(by="Importance", ascending=False)
73
+
74
+ return importance
75
+
76
+ # Visualize feature importance
77
+ def plot_feature_importance(importance):
78
+ plt.figure(figsize=(10, 6))
79
+ sns.barplot(x="Importance", y="Feature", data=importance)
80
+ plt.title("Feature Importance")
81
+ buf = BytesIO()
82
+ plt.savefig(buf, format="png")
83
+ buf.seek(0)
84
+ plt.close()
85
+ return buf
86
+
87
+ # Main analysis function
88
+ def analyze_file(file):
89
+ try:
90
+ # Step 1: Read file
91
+ df = read_file(file)
92
+
93
+ # Step 2: Clean data
94
+ df_cleaned = clean_data(df)
95
+
96
+ # Step 3: Generate summary statistics
97
+ summary = generate_summary(df_cleaned)
98
+
99
+ # Step 4: Generate correlation heatmap
100
+ heatmap_buf = generate_correlation_heatmap(df_cleaned)
101
+
102
+ # Step 5: Feature importance analysis
103
+ importance = feature_importance(df_cleaned)
104
+ importance_plot_buf = plot_feature_importance(importance)
105
+
106
+ # Step 6: Return results
107
+ return (
108
+ summary,
109
+ heatmap_buf,
110
+ importance.head(10), # Top 10 important features
111
+ importance_plot_buf,
112
+ )
113
+ except Exception as e:
114
+ return str(e)
115
+
116
+ # Gradio Interface
117
+ def gradio_interface():
118
+ with gr.Blocks() as interface:
119
+ gr.Markdown("# AI Data Analytics Tool")
120
+ gr.Markdown("Upload your dataset in CSV or Excel format to analyze and generate insights automatically.")
121
+
122
+ with gr.Row():
123
+ file_input = gr.File(label="Upload your CSV or Excel file")
124
+ analyze_button = gr.Button("Analyze")
125
+
126
+ with gr.Row():
127
+ summary_output = gr.Dataframe(label="Summary Statistics")
128
+ heatmap_output = gr.Image(label="Correlation Heatmap")
129
+ importance_output = gr.Dataframe(label="Feature Importance")
130
+ importance_plot_output = gr.Image(label="Feature Importance Plot")
131
+
132
+ analyze_button.click(
133
+ analyze_file,
134
+ inputs=file_input,
135
+ outputs=[summary_output, heatmap_output, importance_output, importance_plot_output],
136
+ )
137
+
138
+ return interface
139
+
140
+ # Launch the Gradio interface
141
+ interface = gradio_interface()
142
+ interface.launch(debug=True)