vishalsh13 commited on
Commit
7e976f4
Β·
1 Parent(s): adec62c

commit update

Browse files
Files changed (2) hide show
  1. app copy.py +144 -0
  2. app.py +127 -65
app copy.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ import re
5
+ from sklearn.ensemble import RandomForestClassifier
6
+ from sklearn.preprocessing import LabelEncoder
7
+ from fuzzywuzzy import process
8
+
9
+ # Enhanced data generation with realistic fraud patterns
10
+ def load_data():
11
+ np.random.seed(42)
12
+ cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
13
+ age_groups = ['18-25', '26-35', '36-45', '46-55', '56+']
14
+ incomes = ['Low', 'Medium', 'High']
15
+
16
+ data = pd.DataFrame({
17
+ 'TransactionID': range(1, 1001),
18
+ 'Amount': np.random.uniform(10, 15000, 1000).round(2),
19
+ 'Type': np.random.choice(['Credit', 'Debit'], 1000),
20
+ 'City': np.random.choice(cities, 1000),
21
+ 'Age': np.random.randint(18, 70, 1000),
22
+ 'Income': np.random.choice(incomes, 1000, p=[0.4, 0.4, 0.2])
23
+ })
24
+
25
+ # Create realistic fraud patterns
26
+ data['Fraud'] = 0
27
+ data.loc[
28
+ ((data['Amount'] > 5000) & (data['Income'] == 'Low')) |
29
+ ((data['Type'] == 'Credit') & (data['Amount'] > 8000)) |
30
+ ((data['City'] == 'New York') & (data['Age'].between(20, 35)) & (data['Amount'] > 6000)),
31
+ 'Fraud'
32
+ ] = 1
33
+
34
+ return data
35
+
36
+ data = load_data()
37
+
38
+ # Initialize separate encoders for each feature
39
+ le_type = LabelEncoder()
40
+ le_city = LabelEncoder()
41
+ le_income = LabelEncoder()
42
+
43
+ # Fit encoders on full dataset (or training data in real scenarios)
44
+ data['Type_encoded'] = le_type.fit_transform(data['Type'])
45
+ data['City_encoded'] = le_city.fit_transform(data['City'])
46
+ data['Income_encoded'] = le_income.fit_transform(data['Income'])
47
+
48
+ # Train model
49
+ features = ['Amount', 'Type_encoded', 'City_encoded', 'Age', 'Income_encoded']
50
+ X = data[features]
51
+ y = data['Fraud']
52
+
53
+ model = RandomForestClassifier(random_state=42, n_estimators=100)
54
+ model.fit(X, y)
55
+
56
+ def process_nl_query(query):
57
+ try:
58
+ # Extract amount
59
+ amount_match = re.search(r'\$?(\d+(?:,\d{3})*(?:\.\d{2})?)', query)
60
+ if amount_match:
61
+ amount = float(amount_match.group(1).replace(',', ''))
62
+ else:
63
+ return "Error: Could not extract transaction amount."
64
+
65
+ # Extract transaction type
66
+ trans_type = 'Credit' if 'credit' in query.lower() else 'Debit'
67
+
68
+ # Fuzzy match city
69
+ cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
70
+ city_match = process.extractOne(query, cities)
71
+ city = city_match[0] if city_match[1] > 70 else 'Unknown'
72
+
73
+ # Extract age
74
+ age_match = re.search(r'(\d+)\s*(?:years?|yrs?)?(?:\s*old)?', query)
75
+ age = int(age_match.group(1)) if age_match else None
76
+
77
+ # Extract income level
78
+ income = 'Low' if 'low' in query.lower() else \
79
+ 'High' if 'high' in query.lower() else 'Medium'
80
+
81
+ # Handle unseen labels
82
+ city_encoded = le_city.transform([city])[0] if city in le_city.classes_ else -1
83
+ income_encoded = le_income.transform([income])[0] if income in le_income.classes_ else -1
84
+
85
+ # Prepare input
86
+ input_df = pd.DataFrame({
87
+ 'Amount': [amount],
88
+ 'Type_encoded': le_type.transform([trans_type])[0],
89
+ 'City_encoded': city_encoded,
90
+ 'Age': [age] if age else data['Age'].median(), # Handle missing age
91
+ 'Income_encoded': income_encoded
92
+ })
93
+
94
+ # Predict
95
+ proba = model.predict_proba(input_df)[0][1]
96
+ prediction = model.predict(input_df)[0]
97
+
98
+ # Generate explanation
99
+ explanation = []
100
+ if amount > 5000 and income == 'Low':
101
+ explanation.append("High amount for low income")
102
+ if amount > 8000 and trans_type == 'Credit':
103
+ explanation.append("Unusually large credit transaction")
104
+ if city == 'New York' and 20 <= age <= 35 and amount > 6000:
105
+ explanation.append("Suspicious pattern for young adults in NYC")
106
+
107
+ return (
108
+ f"Transaction Details:\n"
109
+ f"- Amount: ${amount:,.2f}\n"
110
+ f"- Type: {trans_type}\n"
111
+ f"- City: {city}\n"
112
+ f"- Age: {age}\n"
113
+ f"- Income Level: {income}\n\n"
114
+ f"Fraud Analysis:\n"
115
+ f"- Prediction: {'Potentially Fraudulent' if prediction else 'Likely Legitimate'}\n"
116
+ f"- Confidence: {proba*100:.1f}%\n"
117
+ f"- Risk Factors: {', '.join(explanation) if explanation else 'No specific risk factors identified'}"
118
+ )
119
+
120
+ except Exception as e:
121
+ return f"Error processing query: {str(e)}"
122
+
123
+ # Gradio Interface
124
+ with gr.Blocks() as demo:
125
+ gr.Markdown("## Enhanced Fraud Detection System")
126
+
127
+ with gr.Tab("Natural Language Query"):
128
+ gr.Markdown("**Example:** 'Check a $6000 credit in New York for a 26-year-old with low income'")
129
+ nl_input = gr.Textbox(label="Enter your transaction query:")
130
+ nl_output = gr.Textbox(label="Fraud Analysis", lines=10)
131
+ gr.Examples(
132
+ examples=[
133
+ "Is a $8000 credit in Chicago for a 45-year-old medium income safe?",
134
+ "Verify a $300 debit in Phoenix for a 60-year-old high income client"
135
+ ],
136
+ inputs=nl_input
137
+ )
138
+ nl_input.submit(fn=process_nl_query, inputs=nl_input, outputs=nl_output)
139
+
140
+ with gr.Tab("Data Insights"):
141
+ gr.Markdown("### Fraud Pattern Analysis")
142
+ gr.DataFrame(data[data['Fraud'] == 1].describe())
143
+
144
+ demo.launch()
app.py CHANGED
@@ -2,15 +2,15 @@ import gradio as gr
2
  import pandas as pd
3
  import numpy as np
4
  import re
 
5
  from sklearn.ensemble import RandomForestClassifier
6
  from sklearn.preprocessing import LabelEncoder
7
  from fuzzywuzzy import process
8
 
9
- # Enhanced data generation with realistic fraud patterns
10
  def load_data():
11
  np.random.seed(42)
12
  cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
13
- age_groups = ['18-25', '26-35', '36-45', '46-55', '56+']
14
  incomes = ['Low', 'Medium', 'High']
15
 
16
  data = pd.DataFrame({
@@ -22,7 +22,7 @@ def load_data():
22
  'Income': np.random.choice(incomes, 1000, p=[0.4, 0.4, 0.2])
23
  })
24
 
25
- # Create realistic fraud patterns
26
  data['Fraud'] = 0
27
  data.loc[
28
  ((data['Amount'] > 5000) & (data['Income'] == 'Low')) |
@@ -35,67 +35,36 @@ def load_data():
35
 
36
  data = load_data()
37
 
38
- # Initialize separate encoders for each feature
39
  le_type = LabelEncoder()
40
  le_city = LabelEncoder()
41
  le_income = LabelEncoder()
42
 
43
- # Fit encoders on full dataset (or training data in real scenarios)
44
  data['Type_encoded'] = le_type.fit_transform(data['Type'])
45
  data['City_encoded'] = le_city.fit_transform(data['City'])
46
  data['Income_encoded'] = le_income.fit_transform(data['Income'])
47
 
48
  # Train model
49
- features = ['Amount', 'Type_encoded', 'City_encoded', 'Age', 'Income_encoded']
50
- X = data[features]
51
- y = data['Fraud']
52
-
53
  model = RandomForestClassifier(random_state=42, n_estimators=100)
54
- model.fit(X, y)
55
 
56
- def process_nl_query(query):
 
57
  try:
58
- # Extract amount
59
- amount_match = re.search(r'\$?(\d+(?:,\d{3})*(?:\.\d{2})?)', query)
60
- if amount_match:
61
- amount = float(amount_match.group(1).replace(',', ''))
62
- else:
63
- return "Error: Could not extract transaction amount."
64
-
65
- # Extract transaction type
66
- trans_type = 'Credit' if 'credit' in query.lower() else 'Debit'
67
-
68
- # Fuzzy match city
69
- cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
70
- city_match = process.extractOne(query, cities)
71
- city = city_match[0] if city_match[1] > 70 else 'Unknown'
72
-
73
- # Extract age
74
- age_match = re.search(r'(\d+)\s*(?:years?|yrs?)?(?:\s*old)?', query)
75
- age = int(age_match.group(1)) if age_match else None
76
 
77
- # Extract income level
78
- income = 'Low' if 'low' in query.lower() else \
79
- 'High' if 'high' in query.lower() else 'Medium'
80
-
81
- # Handle unseen labels
82
- city_encoded = le_city.transform([city])[0] if city in le_city.classes_ else -1
83
- income_encoded = le_income.transform([income])[0] if income in le_income.classes_ else -1
84
-
85
- # Prepare input
86
  input_df = pd.DataFrame({
87
  'Amount': [amount],
88
  'Type_encoded': le_type.transform([trans_type])[0],
89
- 'City_encoded': city_encoded,
90
- 'Age': [age] if age else data['Age'].median(), # Handle missing age
91
- 'Income_encoded': income_encoded
92
  })
93
 
94
- # Predict
95
  proba = model.predict_proba(input_df)[0][1]
96
  prediction = model.predict(input_df)[0]
97
 
98
- # Generate explanation
99
  explanation = []
100
  if amount > 5000 and income == 'Low':
101
  explanation.append("High amount for low income")
@@ -105,40 +74,133 @@ def process_nl_query(query):
105
  explanation.append("Suspicious pattern for young adults in NYC")
106
 
107
  return (
108
- f"Transaction Details:\n"
109
- f"- Amount: ${amount:,.2f}\n"
110
- f"- Type: {trans_type}\n"
111
- f"- City: {city}\n"
112
- f"- Age: {age}\n"
113
- f"- Income Level: {income}\n\n"
114
- f"Fraud Analysis:\n"
115
- f"- Prediction: {'Potentially Fraudulent' if prediction else 'Likely Legitimate'}\n"
116
- f"- Confidence: {proba*100:.1f}%\n"
117
- f"- Risk Factors: {', '.join(explanation) if explanation else 'No specific risk factors identified'}"
118
  )
 
 
 
 
 
 
 
 
 
 
 
 
119
 
 
120
  except Exception as e:
121
  return f"Error processing query: {str(e)}"
122
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  # Gradio Interface
124
- with gr.Blocks() as demo:
125
- gr.Markdown("## Enhanced Fraud Detection System")
126
 
127
- with gr.Tab("Natural Language Query"):
128
- gr.Markdown("**Example:** 'Check a $6000 credit in New York for a 26-year-old with low income'")
129
- nl_input = gr.Textbox(label="Enter your transaction query:")
130
- nl_output = gr.Textbox(label="Fraud Analysis", lines=10)
 
 
131
  gr.Examples(
132
  examples=[
133
- "Is a $8000 credit in Chicago for a 45-year-old medium income safe?",
134
- "Verify a $300 debit in Phoenix for a 60-year-old high income client"
135
  ],
136
  inputs=nl_input
137
  )
138
- nl_input.submit(fn=process_nl_query, inputs=nl_input, outputs=nl_output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
 
140
- with gr.Tab("Data Insights"):
141
- gr.Markdown("### Fraud Pattern Analysis")
142
- gr.DataFrame(data[data['Fraud'] == 1].describe())
143
 
144
  demo.launch()
 
2
  import pandas as pd
3
  import numpy as np
4
  import re
5
+ import matplotlib.pyplot as plt
6
  from sklearn.ensemble import RandomForestClassifier
7
  from sklearn.preprocessing import LabelEncoder
8
  from fuzzywuzzy import process
9
 
10
+ # Data generation and preprocessing
11
  def load_data():
12
  np.random.seed(42)
13
  cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
 
14
  incomes = ['Low', 'Medium', 'High']
15
 
16
  data = pd.DataFrame({
 
22
  'Income': np.random.choice(incomes, 1000, p=[0.4, 0.4, 0.2])
23
  })
24
 
25
+ # Fraud patterns
26
  data['Fraud'] = 0
27
  data.loc[
28
  ((data['Amount'] > 5000) & (data['Income'] == 'Low')) |
 
35
 
36
  data = load_data()
37
 
38
+ # Initialize encoders
39
  le_type = LabelEncoder()
40
  le_city = LabelEncoder()
41
  le_income = LabelEncoder()
42
 
 
43
  data['Type_encoded'] = le_type.fit_transform(data['Type'])
44
  data['City_encoded'] = le_city.fit_transform(data['City'])
45
  data['Income_encoded'] = le_income.fit_transform(data['Income'])
46
 
47
  # Train model
 
 
 
 
48
  model = RandomForestClassifier(random_state=42, n_estimators=100)
49
+ model.fit(data[['Amount', 'Type_encoded', 'City_encoded', 'Age', 'Income_encoded']], data['Fraud'])
50
 
51
+ # Fraud prediction function
52
+ def predict_fraud(amount, trans_type, city, age, income):
53
  try:
54
+ city = city if city in le_city.classes_ else 'Unknown'
55
+ income = income if income in le_income.classes_ else 'Medium'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
 
 
 
 
 
 
 
 
 
57
  input_df = pd.DataFrame({
58
  'Amount': [amount],
59
  'Type_encoded': le_type.transform([trans_type])[0],
60
+ 'City_encoded': le_city.transform([city])[0] if city in le_city.classes_ else -1,
61
+ 'Age': [age],
62
+ 'Income_encoded': le_income.transform([income])[0] if income in le_income.classes_ else -1
63
  })
64
 
 
65
  proba = model.predict_proba(input_df)[0][1]
66
  prediction = model.predict(input_df)[0]
67
 
 
68
  explanation = []
69
  if amount > 5000 and income == 'Low':
70
  explanation.append("High amount for low income")
 
74
  explanation.append("Suspicious pattern for young adults in NYC")
75
 
76
  return (
77
+ f"Prediction: {'Fraudulent' if prediction else 'Legitimate'}\n"
78
+ f"Confidence: {proba*100:.1f}%\n"
79
+ f"Risk Factors: {', '.join(explanation) if explanation else 'No specific risks'}"
 
 
 
 
 
 
 
80
  )
81
+ except Exception as e:
82
+ return f"Error: {str(e)}"
83
+
84
+ # NLP processing function
85
+ def process_nl_query(query):
86
+ try:
87
+ amount = float(re.search(r'\$?(\d+(?:,\d{3})*(?:\.\d{2})?)', query).group(1).replace(',', ''))
88
+ trans_type = 'Credit' if 'credit' in query.lower() else 'Debit'
89
+ city = process.extractOne(query, le_city.classes_)[0]
90
+ age_match = re.search(r'(\d+)\s*years?', query)
91
+ age = int(age_match.group(1)) if age_match else data['Age'].median()
92
+ income = 'Low' if 'low' in query.lower() else ('High' if 'high' in query.lower() else 'Medium')
93
 
94
+ return predict_fraud(amount, trans_type, city, age, income)
95
  except Exception as e:
96
  return f"Error processing query: {str(e)}"
97
 
98
+ # Visualization function
99
+ def create_plot(choice):
100
+ try:
101
+ fig, ax = plt.subplots(figsize=(10, 6))
102
+
103
+ if choice == "Fraud by City":
104
+ city_counts = data[data['Fraud'] == 1]['City'].value_counts()
105
+ if not city_counts.empty:
106
+ ax.bar(city_counts.index, city_counts.values)
107
+ ax.set_title('Fraud Cases by City')
108
+ ax.set_xlabel('City')
109
+ ax.set_ylabel('Count')
110
+ plt.xticks(rotation=45)
111
+ else:
112
+ ax.text(0.5, 0.5, 'No fraud data available',
113
+ ha='center', va='center')
114
+ ax.set_title('Fraud Cases by City')
115
+
116
+ elif choice == "Fraud by Income":
117
+ income_counts = data[data['Fraud'] == 1]['Income'].value_counts()
118
+ if not income_counts.empty:
119
+ ax.bar(income_counts.index, income_counts.values)
120
+ ax.set_title('Fraud Cases by Income Level')
121
+ ax.set_xlabel('Income Level')
122
+ ax.set_ylabel('Count')
123
+ else:
124
+ ax.text(0.5, 0.5, 'No fraud data available',
125
+ ha='center', va='center')
126
+ ax.set_title('Fraud Cases by Income Level')
127
+
128
+ elif choice == "Transaction Patterns":
129
+ fraud_data = data[data['Fraud'] == 1]
130
+ legit_data = data[data['Fraud'] == 0]
131
+ if not fraud_data.empty:
132
+ ax.scatter(legit_data['Amount'], legit_data['Age'],
133
+ alpha=0.3, label='Legitimate')
134
+ ax.scatter(fraud_data['Amount'], fraud_data['Age'],
135
+ color='red', alpha=0.5, label='Fraud')
136
+ ax.set_title('Transaction Amount vs Age')
137
+ ax.set_xlabel('Amount')
138
+ ax.set_ylabel('Age')
139
+ ax.legend()
140
+ else:
141
+ ax.text(0.5, 0.5, 'No fraud data available',
142
+ ha='center', va='center')
143
+ ax.set_title('Transaction Amount vs Age')
144
+
145
+ plt.tight_layout()
146
+ return fig, ""
147
+
148
+ except Exception as e:
149
+ plt.close()
150
+ return None, f"Error generating plot: {str(e)}"
151
+
152
  # Gradio Interface
153
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
154
+ gr.Markdown("# πŸ•΅οΈ Banking Fraud Detection System")
155
 
156
+ with gr.Tab("πŸ’¬ NLP Query"):
157
+ gr.Markdown("## Analyze Transactions with Natural Language")
158
+ with gr.Row():
159
+ nl_input = gr.Textbox(label="Describe transaction", placeholder="e.g., 'Credit of $6000 in New York for a 26-year-old with low income'")
160
+ nl_btn = gr.Button("Analyze", variant="primary")
161
+ nl_output = gr.Textbox(label="Analysis Result", lines=4)
162
  gr.Examples(
163
  examples=[
164
+ "$8000 credit in Chicago for 45-year-old with medium income",
165
+ "Verify $300 debit in Phoenix for 60-year-old high income client"
166
  ],
167
  inputs=nl_input
168
  )
169
+ nl_btn.click(process_nl_query, nl_input, nl_output)
170
+
171
+ with gr.Tab("πŸ“ Manual Input"):
172
+ gr.Markdown("## Manual Transaction Analysis")
173
+ with gr.Row():
174
+ amount = gr.Number(label="Amount", minimum=0)
175
+ trans_type = gr.Dropdown(["Credit", "Debit"], label="Type")
176
+ with gr.Row():
177
+ city = gr.Dropdown(le_city.classes_.tolist(), label="City")
178
+ age = gr.Number(label="Age", minimum=18)
179
+ income = gr.Dropdown(le_income.classes_.tolist(), label="Income Level")
180
+ manual_btn = gr.Button("Analyze", variant="primary")
181
+ manual_output = gr.Textbox(label="Analysis Result", lines=4)
182
+ manual_btn.click(predict_fraud, [amount, trans_type, city, age, income], manual_output)
183
+
184
+ with gr.Tab("πŸ“Š Data Insights"):
185
+ gr.Markdown("## Fraud Pattern Visualization")
186
+ with gr.Row():
187
+ plot_choice = gr.Radio(
188
+ ["Fraud by City", "Fraud by Income", "Transaction Patterns"],
189
+ label="Select Visualization",
190
+ value="Fraud by City"
191
+ )
192
+ with gr.Row():
193
+ plot_output = gr.Plot()
194
+ error_output = gr.Textbox(label="Error Message", visible=False)
195
+
196
+ plot_choice.change(
197
+ fn=create_plot,
198
+ inputs=plot_choice,
199
+ outputs=[plot_output, error_output]
200
+ )
201
 
202
+ with gr.Tab("πŸ“ Raw Data"):
203
+ gr.Markdown("## Complete Transaction Dataset")
204
+ gr.DataFrame(data)
205
 
206
  demo.launch()