Spaces:

AITestingWorkSpace
/

FraudNLP

Paused

App Files Files Community

vishalsh13 commited on Feb 2

Commit

7e976f4

1 Parent(s): adec62c

commit update

Browse files

Files changed (2) hide show

app copy.py +144 -0
app.py +127 -65

app copy.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import gradio as gr
+import pandas as pd
+import numpy as np
+import re
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.preprocessing import LabelEncoder
+from fuzzywuzzy import process
+# Enhanced data generation with realistic fraud patterns
+def load_data():
+    np.random.seed(42)
+    cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
+    age_groups = ['18-25', '26-35', '36-45', '46-55', '56+']
+    incomes = ['Low', 'Medium', 'High']
+    data = pd.DataFrame({
+        'TransactionID': range(1, 1001),
+        'Amount': np.random.uniform(10, 15000, 1000).round(2),
+        'Type': np.random.choice(['Credit', 'Debit'], 1000),
+        'City': np.random.choice(cities, 1000),
+        'Age': np.random.randint(18, 70, 1000),
+        'Income': np.random.choice(incomes, 1000, p=[0.4, 0.4, 0.2])
+    })
+    # Create realistic fraud patterns
+    data['Fraud'] = 0
+    data.loc[
+        ((data['Amount'] > 5000) & (data['Income'] == 'Low')) |
+        ((data['Type'] == 'Credit') & (data['Amount'] > 8000)) |
+        ((data['City'] == 'New York') & (data['Age'].between(20, 35)) & (data['Amount'] > 6000)),
+        'Fraud'
+    ] = 1
+    return data
+data = load_data()
+# Initialize separate encoders for each feature
+le_type = LabelEncoder()
+le_city = LabelEncoder()
+le_income = LabelEncoder()
+# Fit encoders on full dataset (or training data in real scenarios)
+data['Type_encoded'] = le_type.fit_transform(data['Type'])
+data['City_encoded'] = le_city.fit_transform(data['City'])
+data['Income_encoded'] = le_income.fit_transform(data['Income'])
+# Train model
+features = ['Amount', 'Type_encoded', 'City_encoded', 'Age', 'Income_encoded']
+X = data[features]
+y = data['Fraud']
+model = RandomForestClassifier(random_state=42, n_estimators=100)
+model.fit(X, y)
+def process_nl_query(query):
+    try:
+        # Extract amount
+        amount_match = re.search(r'\$?(\d+(?:,\d{3})*(?:\.\d{2})?)', query)
+        if amount_match:
+            amount = float(amount_match.group(1).replace(',', ''))
+        else:
+            return "Error: Could not extract transaction amount."
+        # Extract transaction type
+        trans_type = 'Credit' if 'credit' in query.lower() else 'Debit'
+        # Fuzzy match city
+        cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
+        city_match = process.extractOne(query, cities)
+        city = city_match[0] if city_match[1] > 70 else 'Unknown'
+        # Extract age
+        age_match = re.search(r'(\d+)\s*(?:years?|yrs?)?(?:\s*old)?', query)
+        age = int(age_match.group(1)) if age_match else None
+        # Extract income level
+        income = 'Low' if 'low' in query.lower() else \
+                 'High' if 'high' in query.lower() else 'Medium'
+        # Handle unseen labels
+        city_encoded = le_city.transform([city])[0] if city in le_city.classes_ else -1
+        income_encoded = le_income.transform([income])[0] if income in le_income.classes_ else -1
+        # Prepare input
+        input_df = pd.DataFrame({
+            'Amount': [amount],
+            'Type_encoded': le_type.transform([trans_type])[0],
+            'City_encoded': city_encoded,
+            'Age': [age] if age else data['Age'].median(),  # Handle missing age
+            'Income_encoded': income_encoded
+        })
+        # Predict
+        proba = model.predict_proba(input_df)[0][1]
+        prediction = model.predict(input_df)[0]
+        # Generate explanation
+        explanation = []
+        if amount > 5000 and income == 'Low':
+            explanation.append("High amount for low income")
+        if amount > 8000 and trans_type == 'Credit':
+            explanation.append("Unusually large credit transaction")
+        if city == 'New York' and 20 <= age <= 35 and amount > 6000:
+            explanation.append("Suspicious pattern for young adults in NYC")
+        return (
+            f"Transaction Details:\n"
+            f"- Amount: ${amount:,.2f}\n"
+            f"- Type: {trans_type}\n"
+            f"- City: {city}\n"
+            f"- Age: {age}\n"
+            f"- Income Level: {income}\n\n"
+            f"Fraud Analysis:\n"
+            f"- Prediction: {'Potentially Fraudulent' if prediction else 'Likely Legitimate'}\n"
+            f"- Confidence: {proba*100:.1f}%\n"
+            f"- Risk Factors: {', '.join(explanation) if explanation else 'No specific risk factors identified'}"
+        )
+    except Exception as e:
+        return f"Error processing query: {str(e)}"
+# Gradio Interface
+with gr.Blocks() as demo:
+    gr.Markdown("## Enhanced Fraud Detection System")
+    with gr.Tab("Natural Language Query"):
+        gr.Markdown("**Example:** 'Check a $6000 credit in New York for a 26-year-old with low income'")
+        nl_input = gr.Textbox(label="Enter your transaction query:")
+        nl_output = gr.Textbox(label="Fraud Analysis", lines=10)
+        gr.Examples(
+            examples=[
+                "Is a $8000 credit in Chicago for a 45-year-old medium income safe?",
+                "Verify a $300 debit in Phoenix for a 60-year-old high income client"
+            ],
+            inputs=nl_input
+        )
+        nl_input.submit(fn=process_nl_query, inputs=nl_input, outputs=nl_output)
+    with gr.Tab("Data Insights"):
+        gr.Markdown("### Fraud Pattern Analysis")
+        gr.DataFrame(data[data['Fraud'] == 1].describe())
+demo.launch()

app.py CHANGED Viewed

@@ -2,15 +2,15 @@ import gradio as gr
 import pandas as pd
 import numpy as np
 import re
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.preprocessing import LabelEncoder
 from fuzzywuzzy import process
-# Enhanced data generation with realistic fraud patterns
 def load_data():
     np.random.seed(42)
     cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
-    age_groups = ['18-25', '26-35', '36-45', '46-55', '56+']
     incomes = ['Low', 'Medium', 'High']
     data = pd.DataFrame({
@@ -22,7 +22,7 @@ def load_data():
         'Income': np.random.choice(incomes, 1000, p=[0.4, 0.4, 0.2])
     })
-    # Create realistic fraud patterns
     data['Fraud'] = 0
     data.loc[
         ((data['Amount'] > 5000) & (data['Income'] == 'Low')) |
@@ -35,67 +35,36 @@ def load_data():
 data = load_data()
-# Initialize separate encoders for each feature
 le_type = LabelEncoder()
 le_city = LabelEncoder()
 le_income = LabelEncoder()
-# Fit encoders on full dataset (or training data in real scenarios)
 data['Type_encoded'] = le_type.fit_transform(data['Type'])
 data['City_encoded'] = le_city.fit_transform(data['City'])
 data['Income_encoded'] = le_income.fit_transform(data['Income'])
 # Train model
-features = ['Amount', 'Type_encoded', 'City_encoded', 'Age', 'Income_encoded']
-X = data[features]
-y = data['Fraud']
 model = RandomForestClassifier(random_state=42, n_estimators=100)
-model.fit(X, y)
-def process_nl_query(query):
     try:
-        # Extract amount
-        amount_match = re.search(r'\$?(\d+(?:,\d{3})*(?:\.\d{2})?)', query)
-        if amount_match:
-            amount = float(amount_match.group(1).replace(',', ''))
-        else:
-            return "Error: Could not extract transaction amount."
-        # Extract transaction type
-        trans_type = 'Credit' if 'credit' in query.lower() else 'Debit'
-        # Fuzzy match city
-        cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
-        city_match = process.extractOne(query, cities)
-        city = city_match[0] if city_match[1] > 70 else 'Unknown'
-        # Extract age
-        age_match = re.search(r'(\d+)\s*(?:years?|yrs?)?(?:\s*old)?', query)
-        age = int(age_match.group(1)) if age_match else None
-        # Extract income level
-        income = 'Low' if 'low' in query.lower() else \
-                 'High' if 'high' in query.lower() else 'Medium'
-        # Handle unseen labels
-        city_encoded = le_city.transform([city])[0] if city in le_city.classes_ else -1
-        income_encoded = le_income.transform([income])[0] if income in le_income.classes_ else -1
-        # Prepare input
         input_df = pd.DataFrame({
             'Amount': [amount],
             'Type_encoded': le_type.transform([trans_type])[0],
-            'City_encoded': city_encoded,
-            'Age': [age] if age else data['Age'].median(),  # Handle missing age
-            'Income_encoded': income_encoded
         })
-        # Predict
         proba = model.predict_proba(input_df)[0][1]
         prediction = model.predict(input_df)[0]
-        # Generate explanation
         explanation = []
         if amount > 5000 and income == 'Low':
             explanation.append("High amount for low income")
@@ -105,40 +74,133 @@ def process_nl_query(query):
             explanation.append("Suspicious pattern for young adults in NYC")
         return (
-            f"Transaction Details:\n"
-            f"- Amount: ${amount:,.2f}\n"
-            f"- Type: {trans_type}\n"
-            f"- City: {city}\n"
-            f"- Age: {age}\n"
-            f"- Income Level: {income}\n\n"
-            f"Fraud Analysis:\n"
-            f"- Prediction: {'Potentially Fraudulent' if prediction else 'Likely Legitimate'}\n"
-            f"- Confidence: {proba*100:.1f}%\n"
-            f"- Risk Factors: {', '.join(explanation) if explanation else 'No specific risk factors identified'}"
         )
     except Exception as e:
         return f"Error processing query: {str(e)}"
 # Gradio Interface
-with gr.Blocks() as demo:
-    gr.Markdown("## Enhanced Fraud Detection System")
-    with gr.Tab("Natural Language Query"):
-        gr.Markdown("**Example:** 'Check a $6000 credit in New York for a 26-year-old with low income'")
-        nl_input = gr.Textbox(label="Enter your transaction query:")
-        nl_output = gr.Textbox(label="Fraud Analysis", lines=10)
         gr.Examples(
             examples=[
-                "Is a $8000 credit in Chicago for a 45-year-old medium income safe?",
-                "Verify a $300 debit in Phoenix for a 60-year-old high income client"
             ],
             inputs=nl_input
         )
-        nl_input.submit(fn=process_nl_query, inputs=nl_input, outputs=nl_output)
-    with gr.Tab("Data Insights"):
-        gr.Markdown("### Fraud Pattern Analysis")
-        gr.DataFrame(data[data['Fraud'] == 1].describe())
 demo.launch()

 import pandas as pd
 import numpy as np
 import re
+import matplotlib.pyplot as plt
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.preprocessing import LabelEncoder
 from fuzzywuzzy import process
+# Data generation and preprocessing
 def load_data():
     np.random.seed(42)
     cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
     incomes = ['Low', 'Medium', 'High']
     data = pd.DataFrame({
         'Income': np.random.choice(incomes, 1000, p=[0.4, 0.4, 0.2])
     })
+    # Fraud patterns
     data['Fraud'] = 0
     data.loc[
         ((data['Amount'] > 5000) & (data['Income'] == 'Low')) |
 data = load_data()
+# Initialize encoders
 le_type = LabelEncoder()
 le_city = LabelEncoder()
 le_income = LabelEncoder()
 data['Type_encoded'] = le_type.fit_transform(data['Type'])
 data['City_encoded'] = le_city.fit_transform(data['City'])
 data['Income_encoded'] = le_income.fit_transform(data['Income'])
 # Train model
 model = RandomForestClassifier(random_state=42, n_estimators=100)
+model.fit(data[['Amount', 'Type_encoded', 'City_encoded', 'Age', 'Income_encoded']], data['Fraud'])
+# Fraud prediction function
+def predict_fraud(amount, trans_type, city, age, income):
     try:
+        city = city if city in le_city.classes_ else 'Unknown'
+        income = income if income in le_income.classes_ else 'Medium'
         input_df = pd.DataFrame({
             'Amount': [amount],
             'Type_encoded': le_type.transform([trans_type])[0],
+            'City_encoded': le_city.transform([city])[0] if city in le_city.classes_ else -1,
+            'Age': [age],
+            'Income_encoded': le_income.transform([income])[0] if income in le_income.classes_ else -1
         })
         proba = model.predict_proba(input_df)[0][1]
         prediction = model.predict(input_df)[0]
         explanation = []
         if amount > 5000 and income == 'Low':
             explanation.append("High amount for low income")
             explanation.append("Suspicious pattern for young adults in NYC")
         return (
+            f"Prediction: {'Fraudulent' if prediction else 'Legitimate'}\n"
+            f"Confidence: {proba*100:.1f}%\n"
+            f"Risk Factors: {', '.join(explanation) if explanation else 'No specific risks'}"
         )
+    except Exception as e:
+        return f"Error: {str(e)}"
+# NLP processing function
+def process_nl_query(query):
+    try:
+        amount = float(re.search(r'\$?(\d+(?:,\d{3})*(?:\.\d{2})?)', query).group(1).replace(',', ''))
+        trans_type = 'Credit' if 'credit' in query.lower() else 'Debit'
+        city = process.extractOne(query, le_city.classes_)[0]
+        age_match = re.search(r'(\d+)\s*years?', query)
+        age = int(age_match.group(1)) if age_match else data['Age'].median()
+        income = 'Low' if 'low' in query.lower() else ('High' if 'high' in query.lower() else 'Medium')
+        return predict_fraud(amount, trans_type, city, age, income)
     except Exception as e:
         return f"Error processing query: {str(e)}"
+# Visualization function
+def create_plot(choice):
+    try:
+        fig, ax = plt.subplots(figsize=(10, 6))
+        if choice == "Fraud by City":
+            city_counts = data[data['Fraud'] == 1]['City'].value_counts()
+            if not city_counts.empty:
+                ax.bar(city_counts.index, city_counts.values)
+                ax.set_title('Fraud Cases by City')
+                ax.set_xlabel('City')
+                ax.set_ylabel('Count')
+                plt.xticks(rotation=45)
+            else:
+                ax.text(0.5, 0.5, 'No fraud data available',
+                        ha='center', va='center')
+                ax.set_title('Fraud Cases by City')
+        elif choice == "Fraud by Income":
+            income_counts = data[data['Fraud'] == 1]['Income'].value_counts()
+            if not income_counts.empty:
+                ax.bar(income_counts.index, income_counts.values)
+                ax.set_title('Fraud Cases by Income Level')
+                ax.set_xlabel('Income Level')
+                ax.set_ylabel('Count')
+            else:
+                ax.text(0.5, 0.5, 'No fraud data available',
+                        ha='center', va='center')
+                ax.set_title('Fraud Cases by Income Level')
+        elif choice == "Transaction Patterns":
+            fraud_data = data[data['Fraud'] == 1]
+            legit_data = data[data['Fraud'] == 0]
+            if not fraud_data.empty:
+                ax.scatter(legit_data['Amount'], legit_data['Age'],
+                          alpha=0.3, label='Legitimate')
+                ax.scatter(fraud_data['Amount'], fraud_data['Age'],
+                          color='red', alpha=0.5, label='Fraud')
+                ax.set_title('Transaction Amount vs Age')
+                ax.set_xlabel('Amount')
+                ax.set_ylabel('Age')
+                ax.legend()
+            else:
+                ax.text(0.5, 0.5, 'No fraud data available',
+                        ha='center', va='center')
+                ax.set_title('Transaction Amount vs Age')
+        plt.tight_layout()
+        return fig, ""
+    except Exception as e:
+        plt.close()
+        return None, f"Error generating plot: {str(e)}"
 # Gradio Interface
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🕵️ Banking Fraud Detection System")
+    with gr.Tab("💬 NLP Query"):
+        gr.Markdown("## Analyze Transactions with Natural Language")
+        with gr.Row():
+            nl_input = gr.Textbox(label="Describe transaction", placeholder="e.g., 'Credit of $6000 in New York for a 26-year-old with low income'")
+            nl_btn = gr.Button("Analyze", variant="primary")
+        nl_output = gr.Textbox(label="Analysis Result", lines=4)
         gr.Examples(
             examples=[
+                "$8000 credit in Chicago for 45-year-old with medium income",
+                "Verify $300 debit in Phoenix for 60-year-old high income client"
             ],
             inputs=nl_input
         )
+        nl_btn.click(process_nl_query, nl_input, nl_output)
+    with gr.Tab("📝 Manual Input"):
+        gr.Markdown("## Manual Transaction Analysis")
+        with gr.Row():
+            amount = gr.Number(label="Amount", minimum=0)
+            trans_type = gr.Dropdown(["Credit", "Debit"], label="Type")
+        with gr.Row():
+            city = gr.Dropdown(le_city.classes_.tolist(), label="City")
+            age = gr.Number(label="Age", minimum=18)
+        income = gr.Dropdown(le_income.classes_.tolist(), label="Income Level")
+        manual_btn = gr.Button("Analyze", variant="primary")
+        manual_output = gr.Textbox(label="Analysis Result", lines=4)
+        manual_btn.click(predict_fraud, [amount, trans_type, city, age, income], manual_output)
+    with gr.Tab("📊 Data Insights"):
+        gr.Markdown("## Fraud Pattern Visualization")
+        with gr.Row():
+            plot_choice = gr.Radio(
+                ["Fraud by City", "Fraud by Income", "Transaction Patterns"],
+                label="Select Visualization",
+                value="Fraud by City"
+            )
+        with gr.Row():
+            plot_output = gr.Plot()
+            error_output = gr.Textbox(label="Error Message", visible=False)
+        plot_choice.change(
+            fn=create_plot,
+            inputs=plot_choice,
+            outputs=[plot_output, error_output]
+        )
+    with gr.Tab("📁 Raw Data"):
+        gr.Markdown("## Complete Transaction Dataset")
+        gr.DataFrame(data)
 demo.launch()