Spaces:

EAV123
/

SQL_Injection_Detection

Sleeping

App Files Files Community

EAV123 commited on Apr 22

Commit

81c8729

verified ·

1 Parent(s): 6ccd914

Upload 10 files

Browse files

Files changed (10) hide show

app.py +542 -0
cnn_model.h5 +3 -0
lstm_model.h5 +3 -0
requirements.txt +4 -0
rf_model.pkl +3 -0
sql_injection_cnn.h5 +3 -0
sql_tokenizer.pkl +3 -0
svm_model.pkl +3 -0
tfidf_vectorizer.pkl +3 -0
tokenizer.pkl +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,542 @@

+import streamlit as st
+import tensorflow as tf
+from tensorflow.keras.models import load_model
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+import pickle
+import re
+import time
+import numpy as np
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.svm import SVC
+# Load models and preprocessing components
+@st.cache_resource
+def load_components():
+    # Load deep learning models
+    cnn_model = load_model('cnn_model.h5')
+    lstm_model = load_model('lstm_model.h5')
+    # Load traditional ML models
+    with open('rf_model.pkl', 'rb') as f:
+        rf_model = pickle.load(f)
+    with open('svm_model.pkl', 'rb') as f:
+        svm_model = pickle.load(f)
+    # Load tokenizer and vectorizer
+    with open('sql_tokenizer.pkl', 'rb') as f:
+        tokenizer_data = pickle.load(f)
+    with open('tfidf_vectorizer.pkl', 'rb') as f:
+        tfidf_vectorizer = pickle.load(f)
+    return {
+        'cnn_model': cnn_model,
+        'lstm_model': lstm_model,
+        'rf_model': rf_model,
+        'svm_model': svm_model,
+        'tokenizer': tokenizer_data['tokenizer'],
+        'max_sequence_length': tokenizer_data['max_sequence_length'],
+        'tfidf_vectorizer': tfidf_vectorizer
+    }
+# Try to load all components
+try:
+    components = load_components()
+    model_loading_error = None
+except Exception as e:
+    model_loading_error = str(e)
+    components = None
+# Preprocess functions
+def preprocess_query_for_deep_learning(query, tokenizer, max_sequence_length):
+    """
+    Tokenizes and pads the input query to prepare it for deep learning models.
+    """
+    sequences = tokenizer.texts_to_sequences([query])
+    padded = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')
+    return padded
+def preprocess_query_for_traditional_ml(query, tfidf_vectorizer):
+    """
+    Transforms the input query using TF-IDF for traditional ML models.
+    """
+    return tfidf_vectorizer.transform([query])
+# Define improved regex patterns for SQL injection attempts
+SQL_INJECTION_PATTERNS = [
+    # SQL comment syntax that follows a quote (likely injection)
+    r"(?i)'.*--",
+    # Quote followed by OR/AND with comparison (classic injection pattern)
+    r"(?i)'\s*(OR|AND)\s*['\d\w]+=\s*['\d\w]+",
+    # SQL Comment without preceding from a query context
+    r"(?i)(\s|^)--",
+    # Multiple query execution with semicolon
+    r"(?i)'.*;.*--",
+    # UNION-based injections
+    r"(?i)'\s*UNION\s+(ALL\s+)?SELECT",
+    # Time-delay attacks
+    r"(?i)'\s*;\s*WAITFOR\s+DELAY",
+    # DROP/ALTER table attacks
+    r"(?i)'\s*;\s*(DROP|ALTER)",
+    # Quote followed by a true condition
+    r"(?i)'\s*OR\s*'?\d+'?\s*=\s*'?\d+'?",
+    # Quote followed by always true condition like 1=1
+    r"(?i)'\s*OR\s*(['\"]\d+['\"])=(['\"]\d+['\"])",
+    # Batch queries
+    r"(?i);\s*(SELECT|INSERT|UPDATE|DELETE|DROP)",
+    # CAST attacks
+    r"(?i)CAST\s*\(.+AS\s+.+\)",
+    # Typical SQL function calls in injections
+    r"(?i)'\s*;\s*(EXEC|EXECUTE).*",
+]
+# Safe SQL patterns that should not trigger false positives
+SAFE_SQL_PATTERNS = [
+    # Standard SELECT query
+    r"(?i)^SELECT\s+[\w\d\s,*]+\s+FROM\s+[\w\d]+(\s+WHERE\s+[\w\d\s=<>']+)?$",
+    # Standard INSERT query
+    r"(?i)^INSERT\s+INTO\s+[\w\d]+\s*\([^)]+\)\s*VALUES\s*\([^)]+\)$",
+    # Standard UPDATE query
+    r"(?i)^UPDATE\s+[\w\d]+\s+SET\s+[\w\d\s=',]+(\s+WHERE\s+[\w\d\s=<>']+)?$",
+]
+# Rule-based detection function
+def detect_sql_injection_with_regex(query):
+    """
+    Detects potential SQL injection patterns using improved regex.
+    Returns True if any malicious pattern matches and no safe pattern matches.
+    """
+    # First check if the query matches any safe pattern
+    for pattern in SAFE_SQL_PATTERNS:
+        if re.search(pattern, query.strip()):
+            # Query matches a safe pattern
+            return False, None
+    # Then check for malicious patterns
+    for pattern in SQL_INJECTION_PATTERNS:
+        match = re.search(pattern, query)
+        if match:
+            return True, match.group(0)
+    # If no malicious pattern found
+    return False, None
+# Ensemble prediction function
+def predict_with_ensemble(query, components):
+    """
+    Uses an ensemble of models to predict if the query is malicious.
+    Returns predictions from individual models and ensemble vote.
+    """
+    # Get individual model predictions
+    # Random Forest prediction
+    query_tfidf = preprocess_query_for_traditional_ml(query, components['tfidf_vectorizer'])
+    rf_pred = int(components['rf_model'].predict(query_tfidf)[0])
+    # SVM prediction
+    svm_pred = int(components['svm_model'].predict(query_tfidf)[0])
+    # CNN prediction
+    query_padded = preprocess_query_for_deep_learning(query, components['tokenizer'], components['max_sequence_length'])
+    cnn_probability = components['cnn_model'].predict(query_padded)[0][0]
+    cnn_pred = int(cnn_probability > 0.5)
+    # LSTM prediction
+    lstm_probability = components['lstm_model'].predict(query_padded)[0][0]
+    lstm_pred = int(lstm_probability > 0.5)
+    # Majority voting
+    votes = [rf_pred, svm_pred, cnn_pred, lstm_pred]
+    ensemble_pred = np.bincount(votes).argmax()
+    return {
+        'rf': rf_pred,
+        'svm': svm_pred,
+        'cnn': {'prediction': cnn_pred, 'probability': float(cnn_probability)},
+        'lstm': {'prediction': lstm_pred, 'probability': float(lstm_probability)},
+        'ensemble': int(ensemble_pred),
+        'vote_count': {0: list(votes).count(0), 1: list(votes).count(1)}
+    }
+# Initialize session state for UI flow control
+if 'analysis_stage' not in st.session_state:
+    st.session_state.analysis_stage = 0  # 0: not started, 1: regex done, 2: ensemble done
+if 'regex_result' not in st.session_state:
+    st.session_state.regex_result = None
+if 'ensemble_result' not in st.session_state:
+    st.session_state.ensemble_result = None
+# App title and description
+st.title("🛡️ SQL Injection Detection")
+st.markdown("""
+This application uses a multi-layered approach to detect potentially malicious SQL queries:
+1. **Rule-based detection** using improved regex patterns
+2. **Ensemble learning** with majority voting from 4 models:
+   - Random Forest
+   - Support Vector Machine
+   - Convolutional Neural Network
+   - Long Short-Term Memory Network
+Enter a query below or select from the examples to begin analysis.
+""")
+# Display warning if models couldn't be loaded
+if model_loading_error:
+    st.warning(f"⚠️ Some models could not be loaded. The application will only use rule-based detection. Error: {model_loading_error}")
+# Example queries in a dropdown
+st.subheader("Select an Example or Enter Your Own Query")
+example_categories = {
+    "Benign SQL Queries": [
+    "SELECT * FROM users WHERE username='admin'",
+    "SELECT id, name, price FROM products WHERE category_id=5",
+    "SELECT COUNT(*) FROM orders WHERE date > '2023-01-01'",
+    "INSERT INTO logs (user_id, action) VALUES (42, 'login')",
+    "UPDATE customers SET last_login='2023-06-15' WHERE id=101",
+    "DELETE FROM sessions WHERE last_activity < '2023-01-01'",
+    "SELECT email FROM subscribers WHERE active=1",
+    "INSERT INTO feedback (user_id, message) VALUES (87, 'Great service!')",
+    "UPDATE inventory SET stock = stock - 1 WHERE product_id = 300",
+    "SELECT name FROM employees WHERE department = 'Sales'",
+    "SELECT AVG(rating) FROM reviews WHERE product_id = 55",
+    "INSERT INTO audit_log (timestamp, event) VALUES (CURRENT_TIMESTAMP, 'update')",
+    "SELECT * FROM appointments WHERE doctor_id = 10 AND status = 'confirmed'",
+    "UPDATE settings SET value='dark' WHERE key='theme'",
+    "SELECT DISTINCT city FROM customers WHERE country='USA'",
+    "DELETE FROM cart_items WHERE user_id=12 AND product_id=78",
+    "SELECT MAX(salary) FROM employees WHERE role='manager'",
+    "INSERT INTO payments (user_id, amount, method) VALUES (33, 99.99, 'credit')",
+    "UPDATE products SET price = price * 1.1 WHERE category_id = 7",
+    "SELECT * FROM messages WHERE sender_id = 5 AND is_read = 0"
+    ],
+    "Malicious SQL Queries": [
+    "' OR 1=1 --",
+    "admin'; DROP TABLE users; --",
+    "SELECT * FROM users WHERE username='' UNION SELECT username,password FROM admin_users --",
+    "'; WAITFOR DELAY '0:0:10' --",
+    "admin' OR '1'='1",
+    "' OR 'a'='a",
+    "' OR 1=1#",
+    "' OR 1=1/*",
+    "admin'--",
+    "'; EXEC xp_cmdshell('dir'); --",
+    "' OR EXISTS(SELECT * FROM users WHERE username = 'admin') --",
+    "1; DROP TABLE sessions --",
+    "'; SHUTDOWN --",
+    "' OR SLEEP(5) --",
+    "' AND 1=(SELECT COUNT(*) FROM users) --",
+    "admin' AND SUBSTRING(password, 1, 1) = 'a' --",
+    "' UNION ALL SELECT NULL,NULL,NULL --",
+    "0' OR 1=1 ORDER BY 1 --",
+    "1' AND (SELECT COUNT(*) FROM users) > 0 --",
+    "' OR (SELECT ASCII(SUBSTRING(password,1,1)) FROM users WHERE username='admin') > 64 --"
+    ]
+}
+# First create category selection
+category = st.selectbox(
+    "Choose query category:",
+    options=list(example_categories.keys()),
+    key="category"
+)
+# Then show examples from selected category
+example = st.selectbox(
+    "Select an example:",
+    options=example_categories[category],
+    key="example"
+)
+# Allow user to use the selected example or enter their own
+query_source = st.radio(
+    "Query source:",
+    ["Use selected example", "Enter my own query"],
+    key="query_source"
+)
+if query_source == "Enter my own query":
+    query = st.text_area(
+        "Enter SQL Query:",
+        height=100,
+        placeholder="Type your SQL query here..."
+    )
+else:
+    query = example
+    st.code(query, language="sql")
+# Analysis process
+if st.button("Start Analysis") and query:
+    # Reset analysis state
+    st.session_state.analysis_stage = 1
+    # Step 1: Rule-based detection
+    with st.spinner("Running rule-based detection..."):
+        time.sleep(0.5)  # Simulate processing time
+        is_malicious, matched_pattern = detect_sql_injection_with_regex(query)
+        st.session_state.regex_result = (is_malicious, matched_pattern)
+# If we have completed the regex analysis
+if st.session_state.analysis_stage >= 1 and st.session_state.regex_result is not None:
+    is_malicious, matched_pattern = st.session_state.regex_result
+    st.subheader("Step 1: Rule-Based Detection")
+    if is_malicious:
+        st.error("🚨 SQL Injection Detected (Rule-Based)!")
+        st.warning(f"Matched pattern: `{matched_pattern}`")
+        # Show details in expander
+        with st.expander("Rule-Based Detection Details"):
+            st.markdown("""
+            **What was detected:**
+            - The query matched one or more known SQL injection patterns
+            - This type of pattern is commonly used in SQL injection attacks
+            - Review the query for security implications
+            """)
+            st.markdown("**Common SQL injection techniques detected:**")
+            st.markdown("""
+            - Comment sequences (`--`) after quotes
+            - Always true conditions (`OR 1=1`)
+            - Union-based injections
+            - SQL command injections
+            """)
+    else:
+        st.success("✅ No SQL injection patterns detected using rules")
+        with st.expander("Rule-Based Detection Details"):
+            st.markdown("""
+            **Analysis Details:**
+            - The query did not match any known SQL injection patterns
+            - The structure appears to be standard SQL syntax
+            - No suspicious patterns were identified
+            """)
+    # Ask if user wants to proceed with ensemble detection
+    proceed = st.radio(
+        "Would you like to proceed with ensemble model detection?",
+        ["Yes", "No"],
+        index=0,  # Default to Yes
+        key="proceed"
+    )
+    # Check if models are loaded before allowing ensemble analysis
+    if proceed == "Yes" and not model_loading_error:
+        if st.button("Run Ensemble Analysis"):
+            st.session_state.analysis_stage = 2
+            with st.spinner("Running ensemble models..."):
+                time.sleep(1)  # Simulate processing time
+                ensemble_results = predict_with_ensemble(query, components)
+                st.session_state.ensemble_result = ensemble_results
+    elif proceed == "Yes" and model_loading_error:
+        st.error("Cannot run ensemble analysis because models failed to load.")
+# If we have completed the ensemble analysis
+if st.session_state.analysis_stage >= 2 and st.session_state.ensemble_result is not None:
+    results = st.session_state.ensemble_result
+    st.subheader("Step 2: Ensemble Model Detection")
+    # Create a visual representation of voting
+    vote_benign = results['vote_count'][0]
+    vote_malicious = results['vote_count'][1]
+    st.markdown(f"### Model Votes")
+    # Create columns for the voting visualization
+    col1, col2 = st.columns(2)
+    with col1:
+        st.metric("Safe Votes", vote_benign)
+    with col2:
+        st.metric("Malicious Votes", vote_malicious)
+    # Create a progress bar to visualize the voting ratio
+    vote_ratio = vote_malicious / (vote_benign + vote_malicious)
+    st.progress(vote_ratio, text=f"Malicious vote ratio: {vote_ratio*100:.0f}%")
+    # Display individual model results
+    st.markdown("### Individual Model Results")
+    model_cols = st.columns(4)
+    with model_cols[0]:
+        st.markdown("**Random Forest**")
+        if results['rf'] == 1:
+            st.error("⚠️ Malicious")
+        else:
+            st.success("✅ Safe")
+    with model_cols[1]:
+        st.markdown("**SVM**")
+        if results['svm'] == 1:
+            st.error("⚠️ Malicious")
+        else:
+            st.success("✅ Safe")
+    with model_cols[2]:
+        st.markdown("**CNN**")
+        cnn_prob = results['cnn']['probability'] * 100
+        if results['cnn']['prediction'] == 1:
+            st.error(f"⚠️ Malicious ({cnn_prob:.1f}%)")
+        else:
+            st.success(f"✅ Safe ({100-cnn_prob:.1f}%)")
+    with model_cols[3]:
+        st.markdown("**LSTM**")
+        lstm_prob = results['lstm']['probability'] * 100
+        if results['lstm']['prediction'] == 1:
+            st.error(f"⚠️ Malicious ({lstm_prob:.1f}%)")
+        else:
+            st.success(f"✅ Safe ({100-lstm_prob:.1f}%)")
+    # Final ensemble verdict
+    st.markdown("### Ensemble Verdict")
+    if results['ensemble'] == 1:
+        st.error("🚨 SQL Injection Detected by Majority Vote!")
+    else:
+        st.success("✅ Query deemed safe by majority vote")
+    # Explanation in expander
+    with st.expander("Ensemble Detection Details"):
+        st.markdown("""
+        **How ensemble voting works:**
+        - Each model casts a vote (0 for safe, 1 for malicious)
+        - The final decision is based on majority vote
+        - This approach combines the strengths of different model architectures
+        - More robust than any single model alone
+        """)
+        if results['ensemble'] == 1:
+            st.markdown(f"""
+            **Why was this flagged:**
+            - {vote_malicious} out of 4 models identified this query as potentially malicious
+            - The majority vote indicates suspicious patterns
+            - This query should be carefully reviewed before execution
+            """)
+        else:
+            st.markdown(f"""
+            **Why was this considered safe:**
+            - {vote_benign} out of 4 models identified this query as likely safe
+            - The majority vote indicates standard SQL patterns
+            - No significant red flags were detected in the ensemble
+            """)
+    # Final verdict combining both approaches
+    st.subheader("Final Analysis")
+    is_malicious_regex, _ = st.session_state.regex_result
+    is_malicious_ensemble = results['ensemble'] == 1
+    if is_malicious_regex or is_malicious_ensemble:
+        st.error("⚠️ This query appears to contain SQL injection patterns. Review carefully before executing.")
+    else:
+        st.success("✅ This query appears safe based on both rule-based and ensemble detection.")
+    st.info("ℹ️ Remember: Always use parameterized queries and proper input validation in production systems.")
+    # Reset button
+    if st.button("Analyze Another Query"):
+        st.session_state.analysis_stage = 0
+        st.session_state.regex_result = None
+        st.session_state.ensemble_result = None
+        st.experimental_rerun()
+# Sidebar with additional info
+with st.sidebar:
+    st.header("About This App")
+    st.markdown("""
+    ### Multi-Layer Detection Process
+    1. **Rule-Based Detection**
+       - Fast, pattern-matching approach
+       - Uses improved regex to identify SQL injection patterns
+       - Reduces false positives with safe pattern recognition
+    2. **Ensemble Detection**
+       - Combines 4 different machine learning models:
+         - Random Forest
+         - Support Vector Machine (SVM)
+         - Convolutional Neural Network (CNN)
+         - Long Short-Term Memory Network (LSTM)
+       - Final decision by majority voting
+    """)
+    st.markdown("### Machine Learning Architecture")
+    st.code("""
+    # Traditional ML
+    - Random Forest (n_estimators=100)
+    - SVM (kernel='linear')
+    # CNN Architecture
+    Sequential([
+        Embedding(input_dim=10000, output_dim=128),
+        Conv1D(filters=64, kernel_size=3, activation='relu'),
+        MaxPooling1D(pool_size=2),
+        Dropout(0.5),
+        Conv1D(filters=128, kernel_size=3, activation='relu'),
+        MaxPooling1D(pool_size=2),
+        Flatten(),
+        Dense(64, activation='relu'),
+        Dropout(0.5),
+        Dense(1, activation='sigmoid')
+    ])
+    # LSTM Architecture
+    Sequential([
+        Embedding(input_dim=10000, output_dim=128),
+        Bidirectional(LSTM(64, return_sequences=True)),
+        Dropout(0.5),
+        Bidirectional(LSTM(32)),
+        Dropout(0.5),
+        Dense(32, activation='relu'),
+        Dense(1, activation='sigmoid')
+    ])
+    """)
+    st.markdown("### How It Works")
+    st.markdown("""
+    1. **Step 1:** Rule-based patterns scan for known SQL injection techniques
+    2. **Step 2:** Ensemble of 4 models evaluates the query structure
+    3. **Final Analysis:** Combined verdict from both approaches
+    """)
+    st.markdown("---")
+    st.warning("**Note:** This is a demonstration tool, not a replacement for proper security measures.")
+# Footer
+st.markdown("---")
+st.markdown("""
+<style>
+.footer {
+    position: fixed;
+    left: 0;
+    bottom: 0;
+    width: 100%;
+    background-color: white;
+    color: black;
+    text-align: center;
+    padding: 10px;
+    border-top: 1px solid #e5e5e5;
+}
+</style>
+<div class="footer">
+<p>Developed with ❤️ using Streamlit | SQL Injection Detection System</p>
+</div>
+""", unsafe_allow_html=True)

cnn_model.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:782470a371d9c9464a7a59217351a2e2b4800d0149714d84bb3d4d946050698b
+size 18261328

lstm_model.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:043aaf51992b65a67fcd4f63d26c065b9a53a5a0ff928e9266f4ef339e742347
+size 17135816

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+pandas==2.1.4
+numpy==1.26.4
+tensorflow==2.17.0
+scikit-learn==1.2.2

rf_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:83d54671b38025d44fd595dd48f8cb8c1e0f6527b99d719389cffdfa7ee99de6
+size 7197896

sql_injection_cnn.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01e9a8f7523ca79a470358cca96b74cb97bd99397b4dec3ebe0f5d04dc0b6380
+size 18105704

sql_tokenizer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24843be9335666b16e01e6b1063f488e08fded636e13374c5694213d800b3fc1
+size 1116870

svm_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe95f6dfd1704ae1cafa9a91c2768b7933a482520e2cfb887690afdc8f9f9282
+size 234315

tfidf_vectorizer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:80bf747b75db956d1d1dd7b00ff2c29dbb2fe935272d98499376a12a77613b53
+size 2583307

tokenizer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c917ea2e95e0038bb6f4d7de90e3d6164f3458a42e357f7487e27aa9d0a6e9c
+size 951873