Spaces:
Running
Running
upload malicious url python code
Browse files- pages/malicious_url.py +98 -0
pages/malicious_url.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# pages/malicious_url.py
|
2 |
+
|
3 |
+
import streamlit as st
|
4 |
+
import numpy as np
|
5 |
+
import pandas as pd
|
6 |
+
import tensorflow as tf
|
7 |
+
from urllib.parse import urlparse
|
8 |
+
import re
|
9 |
+
import joblib
|
10 |
+
|
11 |
+
@st.cache_resource
|
12 |
+
def load_model_and_scaler():
|
13 |
+
model = tf.keras.models.load_model("models/malicious_url_model.h5")
|
14 |
+
scaler = joblib.load("models/scaler.pkl")
|
15 |
+
return model, scaler
|
16 |
+
|
17 |
+
model, scaler = load_model_and_scaler()
|
18 |
+
|
19 |
+
def extract_features(url):
|
20 |
+
try:
|
21 |
+
parsed_url = urlparse(str(url))
|
22 |
+
features = {
|
23 |
+
'url_length': len(str(url)),
|
24 |
+
'hostname_length': len(parsed_url.hostname) if parsed_url.hostname else 0,
|
25 |
+
'path_length': len(parsed_url.path) if parsed_url.path else 0,
|
26 |
+
'query_length': len(parsed_url.query) if parsed_url.query else 0,
|
27 |
+
'fragment_length': len(parsed_url.fragment) if parsed_url.fragment else 0,
|
28 |
+
'num_dots': str(url).count('.'),
|
29 |
+
'num_hyphens': str(url).count('-'),
|
30 |
+
'num_at': str(url).count('@'),
|
31 |
+
'num_question': str(url).count('?'),
|
32 |
+
'num_ampersand': str(url).count('&'),
|
33 |
+
'num_equals': str(url).count('='),
|
34 |
+
'num_exclamation': str(url).count('!'),
|
35 |
+
'num_slash': str(url).count('/'),
|
36 |
+
'num_plus': str(url).count('+'),
|
37 |
+
'num_asterisk': str(url).count('*'),
|
38 |
+
'num_underscore': str(url).count('_'),
|
39 |
+
'num_hash': str(url).count('#'),
|
40 |
+
'num_dollar': str(url).count('$'),
|
41 |
+
'num_percent': str(url).count('%'),
|
42 |
+
'is_https': 1 if parsed_url.scheme == 'https' else 0,
|
43 |
+
'has_http_in_hostname': 1 if parsed_url.hostname and 'http' in parsed_url.hostname else 0,
|
44 |
+
'hostname_is_ip': 1 if parsed_url.hostname and re.match(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', parsed_url.hostname) else 0,
|
45 |
+
'path_depth': str(url).count('/') - 2 if url and urlparse(str(url)).scheme in ['http', 'https'] and str(url).count('/') > 2 else 0
|
46 |
+
}
|
47 |
+
return pd.Series(features)
|
48 |
+
except Exception:
|
49 |
+
return pd.Series({
|
50 |
+
'url_length': 0, 'hostname_length': 0, 'path_length': 0,
|
51 |
+
'query_length': 0, 'fragment_length': 0, 'num_dots': 0,
|
52 |
+
'num_hyphens': 0, 'num_at': 0, 'num_question': 0,
|
53 |
+
'num_ampersand': 0, 'num_equals': 0, 'num_exclamation': 0,
|
54 |
+
'num_slash': 0, 'num_plus': 0, 'num_asterisk': 0,
|
55 |
+
'num_underscore': 0, 'num_hash': 0, 'num_dollar': 0,
|
56 |
+
'num_percent': 0, 'is_https': 0, 'has_http_in_hostname': 0,
|
57 |
+
'hostname_is_ip': 0, 'path_depth': 0
|
58 |
+
})
|
59 |
+
|
60 |
+
X_columns = [
|
61 |
+
'url_length', 'hostname_length', 'path_length', 'query_length',
|
62 |
+
'fragment_length', 'num_dots', 'num_hyphens', 'num_at',
|
63 |
+
'num_question', 'num_ampersand', 'num_equals', 'num_exclamation',
|
64 |
+
'num_slash', 'num_plus', 'num_asterisk', 'num_underscore',
|
65 |
+
'num_hash', 'num_dollar', 'num_percent', 'is_https',
|
66 |
+
'has_http_in_hostname', 'hostname_is_ip', 'path_depth'
|
67 |
+
]
|
68 |
+
|
69 |
+
def app():
|
70 |
+
st.title("π Malicious URL Detector")
|
71 |
+
st.markdown("Enter a URL below to check if it's likely malicious.")
|
72 |
+
|
73 |
+
url_input = st.text_input(
|
74 |
+
"π Enter a URL:",
|
75 |
+
placeholder="e.g., https://example.com",
|
76 |
+
help="Type any URL you want to analyze"
|
77 |
+
)
|
78 |
+
|
79 |
+
if st.button("π Analyze URL"):
|
80 |
+
if not url_input.strip():
|
81 |
+
st.warning("Please enter a valid URL.")
|
82 |
+
else:
|
83 |
+
with st.spinner("Analyzing..."):
|
84 |
+
features = extract_features(url_input)
|
85 |
+
df_new = pd.DataFrame([features])
|
86 |
+
X_new = df_new[X_columns]
|
87 |
+
X_new.fillna(-1, inplace=True)
|
88 |
+
X_scaled = scaler.transform(X_new)
|
89 |
+
prediction = model.predict(X_scaled)
|
90 |
+
prob = float(prediction[0][0])
|
91 |
+
|
92 |
+
if prob > 0.5:
|
93 |
+
st.error(f"β οΈ This URL is likely **malicious**. Confidence: `{prob:.4f}`")
|
94 |
+
else:
|
95 |
+
st.success(f"β
This URL appears to be **safe**. Confidence: `{1 - prob:.4f}`")
|
96 |
+
|
97 |
+
st.markdown("---")
|
98 |
+
st.markdown("π‘ *Model trained on URL-based features like length, special characters, domain patterns, etc.*")
|