PhishingURLs / app.py
Atulit23's picture
Upload folder using huggingface_hub
8401d32 verified
from urllib.parse import urlparse, urlencode
import ipaddress
import re
from bs4 import BeautifulSoup
import whois
import urllib
import urllib.request
from datetime import datetime
import requests
import pickle
import gradio as gr
loaded_model = pickle.load(open("XGBoostClassifier1.pickle.dat", "rb"))
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
r"tr\.im|link\.zip\.net"
def getDomain(url):
domain = urlparse(url).netloc
if re.match(r"^www.",domain):
domain = domain.replace("www.","")
return domain
def havingIP(url):
try:
ipaddress.ip_address(url)
ip = 1
except:
ip = 0
return ip
def haveAtSign(url):
if "@" in url:
at = 1
else:
at = 0
return at
def getLength(url):
if len(url) < 54:
length = 0
else:
length = 1
return length
def getDepth(url):
s = urlparse(url).path.split('/')
depth = 0
for j in range(len(s)):
if len(s[j]) != 0:
depth = depth+1
return depth
def redirection(url):
pos = url.rfind('//')
if pos > 6:
if pos > 7:
return 1
else:
return 0
else:
return 0
def httpDomain(url):
domain = urlparse(url).netloc
if 'https' in domain:
return 1
else:
return 0
def tinyURL(url):
match=re.search(shortening_services,url)
if match:
return 1
else:
return 0
def prefixSuffix(url):
if '-' in urlparse(url).netloc:
return 1 # phishing
else:
return 0 # legitimate
def web_traffic(url):
# try:
# url = urllib.parse.quote(url)
# rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find(
# "REACH")['RANK']
# rank = int(rank)
# except TypeError:
# return 1
# if rank <100000:
# return 1
# else:
return 0
def domainAge(domain_name):
creation_date = domain_name.creation_date
expiration_date = domain_name.expiration_date
if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
try:
creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
except:
return 1
if ((expiration_date is None) or (creation_date is None)):
return 1
elif ((type(expiration_date) is list) or (type(creation_date) is list)):
return 1
else:
ageofdomain = abs((expiration_date - creation_date).days)
if ((ageofdomain/30) < 6):
age = 1
else:
age = 0
return age
def domainEnd(domain_name):
expiration_date = domain_name.expiration_date
if isinstance(expiration_date,str):
try:
expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
except:
return 1
if (expiration_date is None):
return 1
elif (type(expiration_date) is list):
return 1
else:
today = datetime.now()
end = abs((expiration_date - today).days)
if ((end/30) < 6):
end = 0
else:
end = 1
return end
def iframe(response):
if response == "":
return 1
else:
if re.findall(r"[<iframe>|<frameBorder>]", response.text):
return 0
else:
return 1
def mouseOver(response):
if response == "" :
return 1
else:
if re.findall("<script>.+onmouseover.+</script>", response.text):
return 1
else:
return 0
def rightClick(response):
if response == "":
return 1
else:
if re.findall(r"event.button ?== ?2", response.text):
return 0
else:
return 1
def forwarding(response):
if response == "":
return 1
else:
if len(response.history) <= 2:
return 0
else:
return 1
def featureExtraction(url):
features = []
# features.append(getDomain(url))
features.append(havingIP(url))
features.append(haveAtSign(url))
features.append(getLength(url))
features.append(getDepth(url))
features.append(redirection(url))
features.append(httpDomain(url))
features.append(tinyURL(url))
features.append(prefixSuffix(url))
#Domain based features (4)
dns = 0
try:
domain_name = whois.whois(urlparse(url).netloc)
except:
dns = 1
features.append(dns)
features.append(web_traffic(url))
features.append(1 if dns == 1 else domainAge(domain_name))
features.append(1 if dns == 1 else domainEnd(domain_name))
# HTML & Javascript based features (4)
try:
response = requests.get(url)
except:
response = ""
features.append(iframe(response))
features.append(mouseOver(response))
features.append(rightClick(response))
features.append(forwarding(response))
return features
def index(url):
features = featureExtraction(url)
prediction = loaded_model.predict([features])
print(features)
print(prediction)
if(prediction[0] == 0):
return "Safe"
else:
return "Unsafe"
inputs_image_url = [
gr.Textbox(type="text", label="URL"),
]
outputs_result_dict = [
gr.Textbox(type="text", label="Result Dictionary"),
]
interface_image_url = gr.Interface(
fn=index,
inputs=inputs_image_url,
outputs=outputs_result_dict,
title="URL Detection",
cache_examples=False,
)
gr.TabbedInterface(
[interface_image_url],
tab_names=['URL inference']
).queue().launch()
# 0 -> Riyal
# 1 -> Phishing