Spaces:

Atulit23
/

PhishingURLs

Running

App Files Files Community

PhishingURLs / app.py

Atulit23

Upload folder using huggingface_hub

8401d32 verified over 1 year ago

raw

history blame contribute delete

6.14 kB

	from urllib.parse import urlparse, urlencode
	import ipaddress
	import re
	from bs4 import BeautifulSoup
	import whois
	import urllib
	import urllib.request
	from datetime import datetime
	import requests
	import pickle
	import gradio as gr

	loaded_model = pickle.load(open("XGBoostClassifier1.pickle.dat", "rb"))

	shortening_services = r"bit\.ly\|goo\.gl\|shorte\.st\|go2l\.ink\|x\.co\|ow\.ly\|t\.co\|tinyurl\|tr\.im\|is\.gd\|cli\.gs\|" \
	r"yfrog\.com\|migre\.me\|ff\.im\|tiny\.cc\|url4\.eu\|twit\.ac\|su\.pr\|twurl\.nl\|snipurl\.com\|" \
	r"short\.to\|BudURL\.com\|ping\.fm\|post\.ly\|Just\.as\|bkite\.com\|snipr\.com\|fic\.kr\|loopt\.us\|" \
	r"doiop\.com\|short\.ie\|kl\.am\|wp\.me\|rubyurl\.com\|om\.ly\|to\.ly\|bit\.do\|t\.co\|lnkd\.in\|db\.tt\|" \
	r"qr\.ae\|adf\.ly\|goo\.gl\|bitly\.com\|cur\.lv\|tinyurl\.com\|ow\.ly\|bit\.ly\|ity\.im\|q\.gs\|is\.gd\|" \
	r"po\.st\|bc\.vc\|twitthis\.com\|u\.to\|j\.mp\|buzurl\.com\|cutt\.us\|u\.bb\|yourls\.org\|x\.co\|" \
	r"prettylinkpro\.com\|scrnch\.me\|filoops\.info\|vzturl\.com\|qr\.net\|1url\.com\|tweez\.me\|v\.gd\|" \
	r"tr\.im\|link\.zip\.net"

	def getDomain(url):
	domain = urlparse(url).netloc
	if re.match(r"^www.",domain):
	domain = domain.replace("www.","")
	return domain

	def havingIP(url):
	try:
	ipaddress.ip_address(url)
	ip = 1
	except:
	ip = 0
	return ip

	def haveAtSign(url):
	if "@" in url:
	at = 1
	else:
	at = 0
	return at

	def getLength(url):
	if len(url) < 54:
	length = 0
	else:
	length = 1
	return length

	def getDepth(url):
	s = urlparse(url).path.split('/')
	depth = 0
	for j in range(len(s)):
	if len(s[j]) != 0:
	depth = depth+1
	return depth

	def redirection(url):
	pos = url.rfind('//')
	if pos > 6:
	if pos > 7:
	return 1
	else:
	return 0
	else:
	return 0


	def httpDomain(url):
	domain = urlparse(url).netloc
	if 'https' in domain:
	return 1
	else:
	return 0


	def tinyURL(url):
	match=re.search(shortening_services,url)
	if match:
	return 1
	else:
	return 0

	def prefixSuffix(url):
	if '-' in urlparse(url).netloc:
	return 1 # phishing
	else:
	return 0 # legitimate

	def web_traffic(url):
	# try:
	# url = urllib.parse.quote(url)
	# rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find(
	# "REACH")['RANK']
	# rank = int(rank)
	# except TypeError:
	# return 1
	# if rank <100000:
	# return 1
	# else:
	return 0

	def domainAge(domain_name):
	creation_date = domain_name.creation_date
	expiration_date = domain_name.expiration_date
	if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
	try:
	creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
	expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
	except:
	return 1
	if ((expiration_date is None) or (creation_date is None)):
	return 1
	elif ((type(expiration_date) is list) or (type(creation_date) is list)):
	return 1
	else:
	ageofdomain = abs((expiration_date - creation_date).days)
	if ((ageofdomain/30) < 6):
	age = 1
	else:
	age = 0
	return age


	def domainEnd(domain_name):
	expiration_date = domain_name.expiration_date
	if isinstance(expiration_date,str):
	try:
	expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
	except:
	return 1
	if (expiration_date is None):
	return 1
	elif (type(expiration_date) is list):
	return 1
	else:
	today = datetime.now()
	end = abs((expiration_date - today).days)
	if ((end/30) < 6):
	end = 0
	else:
	end = 1
	return end


	def iframe(response):
	if response == "":
	return 1
	else:
	if re.findall(r"[<iframe>\|<frameBorder>]", response.text):
	return 0
	else:
	return 1


	def mouseOver(response):
	if response == "" :
	return 1
	else:
	if re.findall("<script>.+onmouseover.+</script>", response.text):
	return 1
	else:
	return 0

	def rightClick(response):
	if response == "":
	return 1
	else:
	if re.findall(r"event.button ?== ?2", response.text):
	return 0
	else:
	return 1

	def forwarding(response):
	if response == "":
	return 1
	else:
	if len(response.history) <= 2:
	return 0
	else:
	return 1



	def featureExtraction(url):
	features = []
	# features.append(getDomain(url))
	features.append(havingIP(url))
	features.append(haveAtSign(url))
	features.append(getLength(url))
	features.append(getDepth(url))
	features.append(redirection(url))
	features.append(httpDomain(url))
	features.append(tinyURL(url))
	features.append(prefixSuffix(url))

	#Domain based features (4)
	dns = 0
	try:
	domain_name = whois.whois(urlparse(url).netloc)
	except:
	dns = 1

	features.append(dns)
	features.append(web_traffic(url))
	features.append(1 if dns == 1 else domainAge(domain_name))
	features.append(1 if dns == 1 else domainEnd(domain_name))

	# HTML & Javascript based features (4)
	try:
	response = requests.get(url)
	except:
	response = ""
	features.append(iframe(response))
	features.append(mouseOver(response))
	features.append(rightClick(response))
	features.append(forwarding(response))

	return features

	def index(url):
	features = featureExtraction(url)
	prediction = loaded_model.predict([features])
	print(features)
	print(prediction)

	if(prediction[0] == 0):
	return "Safe"
	else:
	return "Unsafe"


	inputs_image_url = [
	gr.Textbox(type="text", label="URL"),
	]

	outputs_result_dict = [
	gr.Textbox(type="text", label="Result Dictionary"),
	]

	interface_image_url = gr.Interface(
	fn=index,
	inputs=inputs_image_url,
	outputs=outputs_result_dict,
	title="URL Detection",
	cache_examples=False,
	)

	gr.TabbedInterface(
	[interface_image_url],
	tab_names=['URL inference']
	).queue().launch()

	# 0 -> Riyal
	# 1 -> Phishing