Spaces:
Runtime error
Runtime error
LaurentiuStancioiu
commited on
Upload 4 files
Browse files- .gitattributes +1 -0
- app.py +96 -0
- data_plot.csv +3 -0
- model.joblib +3 -0
- requirements.txt +9 -0
.gitattributes
CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
data_plot.csv filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dotenv import load_dotenv
|
2 |
+
import os
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
import openai
|
6 |
+
from sklearn.manifold import TSNE
|
7 |
+
import joblib
|
8 |
+
import gradio as gr
|
9 |
+
from typing import Optional
|
10 |
+
import altair as alt
|
11 |
+
load_dotenv()
|
12 |
+
#plt.style.use('seaborn-poster')
|
13 |
+
|
14 |
+
EMBEDDING_MODEL = "text-embedding-ada-002"
|
15 |
+
openai.api_key = os.getenv("OPENAI_API_KEY")
|
16 |
+
|
17 |
+
|
18 |
+
def get_embedding(text: str, model=EMBEDDING_MODEL) -> list[float]:
|
19 |
+
"""
|
20 |
+
Gets a text as an input and the embedding model used from Openai
|
21 |
+
Returns the embeddings of that blurb of text
|
22 |
+
"""
|
23 |
+
return openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"]
|
24 |
+
|
25 |
+
|
26 |
+
|
27 |
+
def get_plot(website: Optional[str]) -> alt.Chart:
|
28 |
+
df = pd.read_csv("data_plot.csv")
|
29 |
+
matrix = np.array(df.embeddings.apply(eval).to_list())
|
30 |
+
website_embed = get_embedding(website, model=EMBEDDING_MODEL)
|
31 |
+
website_embed = np.array(website_embed)
|
32 |
+
matrix = np.append(matrix, website_embed.reshape(1, -1), axis=0)
|
33 |
+
|
34 |
+
tsne = TSNE(n_components=2, perplexity=50, random_state=42, init='random', learning_rate=200)
|
35 |
+
vis_dims = tsne.fit_transform(matrix)
|
36 |
+
|
37 |
+
df_vis = pd.DataFrame(vis_dims, columns=['x', 'y'])
|
38 |
+
df_vis['type'] = df['type']
|
39 |
+
df_vis["url"] = df["url"]
|
40 |
+
df_vis.loc[df_vis.index[-1], 'type'] = 'Our Data'
|
41 |
+
df_vis.loc[df_vis.index[-1], 'url'] = website
|
42 |
+
# Define color scale
|
43 |
+
scale = alt.Scale(domain=['benign', 'defacement', 'phishing', 'malware', 'Our Data'],
|
44 |
+
range=['red', 'darkorange', 'gold', 'turquoise', 'black'])
|
45 |
+
|
46 |
+
# Create the scatter plot
|
47 |
+
scatter_plot = alt.Chart(df_vis).mark_circle(size=60).encode(
|
48 |
+
x='x',
|
49 |
+
y='y',
|
50 |
+
color=alt.Color('type', scale=scale),
|
51 |
+
tooltip=['type', 'url']
|
52 |
+
).interactive()
|
53 |
+
|
54 |
+
return scatter_plot
|
55 |
+
|
56 |
+
def predict_label(website: Optional[str] = "") -> str:
|
57 |
+
"""
|
58 |
+
It takes the blurb of text and predicts whether it is malicious or not
|
59 |
+
|
60 |
+
"""
|
61 |
+
loaded_model = joblib.load("model.joblib")
|
62 |
+
embedding = get_embedding(website, model = EMBEDDING_MODEL)
|
63 |
+
embedding = np.array(embedding)
|
64 |
+
y_predicted = loaded_model.predict(embedding.reshape(1, -1))
|
65 |
+
if y_predicted[0] == "benign":
|
66 |
+
return "This website is most probably safe."
|
67 |
+
elif y_predicted[0] != "benign":
|
68 |
+
return "This website is most probably malicious."
|
69 |
+
|
70 |
+
#def my_app(website: str):
|
71 |
+
# return (get_plot(website), predict_label(website))
|
72 |
+
|
73 |
+
#get_plot(website = "https://www.youtube.com/watch?v=RiCQzBluTxU")
|
74 |
+
#print(predict_label(website = "https://www.youtube.com/watch?v=RiCQzBluTxU"))
|
75 |
+
|
76 |
+
def gradio_interface(website: Optional[str] = ""):
|
77 |
+
if website == "" or website == None:
|
78 |
+
pass
|
79 |
+
else:
|
80 |
+
prediction = predict_label(website)
|
81 |
+
plot = get_plot(website)
|
82 |
+
|
83 |
+
return prediction, plot
|
84 |
+
|
85 |
+
interface = gr.Interface(
|
86 |
+
fn=gradio_interface,
|
87 |
+
inputs="text",
|
88 |
+
outputs=["text", "plot"],
|
89 |
+
live=True,
|
90 |
+
title="Malicious Website Detector",
|
91 |
+
description="This website comes as a helping tool for those that want to surf safely on the internet.\n Attention: Not all predictions are true and this should be taken as a demo for now."
|
92 |
+
)
|
93 |
+
interface.launch()
|
94 |
+
|
95 |
+
|
96 |
+
|
data_plot.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a59e4896611d72b384678e718268293ea641a09df7b94a737b7f91c57d0729b0
|
3 |
+
size 25454344
|
model.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:79813a1d346c28a9d26676de9a279a0dd8d0401891b553ad8035620339292838
|
3 |
+
size 670236
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
openai==0.26.4
|
2 |
+
pandas==1.5.3
|
3 |
+
plotly==5.13.0
|
4 |
+
scipy==1.10.0
|
5 |
+
scikit-learn==1.2.1
|
6 |
+
pyarrow==11.0.0
|
7 |
+
gradio
|
8 |
+
python-dotenv==1.0.1
|
9 |
+
altair
|