LaurentiuStancioiu commited on
Commit
40840a8
·
verified ·
1 Parent(s): 5481d64

Upload 4 files

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. app.py +96 -0
  3. data_plot.csv +3 -0
  4. model.joblib +3 -0
  5. requirements.txt +9 -0
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ data_plot.csv filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ import os
3
+ import pandas as pd
4
+ import numpy as np
5
+ import openai
6
+ from sklearn.manifold import TSNE
7
+ import joblib
8
+ import gradio as gr
9
+ from typing import Optional
10
+ import altair as alt
11
+ load_dotenv()
12
+ #plt.style.use('seaborn-poster')
13
+
14
+ EMBEDDING_MODEL = "text-embedding-ada-002"
15
+ openai.api_key = os.getenv("OPENAI_API_KEY")
16
+
17
+
18
+ def get_embedding(text: str, model=EMBEDDING_MODEL) -> list[float]:
19
+ """
20
+ Gets a text as an input and the embedding model used from Openai
21
+ Returns the embeddings of that blurb of text
22
+ """
23
+ return openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"]
24
+
25
+
26
+
27
+ def get_plot(website: Optional[str]) -> alt.Chart:
28
+ df = pd.read_csv("data_plot.csv")
29
+ matrix = np.array(df.embeddings.apply(eval).to_list())
30
+ website_embed = get_embedding(website, model=EMBEDDING_MODEL)
31
+ website_embed = np.array(website_embed)
32
+ matrix = np.append(matrix, website_embed.reshape(1, -1), axis=0)
33
+
34
+ tsne = TSNE(n_components=2, perplexity=50, random_state=42, init='random', learning_rate=200)
35
+ vis_dims = tsne.fit_transform(matrix)
36
+
37
+ df_vis = pd.DataFrame(vis_dims, columns=['x', 'y'])
38
+ df_vis['type'] = df['type']
39
+ df_vis["url"] = df["url"]
40
+ df_vis.loc[df_vis.index[-1], 'type'] = 'Our Data'
41
+ df_vis.loc[df_vis.index[-1], 'url'] = website
42
+ # Define color scale
43
+ scale = alt.Scale(domain=['benign', 'defacement', 'phishing', 'malware', 'Our Data'],
44
+ range=['red', 'darkorange', 'gold', 'turquoise', 'black'])
45
+
46
+ # Create the scatter plot
47
+ scatter_plot = alt.Chart(df_vis).mark_circle(size=60).encode(
48
+ x='x',
49
+ y='y',
50
+ color=alt.Color('type', scale=scale),
51
+ tooltip=['type', 'url']
52
+ ).interactive()
53
+
54
+ return scatter_plot
55
+
56
+ def predict_label(website: Optional[str] = "") -> str:
57
+ """
58
+ It takes the blurb of text and predicts whether it is malicious or not
59
+
60
+ """
61
+ loaded_model = joblib.load("model.joblib")
62
+ embedding = get_embedding(website, model = EMBEDDING_MODEL)
63
+ embedding = np.array(embedding)
64
+ y_predicted = loaded_model.predict(embedding.reshape(1, -1))
65
+ if y_predicted[0] == "benign":
66
+ return "This website is most probably safe."
67
+ elif y_predicted[0] != "benign":
68
+ return "This website is most probably malicious."
69
+
70
+ #def my_app(website: str):
71
+ # return (get_plot(website), predict_label(website))
72
+
73
+ #get_plot(website = "https://www.youtube.com/watch?v=RiCQzBluTxU")
74
+ #print(predict_label(website = "https://www.youtube.com/watch?v=RiCQzBluTxU"))
75
+
76
+ def gradio_interface(website: Optional[str] = ""):
77
+ if website == "" or website == None:
78
+ pass
79
+ else:
80
+ prediction = predict_label(website)
81
+ plot = get_plot(website)
82
+
83
+ return prediction, plot
84
+
85
+ interface = gr.Interface(
86
+ fn=gradio_interface,
87
+ inputs="text",
88
+ outputs=["text", "plot"],
89
+ live=True,
90
+ title="Malicious Website Detector",
91
+ description="This website comes as a helping tool for those that want to surf safely on the internet.\n Attention: Not all predictions are true and this should be taken as a demo for now."
92
+ )
93
+ interface.launch()
94
+
95
+
96
+
data_plot.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a59e4896611d72b384678e718268293ea641a09df7b94a737b7f91c57d0729b0
3
+ size 25454344
model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79813a1d346c28a9d26676de9a279a0dd8d0401891b553ad8035620339292838
3
+ size 670236
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ openai==0.26.4
2
+ pandas==1.5.3
3
+ plotly==5.13.0
4
+ scipy==1.10.0
5
+ scikit-learn==1.2.1
6
+ pyarrow==11.0.0
7
+ gradio
8
+ python-dotenv==1.0.1
9
+ altair