Spaces:

LaurentiuStancioiu
/

Malicious_website_detector

Runtime error

App Files Files Community

LaurentiuStancioiu commited on Jan 31, 2024

Commit

40840a8

verified ·

1 Parent(s): 5481d64

Upload 4 files

Browse files

Files changed (5) hide show

.gitattributes +1 -0
app.py +96 -0
data_plot.csv +3 -0
model.joblib +3 -0
requirements.txt +9 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data_plot.csv filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from dotenv import load_dotenv
+import os
+import pandas as pd
+import numpy as np
+import openai
+from sklearn.manifold import TSNE
+import joblib
+import gradio as gr
+from typing import Optional
+import altair as alt
+load_dotenv()
+#plt.style.use('seaborn-poster')
+EMBEDDING_MODEL = "text-embedding-ada-002"
+openai.api_key = os.getenv("OPENAI_API_KEY")
+def get_embedding(text: str, model=EMBEDDING_MODEL) -> list[float]:
+    """
+    Gets a text as an input and the embedding model used from Openai
+    Returns the embeddings of that blurb of text
+    """
+    return openai.Embedding.create(input=[text], model=model)["data"][0]["embedding"]
+def get_plot(website: Optional[str]) -> alt.Chart:
+    df = pd.read_csv("data_plot.csv")
+    matrix = np.array(df.embeddings.apply(eval).to_list())
+    website_embed = get_embedding(website, model=EMBEDDING_MODEL)
+    website_embed = np.array(website_embed)
+    matrix = np.append(matrix, website_embed.reshape(1, -1), axis=0)
+    tsne = TSNE(n_components=2, perplexity=50, random_state=42, init='random', learning_rate=200)
+    vis_dims = tsne.fit_transform(matrix)
+    df_vis = pd.DataFrame(vis_dims, columns=['x', 'y'])
+    df_vis['type'] = df['type']
+    df_vis["url"] = df["url"]
+    df_vis.loc[df_vis.index[-1], 'type'] = 'Our Data'
+    df_vis.loc[df_vis.index[-1], 'url'] = website
+    # Define color scale
+    scale = alt.Scale(domain=['benign', 'defacement', 'phishing', 'malware', 'Our Data'],
+                      range=['red', 'darkorange', 'gold', 'turquoise', 'black'])
+    # Create the scatter plot
+    scatter_plot = alt.Chart(df_vis).mark_circle(size=60).encode(
+        x='x',
+        y='y',
+        color=alt.Color('type', scale=scale),
+        tooltip=['type', 'url']
+    ).interactive()
+    return scatter_plot
+def predict_label(website: Optional[str] = "") -> str:
+    """
+    It takes the blurb of text and predicts whether it is malicious or not
+    """
+    loaded_model = joblib.load("model.joblib")
+    embedding = get_embedding(website, model = EMBEDDING_MODEL)
+    embedding = np.array(embedding)
+    y_predicted = loaded_model.predict(embedding.reshape(1, -1))
+    if y_predicted[0] == "benign":
+        return "This website is most probably safe."
+    elif y_predicted[0] != "benign":
+        return "This website is most probably malicious."
+#def my_app(website: str):
+#   return (get_plot(website), predict_label(website))
+#get_plot(website = "https://www.youtube.com/watch?v=RiCQzBluTxU")
+#print(predict_label(website = "https://www.youtube.com/watch?v=RiCQzBluTxU"))
+def gradio_interface(website: Optional[str] = ""):
+    if website == "" or website == None:
+        pass
+    else:
+        prediction = predict_label(website)
+        plot = get_plot(website)
+        return prediction, plot
+interface = gr.Interface(
+    fn=gradio_interface,
+    inputs="text",
+    outputs=["text", "plot"],
+    live=True,
+    title="Malicious Website Detector",
+    description="This website comes as a helping tool for those that want to surf safely on the internet.\n Attention: Not all predictions are true and this should be taken as a demo for now."
+)
+interface.launch()

data_plot.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a59e4896611d72b384678e718268293ea641a09df7b94a737b7f91c57d0729b0
+size 25454344

model.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79813a1d346c28a9d26676de9a279a0dd8d0401891b553ad8035620339292838
+size 670236

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+openai==0.26.4
+pandas==1.5.3
+plotly==5.13.0
+scipy==1.10.0
+scikit-learn==1.2.1
+pyarrow==11.0.0
+gradio
+python-dotenv==1.0.1
+altair