Spaces:

Anvilogic
/

Embedder-Typosquat-Detect

Running

App Files Files Community

chgrdj

anvilogic-mikehart commited on Nov 20, 2024

Commit

dd2fdf9

verified ·

1 Parent(s): 01fd84f

Remove-embedder-choice (#1)

Browse files

- Removing embedding choice (3b4763bdd34dcb4318ee4c5a1a160ecd6d04c59c)
- Fixing language (a14f06ccc149cfaa82b15ab5221a24f86e91725a)
- Merged (cf4c2843e1d3f149f3cc9933d86c7ae49998aca7)

Co-authored-by: Michael Hart <[email protected]>

Files changed (2) hide show

.gitignore +1 -0
app.py +41 -31

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .venv

app.py CHANGED Viewed

@@ -5,34 +5,44 @@ import numpy as np
 from ast import literal_eval
-st.title("Mining Potential Legitimate Domains from a Typosquatted Domain")
-# model_choice = st.selectbox("Select the embedding model:", ["", "Embedder-typosquat-detect-Canine", "Embedder-typosquat-detect"], index=0)
-model_choice= "Embedder-typosquat-detect-Canine"
-if model_choice:
-    model = SentenceTransformer(f"./{model_choice}")
-    domains_df = pd.read_csv(f'./{model_choice}/domains_embs.csv')
-    domains_df.embedding = domains_df.embedding.apply(literal_eval)
-    corpus_domains = domains_df.domain.to_list()
-    corpus_embeddings = np.stack(domains_df.embedding.values).astype(np.float32)  # Ensure embeddings are float32
-    st.write("Enter a potential typosquatted domain and select the number of top results to retrieve.")
-    domain = st.text_input("Potential Typosquatted Domain")
-    top_k = st.number_input("Top K Results", min_value=1, max_value=50, value=5, step=1)
-    if st.button("Search for Legitimate Domains"):
-        if domain:
-            # Perform Semantic Search
-            query_emb = model.encode(domain).astype(np.float32)  # Ensure query embedding is also float32
-            semantic_res = util.semantic_search(query_emb, corpus_embeddings, top_k=top_k)[0]
-            ids = [r['corpus_id'] for r in semantic_res]
-            scores = [r['score'] for r in semantic_res]
-            res_df = domains_df.loc[ids, ['domain']].copy()
-            res_df['score'] = scores
-            st.write("Mined Domains:")
-            st.dataframe(res_df)
-        else:
-            st.warning("Please enter a domain to perform the search.")

 from ast import literal_eval
+model_choice = "Embedder-typosquat-detect-Canine"
+@st.cache_resource
+def load_model() -> SentenceTransformer:
+    return SentenceTransformer(f"./{model_choice}")
+st.title("Search for the target of typosquat domains with our Domain Embedder")
+st.markdown("This streamlit demonstrates how you can use our domain embedder to find the targets of typosquatted domains. "
+        "Each domain is represented as an vector embedding that can be stored in a vector store for efficient retrieval. "
+        "The domains you can search for in this application are the top 4k most popular domains, like `google.com`.  "
+        "You can use the domain embedder to create a vector store specifically for the websites **you want to monitor**. "
+        "This can include the services your company uses like Office365, or the websites of your company that may "
+        "become spear phishing targets.")
+model = load_model()
+domains_df = pd.read_csv(f'./{model_choice}/domains_embs.csv')
+domains_df.embedding = domains_df.embedding.apply(literal_eval)
+corpus_domains = domains_df.domain.to_list()
+corpus_embeddings = np.stack(domains_df.embedding.values).astype(np.float32)  # Ensure embeddings are float32
+st.header("Enter a potential typosquatted domain and select the number of top results to retrieve. ")
+domain = st.text_input("Potential Typosquatted Domain")
+top_k = st.number_input("Top K Results", min_value=1, max_value=50, value=5, step=1)
+if st.button("Search for Legitimate Domains"):
+    if domain:
+        # Perform Semantic Search
+        query_emb = model.encode(domain).astype(np.float32)  # Ensure query embedding is also float32
+        semantic_res = util.semantic_search(query_emb, corpus_embeddings, top_k=top_k)[0]
+        ids = [r['corpus_id'] for r in semantic_res]
+        scores = [r['score'] for r in semantic_res]
+        res_df = domains_df.loc[ids, ['domain']].copy()
+        res_df['score'] = scores
+        st.write("Mined Domains:")
+        st.dataframe(res_df)
+    else:
+        st.warning("Please enter a domain to perform the search.")