chgrdj anvilogic-mikehart commited on
Commit
dd2fdf9
·
verified ·
1 Parent(s): 01fd84f

Remove-embedder-choice (#1)

Browse files

- Removing embedding choice (3b4763bdd34dcb4318ee4c5a1a160ecd6d04c59c)
- Fixing language (a14f06ccc149cfaa82b15ab5221a24f86e91725a)
- Merged (cf4c2843e1d3f149f3cc9933d86c7ae49998aca7)


Co-authored-by: Michael Hart <[email protected]>

Files changed (2) hide show
  1. .gitignore +1 -0
  2. app.py +41 -31
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .venv
app.py CHANGED
@@ -5,34 +5,44 @@ import numpy as np
5
  from ast import literal_eval
6
 
7
 
8
- st.title("Mining Potential Legitimate Domains from a Typosquatted Domain")
9
- # model_choice = st.selectbox("Select the embedding model:", ["", "Embedder-typosquat-detect-Canine", "Embedder-typosquat-detect"], index=0)
10
- model_choice= "Embedder-typosquat-detect-Canine"
11
- if model_choice:
12
- model = SentenceTransformer(f"./{model_choice}")
13
-
14
- domains_df = pd.read_csv(f'./{model_choice}/domains_embs.csv')
15
- domains_df.embedding = domains_df.embedding.apply(literal_eval)
16
- corpus_domains = domains_df.domain.to_list()
17
- corpus_embeddings = np.stack(domains_df.embedding.values).astype(np.float32) # Ensure embeddings are float32
18
-
19
- st.write("Enter a potential typosquatted domain and select the number of top results to retrieve.")
20
-
21
- domain = st.text_input("Potential Typosquatted Domain")
22
- top_k = st.number_input("Top K Results", min_value=1, max_value=50, value=5, step=1)
23
-
24
- if st.button("Search for Legitimate Domains"):
25
- if domain:
26
- # Perform Semantic Search
27
- query_emb = model.encode(domain).astype(np.float32) # Ensure query embedding is also float32
28
- semantic_res = util.semantic_search(query_emb, corpus_embeddings, top_k=top_k)[0]
29
- ids = [r['corpus_id'] for r in semantic_res]
30
- scores = [r['score'] for r in semantic_res]
31
-
32
- res_df = domains_df.loc[ids, ['domain']].copy()
33
- res_df['score'] = scores
34
-
35
- st.write("Mined Domains:")
36
- st.dataframe(res_df)
37
- else:
38
- st.warning("Please enter a domain to perform the search.")
 
 
 
 
 
 
 
 
 
 
 
5
  from ast import literal_eval
6
 
7
 
8
+ model_choice = "Embedder-typosquat-detect-Canine"
9
+
10
+ @st.cache_resource
11
+ def load_model() -> SentenceTransformer:
12
+ return SentenceTransformer(f"./{model_choice}")
13
+
14
+ st.title("Search for the target of typosquat domains with our Domain Embedder")
15
+ st.markdown("This streamlit demonstrates how you can use our domain embedder to find the targets of typosquatted domains. "
16
+ "Each domain is represented as an vector embedding that can be stored in a vector store for efficient retrieval. "
17
+ "The domains you can search for in this application are the top 4k most popular domains, like `google.com`. "
18
+ "You can use the domain embedder to create a vector store specifically for the websites **you want to monitor**. "
19
+ "This can include the services your company uses like Office365, or the websites of your company that may "
20
+ "become spear phishing targets.")
21
+
22
+ model = load_model()
23
+
24
+
25
+ domains_df = pd.read_csv(f'./{model_choice}/domains_embs.csv')
26
+ domains_df.embedding = domains_df.embedding.apply(literal_eval)
27
+ corpus_domains = domains_df.domain.to_list()
28
+ corpus_embeddings = np.stack(domains_df.embedding.values).astype(np.float32) # Ensure embeddings are float32
29
+
30
+ st.header("Enter a potential typosquatted domain and select the number of top results to retrieve. ")
31
+ domain = st.text_input("Potential Typosquatted Domain")
32
+ top_k = st.number_input("Top K Results", min_value=1, max_value=50, value=5, step=1)
33
+
34
+ if st.button("Search for Legitimate Domains"):
35
+ if domain:
36
+ # Perform Semantic Search
37
+ query_emb = model.encode(domain).astype(np.float32) # Ensure query embedding is also float32
38
+ semantic_res = util.semantic_search(query_emb, corpus_embeddings, top_k=top_k)[0]
39
+ ids = [r['corpus_id'] for r in semantic_res]
40
+ scores = [r['score'] for r in semantic_res]
41
+
42
+ res_df = domains_df.loc[ids, ['domain']].copy()
43
+ res_df['score'] = scores
44
+
45
+ st.write("Mined Domains:")
46
+ st.dataframe(res_df)
47
+ else:
48
+ st.warning("Please enter a domain to perform the search.")