chgrdj commited on
Commit
3b25244
·
verified ·
1 Parent(s): 21a6064

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -7
app.py CHANGED
@@ -1,11 +1,38 @@
1
  import streamlit as st
2
- from sentence_transformers import CrossEncoder
 
 
 
3
 
4
- model_name = "./"
5
- model = CrossEncoder(model_name)
 
6
 
7
- st.title("Mining potential legit domain from a typosquatted domain")
8
- st.write("Enter a potential typosquatted domain.")
 
 
 
9
 
10
- domain = st.text_input("potential typosquatted domain")
11
- st.write("Mined domains")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ from sentence_transformers import SentenceTransformer, util
3
+ import pandas as pd
4
+ import numpy as np
5
+ from ast import literal_eval
6
 
7
+ # Load the model
8
+ model_name = "./Embedder-typosquat"
9
+ model = SentenceTransformer(model_name)
10
 
11
+ # Load the domains and embeddings
12
+ domains_df = pd.read_csv('domains_embs.csv')
13
+ domains_df.embedding = domains_df.embedding.apply(literal_eval)
14
+ corpus_domains = domains_df.domain.to_list()
15
+ corpus_embeddings = np.stack(domains_df.embedding.values)
16
 
17
+ # Streamlit App
18
+ st.title("Mining Potential Legitimate Domains from a Typosquatted Domain")
19
+ st.write("Enter a potential typosquatted domain and select the number of top results to retrieve.")
20
+
21
+ # User Inputs
22
+ domain = st.text_input("Potential Typosquatted Domain")
23
+ top_k = st.number_input("Top K Results", min_value=1, max_value=len(corpus_domains), value=5, step=1)
24
+
25
+ # Perform Semantic Search
26
+ if domain:
27
+ query_emb = model.encode(domain)
28
+ semantic_res = util.semantic_search(query_emb, corpus_embeddings, top_k=top_k)[0]
29
+ ids = [r['corpus_id'] for r in semantic_res]
30
+ scores = [r['score'] for r in semantic_res]
31
+
32
+ # Create a DataFrame for the results
33
+ res_df = domains_df.iloc[ids].copy()
34
+ res_df['score'] = scores
35
+
36
+ # Display the result DataFrame
37
+ st.write("Mined Domains:")
38
+ st.dataframe(res_df)