Mohzen321 commited on
Commit
6d1dc98
·
verified ·
1 Parent(s): f69939b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -30
app.py CHANGED
@@ -2,58 +2,88 @@ import streamlit as st
2
  from transformers import pipeline
3
  import re
4
 
5
- # تحميل النموذجssssss
6
  classifier = pipeline("zero-shot-classification", model="cross-encoder/nli-distilroberta-base")
7
 
8
  # عنوان التطبيق
9
- st.title("URL Analysis App")
 
 
 
10
 
11
  # إدخال الملف النصي
12
- uploaded_file = st.file_uploader("Upload a text file containing URLs", type=["txt"])
13
 
14
  if uploaded_file is not None:
15
  # قراءة الملف النصي
16
  content = uploaded_file.read().decode("utf-8")
17
- urls = [line.strip() for line in content.splitlines() if line.strip()]
 
 
 
18
 
19
  # قوائم لتخزين النتائج
 
 
 
 
 
 
20
  parameters = []
21
  domains = []
22
  full_page_types = []
23
  file_extensions = []
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  # دالة تحليل الروابط
26
  def analyze_urls(urls):
27
  for url in urls:
28
  # استخراج الباراميترات باستخدام RegEx
29
- params = re.findall(r'(\w+)=', url)
30
  parameters.extend(params)
31
 
32
- # استخtraction نطاقات (.com, .uk, .au)
33
- domain_match = re.search(r'\.([a-zA-Z]+)', url)
34
  if domain_match:
35
  domain = domain_match.group(1)
36
  if domain not in domains:
37
  domains.append(domain)
38
 
39
- # استخراج أنماط الصفحات الكاملة (product_detail.php?, viewtopic.php?)
40
  page_type_match = re.search(r'(\w+\.[a-z]+)\?', url)
41
  if page_type_match:
42
  page_type = page_type_match.group(1)
43
  if page_type not in full_page_types:
44
  full_page_types.append(page_type)
45
 
46
- # استخراج الصيغ (php, phtml, asp) بدون علامات الاستفهام
47
- extension_match = re.search(r'(\w+\.[a-z]+)(\?|$)', url)
48
  if extension_match:
49
- extension = extension_match.group(1).split('?')[0]
50
  if extension not in file_extensions:
51
  file_extensions.append(extension)
52
 
53
  # زر البدء
54
  if st.button("Start"):
55
- # تحليل الروابط
56
- analyze_urls(urls)
 
 
57
 
58
  # إزالة التكرارات من القوائم
59
  parameters = list(set(parameters))
@@ -61,22 +91,48 @@ if uploaded_file is not None:
61
  full_page_types = list(set(full_page_types))
62
  file_extensions = list(set(file_extensions))
63
 
64
- # عرض النتائج
65
- st.header("Parameters")
66
- st.text_area("Copy the parameters here:", value="\n".join(parameters), height=200, key="parameters")
67
- st.button("Copy Parameters", on_click=lambda: st.clipboard.copy("\n".join(parameters)))
68
-
69
- st.header("Domains")
70
- st.text_area("Copy the domains here:", value="\n".join(domains), height=200, key="domains")
71
- st.button("Copy Domains", on_click=lambda: st.clipboard.copy("\n".join(domains)))
72
-
73
- st.header("Full PageType")
74
- st.text_area("Copy the full page types here:", value="\n".join(full_page_types), height=200, key="full_page_types")
75
- st.button("Copy Full PageTypes", on_click=lambda: st.clipboard.copy("\n".join(full_page_types)))
76
-
77
- st.header("File Extensions")
78
- st.text_area("Copy the file extensions here:", value="\n".join(file_extensions), height=200, key="file_extensions")
79
- st.button("Copy File Extensions", on_click=lambda: st.clipboard.copy("\n".join(file_extensions)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
  else:
82
- st.warning("Please upload a text file containing URLs to start analysis.")
 
2
  from transformers import pipeline
3
  import re
4
 
5
+ # تحميل النموذج
6
  classifier = pipeline("zero-shot-classification", model="cross-encoder/nli-distilroberta-base")
7
 
8
  # عنوان التطبيق
9
+ st.title("Keyword & URL Analysis App")
10
+
11
+ # اختيار العملية
12
+ operation = st.radio("Choose an operation:", ["Filter Keywords", "Analyze URLs"])
13
 
14
  # إدخال الملف النصي
15
+ uploaded_file = st.file_uploader("Upload a text file", type=["txt"])
16
 
17
  if uploaded_file is not None:
18
  # قراءة الملف النصي
19
  content = uploaded_file.read().decode("utf-8")
20
+ items = [line.strip() for line in content.splitlines() if line.strip()]
21
+
22
+ # تحديد الفئات
23
+ categories = ["shop", "game", "stream"]
24
 
25
  # قوائم لتخزين النتائج
26
+ shopping_items = []
27
+ gaming_items = []
28
+ streaming_items = []
29
+ unknown_items = []
30
+
31
+ # قوائم خاصة بالروابط
32
  parameters = []
33
  domains = []
34
  full_page_types = []
35
  file_extensions = []
36
 
37
+ # دالة تصنيف الكلمات المفتاحية
38
+ def classify_keywords(items, categories):
39
+ results = classifier(items, categories)
40
+ for i, result in enumerate(results):
41
+ best_category = result['labels'][0]
42
+ score = result['scores'][0]
43
+
44
+ if best_category == "shop" and score > 0.5:
45
+ shopping_items.append(items[i])
46
+ elif best_category == "game" and score > 0.5:
47
+ gaming_items.append(items[i])
48
+ elif best_category == "stream" and score > 0.5:
49
+ streaming_items.append(items[i])
50
+ else:
51
+ unknown_items.append(items[i])
52
+
53
  # دالة تحليل الروابط
54
  def analyze_urls(urls):
55
  for url in urls:
56
  # استخراج الباراميترات باستخدام RegEx
57
+ params = re.findall(r'(\w+)=', url) # استخراج الأسماء فقط (بدون '=')
58
  parameters.extend(params)
59
 
60
+ # استخراج الدومينات (مثل .com, .org)
61
+ domain_match = re.search(r'\.([a-zA-Z]{2,})$', url)
62
  if domain_match:
63
  domain = domain_match.group(1)
64
  if domain not in domains:
65
  domains.append(domain)
66
 
67
+ # استخراج أنماط الصفحات الكاملة (مثل product_detail.php?, index.php?)
68
  page_type_match = re.search(r'(\w+\.[a-z]+)\?', url)
69
  if page_type_match:
70
  page_type = page_type_match.group(1)
71
  if page_type not in full_page_types:
72
  full_page_types.append(page_type)
73
 
74
+ # استخراج الصيغ (مثل php, phtml, asp) بدون باقي الرابط
75
+ extension_match = re.search(r'\.([a-z]+)(\?|$)', url)
76
  if extension_match:
77
+ extension = extension_match.group(1)
78
  if extension not in file_extensions:
79
  file_extensions.append(extension)
80
 
81
  # زر البدء
82
  if st.button("Start"):
83
+ if operation == "Filter Keywords":
84
+ classify_keywords(items, categories)
85
+ elif operation == "Analyze URLs":
86
+ analyze_urls(items)
87
 
88
  # إزالة التكرارات من القوائم
89
  parameters = list(set(parameters))
 
91
  full_page_types = list(set(full_page_types))
92
  file_extensions = list(set(file_extensions))
93
 
94
+ # دالة تصدير النتائج
95
+ def export_results(key, filename):
96
+ with open(filename, "w") as f:
97
+ f.write("\n".join(st.session_state[key]))
98
+ st.success(f"Results exported to {filename}")
99
+
100
+ # عرض النتائج بناءً على الخيار المختار
101
+ if operation == "Filter Keywords":
102
+ # عرض النتائج للكلمات المفتاحية
103
+ st.header("Shopping Keywords")
104
+ st.text_area("Copy the shopping keywords here:", value="\n".join(shopping_items), height=200, key="shopping")
105
+ st.button("Export Shopping Keywords", on_click=export_results, args=("shopping", "shopping_keywords.txt"))
106
+
107
+ st.header("Gaming Keywords")
108
+ st.text_area("Copy the gaming keywords here:", value="\n".join(gaming_items), height=200, key="gaming")
109
+ st.button("Export Gaming Keywords", on_click=export_results, args=("gaming", "gaming_keywords.txt"))
110
+
111
+ st.header("Streaming Keywords")
112
+ st.text_area("Copy the streaming keywords here:", value="\n".join(streaming_items), height=200, key="streaming")
113
+ st.button("Export Streaming Keywords", on_click=export_results, args=("streaming", "streaming_keywords.txt"))
114
+
115
+ st.header("Unknown Keywords")
116
+ st.text_area("Copy the unknown keywords here:", value="\n".join(unknown_items), height=200, key="unknown")
117
+ st.button("Export Unknown Keywords", on_click=export_results, args=("unknown", "unknown_keywords.txt"))
118
+
119
+ elif operation == "Analyze URLs":
120
+ # عرض النتائج للروابط
121
+ st.header("Parameters")
122
+ st.text_area("Copy the parameters here:", value="\n".join(parameters), height=200, key="parameters")
123
+ st.button("Export Parameters", on_click=export_results, args=("parameters", "parameters.txt"))
124
+
125
+ st.header("Domains")
126
+ st.text_area("Copy the domains here:", value="\n".join(domains), height=200, key="domains")
127
+ st.button("Export Domains", on_click=export_results, args=("domains", "domains.txt"))
128
+
129
+ st.header("Full PageType")
130
+ st.text_area("Copy the full page types here:", value="\n".join(full_page_types), height=200, key="full_page_types")
131
+ st.button("Export Full PageTypes", on_click=export_results, args=("full_page_types", "full_page_types.txt"))
132
+
133
+ st.header("File Extensions")
134
+ st.text_area("Copy the file extensions here:", value="\n".join(file_extensions), height=200, key="file_extensions")
135
+ st.button("Export File Extensions", on_click=export_results, args=("file_extensions", "file_extensions.txt"))
136
 
137
  else:
138
+ st.warning("Please upload a text file to start analysis.")