th1enq commited on
Commit
619e0de
·
1 Parent(s): d82d422

remove xgboost

Browse files
app.py CHANGED
@@ -10,24 +10,7 @@ from bs4 import BeautifulSoup
10
  import time
11
  import joblib
12
 
13
- # Try to import XGBoost wrapper, handle gracefully if not available
14
- try:
15
- from xgboost_wrapper import xgboost_detector
16
- XGBOOST_AVAILABLE = True
17
- except Exception as e:
18
- print(f"XGBoost wrapper not available: {e}")
19
- XGBOOST_AVAILABLE = False
20
- # Create a dummy detector
21
- class DummyDetector:
22
- def __init__(self):
23
- self.available = False
24
- def predict_combined(self, *args, **kwargs):
25
- return None
26
- def predict_url(self, *args, **kwargs):
27
- return None
28
- def predict_html(self, *args, **kwargs):
29
- return None
30
- xgboost_detector = DummyDetector()
31
 
32
  # --- import your architecture ---
33
  # Make sure this file is in the repo (e.g., models/deberta_lstm_classifier.py)
@@ -170,9 +153,6 @@ def predict_fn(text: str):
170
  # Get prediction for HTML content
171
  html_probs, html_tokens, html_has_attention, html_attention = predict_single_text(html_content, "HTML")
172
 
173
- # Get XGBoost predictions
174
- xgb_result = xgboost_detector.predict_combined(url, html_content)
175
-
176
  # Combine predictions
177
  combined_probs = combine_predictions(url_probs, html_probs)
178
 
@@ -185,12 +165,6 @@ def predict_fn(text: str):
185
  analysis_type = "Combined URL + HTML Analysis"
186
  fetch_status = f"✅ Successfully fetched HTML content (Status: {status})"
187
 
188
- # Add XGBoost analysis if available
189
- if xgb_result:
190
- analysis_type += " + XGBoost"
191
- xgb_info = f" | XGBoost: {'Phishing' if xgb_result['is_phishing'] else 'Benign'} ({xgb_result['probability'][1]:.1%})"
192
- fetch_status += xgb_info
193
-
194
  else:
195
  # Fallback to URL-only analysis
196
  probs = url_probs
@@ -198,17 +172,8 @@ def predict_fn(text: str):
198
  has_attention = url_has_attention
199
  attention_weights = url_attention
200
 
201
- # Get XGBoost URL prediction
202
- xgb_result = xgboost_detector.predict_url(url)
203
-
204
  analysis_type = "URL-only Analysis"
205
  fetch_status = f"⚠️ Could not fetch HTML content: {status}"
206
-
207
- # Add XGBoost analysis if available
208
- if xgb_result:
209
- analysis_type += " + XGBoost"
210
- xgb_info = f" | XGBoost: {'Phishing' if xgb_result['is_phishing'] else 'Benign'} ({xgb_result['probability'][1]:.1%})"
211
- fetch_status += xgb_info
212
  else:
213
  # Process as regular text
214
  probs, tokens, has_attention, attention_weights = predict_single_text(text, "text")
@@ -367,7 +332,7 @@ def predict_fn(text: str):
367
  return prediction_result, detailed_analysis
368
 
369
  # --------- BERT Model Functions ----------
370
- def predict_bert_single_text(text):
371
  """Predict for a single text input using BERT."""
372
  # Tokenize
373
  inputs = bert_tokenizer(
@@ -381,94 +346,77 @@ def predict_bert_single_text(text):
381
  inputs = {k: v.to(device) for k, v in inputs.items()}
382
 
383
  with torch.no_grad():
384
- logits = bert_model(**inputs).logits
 
385
 
386
  probs = F.softmax(logits, dim=-1).squeeze(0).tolist()
387
 
388
- return probs
389
-
390
- def predict_bert_interface_fn(text: str):
391
- """Gradio interface function for BERT model."""
392
- if not text or not text.strip():
393
- return {"error": "Please enter a URL or text."}, ""
394
-
395
- probs = predict_bert_single_text(text)
396
 
397
- # Create detailed analysis
398
- predicted_class = "phishing" if probs[1] > probs[0] else "benign"
399
- confidence = max(probs)
 
 
 
 
 
 
 
400
 
401
- detailed_analysis = f"""
402
- <div style="font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; background: #1e1e1e; padding: 20px; border-radius: 15px;">
403
- <div style="background: linear-gradient(135deg, {'#8b0000' if predicted_class == 'phishing' else '#006400'} 0%, {'#dc143c' if predicted_class == 'phishing' else '#228b22'} 100%); padding: 25px; border-radius: 20px; color: white; text-align: center; margin-bottom: 20px; box-shadow: 0 8px 32px rgba(0,0,0,0.5); border: 2px solid {'#ff4444' if predicted_class == 'phishing' else '#44ff44'};">
404
- <h2 style="margin: 0 0 10px 0; font-size: 28px; color: white;">🔍 BERT Model Analysis</h2>
405
- <div style="font-size: 36px; font-weight: bold; margin: 10px 0; color: white;">
406
- {predicted_class.upper()}
407
- </div>
408
- <div style="font-size: 18px, color: #f0f0f0;">
409
- Confidence: {confidence:.1%}
410
- </div>
411
- </div>
412
- <div style="background: #2d2d2d; padding: 20px; border-radius: 15px; margin: 15px 0; border: 1px solid #555;">
413
- <h3 style="color: #ffffff; margin-bottom: 15px;"> Prediction Confidence</h3>
414
- <div style="display: flex; justify-content: space-between; margin-bottom: 10px;">
415
- <span style="font-weight: bold; color: #ff4444;">Phishing</span>
416
- <span style="font-weight: bold; color: #44ff44;">Benign</span>
417
- </div>
418
- <div style="width: 100%; background-color: #404040; border-radius: 25px; overflow: hidden; height: 30px; border: 1px solid #666;">
419
- <div style="width: {probs[1]*100:.1f}%; background: linear-gradient(90deg, #ff4444 0%, #ff6666 100%); height: 100%; display: flex; align-items: center; justify-content: center; color: white; font-weight: bold; font-size: 14px;">
420
- {probs[1]:.1%}
421
- </div>
422
- </div>
423
- <div style="margin-top: 10px; text-align: center; color: #cccccc; font-size: 14px;">
424
- Benign: {probs[0]:.1%}
425
- </div>
426
- </div>
427
- </div>
428
- """
429
- # Build label->prob mapping for Gradio Label output
430
- if len(LABELS) == len(probs):
431
- prediction_result = {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
432
- else:
433
- prediction_result = {f"class_{i}": float(p) for i, p in enumerate(probs)}
434
-
435
- return prediction_result, detailed_analysis
436
 
437
- # --------- XGBoost Interface Function ----------
438
- def predict_xgboost_interface_fn(text: str):
439
- """Gradio interface function for XGBoost models."""
440
  if not text or not text.strip():
441
  return {"error": "Please enter a URL or text."}, ""
442
 
443
- if not xgboost_detector.available:
444
- return {"benign": 0.5, "phishing": 0.5}, "XGBoost models are not properly loaded."
445
-
446
  # Check if input is URL
447
  if is_url(text.strip()):
 
448
  url = text.strip()
449
 
 
 
 
450
  # Try to fetch HTML content
451
  html_content, status = fetch_html_content(url)
452
 
453
  if html_content:
454
- result = xgboost_detector.predict_combined(url, html_content)
455
- analysis_type = "Combined URL + HTML XGBoost Analysis"
 
 
 
 
 
 
 
 
 
 
 
456
  fetch_status = f"✅ Successfully fetched HTML content (Status: {status})"
 
457
  else:
458
- result = xgboost_detector.predict_url(url)
459
- analysis_type = "URL-only XGBoost Analysis"
 
 
 
 
 
460
  fetch_status = f"⚠️ Could not fetch HTML content: {status}"
461
  else:
462
- # For text input, treat as HTML content
463
- result = xgboost_detector.predict_html(text)
464
- analysis_type = "HTML Content XGBoost Analysis"
465
  fetch_status = ""
466
 
467
- if not result:
468
- return {"benign": 0.5, "phishing": 0.5}, "Failed to get prediction from XGBoost models."
469
-
470
- predicted_class = "phishing" if result['is_phishing'] else "benign"
471
- confidence = max(result['probability'])
472
 
473
  detailed_analysis = f"""
474
  <div style="font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; background: #1e1e1e; padding: 20px; border-radius: 15px;">
@@ -480,6 +428,9 @@ def predict_xgboost_interface_fn(text: str):
480
  <div style="font-size: 18px; color: #f0f0f0;">
481
  Confidence: {confidence:.1%}
482
  </div>
 
 
 
483
  </div>
484
  """
485
 
@@ -490,59 +441,132 @@ def predict_xgboost_interface_fn(text: str):
490
  </div>
491
  """
492
 
493
- # Show detailed XGBoost results
494
- detailed_analysis += f"""
495
- <div style="background: #2d2d2d; padding: 20px; border-radius: 15px; margin: 15px 0; border: 1px solid #555;">
496
- <h3 style="color: #ffffff; margin-bottom: 15px;">🎯 XGBoost Prediction Confidence</h3>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
497
  <div style="display: flex; justify-content: space-between; margin-bottom: 10px;">
498
  <span style="font-weight: bold; color: #ff4444;">Phishing</span>
499
  <span style="font-weight: bold; color: #44ff44;">Benign</span>
500
  </div>
501
  <div style="width: 100%; background-color: #404040; border-radius: 25px; overflow: hidden; height: 30px; border: 1px solid #666;">
502
- <div style="width: {result['probability'][1]*100:.1f}%; background: linear-gradient(90deg, #ff4444 0%, #ff6666 100%); height: 100%; display: flex; align-items: center; justify-content: center; color: white; font-weight: bold; font-size: 14px;">
503
- {result['probability'][1]:.1%}
504
  </div>
505
  </div>
506
  <div style="margin-top: 10px; text-align: center; color: #cccccc; font-size: 14px;">
507
- Benign: {result['probability'][0]:.1%}
508
  </div>
509
  </div>
510
  """
511
-
512
- # Show component analysis if available
513
- if 'url_result' in result and 'html_result' in result:
514
  detailed_analysis += f"""
515
  <div style="background: linear-gradient(135deg, #1a237e 0%, #3949ab 100%); padding: 20px; border-radius: 15px; color: white; margin: 15px 0; border: 2px solid #3f51b5;">
516
- <h3 style="margin: 0 0 15px 0; color: white;">🔬 Component Analysis</h3>
517
- <div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 15px;">
518
- <div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 10px; border: 1px solid rgba(255,255,255,0.2);">
519
- <div style="font-size: 18px; font-weight: bold; color: white;">URL Analysis</div>
520
- <div style="font-size: 24px; font-weight: bold; color: {'#ff6666' if result['url_result']['is_phishing'] else '#66ff66'};">
521
- {'Phishing' if result['url_result']['is_phishing'] else 'Benign'}
522
- </div>
523
- <div style="font-size: 14px; color: #e0e0e0;">{result['url_result']['probability'][1]:.1%} phishing</div>
524
  </div>
525
- <div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 10px; border: 1px solid rgba(255,255,255,0.2);">
526
- <div style="font-size: 18px; font-weight: bold; color: white;">HTML Analysis</div>
527
- <div style="font-size: 24px; font-weight: bold; color: {'#ff6666' if result['html_result']['is_phishing'] else '#66ff66'};">
528
- {'Phishing' if result['html_result']['is_phishing'] else 'Benign'}
529
- </div>
530
- <div style="font-size: 14px; color: #e0e0e0;">{result['html_result']['probability'][1]:.1%} phishing</div>
531
  </div>
 
 
 
 
 
 
 
 
 
 
 
532
  </div>
533
  </div>
 
 
 
 
 
534
  """
535
 
536
  detailed_analysis += "</div>"
537
 
538
  # Build label->prob mapping for Gradio Label output
539
- if len(LABELS) == len(result['probability']):
540
- prediction_result = {LABELS[i]: float(result['probability'][i]) for i in range(len(LABELS))}
541
  else:
542
- prediction_result = {f"class_{i}": float(p) for i, p in enumerate(result['probability'])}
543
 
544
  return prediction_result, detailed_analysis
545
 
 
 
546
  # --------- Gradio UI ----------
547
  deberta_interface = gr.Interface(
548
  fn=predict_fn,
@@ -613,44 +637,70 @@ bert_interface = gr.Interface(
613
  inputs=gr.Textbox(label="URL or text", placeholder="Example: http://suspicious-site.example or paste any text"),
614
  outputs=[
615
  gr.Label(label="Prediction result"),
616
- gr.Markdown(label="Detailed analysis")
617
  ],
618
  title="Phishing Detector (BERT)",
619
- description="Enter a URL or text for analysis using the BERT model.",
620
- examples=[
621
- ["http://rendmoiunserviceeee.com"],
622
- ["https://www.google.com"],
623
- ["Dear customer, your account has been suspended. Click here to verify your identity immediately."],
624
- ]
625
- )
626
-
627
- xgboost_interface = gr.Interface(
628
- fn=predict_xgboost_interface_fn,
629
- inputs=gr.Textbox(label="URL or HTML content", placeholder="Example: http://suspicious-site.example or paste HTML content"),
630
- outputs=[
631
- gr.Label(label="Prediction result"),
632
- gr.Markdown(label="Detailed analysis")
633
- ],
634
- title="Phishing Detector (XGBoost)",
635
  description="""
636
- Enter a URL or HTML content for analysis using XGBoost models.
637
  **Features:**
638
- - **URL Feature Analysis**: Extracts 30+ features from URL structure
639
- - **HTML Feature Analysis**: Extracts 43+ features from HTML content
640
- - **Combined Analysis**: For URLs, combines both URL and HTML features
641
- - **Fast Prediction**: Traditional ML approach for quick results
 
 
 
 
 
 
 
 
642
  """,
643
  examples=[
644
  ["http://rendmoiunserviceeee.com"],
645
  ["https://www.google.com"],
 
 
646
  ["http://paypaI-security-update.net/login"],
647
- ["<html><head><title>Urgent Security Alert</title></head><body><form><input type='password'></form></body></html>"],
648
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
649
  )
650
 
 
 
651
  demo = gr.TabbedInterface(
652
- [deberta_interface, bert_interface, xgboost_interface],
653
- ["DeBERTa + LSTM", "BERT", "XGBoost"]
654
  )
655
 
656
  if __name__ == "__main__":
 
10
  import time
11
  import joblib
12
 
13
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  # --- import your architecture ---
16
  # Make sure this file is in the repo (e.g., models/deberta_lstm_classifier.py)
 
153
  # Get prediction for HTML content
154
  html_probs, html_tokens, html_has_attention, html_attention = predict_single_text(html_content, "HTML")
155
 
 
 
 
156
  # Combine predictions
157
  combined_probs = combine_predictions(url_probs, html_probs)
158
 
 
165
  analysis_type = "Combined URL + HTML Analysis"
166
  fetch_status = f"✅ Successfully fetched HTML content (Status: {status})"
167
 
 
 
 
 
 
 
168
  else:
169
  # Fallback to URL-only analysis
170
  probs = url_probs
 
172
  has_attention = url_has_attention
173
  attention_weights = url_attention
174
 
 
 
 
175
  analysis_type = "URL-only Analysis"
176
  fetch_status = f"⚠️ Could not fetch HTML content: {status}"
 
 
 
 
 
 
177
  else:
178
  # Process as regular text
179
  probs, tokens, has_attention, attention_weights = predict_single_text(text, "text")
 
332
  return prediction_result, detailed_analysis
333
 
334
  # --------- BERT Model Functions ----------
335
+ def predict_bert_single_text(text, text_type="text"):
336
  """Predict for a single text input using BERT."""
337
  # Tokenize
338
  inputs = bert_tokenizer(
 
346
  inputs = {k: v.to(device) for k, v in inputs.items()}
347
 
348
  with torch.no_grad():
349
+ outputs = bert_model(**inputs, output_attentions=True)
350
+ logits = outputs.logits
351
 
352
  probs = F.softmax(logits, dim=-1).squeeze(0).tolist()
353
 
354
+ # Get tokens for visualization
355
+ tokens = bert_tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze(0).tolist())
 
 
 
 
 
 
356
 
357
+ # Get attention weights (use last layer, first head as approximation)
358
+ attention_weights = None
359
+ has_attention = False
360
+ if hasattr(outputs, 'attentions') and outputs.attentions is not None:
361
+ # Average attention across all heads in the last layer
362
+ last_layer_attention = outputs.attentions[-1] # Last layer
363
+ attention_weights = last_layer_attention.mean(dim=1).squeeze(0) # Average across heads
364
+ # Use attention to [CLS] token as importance scores
365
+ attention_weights = attention_weights[0] # [CLS] token attention to all tokens
366
+ has_attention = True
367
 
368
+ return probs, tokens, has_attention, attention_weights
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
 
370
+ def predict_bert_interface_fn(text: str):
371
+ """Gradio interface function for BERT model."""
 
372
  if not text or not text.strip():
373
  return {"error": "Please enter a URL or text."}, ""
374
 
 
 
 
375
  # Check if input is URL
376
  if is_url(text.strip()):
377
+ # Process URL
378
  url = text.strip()
379
 
380
+ # Get prediction for URL itself
381
+ url_probs, url_tokens, url_has_attention, url_attention = predict_bert_single_text(url, "URL")
382
+
383
  # Try to fetch HTML content
384
  html_content, status = fetch_html_content(url)
385
 
386
  if html_content:
387
+ # Get prediction for HTML content
388
+ html_probs, html_tokens, html_has_attention, html_attention = predict_bert_single_text(html_content, "HTML")
389
+
390
+ # Combine predictions
391
+ combined_probs = combine_predictions(url_probs, html_probs)
392
+
393
+ # Use combined probabilities but show analysis for both
394
+ probs = combined_probs
395
+ tokens = url_tokens + ["[SEP]"] + html_tokens[:50] # Limit HTML tokens for display
396
+ has_attention = url_has_attention or html_has_attention
397
+ attention_weights = url_attention if url_has_attention else html_attention
398
+
399
+ analysis_type = "Combined URL + HTML BERT Analysis"
400
  fetch_status = f"✅ Successfully fetched HTML content (Status: {status})"
401
+
402
  else:
403
+ # Fallback to URL-only analysis
404
+ probs = url_probs
405
+ tokens = url_tokens
406
+ has_attention = url_has_attention
407
+ attention_weights = url_attention
408
+
409
+ analysis_type = "URL-only BERT Analysis"
410
  fetch_status = f"⚠️ Could not fetch HTML content: {status}"
411
  else:
412
+ # Process as regular text
413
+ probs, tokens, has_attention, attention_weights = predict_bert_single_text(text, "text")
414
+ analysis_type = "BERT Text Analysis"
415
  fetch_status = ""
416
 
417
+ # Create detailed analysis
418
+ predicted_class = "phishing" if probs[1] > probs[0] else "benign"
419
+ confidence = max(probs)
 
 
420
 
421
  detailed_analysis = f"""
422
  <div style="font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; background: #1e1e1e; padding: 20px; border-radius: 15px;">
 
428
  <div style="font-size: 18px; color: #f0f0f0;">
429
  Confidence: {confidence:.1%}
430
  </div>
431
+ <div style="margin-top: 15px; font-size: 14px; color: #e0e0e0;">
432
+ {'This appears to be a phishing attempt!' if predicted_class == 'phishing' else '✅ This appears to be legitimate content.'}
433
+ </div>
434
  </div>
435
  """
436
 
 
441
  </div>
442
  """
443
 
444
+ if has_attention and attention_weights is not None:
445
+ attention_scores = attention_weights.squeeze(0).tolist() if attention_weights.dim() > 1 else attention_weights.tolist()
446
+
447
+ token_analysis = []
448
+ for i, (token, score) in enumerate(zip(tokens, attention_scores)):
449
+ # More lenient filtering - include more tokens for text analysis
450
+ if token not in ['[CLS]', '[SEP]', '[PAD]', '<s>', '</s>'] and len(token.strip()) > 0 and score > 0.005:
451
+ clean_token = token.replace('▁', '').replace('Ġ', '').strip() # Handle different tokenizer prefixes
452
+ if clean_token: # Only add if token has content after cleaning
453
+ token_analysis.append({
454
+ 'token': clean_token,
455
+ 'importance': score,
456
+ 'position': i
457
+ })
458
+
459
+ # Sort by importance
460
+ token_analysis.sort(key=lambda x: x['importance'], reverse=True)
461
+
462
+ detailed_analysis += f"""
463
+ ## Top important tokens:
464
+ <div style="background: #2d2d2d; padding: 15px; border-radius: 10px; margin: 15px 0; border-left: 4px solid #4caf50; color: #e0e0e0;">
465
+ <strong>Analysis Info:</strong> Found {len(token_analysis)} important tokens out of {len(tokens)} total tokens
466
+ </div>
467
+ <div style="font-family: Arial, sans-serif;">
468
+ """
469
+
470
+ for i, token_info in enumerate(token_analysis[:10]): # Top 10 tokens
471
+ bar_width = int(token_info['importance'] * 100)
472
+ color = "#ff4444" if predicted_class == "phishing" else "#44ff44"
473
+
474
+ detailed_analysis += f"""
475
+ <div style="margin: 8px 0; display: flex; align-items: center; background: #2d2d2d; padding: 8px; border-radius: 8px; border-left: 4px solid {color};">
476
+ <div style="width: 30px; text-align: right; margin-right: 10px; font-weight: bold; color: #ffffff;">
477
+ {i+1}.
478
+ </div>
479
+ <div style="width: 120px; margin-right: 10px; font-weight: bold; color: #e0e0e0; text-align: right;">
480
+ {token_info['token']}
481
+ </div>
482
+ <div style="width: 300px; background-color: #404040; border-radius: 10px; overflow: hidden; margin-right: 10px; border: 1px solid #555;">
483
+ <div style="width: {bar_width}%; background-color: {color}; height: 20px; border-radius: 10px; transition: width 0.3s ease;"></div>
484
+ </div>
485
+ <div style="color: #cccccc; font-size: 12px; font-weight: bold;">
486
+ {token_info['importance']:.1%}
487
+ </div>
488
+ </div>
489
+ """
490
+
491
+ detailed_analysis += "</div>\n"
492
+
493
+ detailed_analysis += f"""
494
+ ## Detailed analysis:
495
+ <div style="font-family: Arial, sans-serif; background: linear-gradient(135deg, #1a237e 0%, #3949ab 100%); padding: 20px; border-radius: 15px; color: white; margin: 15px 0; border: 2px solid #3f51b5;">
496
+ <h3 style="margin: 0 0 15px 0; color: white;">Statistical Overview</h3>
497
+ <div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 15px;">
498
+ <div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 10px; border: 1px solid rgba(255,255,255,0.2);">
499
+ <div style="font-size: 24px; font-weight: bold; color: white;">{len([t for t in tokens if t not in ['[CLS]', '[SEP]', '[PAD]']])}</div>
500
+ <div style="font-size: 14px; color: #e0e0e0;">Total tokens</div>
501
+ </div>
502
+ <div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 10px; border: 1px solid rgba(255,255,255,0.2);">
503
+ <div style="font-size: 24px; font-weight: bold; color: white;">{len([t for t in token_analysis if t['importance'] > 0.05])}</div>
504
+ <div style="font-size: 14px; color: #e0e0e0;">High impact tokens (>5%)</div>
505
+ </div>
506
+ </div>
507
+ </div>
508
+ <div style="font-family: Arial, sans-serif; margin: 15px 0; background: #2d2d2d; padding: 20px; border-radius: 15px; border: 1px solid #555;">
509
+ <h3 style="color: #ffffff; margin-bottom: 15px;"> Prediction Confidence</h3>
510
  <div style="display: flex; justify-content: space-between; margin-bottom: 10px;">
511
  <span style="font-weight: bold; color: #ff4444;">Phishing</span>
512
  <span style="font-weight: bold; color: #44ff44;">Benign</span>
513
  </div>
514
  <div style="width: 100%; background-color: #404040; border-radius: 25px; overflow: hidden; height: 30px; border: 1px solid #666;">
515
+ <div style="width: {probs[1]*100:.1f}%; background: linear-gradient(90deg, #ff4444 0%, #ff6666 100%); height: 100%; display: flex; align-items: center; justify-content: center; color: white; font-weight: bold; font-size: 14px;">
516
+ {probs[1]:.1%}
517
  </div>
518
  </div>
519
  <div style="margin-top: 10px; text-align: center; color: #cccccc; font-size: 14px;">
520
+ Benign: {probs[0]:.1%}
521
  </div>
522
  </div>
523
  """
524
+ else:
525
+ # Fallback analysis without attention weights
 
526
  detailed_analysis += f"""
527
  <div style="background: linear-gradient(135deg, #1a237e 0%, #3949ab 100%); padding: 20px; border-radius: 15px; color: white; margin: 15px 0; border: 2px solid #3f51b5;">
528
+ <h3 style="margin: 0 0 15px 0; color: white;">Basic Analysis</h3>
529
+ <div style="display: grid; grid-template-columns: repeat(3, 1fr); gap: 15px;">
530
+ <div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 10px; text-align: center; border: 1px solid rgba(255,255,255,0.2);">
531
+ <div style="font-size: 24px; font-weight: bold; color: white;">{probs[1]:.1%}</div>
532
+ <div style="font-size: 14px; color: #e0e0e0;">Phishing</div>
 
 
 
533
  </div>
534
+ <div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 10px; text-align: center; border: 1px solid rgba(255,255,255,0.2);">
535
+ <div style="font-size: 24px; font-weight: bold; color: white;">{probs[0]:.1%}</div>
536
+ <div style="font-size: 14px; color: #e0e0e0;">Benign</div>
 
 
 
537
  </div>
538
+ <div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 10px; text-align: center; border: 1px solid rgba(255,255,255,0.2);">
539
+ <div style="font-size: 24px; font-weight: bold; color: white;">{len([t for t in tokens if t not in ['[CLS]', '[SEP]', '[PAD]']])}</div>
540
+ <div style="font-size: 14px; color: #e0e0e0;">Tokens</div>
541
+ </div>
542
+ </div>
543
+ </div>
544
+ <div style="background: #2d2d2d; padding: 20px; border-radius: 15px; margin: 15px 0; border: 1px solid #555;">
545
+ <h3 style="color: #ffffff; margin: 0 0 15px 0;">🔤 Tokens in text:</h3>
546
+ <div style="display: flex; flex-wrap: wrap; gap: 8px;">""" + ''.join([f'<span style="background: #404040; color: #64b5f6; padding: 4px 8px; border-radius: 15px; font-size: 12px; border: 1px solid #666;">{token.replace("▁", "")}</span>' for token in tokens if token not in ['[CLS]', '[SEP]', '[PAD]']]) + f"""</div>
547
+ <div style="margin-top: 15px; padding: 10px; background: #3d2914; border-radius: 8px; border-left: 4px solid #ff9800;">
548
+ <strong style="color: #ffcc02;">Debug info:</strong> <span style="color: #e0e0e0;">Found {len(tokens)} total tokens, {len([t for t in tokens if t not in ['[CLS]', '[SEP]', '[PAD]']])} content tokens</span>
549
  </div>
550
  </div>
551
+ <div style="background: #3d2914; padding: 15px; border-radius: 10px; border-left: 4px solid #ff9800; margin: 15px 0;">
552
+ <p style="margin: 0; color: #ffcc02; font-size: 14px;">
553
+ <strong>Note:</strong> Detailed attention weights analysis is not available for the current model.
554
+ </p>
555
+ </div>
556
  """
557
 
558
  detailed_analysis += "</div>"
559
 
560
  # Build label->prob mapping for Gradio Label output
561
+ if len(LABELS) == len(probs):
562
+ prediction_result = {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
563
  else:
564
+ prediction_result = {f"class_{i}": float(p) for i, p in enumerate(probs)}
565
 
566
  return prediction_result, detailed_analysis
567
 
568
+
569
+
570
  # --------- Gradio UI ----------
571
  deberta_interface = gr.Interface(
572
  fn=predict_fn,
 
637
  inputs=gr.Textbox(label="URL or text", placeholder="Example: http://suspicious-site.example or paste any text"),
638
  outputs=[
639
  gr.Label(label="Prediction result"),
640
+ gr.Markdown(label="Detailed token analysis")
641
  ],
642
  title="Phishing Detector (BERT)",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
643
  description="""
644
+ Enter a URL or text for analysis using the BERT model.
645
  **Features:**
646
+ - **URL Analysis**: For URLs, the system will fetch HTML content and combine both URL and content analysis
647
+ - **Combined Prediction**: Uses weighted combination of URL structure and webpage content analysis
648
+ - **Visual Analysis**: Predict phishing/benign probability with visual charts
649
+ - **Token Importance**: Display the most important tokens in classification using attention weights
650
+ - **Detailed Insights**: Comprehensive analysis of the impact of each token
651
+ - **Dark Theme**: Beautiful interface with colorful charts optimized for dark themes
652
+
653
+ **How it works for URLs:**
654
+ 1. Analyze the URL structure itself
655
+ 2. Fetch the webpage HTML content
656
+ 3. Analyze the webpage content
657
+ 4. Combine both results for final prediction (30% URL + 70% content)
658
  """,
659
  examples=[
660
  ["http://rendmoiunserviceeee.com"],
661
  ["https://www.google.com"],
662
+ ["Dear customer, your account has been suspended. Click here to verify your identity immediately."],
663
+ ["https://mail-secure-login-verify.example/path?token=suspicious"],
664
  ["http://paypaI-security-update.net/login"],
665
+ ["Your package has been delivered successfully. Thank you for using our service."],
666
+ ["https://github.com/user/repo"]
667
+ ],
668
+ theme=gr.themes.Soft(),
669
+ css="""
670
+ .gradio-container {
671
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
672
+ background-color: #1e1e1e !important;
673
+ color: #ffffff !important;
674
+ }
675
+ .dark .gradio-container {
676
+ background-color: #1e1e1e !important;
677
+ }
678
+ /* Dark theme for all components */
679
+ .block {
680
+ background-color: #2d2d2d !important;
681
+ border: 1px solid #444 !important;
682
+ }
683
+ .gradio-textbox {
684
+ background-color: #3d3d3d !important;
685
+ color: #ffffff !important;
686
+ border: 1px solid #666 !important;
687
+ }
688
+ .gradio-button {
689
+ background-color: #4a4a4a !important;
690
+ color: #ffffff !important;
691
+ border: 1px solid #666 !important;
692
+ }
693
+ .gradio-button:hover {
694
+ background-color: #5a5a5a !important;
695
+ }
696
+ """
697
  )
698
 
699
+
700
+
701
  demo = gr.TabbedInterface(
702
+ [deberta_interface, bert_interface],
703
+ ["DeBERTa + LSTM", "BERT"]
704
  )
705
 
706
  if __name__ == "__main__":
xgboost/URLFeatureExtraction.py DELETED
@@ -1,382 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
-
3
- # importing required packages for this section
4
- from urllib.parse import urlparse,urlencode
5
- import ipaddress
6
- import re
7
-
8
- """#### **3.1.1. Domain of the URL**
9
- Here, we are just extracting the domain present in the URL. This feature doesn't have much significance in the training. May even be dropped while training the model.
10
- """
11
- '''
12
- # 1.Domain of the URL (Domain)
13
- def getDomain(url):
14
- domain = urlparse(url).netloc
15
- if re.match(r"^www.",domain):
16
- domain = domain.replace("www.","")
17
- return domain'''
18
-
19
- """#### **3.1.2. IP Address in the URL**
20
-
21
- Checks for the presence of IP address in the URL. URLs may have IP address instead of domain name. If an IP address is used as an alternative of the domain name in the URL, we can be sure that someone is trying to steal personal information with this URL.
22
-
23
- If the domain part of URL has IP address, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
24
- """
25
-
26
- # 2.Checks for IP address in URL (Have_IP)
27
- def havingIP(url):
28
- try:
29
- ipaddress.ip_address(url)
30
- ip = 1
31
- except:
32
- ip = 0
33
- return ip
34
-
35
- """#### **3.1.3. "@" Symbol in URL**
36
-
37
- Checks for the presence of '@' symbol in the URL. Using “@” symbol in the URL leads the browser to ignore everything preceding the “@” symbol and the real address often follows the “@” symbol.
38
-
39
- If the URL has '@' symbol, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
40
- """
41
-
42
- # 3.Checks the presence of @ in URL (Have_At)
43
- def haveAtSign(url):
44
- if "@" in url:
45
- at = 1
46
- else:
47
- at = 0
48
- return at
49
-
50
- """#### **3.1.4. Length of URL**
51
-
52
- Computes the length of the URL. Phishers can use long URL to hide the doubtful part in the address bar. In this project, if the length of the URL is greater than or equal 54 characters then the URL classified as phishing otherwise legitimate.
53
-
54
- If the length of URL >= 54 , the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
55
- """
56
-
57
- # 4.Finding the length of URL and categorizing (URL_Length)
58
- def getLength(url):
59
- if len(url) < 54:
60
- length = 0
61
- else:
62
- length = 1
63
- return length
64
-
65
- """#### **3.1.5. Depth of URL**
66
-
67
- Computes the depth of the URL. This feature calculates the number of sub pages in the given url based on the '/'.
68
-
69
- The value of feature is a numerical based on the URL.
70
- """
71
-
72
- # 5.Gives number of '/' in URL (URL_Depth)
73
- def getDepth(url):
74
- s = urlparse(url).path.split('/')
75
- depth = 0
76
- for j in range(len(s)):
77
- if len(s[j]) != 0:
78
- depth = depth+1
79
- return depth
80
-
81
- """#### **3.1.6. Redirection "//" in URL**
82
-
83
- Checks the presence of "//" in the URL. The existence of “//” within the URL path means that the user will be redirected to another website. The location of the “//” in URL is computed. We find that if the URL starts with “HTTP”, that means the “//” should appear in the sixth position. However, if the URL employs “HTTPS” then the “//” should appear in seventh position.
84
-
85
- If the "//" is anywhere in the URL apart from after the protocal, thee value assigned to this feature is 1 (phishing) or else 0 (legitimate).
86
- """
87
-
88
- # 6.Checking for redirection '//' in the url (Redirection)
89
- def redirection(url):
90
- pos = url.rfind('//')
91
- if pos > 6:
92
- if pos > 7:
93
- return 1
94
- else:
95
- return 0
96
- else:
97
- return 0
98
-
99
- """#### **3.1.7. "http/https" in Domain name**
100
-
101
- Checks for the presence of "http/https" in the domain part of the URL. The phishers may add the “HTTPS” token to the domain part of a URL in order to trick users.
102
-
103
- If the URL has "http/https" in the domain part, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
104
- """
105
-
106
- # 7.Existence of “HTTPS” Token in the Domain Part of the URL (https_Domain)
107
- def httpDomain(url):
108
- domain = urlparse(url).netloc
109
- if 'https' in domain:
110
- return 1
111
- else:
112
- return 0
113
-
114
- """#### **3.1.8. Using URL Shortening Services “TinyURL”**
115
-
116
- URL shortening is a method on the “World Wide Web” in which a URL may be made considerably smaller in length and still lead to the required webpage. This is accomplished by means of an “HTTP Redirect” on a domain name that is short, which links to the webpage that has a long URL.
117
-
118
- If the URL is using Shortening Services, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
119
- """
120
-
121
- #listing shortening services
122
- shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
123
- r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
124
- r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
125
- r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
126
- r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
127
- r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
128
- r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
129
- r"tr\.im|link\.zip\.net"
130
-
131
- # 8. Checking for Shortening Services in URL (Tiny_URL)
132
- def tinyURL(url):
133
- match=re.search(shortening_services,url)
134
- if match:
135
- return 1
136
- else:
137
- return 0
138
-
139
- """#### **3.1.9. Prefix or Suffix "-" in Domain**
140
-
141
- Checking the presence of '-' in the domain part of URL. The dash symbol is rarely used in legitimate URLs. Phishers tend to add prefixes or suffixes separated by (-) to the domain name so that users feel that they are dealing with a legitimate webpage.
142
-
143
- If the URL has '-' symbol in the domain part of the URL, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
144
- """
145
-
146
- # 9.Checking for Prefix or Suffix Separated by (-) in the Domain (Prefix/Suffix)
147
- def prefixSuffix(url):
148
- if '-' in urlparse(url).netloc:
149
- return 1 # phishing
150
- else:
151
- return 0 # legitimate
152
-
153
- """### **3.2. Domain Based Features:**
154
-
155
- Many features can be extracted that come under this category. Out of them, below mentioned were considered for this project.
156
-
157
- * DNS Record
158
- * Website Traffic
159
- * Age of Domain
160
- * End Period of Domain
161
-
162
- Each of these features are explained and the coded below:
163
- """
164
-
165
- #!pip install python-whois
166
-
167
- # importing required packages for this section
168
- import re
169
- from bs4 import BeautifulSoup
170
- #import whois
171
- import urllib
172
- import urllib.request
173
- from datetime import datetime
174
-
175
- """#### **3.2.1. DNS Record**
176
-
177
- For phishing websites, either the claimed identity is not recognized by the WHOIS database or no records founded for the hostname.
178
- If the DNS record is empty or not found then, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
179
- """
180
-
181
- # 11.DNS Record availability (DNS_Record)
182
- # obtained in the featureExtraction function itself
183
-
184
- """#### **3.2.2. Web Traffic**
185
-
186
- This feature measures the popularity of the website by determining the number of visitors and the number of pages they visit. However, since phishing websites live for a short period of time, they may not be recognized by the Alexa database (Alexa the Web Information Company., 1996). By reviewing our dataset, we find that in worst scenarios, legitimate websites ranked among the top 100,000. Furthermore, if the domain has no traffic or is not recognized by the Alexa database, it is classified as “Phishing”.
187
-
188
- If the rank of the domain < 100000, the vlaue of this feature is 1 (phishing) else 0 (legitimate).
189
- """
190
-
191
- # 12.Web traffic (Web_Traffic)
192
- def web_traffic(url):
193
- try:
194
- #Filling the whitespaces in the URL if any
195
- url = urllib.parse.quote(url)
196
- rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find(
197
- "REACH")['RANK']
198
- rank = int(rank)
199
- except TypeError:
200
- return 1
201
- if rank <100000:
202
- return 1
203
- else:
204
- return 0
205
-
206
- """#### **3.2.3. Age of Domain**
207
-
208
- This feature can be extracted from WHOIS database. Most phishing websites live for a short period of time. The minimum age of the legitimate domain is considered to be 12 months for this project. Age here is nothing but different between creation and expiration time.
209
-
210
- If age of domain > 12 months, the vlaue of this feature is 1 (phishing) else 0 (legitimate).
211
- """
212
-
213
- # 13.Survival time of domain: The difference between termination time and creation time (Domain_Age)
214
- def domainAge(domain_name):
215
- creation_date = domain_name.creation_date
216
- expiration_date = domain_name.expiration_date
217
- if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
218
- try:
219
- creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
220
- expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
221
- except:
222
- return 1
223
- if ((expiration_date is None) or (creation_date is None)):
224
- return 1
225
- elif ((type(expiration_date) is list) or (type(creation_date) is list)):
226
- return 1
227
- else:
228
- ageofdomain = abs((expiration_date - creation_date).days)
229
- if ((ageofdomain/30) < 6):
230
- age = 1
231
- else:
232
- age = 0
233
- return age
234
-
235
- """#### **3.2.4. End Period of Domain**
236
-
237
- This feature can be extracted from WHOIS database. For this feature, the remaining domain time is calculated by finding the different between expiration time & current time. The end period considered for the legitimate domain is 6 months or less for this project.
238
-
239
- If end period of domain > 6 months, the vlaue of this feature is 1 (phishing) else 0 (legitimate).
240
- """
241
-
242
- # 14.End time of domain: The difference between termination time and current time (Domain_End)
243
- def domainEnd(domain_name):
244
- expiration_date = domain_name.expiration_date
245
- if isinstance(expiration_date,str):
246
- try:
247
- expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
248
- except:
249
- return 1
250
- if (expiration_date is None):
251
- return 1
252
- elif (type(expiration_date) is list):
253
- return 1
254
- else:
255
- today = datetime.now()
256
- end = abs((expiration_date - today).days)
257
- if ((end/30) < 6):
258
- end = 0
259
- else:
260
- end = 1
261
- return end
262
-
263
- """## **3.3. HTML and JavaScript based Features**
264
-
265
- Many features can be extracted that come under this category. Out of them, below mentioned were considered for this project.
266
-
267
- * IFrame Redirection
268
- * Status Bar Customization
269
- * Disabling Right Click
270
- * Website Forwarding
271
-
272
- Each of these features are explained and the coded below:
273
- """
274
-
275
- # importing required packages for this section
276
- import requests
277
-
278
- """### **3.3.1. IFrame Redirection**
279
-
280
- IFrame is an HTML tag used to display an additional webpage into one that is currently shown. Phishers can make use of the “iframe” tag and make it invisible i.e. without frame borders. In this regard, phishers make use of the “frameBorder” attribute which causes the browser to render a visual delineation.
281
-
282
- If the iframe is empty or repsonse is not found then, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
283
- """
284
-
285
- # 15. IFrame Redirection (iFrame)
286
- def iframe(response):
287
- if response == "":
288
- return 1
289
- else:
290
- if re.findall(r"[<iframe>|<frameBorder>]", response.text):
291
- return 0
292
- else:
293
- return 1
294
-
295
- """### **3.3.2. Status Bar Customization**
296
-
297
- Phishers may use JavaScript to show a fake URL in the status bar to users. To extract this feature, we must dig-out the webpage source code, particularly the “onMouseOver” event, and check if it makes any changes on the status bar
298
-
299
- If the response is empty or onmouseover is found then, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
300
- """
301
-
302
- # 16.Checks the effect of mouse over on status bar (Mouse_Over)
303
- def mouseOver(response):
304
- if response == "" :
305
- return 1
306
- else:
307
- if re.findall("<script>.+onmouseover.+</script>", response.text):
308
- return 1
309
- else:
310
- return 0
311
-
312
- """### **3.3.3. Disabling Right Click**
313
-
314
- Phishers use JavaScript to disable the right-click function, so that users cannot view and save the webpage source code. This feature is treated exactly as “Using onMouseOver to hide the Link”. Nonetheless, for this feature, we will search for event “event.button==2” in the webpage source code and check if the right click is disabled.
315
-
316
- If the response is empty or onmouseover is not found then, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
317
- """
318
-
319
- # 17.Checks the status of the right click attribute (Right_Click)
320
- def rightClick(response):
321
- if response == "":
322
- return 1
323
- else:
324
- if re.findall(r"event.button ?== ?2", response.text):
325
- return 0
326
- else:
327
- return 1
328
-
329
- """### **3.3.4. Website Forwarding**
330
- The fine line that distinguishes phishing websites from legitimate ones is how many times a website has been redirected. In our dataset, we find that legitimate websites have been redirected one time max. On the other hand, phishing websites containing this feature have been redirected at least 4 times.
331
- """
332
-
333
- # 18.Checks the number of forwardings (Web_Forwards)
334
- def forwarding(response):
335
- if response == "":
336
- return 1
337
- else:
338
- if len(response.history) <= 2:
339
- return 0
340
- else:
341
- return 1
342
-
343
- """## **4. Computing URL Features**
344
-
345
- Create a list and a function that calls the other functions and stores all the features of the URL in the list. We will extract the features of each URL and append to this list.
346
- """
347
-
348
- #Function to extract features
349
- def featureExtraction(url):
350
-
351
- features = []
352
- #Address bar based features (10)
353
- #features.append(getDomain(url))
354
- features.append(havingIP(url))
355
- features.append(haveAtSign(url))
356
- features.append(getLength(url))
357
- features.append(getDepth(url))
358
- features.append(redirection(url))
359
- features.append(httpDomain(url))
360
- features.append(tinyURL(url))
361
- features.append(prefixSuffix(url))
362
-
363
- # #Domain based features (4)
364
- # dns = 0
365
- # try:
366
- # domain_name = whois.whois(urlparse(url).netloc)
367
- # except:
368
- # dns = 1
369
-
370
- # features.append(dns)
371
- # features.append(web_traffic(url))
372
- # features.append(1 if dns == 1 else domainAge(domain_name))
373
- # features.append(1 if dns == 1 else domainEnd(domain_name))
374
-
375
-
376
-
377
- return features
378
-
379
- #converting the list to dataframe
380
- feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'URL_Depth','Redirection',
381
- 'https_Domain', 'TinyURL', 'Prefix/Suffix', 'Label']
382
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
xgboost/__init__.py DELETED
File without changes
xgboost/features.py DELETED
@@ -1,347 +0,0 @@
1
- """
2
- from bs4 import BeautifulSoup
3
-
4
-
5
- with open("mini_dataset/6.html") as f:
6
- test = f.read()
7
-
8
- soup = BeautifulSoup(test, "html.parser")
9
- """
10
-
11
-
12
- # has_title
13
- def has_title(soup):
14
- if soup.title is None:
15
- return 0
16
- if len(soup.title.text) > 0:
17
- return 1
18
- else:
19
- return 0
20
-
21
-
22
- # has_input
23
- def has_input(soup):
24
- if len(soup.find_all("input")):
25
- return 1
26
- else:
27
- return 0
28
-
29
-
30
- # has_button
31
- def has_button(soup):
32
- if len(soup.find_all("button")) > 0:
33
- return 1
34
- else:
35
- return 0
36
-
37
-
38
- # has_image
39
- def has_image(soup):
40
- if len(soup.find_all("image")) == 0:
41
- return 0
42
- else:
43
- return 1
44
-
45
-
46
- # has_submit
47
- def has_submit(soup):
48
- for button in soup.find_all("input"):
49
- if button.get("type") == "submit":
50
- return 1
51
- else:
52
- pass
53
- return 0
54
-
55
-
56
- # has_link
57
- def has_link(soup):
58
- if len(soup.find_all("link")) > 0:
59
- return 1
60
- else:
61
- return 0
62
-
63
-
64
- # has_password
65
- def has_password(soup):
66
- for input in soup.find_all("input"):
67
- if (input.get("type") or input.get("name") or input.get("id")) == "password":
68
- return 1
69
- else:
70
- pass
71
- return 0
72
-
73
-
74
- # has_email_input
75
- def has_email_input(soup):
76
- for input in soup.find_all("input"):
77
- if (input.get("type") or input.get("id") or input.get("name")) == "email":
78
- return 1
79
- else:
80
- pass
81
- return 0
82
-
83
-
84
- # has_hidden_element
85
- def has_hidden_element(soup):
86
- for input in soup.find_all("input"):
87
- if input.get("type") == "hidden":
88
- return 1
89
- else:
90
- pass
91
- return 0
92
-
93
-
94
- # has_audio
95
- def has_audio(soup):
96
- if len(soup.find_all("audio")) > 0:
97
- return 1
98
- else:
99
- return 0
100
-
101
-
102
- # has_video
103
- def has_video(soup):
104
- if len(soup.find_all("video")) > 0:
105
- return 1
106
- else:
107
- return 0
108
-
109
-
110
- # number_of_inputs
111
- def number_of_inputs(soup):
112
- return len(soup.find_all("input"))
113
-
114
-
115
- # number_of_buttons
116
- def number_of_buttons(soup):
117
- return len(soup.find_all("button"))
118
-
119
-
120
- # number_of_images
121
- def number_of_images(soup):
122
- image_tags = len(soup.find_all("image"))
123
- count = 0
124
- for meta in soup.find_all("meta"):
125
- if meta.get("type") or meta.get("name") == "image":
126
- count += 1
127
- return image_tags + count
128
-
129
-
130
- # number_of_option
131
- def number_of_option(soup):
132
- return len(soup.find_all("option"))
133
-
134
-
135
- # number_of_list
136
- def number_of_list(soup):
137
- return len(soup.find_all("li"))
138
-
139
-
140
- # number_of_TH
141
- def number_of_TH(soup):
142
- return len(soup.find_all("th"))
143
-
144
-
145
- # number_of_TR
146
- def number_of_TR(soup):
147
- return len(soup.find_all("tr"))
148
-
149
-
150
- # number_of_href
151
- def number_of_href(soup):
152
- count = 0
153
- for link in soup.find_all("link"):
154
- if link.get("href"):
155
- count += 1
156
- return count
157
-
158
-
159
- # number_of_paragraph
160
- def number_of_paragraph(soup):
161
- return len(soup.find_all("p"))
162
-
163
-
164
- # number_of_script
165
- def number_of_script(soup):
166
- return len(soup.find_all("script"))
167
-
168
-
169
- # length_of_title
170
- def length_of_title(soup):
171
- if soup.title == None:
172
- return 0
173
- return len(soup.title.text)
174
-
175
-
176
- """
177
- print("has_title --> ", has_title(soup))
178
- print("has_input --> ", has_input(soup))
179
- print("has_button --> ", has_button(soup))
180
- print("has_image --> ", has_image(soup))
181
- print("has_submit --> ", has_submit(soup))
182
- print("has_link --> ", has_link(soup))
183
- print("has_password --> ", has_password(soup))
184
- print("has_email_input --> ", has_email_input(soup))
185
- print("has_hidden_element --> ", has_hidden_element(soup))
186
- print("has_audio --> ", has_audio(soup))
187
- print("has_video --> ", has_video(soup))
188
- print("number_of_inputs --> ", number_of_inputs(soup))
189
- print("number_of_buttons --> ", number_of_buttons(soup))
190
- print("number_of_images --> ", number_of_images(soup))
191
- print("number_of_option --> ", number_of_option(soup))
192
- print("number_of_list --> ", number_of_list(soup))
193
- print("number_of_TH --> ", number_of_TH(soup))
194
- print("number_of_TR --> ", number_of_TR(soup))
195
- print("number_of_href --> ", number_of_href(soup))
196
- print("number_of_paragraph --> ", number_of_paragraph(soup))
197
- print("number_of_script --> ", number_of_script(soup))
198
- print("length_of_title --> ", length_of_title(soup))
199
-
200
- """
201
-
202
-
203
- # has h1
204
- def has_h1(soup):
205
- if len(soup.find_all("h1")) > 0:
206
- return 1
207
- else:
208
- return 0
209
-
210
-
211
- # has h2
212
- def has_h2(soup):
213
- if len(soup.find_all("h2")) > 0:
214
- return 1
215
- else:
216
- return 0
217
-
218
-
219
- # has h3
220
- def has_h3(soup):
221
- if len(soup.find_all("h3")) > 0:
222
- return 1
223
- else:
224
- return 0
225
-
226
-
227
- # length of text
228
- def length_of_text(soup):
229
- return len(soup.get_text())
230
-
231
-
232
- # number of clickable button
233
- def number_of_clickable_button(soup):
234
- count = 0
235
- for button in soup.find_all("button"):
236
- if button.get("type") == "button":
237
- count += 1
238
- return count
239
-
240
-
241
- # number of a
242
- def number_of_a(soup):
243
- return len(soup.find_all("a"))
244
-
245
-
246
- # number of img
247
- def number_of_img(soup):
248
- return len(soup.find_all("img"))
249
-
250
-
251
- # number of div class
252
- def number_of_div(soup):
253
- return len(soup.find_all("div"))
254
-
255
-
256
- # number of figures
257
- def number_of_figure(soup):
258
- return len(soup.find_all("figure"))
259
-
260
-
261
- # has footer
262
- def has_footer(soup):
263
- if len(soup.find_all("footer")) > 0:
264
- return 1
265
- else:
266
- return 0
267
-
268
-
269
- # has form
270
- def has_form(soup):
271
- if len(soup.find_all("form")) > 0:
272
- return 1
273
- else:
274
- return 0
275
-
276
-
277
- # has textarea
278
- def has_text_area(soup):
279
- if len(soup.find_all("textarea")) > 0:
280
- return 1
281
- else:
282
- return 0
283
-
284
-
285
- # has iframe
286
- def has_iframe(soup):
287
- if len(soup.find_all("iframe")) > 0:
288
- return 1
289
- else:
290
- return 0
291
-
292
-
293
- # has text input
294
- def has_text_input(soup):
295
- for input in soup.find_all("input"):
296
- if input.get("type") == "text":
297
- return 1
298
- return 0
299
-
300
-
301
- # number of meta
302
- def number_of_meta(soup):
303
- return len(soup.find_all("meta"))
304
-
305
-
306
- # has nav
307
- def has_nav(soup):
308
- if len(soup.find_all("nav")) > 0:
309
- return 1
310
- else:
311
- return 0
312
-
313
-
314
- # has object
315
- def has_object(soup):
316
- if len(soup.find_all("object")) > 0:
317
- return 1
318
- else:
319
- return 0
320
-
321
-
322
- # has picture
323
- def has_picture(soup):
324
- if len(soup.find_all("picture")) > 0:
325
- return 1
326
- else:
327
- return 0
328
-
329
-
330
- # number of sources
331
- def number_of_sources(soup):
332
- return len(soup.find_all("source"))
333
-
334
-
335
- # number of span
336
- def number_of_span(soup):
337
- return len(soup.find_all("span"))
338
-
339
-
340
- # number of table
341
- def number_of_table(soup):
342
- return len(soup.find_all("table"))
343
-
344
-
345
-
346
-
347
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
xgboost_wrapper.py DELETED
@@ -1,246 +0,0 @@
1
- """
2
- XGBoost Model Wrapper
3
- This module provides a safe wrapper around the XGBoost models for phishing detection.
4
- Loads models from Hugging Face Hub: th1enq/xgboost_checkpoint
5
- """
6
-
7
- import os
8
- import sys
9
- import joblib
10
- import pickle
11
- from bs4 import BeautifulSoup
12
- from huggingface_hub import hf_hub_download
13
-
14
- # Add xgboost directory to path for feature extraction modules
15
- xgboost_dir = os.path.join(os.path.dirname(__file__), 'xgboost')
16
- sys.path.append(xgboost_dir)
17
-
18
- try:
19
- import features as fe
20
- from URLFeatureExtraction import featureExtraction
21
- XGBOOST_AVAILABLE = True
22
- except ImportError as e:
23
- print(f"XGBoost modules not available: {e}")
24
- XGBOOST_AVAILABLE = False
25
-
26
- def load_model_from_hub(repo_id, filename):
27
- """Load model from Hugging Face Hub"""
28
- try:
29
- # Download model from Hugging Face Hub
30
- model_path = hf_hub_download(repo_id=repo_id, filename=filename)
31
-
32
- # Try different loading methods to handle version compatibility
33
- try:
34
- return joblib.load(model_path)
35
- except Exception as e1:
36
- try:
37
- # Try with pickle
38
- with open(model_path, 'rb') as f:
39
- return pickle.load(f)
40
- except Exception as e2:
41
- print(f"Failed to load model {filename} from {repo_id}: {e1}")
42
- print(f"Pickle fallback failed: {e2}")
43
- return None
44
-
45
- except Exception as e:
46
- print(f"Failed to download model {filename} from {repo_id}: {e}")
47
- return None
48
-
49
- def load_model_safe(model_path):
50
- """Safely load a local model, handling version compatibility issues"""
51
- try:
52
- # Try loading with joblib first
53
- return joblib.load(model_path)
54
- except Exception as e1:
55
- try:
56
- # Try loading with pickle
57
- with open(model_path, 'rb') as f:
58
- return pickle.load(f)
59
- except Exception as e2:
60
- print(f"Failed to load model {model_path}")
61
- print(f"Joblib error: {e1}")
62
- print(f"Pickle error: {e2}")
63
- return None
64
-
65
- def extract_features_from_html(html_content):
66
- """Extract features from HTML content for phishing detection"""
67
- if not XGBOOST_AVAILABLE:
68
- return None
69
-
70
- try:
71
- soup = BeautifulSoup(html_content, "html.parser")
72
-
73
- features = [
74
- fe.has_title(soup),
75
- fe.has_input(soup),
76
- fe.has_button(soup),
77
- fe.has_image(soup),
78
- fe.has_submit(soup),
79
- fe.has_link(soup),
80
- fe.has_password(soup),
81
- fe.has_email_input(soup),
82
- fe.has_hidden_element(soup),
83
- fe.has_audio(soup),
84
- fe.has_video(soup),
85
- fe.number_of_inputs(soup),
86
- fe.number_of_buttons(soup),
87
- fe.number_of_images(soup),
88
- fe.number_of_option(soup),
89
- fe.number_of_list(soup),
90
- fe.number_of_TH(soup),
91
- fe.number_of_TR(soup),
92
- fe.number_of_href(soup),
93
- fe.number_of_paragraph(soup),
94
- fe.number_of_script(soup),
95
- fe.length_of_title(soup),
96
- fe.has_h1(soup),
97
- fe.has_h2(soup),
98
- fe.has_h3(soup),
99
- fe.length_of_text(soup),
100
- fe.number_of_clickable_button(soup),
101
- fe.number_of_a(soup),
102
- fe.number_of_img(soup),
103
- fe.number_of_div(soup),
104
- fe.number_of_figure(soup),
105
- fe.has_footer(soup),
106
- fe.has_form(soup),
107
- fe.has_text_area(soup),
108
- fe.has_iframe(soup),
109
- fe.has_text_input(soup),
110
- fe.number_of_meta(soup),
111
- fe.has_nav(soup),
112
- fe.has_object(soup),
113
- fe.has_picture(soup),
114
- fe.number_of_sources(soup),
115
- fe.number_of_span(soup),
116
- fe.number_of_table(soup)
117
- ]
118
-
119
- return features
120
- except Exception as e:
121
- print(f"Error extracting HTML features: {e}")
122
- return [0] * 43
123
-
124
- def extract_features_from_url(url):
125
- """Extract features from URL for phishing detection"""
126
- if not XGBOOST_AVAILABLE:
127
- return None
128
-
129
- try:
130
- return featureExtraction(url)
131
- except Exception as e:
132
- print(f"Error extracting URL features: {e}")
133
- return None
134
-
135
- class XGBoostPhishingDetector:
136
- def __init__(self):
137
- self.html_model = None
138
- self.url_model = None
139
- self.available = XGBOOST_AVAILABLE
140
-
141
- if self.available:
142
- self._load_models()
143
-
144
- def _load_models(self):
145
- """Load the XGBoost models from Hugging Face Hub"""
146
- repo_id = "th1enq/xgboost_checkpoint"
147
-
148
- # Try to load from Hugging Face Hub first
149
- print("🔄 Loading XGBoost models from Hugging Face Hub...")
150
-
151
- self.html_model = load_model_from_hub(repo_id, 'xgboost_html.joblib')
152
- if self.html_model:
153
- print("✅ HTML XGBoost model loaded from Hugging Face Hub")
154
- else:
155
- print("❌ Failed to load HTML XGBoost model from Hugging Face Hub")
156
- # Fallback to local file
157
- html_model_path = os.path.join(xgboost_dir, 'xgboost_html.joblib')
158
- if os.path.exists(html_model_path):
159
- self.html_model = load_model_safe(html_model_path)
160
- print("✅ HTML XGBoost model loaded from local file")
161
-
162
- self.url_model = load_model_from_hub(repo_id, 'xgboost_url.joblib')
163
- if self.url_model:
164
- print("✅ URL XGBoost model loaded from Hugging Face Hub")
165
- else:
166
- print("❌ Failed to load URL XGBoost model from Hugging Face Hub")
167
- # Fallback to local file
168
- url_model_path = os.path.join(xgboost_dir, 'xgboost_url.joblib')
169
- if os.path.exists(url_model_path):
170
- self.url_model = load_model_safe(url_model_path)
171
- print("✅ URL XGBoost model loaded from local file")
172
-
173
- def predict_html(self, html_content):
174
- """Predict phishing from HTML content"""
175
- if not self.available or not self.html_model:
176
- return None
177
-
178
- features = extract_features_from_html(html_content)
179
- if features is None:
180
- return None
181
-
182
- try:
183
- prediction = self.html_model.predict([features])[0]
184
- probability = self.html_model.predict_proba([features])[0] if hasattr(self.html_model, 'predict_proba') else [1-prediction, prediction]
185
- return {
186
- 'prediction': int(prediction),
187
- 'probability': probability,
188
- 'is_phishing': prediction == 1
189
- }
190
- except Exception as e:
191
- print(f"Error predicting HTML: {e}")
192
- return None
193
-
194
- def predict_url(self, url):
195
- """Predict phishing from URL"""
196
- if not self.available or not self.url_model:
197
- return None
198
-
199
- features = extract_features_from_url(url)
200
- if features is None:
201
- return None
202
-
203
- try:
204
- prediction = self.url_model.predict([features])[0]
205
- probability = self.url_model.predict_proba([features])[0] if hasattr(self.url_model, 'predict_proba') else [1-prediction, prediction]
206
- return {
207
- 'prediction': int(prediction),
208
- 'probability': probability,
209
- 'is_phishing': prediction == 1
210
- }
211
- except Exception as e:
212
- print(f"Error predicting URL: {e}")
213
- return None
214
-
215
- def predict_combined(self, url, html_content=None, url_weight=0.3, html_weight=0.7):
216
- """Predict using both URL and HTML analysis"""
217
- url_result = self.predict_url(url)
218
- html_result = None
219
-
220
- if html_content:
221
- html_result = self.predict_html(html_content)
222
-
223
- if url_result and html_result:
224
- # Combine predictions
225
- combined_prob = [
226
- url_weight * url_result['probability'][0] + html_weight * html_result['probability'][0],
227
- url_weight * url_result['probability'][1] + html_weight * html_result['probability'][1]
228
- ]
229
- combined_prediction = 1 if combined_prob[1] > combined_prob[0] else 0
230
-
231
- return {
232
- 'prediction': combined_prediction,
233
- 'probability': combined_prob,
234
- 'is_phishing': combined_prediction == 1,
235
- 'url_result': url_result,
236
- 'html_result': html_result
237
- }
238
- elif url_result:
239
- return url_result
240
- elif html_result:
241
- return html_result
242
- else:
243
- return None
244
-
245
- # Global instance
246
- xgboost_detector = XGBoostPhishingDetector()