Phishing-Detector

Sleeping

App Files Files Community

th1enq commited on 20 days ago

Commit

619e0de

1 Parent(s): d82d422

remove xgboost

Browse files

Files changed (5) hide show

app.py +205 -155
xgboost/URLFeatureExtraction.py +0 -382
xgboost/__init__.py +0 -0
xgboost/features.py +0 -347
xgboost_wrapper.py +0 -246

app.py CHANGED Viewed

@@ -10,24 +10,7 @@ from bs4 import BeautifulSoup
 import time
 import joblib
-# Try to import XGBoost wrapper, handle gracefully if not available
-try:
-    from xgboost_wrapper import xgboost_detector
-    XGBOOST_AVAILABLE = True
-except Exception as e:
-    print(f"XGBoost wrapper not available: {e}")
-    XGBOOST_AVAILABLE = False
-    # Create a dummy detector
-    class DummyDetector:
-        def __init__(self):
-            self.available = False
-        def predict_combined(self, *args, **kwargs):
-            return None
-        def predict_url(self, *args, **kwargs):
-            return None
-        def predict_html(self, *args, **kwargs):
-            return None
-    xgboost_detector = DummyDetector()
 # --- import your architecture ---
 # Make sure this file is in the repo (e.g., models/deberta_lstm_classifier.py)
@@ -170,9 +153,6 @@ def predict_fn(text: str):
             # Get prediction for HTML content
             html_probs, html_tokens, html_has_attention, html_attention = predict_single_text(html_content, "HTML")
-            # Get XGBoost predictions
-            xgb_result = xgboost_detector.predict_combined(url, html_content)
             # Combine predictions
             combined_probs = combine_predictions(url_probs, html_probs)
@@ -185,12 +165,6 @@ def predict_fn(text: str):
             analysis_type = "Combined URL + HTML Analysis"
             fetch_status = f"✅ Successfully fetched HTML content (Status: {status})"
-            # Add XGBoost analysis if available
-            if xgb_result:
-                analysis_type += " + XGBoost"
-                xgb_info = f" | XGBoost: {'Phishing' if xgb_result['is_phishing'] else 'Benign'} ({xgb_result['probability'][1]:.1%})"
-                fetch_status += xgb_info
         else:
             # Fallback to URL-only analysis
             probs = url_probs
@@ -198,17 +172,8 @@ def predict_fn(text: str):
             has_attention = url_has_attention
             attention_weights = url_attention
-            # Get XGBoost URL prediction
-            xgb_result = xgboost_detector.predict_url(url)
             analysis_type = "URL-only Analysis"
             fetch_status = f"⚠️ Could not fetch HTML content: {status}"
-            # Add XGBoost analysis if available
-            if xgb_result:
-                analysis_type += " + XGBoost"
-                xgb_info = f" | XGBoost: {'Phishing' if xgb_result['is_phishing'] else 'Benign'} ({xgb_result['probability'][1]:.1%})"
-                fetch_status += xgb_info
     else:
         # Process as regular text
         probs, tokens, has_attention, attention_weights = predict_single_text(text, "text")
@@ -367,7 +332,7 @@ def predict_fn(text: str):
     return prediction_result, detailed_analysis
 # --------- BERT Model Functions ----------
-def predict_bert_single_text(text):
     """Predict for a single text input using BERT."""
     # Tokenize
     inputs = bert_tokenizer(
@@ -381,94 +346,77 @@ def predict_bert_single_text(text):
     inputs = {k: v.to(device) for k, v in inputs.items()}
     with torch.no_grad():
-        logits = bert_model(**inputs).logits
     probs = F.softmax(logits, dim=-1).squeeze(0).tolist()
-    return probs
-def predict_bert_interface_fn(text: str):
-    """Gradio interface function for BERT model."""
-    if not text or not text.strip():
-        return {"error": "Please enter a URL or text."}, ""
-    probs = predict_bert_single_text(text)
-    # Create detailed analysis
-    predicted_class = "phishing" if probs[1] > probs[0] else "benign"
-    confidence = max(probs)
-    detailed_analysis = f"""
-<div style="font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; background: #1e1e1e; padding: 20px; border-radius: 15px;">
-<div style="background: linear-gradient(135deg, {'#8b0000' if predicted_class == 'phishing' else '#006400'} 0%, {'#dc143c' if predicted_class == 'phishing' else '#228b22'} 100%); padding: 25px; border-radius: 20px; color: white; text-align: center; margin-bottom: 20px; box-shadow: 0 8px 32px rgba(0,0,0,0.5); border: 2px solid {'#ff4444' if predicted_class == 'phishing' else '#44ff44'};">
-    <h2 style="margin: 0 0 10px 0; font-size: 28px; color: white;">🔍 BERT Model Analysis</h2>
-    <div style="font-size: 36px; font-weight: bold; margin: 10px 0; color: white;">
-        {predicted_class.upper()}
-    </div>
-    <div style="font-size: 18px, color: #f0f0f0;">
-        Confidence: {confidence:.1%}
-    </div>
-</div>
-<div style="background: #2d2d2d; padding: 20px; border-radius: 15px; margin: 15px 0; border: 1px solid #555;">
-    <h3 style="color: #ffffff; margin-bottom: 15px;"> Prediction Confidence</h3>
-    <div style="display: flex; justify-content: space-between; margin-bottom: 10px;">
-        <span style="font-weight: bold; color: #ff4444;">Phishing</span>
-        <span style="font-weight: bold; color: #44ff44;">Benign</span>
-    </div>
-    <div style="width: 100%; background-color: #404040; border-radius: 25px; overflow: hidden; height: 30px; border: 1px solid #666;">
-        <div style="width: {probs[1]*100:.1f}%; background: linear-gradient(90deg, #ff4444 0%, #ff6666 100%); height: 100%; display: flex; align-items: center; justify-content: center; color: white; font-weight: bold; font-size: 14px;">
-            {probs[1]:.1%}
-        </div>
-    </div>
-    <div style="margin-top: 10px; text-align: center; color: #cccccc; font-size: 14px;">
-        Benign: {probs[0]:.1%}
-    </div>
-</div>
-</div>
-"""
-    # Build label->prob mapping for Gradio Label output
-    if len(LABELS) == len(probs):
-        prediction_result = {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
-    else:
-        prediction_result = {f"class_{i}": float(p) for i, p in enumerate(probs)}
-    return prediction_result, detailed_analysis
-# --------- XGBoost Interface Function ----------
-def predict_xgboost_interface_fn(text: str):
-    """Gradio interface function for XGBoost models."""
     if not text or not text.strip():
         return {"error": "Please enter a URL or text."}, ""
-    if not xgboost_detector.available:
-        return {"benign": 0.5, "phishing": 0.5}, "XGBoost models are not properly loaded."
     # Check if input is URL
     if is_url(text.strip()):
         url = text.strip()
         # Try to fetch HTML content
         html_content, status = fetch_html_content(url)
         if html_content:
-            result = xgboost_detector.predict_combined(url, html_content)
-            analysis_type = "Combined URL + HTML XGBoost Analysis"
             fetch_status = f"✅ Successfully fetched HTML content (Status: {status})"
         else:
-            result = xgboost_detector.predict_url(url)
-            analysis_type = "URL-only XGBoost Analysis"
             fetch_status = f"⚠️ Could not fetch HTML content: {status}"
     else:
-        # For text input, treat as HTML content
-        result = xgboost_detector.predict_html(text)
-        analysis_type = "HTML Content XGBoost Analysis"
         fetch_status = ""
-    if not result:
-        return {"benign": 0.5, "phishing": 0.5}, "Failed to get prediction from XGBoost models."
-    predicted_class = "phishing" if result['is_phishing'] else "benign"
-    confidence = max(result['probability'])
     detailed_analysis = f"""
 <div style="font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; background: #1e1e1e; padding: 20px; border-radius: 15px;">
@@ -480,6 +428,9 @@ def predict_xgboost_interface_fn(text: str):
     <div style="font-size: 18px; color: #f0f0f0;">
         Confidence: {confidence:.1%}
     </div>
 </div>
 """
@@ -490,59 +441,132 @@ def predict_xgboost_interface_fn(text: str):
 </div>
 """
-    # Show detailed XGBoost results
-    detailed_analysis += f"""
-<div style="background: #2d2d2d; padding: 20px; border-radius: 15px; margin: 15px 0; border: 1px solid #555;">
-    <h3 style="color: #ffffff; margin-bottom: 15px;">🎯 XGBoost Prediction Confidence</h3>
     <div style="display: flex; justify-content: space-between; margin-bottom: 10px;">
         <span style="font-weight: bold; color: #ff4444;">Phishing</span>
         <span style="font-weight: bold; color: #44ff44;">Benign</span>
     </div>
     <div style="width: 100%; background-color: #404040; border-radius: 25px; overflow: hidden; height: 30px; border: 1px solid #666;">
-        <div style="width: {result['probability'][1]*100:.1f}%; background: linear-gradient(90deg, #ff4444 0%, #ff6666 100%); height: 100%; display: flex; align-items: center; justify-content: center; color: white; font-weight: bold; font-size: 14px;">
-            {result['probability'][1]:.1%}
         </div>
     </div>
     <div style="margin-top: 10px; text-align: center; color: #cccccc; font-size: 14px;">
-        Benign: {result['probability'][0]:.1%}
     </div>
 </div>
 """
-    # Show component analysis if available
-    if 'url_result' in result and 'html_result' in result:
         detailed_analysis += f"""
 <div style="background: linear-gradient(135deg, #1a237e 0%, #3949ab 100%); padding: 20px; border-radius: 15px; color: white; margin: 15px 0; border: 2px solid #3f51b5;">
-    <h3 style="margin: 0 0 15px 0; color: white;">🔬 Component Analysis</h3>
-    <div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 15px;">
-        <div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 10px; border: 1px solid rgba(255,255,255,0.2);">
-            <div style="font-size: 18px; font-weight: bold; color: white;">URL Analysis</div>
-            <div style="font-size: 24px; font-weight: bold; color: {'#ff6666' if result['url_result']['is_phishing'] else '#66ff66'};">
-                {'Phishing' if result['url_result']['is_phishing'] else 'Benign'}
-            </div>
-            <div style="font-size: 14px; color: #e0e0e0;">{result['url_result']['probability'][1]:.1%} phishing</div>
         </div>
-        <div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 10px; border: 1px solid rgba(255,255,255,0.2);">
-            <div style="font-size: 18px; font-weight: bold; color: white;">HTML Analysis</div>
-            <div style="font-size: 24px; font-weight: bold; color: {'#ff6666' if result['html_result']['is_phishing'] else '#66ff66'};">
-                {'Phishing' if result['html_result']['is_phishing'] else 'Benign'}
-            </div>
-            <div style="font-size: 14px; color: #e0e0e0;">{result['html_result']['probability'][1]:.1%} phishing</div>
         </div>
     </div>
 </div>
 """
     detailed_analysis += "</div>"
     # Build label->prob mapping for Gradio Label output
-    if len(LABELS) == len(result['probability']):
-        prediction_result = {LABELS[i]: float(result['probability'][i]) for i in range(len(LABELS))}
     else:
-        prediction_result = {f"class_{i}": float(p) for i, p in enumerate(result['probability'])}
     return prediction_result, detailed_analysis
 # --------- Gradio UI ----------
 deberta_interface = gr.Interface(
     fn=predict_fn,
@@ -613,44 +637,70 @@ bert_interface = gr.Interface(
     inputs=gr.Textbox(label="URL or text", placeholder="Example: http://suspicious-site.example or paste any text"),
     outputs=[
         gr.Label(label="Prediction result"),
-        gr.Markdown(label="Detailed analysis")
     ],
     title="Phishing Detector (BERT)",
-    description="Enter a URL or text for analysis using the BERT model.",
-    examples=[
-        ["http://rendmoiunserviceeee.com"],
-        ["https://www.google.com"],
-        ["Dear customer, your account has been suspended. Click here to verify your identity immediately."],
-    ]
-)
-xgboost_interface = gr.Interface(
-    fn=predict_xgboost_interface_fn,
-    inputs=gr.Textbox(label="URL or HTML content", placeholder="Example: http://suspicious-site.example or paste HTML content"),
-    outputs=[
-        gr.Label(label="Prediction result"),
-        gr.Markdown(label="Detailed analysis")
-    ],
-    title="Phishing Detector (XGBoost)",
     description="""
-    Enter a URL or HTML content for analysis using XGBoost models.
     **Features:**
-    - **URL Feature Analysis**: Extracts 30+ features from URL structure
-    - **HTML Feature Analysis**: Extracts 43+ features from HTML content
-    - **Combined Analysis**: For URLs, combines both URL and HTML features
-    - **Fast Prediction**: Traditional ML approach for quick results
     """,
     examples=[
         ["http://rendmoiunserviceeee.com"],
         ["https://www.google.com"],
         ["http://paypaI-security-update.net/login"],
-        ["<html><head><title>Urgent Security Alert</title></head><body><form><input type='password'></form></body></html>"],
-    ]
 )
 demo = gr.TabbedInterface(
-    [deberta_interface, bert_interface, xgboost_interface],
-    ["DeBERTa + LSTM", "BERT", "XGBoost"]
 )
 if __name__ == "__main__":

 import time
 import joblib
 # --- import your architecture ---
 # Make sure this file is in the repo (e.g., models/deberta_lstm_classifier.py)
             # Get prediction for HTML content
             html_probs, html_tokens, html_has_attention, html_attention = predict_single_text(html_content, "HTML")
             # Combine predictions
             combined_probs = combine_predictions(url_probs, html_probs)
             analysis_type = "Combined URL + HTML Analysis"
             fetch_status = f"✅ Successfully fetched HTML content (Status: {status})"
         else:
             # Fallback to URL-only analysis
             probs = url_probs
             has_attention = url_has_attention
             attention_weights = url_attention
             analysis_type = "URL-only Analysis"
             fetch_status = f"⚠️ Could not fetch HTML content: {status}"
     else:
         # Process as regular text
         probs, tokens, has_attention, attention_weights = predict_single_text(text, "text")
     return prediction_result, detailed_analysis
 # --------- BERT Model Functions ----------
+def predict_bert_single_text(text, text_type="text"):
     """Predict for a single text input using BERT."""
     # Tokenize
     inputs = bert_tokenizer(
     inputs = {k: v.to(device) for k, v in inputs.items()}
     with torch.no_grad():
+        outputs = bert_model(**inputs, output_attentions=True)
+        logits = outputs.logits
     probs = F.softmax(logits, dim=-1).squeeze(0).tolist()
+    # Get tokens for visualization
+    tokens = bert_tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze(0).tolist())
+    # Get attention weights (use last layer, first head as approximation)
+    attention_weights = None
+    has_attention = False
+    if hasattr(outputs, 'attentions') and outputs.attentions is not None:
+        # Average attention across all heads in the last layer
+        last_layer_attention = outputs.attentions[-1]  # Last layer
+        attention_weights = last_layer_attention.mean(dim=1).squeeze(0)  # Average across heads
+        # Use attention to [CLS] token as importance scores
+        attention_weights = attention_weights[0]  # [CLS] token attention to all tokens
+        has_attention = True
+    return probs, tokens, has_attention, attention_weights
+def predict_bert_interface_fn(text: str):
+    """Gradio interface function for BERT model."""
     if not text or not text.strip():
         return {"error": "Please enter a URL or text."}, ""
     # Check if input is URL
     if is_url(text.strip()):
+        # Process URL
         url = text.strip()
+        # Get prediction for URL itself
+        url_probs, url_tokens, url_has_attention, url_attention = predict_bert_single_text(url, "URL")
         # Try to fetch HTML content
         html_content, status = fetch_html_content(url)
         if html_content:
+            # Get prediction for HTML content
+            html_probs, html_tokens, html_has_attention, html_attention = predict_bert_single_text(html_content, "HTML")
+            # Combine predictions
+            combined_probs = combine_predictions(url_probs, html_probs)
+            # Use combined probabilities but show analysis for both
+            probs = combined_probs
+            tokens = url_tokens + ["[SEP]"] + html_tokens[:50]  # Limit HTML tokens for display
+            has_attention = url_has_attention or html_has_attention
+            attention_weights = url_attention if url_has_attention else html_attention
+            analysis_type = "Combined URL + HTML BERT Analysis"
             fetch_status = f"✅ Successfully fetched HTML content (Status: {status})"
         else:
+            # Fallback to URL-only analysis
+            probs = url_probs
+            tokens = url_tokens
+            has_attention = url_has_attention
+            attention_weights = url_attention
+            analysis_type = "URL-only BERT Analysis"
             fetch_status = f"⚠️ Could not fetch HTML content: {status}"
     else:
+        # Process as regular text
+        probs, tokens, has_attention, attention_weights = predict_bert_single_text(text, "text")
+        analysis_type = "BERT Text Analysis"
         fetch_status = ""
+    # Create detailed analysis
+    predicted_class = "phishing" if probs[1] > probs[0] else "benign"
+    confidence = max(probs)
     detailed_analysis = f"""
 <div style="font-family: Arial, sans-serif; max-width: 800px; margin: 0 auto; background: #1e1e1e; padding: 20px; border-radius: 15px;">
     <div style="font-size: 18px; color: #f0f0f0;">
         Confidence: {confidence:.1%}
     </div>
+    <div style="margin-top: 15px; font-size: 14px; color: #e0e0e0;">
+        {'This appears to be a phishing attempt!' if predicted_class == 'phishing' else '✅ This appears to be legitimate content.'}
+    </div>
 </div>
 """
 </div>
 """
+    if has_attention and attention_weights is not None:
+        attention_scores = attention_weights.squeeze(0).tolist() if attention_weights.dim() > 1 else attention_weights.tolist()
+        token_analysis = []
+        for i, (token, score) in enumerate(zip(tokens, attention_scores)):
+            # More lenient filtering - include more tokens for text analysis
+            if token not in ['[CLS]', '[SEP]', '[PAD]', '<s>', '</s>'] and len(token.strip()) > 0 and score > 0.005:
+                clean_token = token.replace('▁', '').replace('Ġ', '').strip()  # Handle different tokenizer prefixes
+                if clean_token:  # Only add if token has content after cleaning
+                    token_analysis.append({
+                        'token': clean_token,
+                        'importance': score,
+                        'position': i
+                    })
+        # Sort by importance
+        token_analysis.sort(key=lambda x: x['importance'], reverse=True)
+        detailed_analysis += f"""
+## Top important tokens:
+<div style="background: #2d2d2d; padding: 15px; border-radius: 10px; margin: 15px 0; border-left: 4px solid #4caf50; color: #e0e0e0;">
+    <strong>Analysis Info:</strong> Found {len(token_analysis)} important tokens out of {len(tokens)} total tokens
+</div>
+<div style="font-family: Arial, sans-serif;">
+"""
+        for i, token_info in enumerate(token_analysis[:10]):  # Top 10 tokens
+            bar_width = int(token_info['importance'] * 100)
+            color = "#ff4444" if predicted_class == "phishing" else "#44ff44"
+            detailed_analysis += f"""
+<div style="margin: 8px 0; display: flex; align-items: center; background: #2d2d2d; padding: 8px; border-radius: 8px; border-left: 4px solid {color};">
+    <div style="width: 30px; text-align: right; margin-right: 10px; font-weight: bold; color: #ffffff;">
+        {i+1}.
+    </div>
+    <div style="width: 120px; margin-right: 10px; font-weight: bold; color: #e0e0e0; text-align: right;">
+        {token_info['token']}
+    </div>
+    <div style="width: 300px; background-color: #404040; border-radius: 10px; overflow: hidden; margin-right: 10px; border: 1px solid #555;">
+        <div style="width: {bar_width}%; background-color: {color}; height: 20px; border-radius: 10px; transition: width 0.3s ease;"></div>
+    </div>
+    <div style="color: #cccccc; font-size: 12px; font-weight: bold;">
+        {token_info['importance']:.1%}
+    </div>
+</div>
+"""
+        detailed_analysis += "</div>\n"
+        detailed_analysis += f"""
+## Detailed analysis:
+<div style="font-family: Arial, sans-serif; background: linear-gradient(135deg, #1a237e 0%, #3949ab 100%); padding: 20px; border-radius: 15px; color: white; margin: 15px 0; border: 2px solid #3f51b5;">
+    <h3 style="margin: 0 0 15px 0; color: white;">Statistical Overview</h3>
+    <div style="display: grid; grid-template-columns: repeat(2, 1fr); gap: 15px;">
+        <div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 10px; border: 1px solid rgba(255,255,255,0.2);">
+            <div style="font-size: 24px; font-weight: bold; color: white;">{len([t for t in tokens if t not in ['[CLS]', '[SEP]', '[PAD]']])}</div>
+            <div style="font-size: 14px; color: #e0e0e0;">Total tokens</div>
+        </div>
+        <div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 10px; border: 1px solid rgba(255,255,255,0.2);">
+            <div style="font-size: 24px; font-weight: bold; color: white;">{len([t for t in token_analysis if t['importance'] > 0.05])}</div>
+            <div style="font-size: 14px; color: #e0e0e0;">High impact tokens (>5%)</div>
+        </div>
+    </div>
+</div>
+<div style="font-family: Arial, sans-serif; margin: 15px 0; background: #2d2d2d; padding: 20px; border-radius: 15px; border: 1px solid #555;">
+    <h3 style="color: #ffffff; margin-bottom: 15px;"> Prediction Confidence</h3>
     <div style="display: flex; justify-content: space-between; margin-bottom: 10px;">
         <span style="font-weight: bold; color: #ff4444;">Phishing</span>
         <span style="font-weight: bold; color: #44ff44;">Benign</span>
     </div>
     <div style="width: 100%; background-color: #404040; border-radius: 25px; overflow: hidden; height: 30px; border: 1px solid #666;">
+        <div style="width: {probs[1]*100:.1f}%; background: linear-gradient(90deg, #ff4444 0%, #ff6666 100%); height: 100%; display: flex; align-items: center; justify-content: center; color: white; font-weight: bold; font-size: 14px;">
+            {probs[1]:.1%}
         </div>
     </div>
     <div style="margin-top: 10px; text-align: center; color: #cccccc; font-size: 14px;">
+        Benign: {probs[0]:.1%}
     </div>
 </div>
 """
+    else:
+        # Fallback analysis without attention weights
         detailed_analysis += f"""
 <div style="background: linear-gradient(135deg, #1a237e 0%, #3949ab 100%); padding: 20px; border-radius: 15px; color: white; margin: 15px 0; border: 2px solid #3f51b5;">
+    <h3 style="margin: 0 0 15px 0; color: white;">Basic Analysis</h3>
+    <div style="display: grid; grid-template-columns: repeat(3, 1fr); gap: 15px;">
+        <div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 10px; text-align: center; border: 1px solid rgba(255,255,255,0.2);">
+            <div style="font-size: 24px; font-weight: bold; color: white;">{probs[1]:.1%}</div>
+            <div style="font-size: 14px; color: #e0e0e0;">Phishing</div>
         </div>
+        <div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 10px; text-align: center; border: 1px solid rgba(255,255,255,0.2);">
+            <div style="font-size: 24px; font-weight: bold; color: white;">{probs[0]:.1%}</div>
+            <div style="font-size: 14px; color: #e0e0e0;">Benign</div>
         </div>
+        <div style="background: rgba(255,255,255,0.1); padding: 15px; border-radius: 10px; text-align: center; border: 1px solid rgba(255,255,255,0.2);">
+            <div style="font-size: 24px; font-weight: bold; color: white;">{len([t for t in tokens if t not in ['[CLS]', '[SEP]', '[PAD]']])}</div>
+            <div style="font-size: 14px; color: #e0e0e0;">Tokens</div>
+        </div>
+    </div>
+</div>
+<div style="background: #2d2d2d; padding: 20px; border-radius: 15px; margin: 15px 0; border: 1px solid #555;">
+    <h3 style="color: #ffffff; margin: 0 0 15px 0;">🔤 Tokens in text:</h3>
+    <div style="display: flex; flex-wrap: wrap; gap: 8px;">""" + ''.join([f'<span style="background: #404040; color: #64b5f6; padding: 4px 8px; border-radius: 15px; font-size: 12px; border: 1px solid #666;">{token.replace("▁", "")}</span>' for token in tokens if token not in ['[CLS]', '[SEP]', '[PAD]']]) + f"""</div>
+    <div style="margin-top: 15px; padding: 10px; background: #3d2914; border-radius: 8px; border-left: 4px solid #ff9800;">
+        <strong style="color: #ffcc02;">Debug info:</strong> <span style="color: #e0e0e0;">Found {len(tokens)} total tokens, {len([t for t in tokens if t not in ['[CLS]', '[SEP]', '[PAD]']])} content tokens</span>
     </div>
 </div>
+<div style="background: #3d2914; padding: 15px; border-radius: 10px; border-left: 4px solid #ff9800; margin: 15px 0;">
+    <p style="margin: 0; color: #ffcc02; font-size: 14px;">
+        <strong>Note:</strong> Detailed attention weights analysis is not available for the current model.
+    </p>
+</div>
 """
     detailed_analysis += "</div>"
     # Build label->prob mapping for Gradio Label output
+    if len(LABELS) == len(probs):
+        prediction_result = {LABELS[i]: float(probs[i]) for i in range(len(LABELS))}
     else:
+        prediction_result = {f"class_{i}": float(p) for i, p in enumerate(probs)}
     return prediction_result, detailed_analysis
 # --------- Gradio UI ----------
 deberta_interface = gr.Interface(
     fn=predict_fn,
     inputs=gr.Textbox(label="URL or text", placeholder="Example: http://suspicious-site.example or paste any text"),
     outputs=[
         gr.Label(label="Prediction result"),
+        gr.Markdown(label="Detailed token analysis")
     ],
     title="Phishing Detector (BERT)",
     description="""
+    Enter a URL or text for analysis using the BERT model.
     **Features:**
+    - **URL Analysis**: For URLs, the system will fetch HTML content and combine both URL and content analysis
+    - **Combined Prediction**: Uses weighted combination of URL structure and webpage content analysis
+    - **Visual Analysis**: Predict phishing/benign probability with visual charts
+    - **Token Importance**: Display the most important tokens in classification using attention weights
+    - **Detailed Insights**: Comprehensive analysis of the impact of each token
+    - **Dark Theme**: Beautiful interface with colorful charts optimized for dark themes
+    **How it works for URLs:**
+    1. Analyze the URL structure itself
+    2. Fetch the webpage HTML content
+    3. Analyze the webpage content
+    4. Combine both results for final prediction (30% URL + 70% content)
     """,
     examples=[
         ["http://rendmoiunserviceeee.com"],
         ["https://www.google.com"],
+        ["Dear customer, your account has been suspended. Click here to verify your identity immediately."],
+        ["https://mail-secure-login-verify.example/path?token=suspicious"],
         ["http://paypaI-security-update.net/login"],
+        ["Your package has been delivered successfully. Thank you for using our service."],
+        ["https://github.com/user/repo"]
+    ],
+    theme=gr.themes.Soft(),
+    css="""
+    .gradio-container {
+        font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+        background-color: #1e1e1e !important;
+        color: #ffffff !important;
+    }
+    .dark .gradio-container {
+        background-color: #1e1e1e !important;
+    }
+    /* Dark theme for all components */
+    .block {
+        background-color: #2d2d2d !important;
+        border: 1px solid #444 !important;
+    }
+    .gradio-textbox {
+        background-color: #3d3d3d !important;
+        color: #ffffff !important;
+        border: 1px solid #666 !important;
+    }
+    .gradio-button {
+        background-color: #4a4a4a !important;
+        color: #ffffff !important;
+        border: 1px solid #666 !important;
+    }
+    .gradio-button:hover {
+        background-color: #5a5a5a !important;
+    }
+    """
 )
 demo = gr.TabbedInterface(
+    [deberta_interface, bert_interface],
+    ["DeBERTa + LSTM", "BERT"]
 )
 if __name__ == "__main__":

xgboost/URLFeatureExtraction.py DELETED Viewed

@@ -1,382 +0,0 @@
-# -*- coding: utf-8 -*-
-# importing required packages for this section
-from urllib.parse import urlparse,urlencode
-import ipaddress
-import re
-"""#### **3.1.1. Domain of the URL**
-Here, we are just extracting the domain present in the URL. This feature doesn't have much significance in the training. May even be dropped while training the model.
-"""
-'''
-# 1.Domain of the URL (Domain)
-def getDomain(url):
-  domain = urlparse(url).netloc
-  if re.match(r"^www.",domain):
-	       domain = domain.replace("www.","")
-  return domain'''
-"""#### **3.1.2. IP Address in the URL**
-Checks for the presence of IP address in the URL. URLs may have IP address instead of domain name. If an IP address is used as an alternative of the domain name in the URL, we can be sure that someone is trying to steal personal information with this URL.
-If the domain part of URL has IP address, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
-"""
-# 2.Checks for IP address in URL (Have_IP)
-def havingIP(url):
-  try:
-    ipaddress.ip_address(url)
-    ip = 1
-  except:
-    ip = 0
-  return ip
-"""#### **3.1.3. "@" Symbol in URL**
-Checks for the presence of '@' symbol in the URL. Using “@” symbol in the URL leads the browser to ignore everything preceding the “@” symbol and the real address often follows the “@” symbol.
-If the URL has '@' symbol, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
-"""
-# 3.Checks the presence of @ in URL (Have_At)
-def haveAtSign(url):
-  if "@" in url:
-    at = 1
-  else:
-    at = 0
-  return at
-"""#### **3.1.4. Length of URL**
-Computes the length of the URL. Phishers can use long URL to hide the doubtful part in the address bar. In this project, if the length of the URL is greater than or equal 54 characters then the URL classified as phishing otherwise legitimate.
-If the length of URL >= 54 , the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
-"""
-# 4.Finding the length of URL and categorizing (URL_Length)
-def getLength(url):
-  if len(url) < 54:
-    length = 0
-  else:
-    length = 1
-  return length
-"""#### **3.1.5. Depth of URL**
-Computes the depth of the URL. This feature calculates the number of sub pages in the given url based on the '/'.
-The value of feature is a numerical based on the URL.
-"""
-# 5.Gives number of '/' in URL (URL_Depth)
-def getDepth(url):
-  s = urlparse(url).path.split('/')
-  depth = 0
-  for j in range(len(s)):
-    if len(s[j]) != 0:
-      depth = depth+1
-  return depth
-"""#### **3.1.6. Redirection "//" in URL**
-Checks the presence of "//" in the URL. The existence of “//” within the URL path means that the user will be redirected to another website. The location of the “//” in URL is computed. We find that if the URL starts with “HTTP”, that means the “//” should appear in the sixth position. However, if the URL employs “HTTPS” then the “//” should appear in seventh position.
-If the "//" is anywhere in the URL apart from after the protocal, thee value assigned to this feature is 1 (phishing) or else 0 (legitimate).
-"""
-# 6.Checking for redirection '//' in the url (Redirection)
-def redirection(url):
-  pos = url.rfind('//')
-  if pos > 6:
-    if pos > 7:
-      return 1
-    else:
-      return 0
-  else:
-    return 0
-"""#### **3.1.7. "http/https" in Domain name**
-Checks for the presence of "http/https" in the domain part of the URL. The phishers may add the “HTTPS” token to the domain part of a URL in order to trick users.
-If the URL has "http/https" in the domain part, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
-"""
-# 7.Existence of “HTTPS” Token in the Domain Part of the URL (https_Domain)
-def httpDomain(url):
-  domain = urlparse(url).netloc
-  if 'https' in domain:
-    return 1
-  else:
-    return 0
-"""#### **3.1.8. Using URL Shortening Services “TinyURL”**
-URL shortening is a method on the “World Wide Web” in which a URL may be made considerably smaller in length and still lead to the required webpage. This is accomplished by means of an “HTTP Redirect” on a domain name that is short, which links to the webpage that has a long URL.
-If the URL is using Shortening Services, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
-"""
-#listing shortening services
-shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
-                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
-                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
-                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
-                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
-                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
-                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
-                      r"tr\.im|link\.zip\.net"
-# 8. Checking for Shortening Services in URL (Tiny_URL)
-def tinyURL(url):
-    match=re.search(shortening_services,url)
-    if match:
-        return 1
-    else:
-        return 0
-"""#### **3.1.9. Prefix or Suffix "-" in Domain**
-Checking the presence of '-' in the domain part of URL. The dash symbol is rarely used in legitimate URLs. Phishers tend to add prefixes or suffixes separated by (-) to the domain name so that users feel that they are dealing with a legitimate webpage.
-If the URL has '-' symbol in the domain part of the URL, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
-"""
-# 9.Checking for Prefix or Suffix Separated by (-) in the Domain (Prefix/Suffix)
-def prefixSuffix(url):
-    if '-' in urlparse(url).netloc:
-        return 1            # phishing
-    else:
-        return 0            # legitimate
-"""### **3.2. Domain Based Features:**
-Many features can be extracted that come under this category. Out of them, below mentioned were considered for this project.
-*   DNS Record
-*   Website Traffic
-*   Age of Domain
-*   End Period of Domain
-Each of these features are explained and the coded below:
-"""
-#!pip install python-whois
-# importing required packages for this section
-import re
-from bs4 import BeautifulSoup
-#import whois
-import urllib
-import urllib.request
-from datetime import datetime
-"""#### **3.2.1. DNS Record**
-For phishing websites, either the claimed identity is not recognized by the WHOIS database or no records founded for the hostname.
-If the DNS record is empty or not found then, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
-"""
-# 11.DNS Record availability (DNS_Record)
-# obtained in the featureExtraction function itself
-"""#### **3.2.2. Web Traffic**
-This feature measures the popularity of the website by determining the number of visitors and the number of pages they visit. However, since phishing websites live for a short period of time, they may not be recognized by the Alexa database (Alexa the Web Information Company., 1996). By reviewing our dataset, we find that in worst scenarios, legitimate websites ranked among the top 100,000. Furthermore, if the domain has no traffic or is not recognized by the Alexa database, it is classified as “Phishing”.
-If the rank of the domain < 100000, the vlaue of this feature is 1 (phishing) else 0 (legitimate).
-"""
-# 12.Web traffic (Web_Traffic)
-def web_traffic(url):
-  try:
-    #Filling the whitespaces in the URL if any
-    url = urllib.parse.quote(url)
-    rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find(
-        "REACH")['RANK']
-    rank = int(rank)
-  except TypeError:
-        return 1
-  if rank <100000:
-    return 1
-  else:
-    return 0
-"""#### **3.2.3. Age of Domain**
-This feature can be extracted from WHOIS database. Most phishing websites live for a short period of time. The minimum age of the legitimate domain is considered to be 12 months for this project. Age here is nothing but different between creation and expiration time.
-If age of domain > 12 months, the vlaue of this feature is 1 (phishing) else 0 (legitimate).
-"""
-# 13.Survival time of domain: The difference between termination time and creation time (Domain_Age)
-def domainAge(domain_name):
-  creation_date = domain_name.creation_date
-  expiration_date = domain_name.expiration_date
-  if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
-    try:
-      creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
-      expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
-    except:
-      return 1
-  if ((expiration_date is None) or (creation_date is None)):
-      return 1
-  elif ((type(expiration_date) is list) or (type(creation_date) is list)):
-      return 1
-  else:
-    ageofdomain = abs((expiration_date - creation_date).days)
-    if ((ageofdomain/30) < 6):
-      age = 1
-    else:
-      age = 0
-  return age
-"""#### **3.2.4. End Period of Domain**
-This feature can be extracted from WHOIS database. For this feature, the remaining domain time is calculated by finding the different between expiration time & current time. The end period considered for the legitimate domain is 6 months or less  for this project.
-If end period of domain > 6 months, the vlaue of this feature is 1 (phishing) else 0 (legitimate).
-"""
-# 14.End time of domain: The difference between termination time and current time (Domain_End)
-def domainEnd(domain_name):
-  expiration_date = domain_name.expiration_date
-  if isinstance(expiration_date,str):
-    try:
-      expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
-    except:
-      return 1
-  if (expiration_date is None):
-      return 1
-  elif (type(expiration_date) is list):
-      return 1
-  else:
-    today = datetime.now()
-    end = abs((expiration_date - today).days)
-    if ((end/30) < 6):
-      end = 0
-    else:
-      end = 1
-  return end
-"""## **3.3. HTML and JavaScript based Features**
-Many features can be extracted that come under this category. Out of them, below mentioned were considered for this project.
-*   IFrame Redirection
-*   Status Bar Customization
-*   Disabling Right Click
-*   Website Forwarding
-Each of these features are explained and the coded below:
-"""
-# importing required packages for this section
-import requests
-"""### **3.3.1. IFrame Redirection**
-IFrame is an HTML tag used to display an additional webpage into one that is currently shown. Phishers can make use of the “iframe” tag and make it invisible i.e. without frame borders. In this regard, phishers make use of the “frameBorder” attribute which causes the browser to render a visual delineation.
-If the iframe is empty or repsonse is not found then, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
-"""
-# 15. IFrame Redirection (iFrame)
-def iframe(response):
-  if response == "":
-      return 1
-  else:
-      if re.findall(r"[<iframe>|<frameBorder>]", response.text):
-          return 0
-      else:
-          return 1
-"""### **3.3.2. Status Bar Customization**
-Phishers may use JavaScript to show a fake URL in the status bar to users. To extract this feature, we must dig-out the webpage source code, particularly the “onMouseOver” event, and check if it makes any changes on the status bar
-If the response is empty or onmouseover is found then, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
-"""
-# 16.Checks the effect of mouse over on status bar (Mouse_Over)
-def mouseOver(response):
-  if response == "" :
-    return 1
-  else:
-    if re.findall("<script>.+onmouseover.+</script>", response.text):
-      return 1
-    else:
-      return 0
-"""### **3.3.3. Disabling Right Click**
-Phishers use JavaScript to disable the right-click function, so that users cannot view and save the webpage source code. This feature is treated exactly as “Using onMouseOver to hide the Link”. Nonetheless, for this feature, we will search for event “event.button==2” in the webpage source code and check if the right click is disabled.
-If the response is empty or onmouseover is not found then, the value assigned to this feature is 1 (phishing) or else 0 (legitimate).
-"""
-# 17.Checks the status of the right click attribute (Right_Click)
-def rightClick(response):
-  if response == "":
-    return 1
-  else:
-    if re.findall(r"event.button ?== ?2", response.text):
-      return 0
-    else:
-      return 1
-"""### **3.3.4. Website Forwarding**
-The fine line that distinguishes phishing websites from legitimate ones is how many times a website has been redirected. In our dataset, we find that legitimate websites have been redirected one time max. On the other hand, phishing websites containing this feature have been redirected at least 4 times.
-"""
-# 18.Checks the number of forwardings (Web_Forwards)
-def forwarding(response):
-  if response == "":
-    return 1
-  else:
-    if len(response.history) <= 2:
-      return 0
-    else:
-      return 1
-"""## **4. Computing URL Features**
-Create a list and a function that calls the other functions and stores all the features of the URL in the list. We will extract the features of each URL and append to this list.
-"""
-#Function to extract features
-def featureExtraction(url):
-  features = []
-  #Address bar based features (10)
-  #features.append(getDomain(url))
-  features.append(havingIP(url))
-  features.append(haveAtSign(url))
-  features.append(getLength(url))
-  features.append(getDepth(url))
-  features.append(redirection(url))
-  features.append(httpDomain(url))
-  features.append(tinyURL(url))
-  features.append(prefixSuffix(url))
-  # #Domain based features (4)
-  # dns = 0
-  # try:
-  #   domain_name = whois.whois(urlparse(url).netloc)
-  # except:
-  #   dns = 1
-  # features.append(dns)
-  # features.append(web_traffic(url))
-  # features.append(1 if dns == 1 else domainAge(domain_name))
-  # features.append(1 if dns == 1 else domainEnd(domain_name))
-  return features
-#converting the list to dataframe
-feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'URL_Depth','Redirection',
-                      'https_Domain', 'TinyURL', 'Prefix/Suffix', 'Label']

xgboost/__init__.py DELETED Viewed

File without changes

xgboost/features.py DELETED Viewed

@@ -1,347 +0,0 @@
-"""
-from bs4 import BeautifulSoup
-with open("mini_dataset/6.html") as f:
-    test = f.read()
-soup = BeautifulSoup(test, "html.parser")
-"""
-# has_title
-def has_title(soup):
-    if soup.title is None:
-        return 0
-    if len(soup.title.text) > 0:
-        return 1
-    else:
-        return 0
-# has_input
-def has_input(soup):
-    if len(soup.find_all("input")):
-        return 1
-    else:
-        return 0
-# has_button
-def has_button(soup):
-    if len(soup.find_all("button")) > 0:
-        return 1
-    else:
-        return 0
-# has_image
-def has_image(soup):
-    if len(soup.find_all("image")) == 0:
-        return 0
-    else:
-        return 1
-# has_submit
-def has_submit(soup):
-    for button in soup.find_all("input"):
-        if button.get("type") == "submit":
-            return 1
-        else:
-            pass
-    return 0
-# has_link
-def has_link(soup):
-    if len(soup.find_all("link")) > 0:
-        return 1
-    else:
-        return 0
-# has_password
-def has_password(soup):
-    for input in soup.find_all("input"):
-        if (input.get("type") or input.get("name") or input.get("id")) == "password":
-            return 1
-        else:
-            pass
-    return 0
-# has_email_input
-def has_email_input(soup):
-    for input in soup.find_all("input"):
-        if (input.get("type") or input.get("id") or input.get("name")) == "email":
-            return 1
-        else:
-            pass
-    return 0
-# has_hidden_element
-def has_hidden_element(soup):
-    for input in soup.find_all("input"):
-        if input.get("type") == "hidden":
-            return 1
-        else:
-            pass
-    return 0
-# has_audio
-def has_audio(soup):
-    if len(soup.find_all("audio")) > 0:
-        return 1
-    else:
-        return 0
-# has_video
-def has_video(soup):
-    if len(soup.find_all("video")) > 0:
-        return 1
-    else:
-        return 0
-# number_of_inputs
-def number_of_inputs(soup):
-    return len(soup.find_all("input"))
-# number_of_buttons
-def number_of_buttons(soup):
-    return len(soup.find_all("button"))
-# number_of_images
-def number_of_images(soup):
-    image_tags = len(soup.find_all("image"))
-    count = 0
-    for meta in soup.find_all("meta"):
-        if meta.get("type") or meta.get("name") == "image":
-            count += 1
-    return image_tags + count
-# number_of_option
-def number_of_option(soup):
-    return len(soup.find_all("option"))
-# number_of_list
-def number_of_list(soup):
-    return len(soup.find_all("li"))
-# number_of_TH
-def number_of_TH(soup):
-    return len(soup.find_all("th"))
-# number_of_TR
-def number_of_TR(soup):
-    return len(soup.find_all("tr"))
-# number_of_href
-def number_of_href(soup):
-    count = 0
-    for link in soup.find_all("link"):
-        if link.get("href"):
-            count += 1
-    return count
-# number_of_paragraph
-def number_of_paragraph(soup):
-    return len(soup.find_all("p"))
-# number_of_script
-def number_of_script(soup):
-    return len(soup.find_all("script"))
-# length_of_title
-def length_of_title(soup):
-    if soup.title == None:
-        return 0
-    return len(soup.title.text)
-"""
-print("has_title --> ", has_title(soup))
-print("has_input --> ", has_input(soup))
-print("has_button --> ", has_button(soup))
-print("has_image --> ", has_image(soup))
-print("has_submit --> ", has_submit(soup))
-print("has_link --> ", has_link(soup))
-print("has_password --> ", has_password(soup))
-print("has_email_input --> ", has_email_input(soup))
-print("has_hidden_element --> ", has_hidden_element(soup))
-print("has_audio --> ", has_audio(soup))
-print("has_video --> ", has_video(soup))
-print("number_of_inputs --> ", number_of_inputs(soup))
-print("number_of_buttons --> ", number_of_buttons(soup))
-print("number_of_images --> ", number_of_images(soup))
-print("number_of_option --> ", number_of_option(soup))
-print("number_of_list --> ", number_of_list(soup))
-print("number_of_TH --> ", number_of_TH(soup))
-print("number_of_TR --> ", number_of_TR(soup))
-print("number_of_href --> ", number_of_href(soup))
-print("number_of_paragraph --> ", number_of_paragraph(soup))
-print("number_of_script --> ", number_of_script(soup))
-print("length_of_title --> ", length_of_title(soup))
-"""
-# has h1
-def has_h1(soup):
-    if len(soup.find_all("h1")) > 0:
-        return 1
-    else:
-        return 0
-# has h2
-def has_h2(soup):
-    if len(soup.find_all("h2")) > 0:
-        return 1
-    else:
-        return 0
-# has h3
-def has_h3(soup):
-    if len(soup.find_all("h3")) > 0:
-        return 1
-    else:
-        return 0
-# length of text
-def length_of_text(soup):
-    return len(soup.get_text())
-# number of clickable button
-def number_of_clickable_button(soup):
-    count = 0
-    for button in soup.find_all("button"):
-        if button.get("type") == "button":
-            count += 1
-    return count
-# number of a
-def number_of_a(soup):
-    return len(soup.find_all("a"))
-# number of img
-def number_of_img(soup):
-    return len(soup.find_all("img"))
-# number of div class
-def number_of_div(soup):
-    return len(soup.find_all("div"))
-# number of figures
-def number_of_figure(soup):
-    return len(soup.find_all("figure"))
-# has footer
-def has_footer(soup):
-    if len(soup.find_all("footer")) > 0:
-        return 1
-    else:
-        return 0
-# has form
-def has_form(soup):
-    if len(soup.find_all("form")) > 0:
-        return 1
-    else:
-        return 0
-# has textarea
-def has_text_area(soup):
-    if len(soup.find_all("textarea")) > 0:
-        return 1
-    else:
-        return 0
-# has iframe
-def has_iframe(soup):
-    if len(soup.find_all("iframe")) > 0:
-        return 1
-    else:
-        return 0
-# has text input
-def has_text_input(soup):
-    for input in soup.find_all("input"):
-        if input.get("type") == "text":
-            return 1
-    return 0
-# number of meta
-def number_of_meta(soup):
-    return len(soup.find_all("meta"))
-# has nav
-def has_nav(soup):
-    if len(soup.find_all("nav")) > 0:
-        return 1
-    else:
-        return 0
-# has object
-def has_object(soup):
-    if len(soup.find_all("object")) > 0:
-        return 1
-    else:
-        return 0
-# has picture
-def has_picture(soup):
-    if len(soup.find_all("picture")) > 0:
-        return 1
-    else:
-        return 0
-# number of sources
-def number_of_sources(soup):
-    return len(soup.find_all("source"))
-# number of span
-def number_of_span(soup):
-    return len(soup.find_all("span"))
-# number of table
-def number_of_table(soup):
-    return len(soup.find_all("table"))

xgboost_wrapper.py DELETED Viewed

@@ -1,246 +0,0 @@
-"""
-XGBoost Model Wrapper
-This module provides a safe wrapper around the XGBoost models for phishing detection.
-Loads models from Hugging Face Hub: th1enq/xgboost_checkpoint
-"""
-import os
-import sys
-import joblib
-import pickle
-from bs4 import BeautifulSoup
-from huggingface_hub import hf_hub_download
-# Add xgboost directory to path for feature extraction modules
-xgboost_dir = os.path.join(os.path.dirname(__file__), 'xgboost')
-sys.path.append(xgboost_dir)
-try:
-    import features as fe
-    from URLFeatureExtraction import featureExtraction
-    XGBOOST_AVAILABLE = True
-except ImportError as e:
-    print(f"XGBoost modules not available: {e}")
-    XGBOOST_AVAILABLE = False
-def load_model_from_hub(repo_id, filename):
-    """Load model from Hugging Face Hub"""
-    try:
-        # Download model from Hugging Face Hub
-        model_path = hf_hub_download(repo_id=repo_id, filename=filename)
-        # Try different loading methods to handle version compatibility
-        try:
-            return joblib.load(model_path)
-        except Exception as e1:
-            try:
-                # Try with pickle
-                with open(model_path, 'rb') as f:
-                    return pickle.load(f)
-            except Exception as e2:
-                print(f"Failed to load model {filename} from {repo_id}: {e1}")
-                print(f"Pickle fallback failed: {e2}")
-                return None
-    except Exception as e:
-        print(f"Failed to download model {filename} from {repo_id}: {e}")
-        return None
-def load_model_safe(model_path):
-    """Safely load a local model, handling version compatibility issues"""
-    try:
-        # Try loading with joblib first
-        return joblib.load(model_path)
-    except Exception as e1:
-        try:
-            # Try loading with pickle
-            with open(model_path, 'rb') as f:
-                return pickle.load(f)
-        except Exception as e2:
-            print(f"Failed to load model {model_path}")
-            print(f"Joblib error: {e1}")
-            print(f"Pickle error: {e2}")
-            return None
-def extract_features_from_html(html_content):
-    """Extract features from HTML content for phishing detection"""
-    if not XGBOOST_AVAILABLE:
-        return None
-    try:
-        soup = BeautifulSoup(html_content, "html.parser")
-        features = [
-            fe.has_title(soup),
-            fe.has_input(soup),
-            fe.has_button(soup),
-            fe.has_image(soup),
-            fe.has_submit(soup),
-            fe.has_link(soup),
-            fe.has_password(soup),
-            fe.has_email_input(soup),
-            fe.has_hidden_element(soup),
-            fe.has_audio(soup),
-            fe.has_video(soup),
-            fe.number_of_inputs(soup),
-            fe.number_of_buttons(soup),
-            fe.number_of_images(soup),
-            fe.number_of_option(soup),
-            fe.number_of_list(soup),
-            fe.number_of_TH(soup),
-            fe.number_of_TR(soup),
-            fe.number_of_href(soup),
-            fe.number_of_paragraph(soup),
-            fe.number_of_script(soup),
-            fe.length_of_title(soup),
-            fe.has_h1(soup),
-            fe.has_h2(soup),
-            fe.has_h3(soup),
-            fe.length_of_text(soup),
-            fe.number_of_clickable_button(soup),
-            fe.number_of_a(soup),
-            fe.number_of_img(soup),
-            fe.number_of_div(soup),
-            fe.number_of_figure(soup),
-            fe.has_footer(soup),
-            fe.has_form(soup),
-            fe.has_text_area(soup),
-            fe.has_iframe(soup),
-            fe.has_text_input(soup),
-            fe.number_of_meta(soup),
-            fe.has_nav(soup),
-            fe.has_object(soup),
-            fe.has_picture(soup),
-            fe.number_of_sources(soup),
-            fe.number_of_span(soup),
-            fe.number_of_table(soup)
-        ]
-        return features
-    except Exception as e:
-        print(f"Error extracting HTML features: {e}")
-        return [0] * 43
-def extract_features_from_url(url):
-    """Extract features from URL for phishing detection"""
-    if not XGBOOST_AVAILABLE:
-        return None
-    try:
-        return featureExtraction(url)
-    except Exception as e:
-        print(f"Error extracting URL features: {e}")
-        return None
-class XGBoostPhishingDetector:
-    def __init__(self):
-        self.html_model = None
-        self.url_model = None
-        self.available = XGBOOST_AVAILABLE
-        if self.available:
-            self._load_models()
-    def _load_models(self):
-        """Load the XGBoost models from Hugging Face Hub"""
-        repo_id = "th1enq/xgboost_checkpoint"
-        # Try to load from Hugging Face Hub first
-        print("🔄 Loading XGBoost models from Hugging Face Hub...")
-        self.html_model = load_model_from_hub(repo_id, 'xgboost_html.joblib')
-        if self.html_model:
-            print("✅ HTML XGBoost model loaded from Hugging Face Hub")
-        else:
-            print("❌ Failed to load HTML XGBoost model from Hugging Face Hub")
-            # Fallback to local file
-            html_model_path = os.path.join(xgboost_dir, 'xgboost_html.joblib')
-            if os.path.exists(html_model_path):
-                self.html_model = load_model_safe(html_model_path)
-                print("✅ HTML XGBoost model loaded from local file")
-        self.url_model = load_model_from_hub(repo_id, 'xgboost_url.joblib')
-        if self.url_model:
-            print("✅ URL XGBoost model loaded from Hugging Face Hub")
-        else:
-            print("❌ Failed to load URL XGBoost model from Hugging Face Hub")
-            # Fallback to local file
-            url_model_path = os.path.join(xgboost_dir, 'xgboost_url.joblib')
-            if os.path.exists(url_model_path):
-                self.url_model = load_model_safe(url_model_path)
-                print("✅ URL XGBoost model loaded from local file")
-    def predict_html(self, html_content):
-        """Predict phishing from HTML content"""
-        if not self.available or not self.html_model:
-            return None
-        features = extract_features_from_html(html_content)
-        if features is None:
-            return None
-        try:
-            prediction = self.html_model.predict([features])[0]
-            probability = self.html_model.predict_proba([features])[0] if hasattr(self.html_model, 'predict_proba') else [1-prediction, prediction]
-            return {
-                'prediction': int(prediction),
-                'probability': probability,
-                'is_phishing': prediction == 1
-            }
-        except Exception as e:
-            print(f"Error predicting HTML: {e}")
-            return None
-    def predict_url(self, url):
-        """Predict phishing from URL"""
-        if not self.available or not self.url_model:
-            return None
-        features = extract_features_from_url(url)
-        if features is None:
-            return None
-        try:
-            prediction = self.url_model.predict([features])[0]
-            probability = self.url_model.predict_proba([features])[0] if hasattr(self.url_model, 'predict_proba') else [1-prediction, prediction]
-            return {
-                'prediction': int(prediction),
-                'probability': probability,
-                'is_phishing': prediction == 1
-            }
-        except Exception as e:
-            print(f"Error predicting URL: {e}")
-            return None
-    def predict_combined(self, url, html_content=None, url_weight=0.3, html_weight=0.7):
-        """Predict using both URL and HTML analysis"""
-        url_result = self.predict_url(url)
-        html_result = None
-        if html_content:
-            html_result = self.predict_html(html_content)
-        if url_result and html_result:
-            # Combine predictions
-            combined_prob = [
-                url_weight * url_result['probability'][0] + html_weight * html_result['probability'][0],
-                url_weight * url_result['probability'][1] + html_weight * html_result['probability'][1]
-            ]
-            combined_prediction = 1 if combined_prob[1] > combined_prob[0] else 0
-            return {
-                'prediction': combined_prediction,
-                'probability': combined_prob,
-                'is_phishing': combined_prediction == 1,
-                'url_result': url_result,
-                'html_result': html_result
-            }
-        elif url_result:
-            return url_result
-        elif html_result:
-            return html_result
-        else:
-            return None
-# Global instance
-xgboost_detector = XGBoostPhishingDetector()