Spaces:

quantumbit
/

check

Sleeping

App Files Files Community

quantumbit commited on 5 days ago

Commit

8980309

verified ·

1 Parent(s): a526fb5

Update app.py

Browse files

Files changed (1) hide show

app.py +190 -115

app.py CHANGED Viewed

@@ -3,11 +3,16 @@ import re
 import json
 import base64
 import logging
 from typing import List, Dict, Any
 from fastapi import FastAPI
 from pydantic import BaseModel
-import requests
-from bs4 import BeautifulSoup
 # -------------------------
 # Logging
@@ -52,115 +57,173 @@ def try_decode_jwt(token: str) -> Dict[str, Any]:
         return {}
 # -------------------------
-# Scraper
 # -------------------------
-def scrape_with_requests(url: str) -> Dict[str, Any]:
-    """Scrape a webpage and extract visible + hidden info (expanded)."""
     try:
-        headers = {
-            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) "
-            "AppleWebKit/537.36 (KHTML, like Gecko) "
-            "Chrome/118.0 Safari/537.36"
-        }
-        r = requests.get(url, headers=headers, timeout=30)
-        r.raise_for_status()
-        html = r.text
-        soup = BeautifulSoup(html, "html.parser")
-        title = soup.title.get_text(strip=True) if soup.title else "No title"
-        visible_text = soup.get_text(separator=" ", strip=True)[:6000]
         hidden_values: List[str] = []
         jwt_data: Dict[str, Any] = {}
-        # Hidden inputs
-        for inp in soup.find_all("input", {"type": "hidden"}):
-            name = inp.get("name", "")
-            value = inp.get("value", "")
-            if value:
-                hidden_values.append(f"hidden_input {name}={value}")
-        # Elements with display:none
-        for elem in soup.find_all(attrs={"style": re.compile(r"display\s*:\s*none", re.I)}):
-            txt = elem.get_text(" ", strip=True)
-            if txt:
-                hidden_values.append(f"display_none {txt}")
-        # HTML comments
-        for comment in soup.find_all(string=lambda t: isinstance(t, str) and ("<!--" in t or "-->" in t)):
-            c = str(comment).strip()
-            if c:
-                hidden_values.append(f"comment {c}")
-        # data-* attributes
-        for tag in soup.find_all():
-            for k, v in tag.attrs.items():
-                if k.startswith("data-") and isinstance(v, str) and v.strip():
-                    hidden_values.append(f"{k}={v.strip()}")
-        # Script tags (look for JSON-like challenge info and completion codes)
-        for script in soup.find_all("script"):
-            txt = script.get_text(" ", strip=True)
-            if txt:
-                # Look for completion codes or challenge codes
-                completion_matches = re.findall(r"(completion[_\s]*code|challenge[_\s]*code|code)\s*[:=]\s*['\"]?([A-Za-z0-9\-_]{6,})['\"]?", txt, flags=re.I)
-                for k, v in completion_matches:
-                    hidden_values.append(f"script completion_code={v}")
-                # General matches for challenge info
-                matches = re.findall(r"(challenge\w*|code|completion)\s*[:=]\s*['\"]?([A-Za-z0-9\-_]+)['\"]?", txt, flags=re.I)
-                for k, v in matches:
-                    hidden_values.append(f"script {k}={v}")
-        # ✅ Enhanced JWT token detection and decoding
-        # Look for JWT patterns in the entire HTML content
         jwt_patterns = [
-            r"eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+",  # Standard JWT
-            r"[A-Za-z0-9_-]{20,}\.[A-Za-z0-9_-]{20,}\.[A-Za-z0-9_-]{20,}"  # Generic three-part tokens
         ]
         for pattern in jwt_patterns:
-            jwt_matches = re.findall(pattern, html)
             for token in jwt_matches:
-                logger.info(f"Found potential JWT: {token[:50]}...")
                 data = try_decode_jwt(token)
                 if data:
                     jwt_data.update(data)
                     for k, v in data.items():
                         hidden_values.append(f"jwt {k}={v}")
-        # Look for completion codes in various formats
-        completion_patterns = [
-            r"completion[_\s]*code[:\s]*([A-Za-z0-9\-_]{6,})",
-            r"challenge[_\s]*complete[_\s]*code[:\s]*([A-Za-z0-9\-_]{6,})",
-            r"code[:\s]*([A-Za-z0-9\-_]{10,})",
-        ]
-        for pattern in completion_patterns:
-            matches = re.findall(pattern, html, flags=re.I)
-            for match in matches:
-                hidden_values.append(f"completion_code {match}")
-        # Enhanced token detection
-        tokens = re.findall(r"[A-Za-z0-9_\-]{12,}", html)
-        for t in tokens:
-            if any(x in t.lower() for x in ["chall", "code", "id", "completion"]):
-                hidden_values.append(f"token {t}")
         logger.info(f"Found {len(hidden_values)} hidden values")
         logger.info(f"JWT data: {jwt_data}")
         return {
-            "title": title,
-            "visible_text": visible_text,
-            "hidden_values": hidden_values[:500],
             "jwt_data": jwt_data,
         }
     except Exception as e:
-        logger.error(f"Request scraping failed for {url}: {e}")
         return {}
 # -------------------------
@@ -169,57 +232,69 @@ def scrape_with_requests(url: str) -> Dict[str, Any]:
 def answer_question(question: str, content: Dict[str, Any]) -> str:
     """Enhanced rule-based extraction for Round 5 questions."""
     ql = question.lower()
-    title = content.get("title", "")
     hidden = content.get("hidden_values", [])
     jwt_data = content.get("jwt_data", {})
-    # Direct JWT data extraction
     if "challenge id" in ql or "challengeid" in ql:
         # First check JWT data directly
         if "challengeID" in jwt_data:
-            return str(jwt_data["challengeID"])
-        # Then check hidden values
         for h in hidden:
             if "challengeid" in h.lower():
-                return h.split("=", 1)[-1].strip()
     if "completion" in ql and "code" in ql:
-        # Look for completion codes in various formats
         for h in hidden:
             if "completion_code" in h.lower():
-                return h.split("=", 1)[-1].strip()
-            if "code" in h.lower() and len(h.split("=", 1)[-1].strip()) > 10:
-                return h.split("=", 1)[-1].strip()
-        # Check JWT data for any field that might be a completion code
-        for key, value in jwt_data.items():
-            if isinstance(value, str) and len(value) > 10 and key.lower() != "email":
-                return str(value)
     if "challenge name" in ql:
-        # Check JWT data first
         if "coolGuy" in jwt_data:
-            return str(jwt_data["coolGuy"])
-        # Then check hidden values
-        for h in hidden:
-            if "challenge" in h.lower() and "name" in h.lower():
-                return h.split("=", 1)[-1].strip()
-    # Fallbacks
-    if "challenge name" in ql and title:
-        return title.strip()
-    # If we have JWT data, return the most likely candidate
     if jwt_data:
-        # For challenge ID questions, return challengeID if present
-        if "challenge" in ql and "id" in ql and "challengeID" in jwt_data:
-            return str(jwt_data["challengeID"])
-        # For other questions, return the first non-standard field
         for key, value in jwt_data.items():
-            if key not in ["iat", "exp", "email"] and isinstance(value, str):
                 return str(value)
     return "Challenge information not found"
@@ -229,7 +304,7 @@ def answer_question(question: str, content: Dict[str, Any]) -> str:
 @app.get("/")
 def root():
     return {
-        "message": "HackRx Round 5 API - Ready",
         "endpoints": {"challenge": "POST /challenge", "health": "GET /health"},
     }
@@ -242,7 +317,7 @@ def health():
 @app.post("/challenge", response_model=ChallengeResponse)
 def challenge(req: ChallengeRequest):
     logger.info(f"Round 5 request: url={req.url}, questions={req.questions}")
-    content = scrape_with_requests(req.url)
     if not content:
         return ChallengeResponse(answers=["Challenge information not found" for _ in req.questions])

 import json
 import base64
 import logging
+import time
 from typing import List, Dict, Any
 from fastapi import FastAPI
 from pydantic import BaseModel
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service
 # -------------------------
 # Logging
         return {}
+def setup_chrome_driver():
+    """Setup Chrome driver with appropriate options."""
+    chrome_options = Options()
+    chrome_options.add_argument("--headless")  # Run in background
+    chrome_options.add_argument("--no-sandbox")
+    chrome_options.add_argument("--disable-dev-shm-usage")
+    chrome_options.add_argument("--disable-gpu")
+    chrome_options.add_argument("--window-size=1920,1080")
+    chrome_options.add_argument("--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0 Safari/537.36")
+    # Enable logging to capture console messages
+    chrome_options.add_argument("--enable-logging")
+    chrome_options.add_argument("--log-level=0")
+    try:
+        driver = webdriver.Chrome(options=chrome_options)
+        return driver
+    except Exception as e:
+        logger.error(f"Failed to create Chrome driver: {e}")
+        return None
 # -------------------------
+# Interactive Scraper
 # -------------------------
+def scrape_with_selenium(url: str) -> Dict[str, Any]:
+    """Scrape webpage with Selenium, click Start Challenge, and extract data."""
+    driver = None
     try:
+        driver = setup_chrome_driver()
+        if not driver:
+            return {}
+        logger.info(f"Loading URL: {url}")
+        driver.get(url)
+        # Wait for page to load
+        WebDriverWait(driver, 10).until(
+            EC.presence_of_element_located((By.TAG_NAME, "body"))
+        )
+        time.sleep(2)
+        # Look for and click "Start Challenge" button
+        start_button_selectors = [
+            "button:contains('Start Challenge')",
+            "button[id*='start']",
+            "button[class*='start']",
+            "input[value*='Start']",
+            "a[href*='start']",
+            ".btn:contains('Start')",
+            "[onclick*='start']"
+        ]
+        button_clicked = False
+        for selector in start_button_selectors:
+            try:
+                if "contains" in selector:
+                    # Use XPath for text-based selection
+                    xpath_selector = f"//button[contains(text(), 'Start Challenge')] | //button[contains(text(), 'Start')] | //input[contains(@value, 'Start')]"
+                    elements = driver.find_elements(By.XPATH, xpath_selector)
+                else:
+                    elements = driver.find_elements(By.CSS_SELECTOR, selector)
+                if elements:
+                    logger.info(f"Found start button with selector: {selector}")
+                    elements[0].click()
+                    button_clicked = True
+                    time.sleep(3)  # Wait for challenge to start
+                    break
+            except Exception as e:
+                logger.debug(f"Selector {selector} failed: {e}")
+                continue
+        if not button_clicked:
+            logger.warning("Could not find Start Challenge button, proceeding with current page")
+        # Get page source after interaction
+        html = driver.page_source
+        # Get console logs
+        console_logs = []
+        try:
+            logs = driver.get_log('browser')
+            for log in logs:
+                console_logs.append(log['message'])
+                logger.info(f"Console log: {log['message']}")
+        except Exception as e:
+            logger.warning(f"Could not get console logs: {e}")
+        # Extract data from HTML
         hidden_values: List[str] = []
         jwt_data: Dict[str, Any] = {}
+        # Look for JWT tokens in HTML and console logs
+        all_text = html + " ".join(console_logs)
         jwt_patterns = [
+            r"eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+",
+            r"[A-Za-z0-9_-]{20,}\.[A-Za-z0-9_-]{20,}\.[A-Za-z0-9_-]{20,}"
         ]
         for pattern in jwt_patterns:
+            jwt_matches = re.findall(pattern, all_text)
             for token in jwt_matches:
+                logger.info(f"Found JWT token: {token[:50]}...")
                 data = try_decode_jwt(token)
                 if data:
                     jwt_data.update(data)
                     for k, v in data.items():
                         hidden_values.append(f"jwt {k}={v}")
+        # Look for completion codes in console logs
+        for log in console_logs:
+            # Look for completion codes
+            completion_matches = re.findall(r"completion[_\s]*code[:\s]*([A-Za-z0-9\-_]{6,})", log, flags=re.I)
+            for code in completion_matches:
+                hidden_values.append(f"completion_code {code}")
+            # Look for challenge completion messages
+            if "challenge" in log.lower() and ("complete" in log.lower() or "finished" in log.lower()):
+                hidden_values.append(f"console_message {log}")
+        # Execute JavaScript to check for global variables or challenge data
+        try:
+            js_result = driver.execute_script("""
+                var data = {};
+                if (window.challengeData) data.challengeData = window.challengeData;
+                if (window.challenge) data.challenge = window.challenge;
+                if (window.completionCode) data.completionCode = window.completionCode;
+                return data;
+            """)
+            if js_result:
+                for k, v in js_result.items():
+                    hidden_values.append(f"js_global {k}={v}")
+                    logger.info(f"Found JS global: {k} = {v}")
+        except Exception as e:
+            logger.debug(f"JS execution failed: {e}")
+        # Look for data in local storage
+        try:
+            local_storage = driver.execute_script("return window.localStorage;")
+            if local_storage:
+                for k, v in local_storage.items():
+                    if any(keyword in k.lower() for keyword in ['challenge', 'code', 'completion']):
+                        hidden_values.append(f"localStorage {k}={v}")
+        except Exception as e:
+            logger.debug(f"LocalStorage check failed: {e}")
         logger.info(f"Found {len(hidden_values)} hidden values")
         logger.info(f"JWT data: {jwt_data}")
         return {
+            "title": driver.title,
+            "visible_text": driver.find_element(By.TAG_NAME, "body").text[:6000],
+            "hidden_values": hidden_values,
             "jwt_data": jwt_data,
+            "console_logs": console_logs,
+            "button_clicked": button_clicked
         }
     except Exception as e:
+        logger.error(f"Selenium scraping failed for {url}: {e}")
         return {}
+    finally:
+        if driver:
+            driver.quit()
 # -------------------------
 def answer_question(question: str, content: Dict[str, Any]) -> str:
     """Enhanced rule-based extraction for Round 5 questions."""
     ql = question.lower()
     hidden = content.get("hidden_values", [])
     jwt_data = content.get("jwt_data", {})
+    console_logs = content.get("console_logs", [])
+    logger.info(f"Answering question: {question}")
+    logger.info(f"Available JWT data: {jwt_data}")
+    logger.info(f"Hidden values count: {len(hidden)}")
+    # Challenge ID extraction
     if "challenge id" in ql or "challengeid" in ql:
         # First check JWT data directly
         if "challengeID" in jwt_data:
+            result = str(jwt_data["challengeID"])
+            logger.info(f"Found challengeID in JWT: {result}")
+            return result
+        # Check hidden values
         for h in hidden:
             if "challengeid" in h.lower():
+                result = h.split("=", 1)[-1].strip()
+                logger.info(f"Found challengeID in hidden values: {result}")
+                return result
+    # Completion code extraction
     if "completion" in ql and "code" in ql:
+        # Look for explicit completion codes
         for h in hidden:
             if "completion_code" in h.lower():
+                result = h.split("=", 1)[-1].strip()
+                logger.info(f"Found completion code: {result}")
+                return result
+        # Look in console logs for completion codes
+        for log in console_logs:
+            completion_matches = re.findall(r"completion[_\s]*code[:\s]*([A-Za-z0-9\-_]{6,})", log, flags=re.I)
+            if completion_matches:
+                result = completion_matches[0]
+                logger.info(f"Found completion code in console: {result}")
+                return result
+        # Look for any long tokens that might be completion codes
+        for h in hidden:
+            if "token" in h.lower() or "code" in h.lower():
+                token = h.split("=", 1)[-1].strip()
+                if len(token) > 15:  # Assuming completion codes are reasonably long
+                    logger.info(f"Found potential completion code: {token}")
+                    return token
+    # Challenge name extraction
     if "challenge name" in ql:
         if "coolGuy" in jwt_data:
+            result = str(jwt_data["coolGuy"])
+            logger.info(f"Found challenge name in JWT: {result}")
+            return result
+    # Fallback: return any relevant data from JWT
     if jwt_data:
         for key, value in jwt_data.items():
+            if key not in ["iat", "exp"] and isinstance(value, str):
+                logger.info(f"Fallback: returning JWT field {key}: {value}")
                 return str(value)
+    logger.warning("No matching data found for question")
     return "Challenge information not found"
 @app.get("/")
 def root():
     return {
+        "message": "HackRx Round 5 API - Ready (with Selenium support)",
         "endpoints": {"challenge": "POST /challenge", "health": "GET /health"},
     }
 @app.post("/challenge", response_model=ChallengeResponse)
 def challenge(req: ChallengeRequest):
     logger.info(f"Round 5 request: url={req.url}, questions={req.questions}")
+    content = scrape_with_selenium(req.url)
     if not content:
         return ChallengeResponse(answers=["Challenge information not found" for _ in req.questions])