Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,85 +1,152 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
from
|
|
|
|
|
|
|
5 |
from bs4 import BeautifulSoup
|
6 |
-
import time
|
7 |
|
8 |
-
|
|
|
|
|
|
|
|
|
9 |
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
chrome_options.add_argument("--disable-dev-shm-usage")
|
15 |
-
chrome_options.add_argument("--disable-gpu")
|
16 |
-
chrome_options.add_argument("--disable-software-rasterizer")
|
17 |
-
chrome_options.add_argument("--incognito")
|
18 |
-
chrome_options.add_argument("--remote-debugging-port=9222")
|
19 |
|
20 |
-
driver = webdriver.Chrome(options=chrome_options)
|
21 |
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
-
try:
|
25 |
-
driver.get(url)
|
26 |
-
time.sleep(3)
|
27 |
-
|
28 |
-
# Try clicking the button
|
29 |
-
try:
|
30 |
-
start_button = driver.find_element(By.XPATH, "//button[contains(text(), 'Start Challenge')]")
|
31 |
-
start_button.click()
|
32 |
-
time.sleep(3)
|
33 |
-
except Exception as e:
|
34 |
-
print("⚠️ Could not click Start Challenge button:", e)
|
35 |
-
|
36 |
-
html = driver.page_source
|
37 |
-
soup = BeautifulSoup(html, "html.parser")
|
38 |
-
|
39 |
-
# Scrape hidden elements
|
40 |
-
for tag in soup.find_all(True):
|
41 |
-
if tag.has_attr("style") and "display:none" in tag["style"]:
|
42 |
-
token_found.append(tag.get_text(strip=True))
|
43 |
|
44 |
-
|
45 |
-
|
46 |
-
token_found.append(hidden["value"])
|
47 |
|
48 |
-
if "hidden code" in html.lower():
|
49 |
-
start = html.lower().find("hidden code")
|
50 |
-
token_found.append(html[start:start+100])
|
51 |
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
except Exception as e:
|
56 |
-
|
57 |
-
return
|
58 |
-
|
59 |
-
|
60 |
-
def
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import logging
|
4 |
+
from typing import List, Dict, Any
|
5 |
+
from fastapi import FastAPI
|
6 |
+
from pydantic import BaseModel
|
7 |
+
import requests
|
8 |
from bs4 import BeautifulSoup
|
|
|
9 |
|
10 |
+
# -------------------------
|
11 |
+
# Logging
|
12 |
+
# -------------------------
|
13 |
+
logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO"))
|
14 |
+
logger = logging.getLogger("hackrx-round5")
|
15 |
|
16 |
+
# -------------------------
|
17 |
+
# FastAPI app
|
18 |
+
# -------------------------
|
19 |
+
app = FastAPI(title="HackRx Round 5 API", version="1.0.0")
|
|
|
|
|
|
|
|
|
|
|
20 |
|
|
|
21 |
|
22 |
+
# -------------------------
|
23 |
+
# Models
|
24 |
+
# -------------------------
|
25 |
+
class ChallengeRequest(BaseModel):
|
26 |
+
url: str
|
27 |
+
questions: List[str]
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
+
class ChallengeResponse(BaseModel):
|
31 |
+
answers: List[str]
|
|
|
32 |
|
|
|
|
|
|
|
33 |
|
34 |
+
# -------------------------
|
35 |
+
# Scraper
|
36 |
+
# -------------------------
|
37 |
+
def scrape_with_requests(url: str) -> Dict[str, Any]:
|
38 |
+
"""Scrape a webpage and extract visible + hidden info."""
|
39 |
+
try:
|
40 |
+
headers = {
|
41 |
+
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) "
|
42 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
43 |
+
"Chrome/118.0 Safari/537.36"
|
44 |
+
}
|
45 |
+
r = requests.get(url, headers=headers, timeout=30)
|
46 |
+
r.raise_for_status()
|
47 |
+
html = r.text
|
48 |
+
soup = BeautifulSoup(html, "html.parser")
|
49 |
|
50 |
+
title = soup.title.get_text(strip=True) if soup.title else "No title"
|
51 |
+
visible_text = soup.get_text(separator=" ", strip=True)[:6000]
|
52 |
+
|
53 |
+
hidden_values: List[str] = []
|
54 |
+
|
55 |
+
# Hidden inputs
|
56 |
+
for inp in soup.find_all("input", {"type": "hidden"}):
|
57 |
+
name = inp.get("name", "")
|
58 |
+
value = inp.get("value", "")
|
59 |
+
if value:
|
60 |
+
hidden_values.append(f"hidden_input {name}={value}")
|
61 |
+
|
62 |
+
# Elements with display:none
|
63 |
+
for elem in soup.find_all(attrs={"style": re.compile(r"display\s*:\s*none", re.I)}):
|
64 |
+
txt = elem.get_text(" ", strip=True)
|
65 |
+
if txt:
|
66 |
+
hidden_values.append(f"display_none {txt}")
|
67 |
+
|
68 |
+
# HTML comments
|
69 |
+
for comment in soup.find_all(string=lambda t: isinstance(t, str) and ("<!--" in t or "-->" in t)):
|
70 |
+
c = str(comment).strip()
|
71 |
+
if c:
|
72 |
+
hidden_values.append(f"comment {c}")
|
73 |
+
|
74 |
+
# data-* attributes
|
75 |
+
for tag in soup.find_all():
|
76 |
+
for k, v in tag.attrs.items():
|
77 |
+
if k.startswith("data-") and isinstance(v, str) and v.strip():
|
78 |
+
hidden_values.append(f"{k}={v.strip()}")
|
79 |
+
|
80 |
+
return {
|
81 |
+
"title": title,
|
82 |
+
"visible_text": visible_text,
|
83 |
+
"hidden_values": hidden_values[:200],
|
84 |
+
}
|
85 |
except Exception as e:
|
86 |
+
logger.error(f"Request scraping failed for {url}: {e}")
|
87 |
+
return {}
|
88 |
+
|
89 |
+
|
90 |
+
def answer_question(question: str, content: Dict[str, Any]) -> str:
|
91 |
+
"""Simple rule-based extraction for Round 5 questions."""
|
92 |
+
ql = question.lower()
|
93 |
+
title = content.get("title", "")
|
94 |
+
visible = content.get("visible_text", "")
|
95 |
+
hidden = content.get("hidden_values", [])
|
96 |
+
|
97 |
+
# Look into hidden values
|
98 |
+
for h in hidden:
|
99 |
+
if "challenge" in ql and "name" in ql and "challenge" in h.lower():
|
100 |
+
return h.split("=", 1)[-1].strip()
|
101 |
+
if "challenge id" in ql and "id" in h.lower():
|
102 |
+
return h.split("=", 1)[-1].strip()
|
103 |
+
if "completion code" in ql or "hidden code" in ql or "code" in ql:
|
104 |
+
if "code" in h.lower():
|
105 |
+
return h.split("=", 1)[-1].strip()
|
106 |
+
|
107 |
+
# Fallbacks
|
108 |
+
if "challenge name" in ql and title:
|
109 |
+
return title.strip()
|
110 |
+
|
111 |
+
if "challenge id" in ql:
|
112 |
+
m = re.search(r"challenge\s*id\s*[:\-]\s*([A-Za-z0-9\-_]+)", visible, flags=re.I)
|
113 |
+
if m:
|
114 |
+
return m.group(1)
|
115 |
+
|
116 |
+
if "completion code" in ql or "hidden code" in ql:
|
117 |
+
m = re.search(r"(?:completion|hidden)\s*code\s*[:\-]\s*([A-Za-z0-9\-_]+)", visible, flags=re.I)
|
118 |
+
if m:
|
119 |
+
return m.group(1)
|
120 |
+
|
121 |
+
return "Challenge information not found"
|
122 |
+
|
123 |
+
|
124 |
+
# -------------------------
|
125 |
+
# Routes
|
126 |
+
# -------------------------
|
127 |
+
@app.get("/")
|
128 |
+
def root():
|
129 |
+
return {
|
130 |
+
"message": "HackRx Round 5 API - Ready",
|
131 |
+
"endpoints": {"challenge": "POST /challenge", "health": "GET /health"},
|
132 |
+
}
|
133 |
+
|
134 |
+
|
135 |
+
@app.get("/health")
|
136 |
+
def health():
|
137 |
+
return {"status": "healthy"}
|
138 |
+
|
139 |
+
|
140 |
+
@app.post("/challenge", response_model=ChallengeResponse)
|
141 |
+
def challenge(req: ChallengeRequest):
|
142 |
+
logger.info(f"Round 5 request: url={req.url}, questions={req.questions}")
|
143 |
+
content = scrape_with_requests(req.url)
|
144 |
+
if not content:
|
145 |
+
return ChallengeResponse(answers=["Challenge information not found" for _ in req.questions])
|
146 |
+
|
147 |
+
answers = []
|
148 |
+
for q in req.questions:
|
149 |
+
ans = answer_question(q, content)
|
150 |
+
answers.append(ans)
|
151 |
+
logger.info(f"Q: {q} → A: {ans}")
|
152 |
+
return ChallengeResponse(answers=answers)
|