Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -14,6 +14,48 @@ from watchdog.events import FileSystemEventHandler
|
|
14 |
from bs4 import BeautifulSoup
|
15 |
from urllib.parse import urlparse, urljoin
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
def auto_detect_mode(content):
|
18 |
if isinstance(content, str):
|
19 |
return 'w' # If content is a string, it's text
|
@@ -528,53 +570,8 @@ async def handle_user_input(websocket,key, process, connected,process_ids):
|
|
528 |
soup.body.append(new_body)
|
529 |
else:
|
530 |
print("No <body> tag found in the HTML")
|
531 |
-
|
532 |
-
|
533 |
-
# base_host = base_url.netloc.rstrip("/")
|
534 |
-
|
535 |
-
# Function to join relative URLs with base URL
|
536 |
-
def join_url_with_base(url_arg):
|
537 |
-
if url_arg.startswith('/'):
|
538 |
-
# print(url+url_arg)
|
539 |
-
return url+url_arg
|
540 |
-
elif url_arg.startswith('http'):
|
541 |
-
# print(here)
|
542 |
-
# print(url_arg)
|
543 |
-
return url_arg
|
544 |
-
else:
|
545 |
-
# print(url_arg)
|
546 |
-
return url+url_arg
|
547 |
-
|
548 |
-
# Find and modify CSS links
|
549 |
-
for link in soup.find_all("link", rel="stylesheet", href=True):
|
550 |
-
link['href'] = join_url_with_base(link['href'])
|
551 |
-
|
552 |
-
# Find and modify JS links
|
553 |
-
for script in soup.find_all("script", src=True):
|
554 |
-
script['src'] = join_url_with_base(script['src'])
|
555 |
-
|
556 |
-
# Find and modify regular anchor links
|
557 |
-
for anchor in soup.find_all("a", href=True):
|
558 |
-
anchor['href'] = join_url_with_base(anchor['href'])
|
559 |
-
# Find and modify CSS links
|
560 |
-
for link in soup.find_all("link", rel="stylesheet", href=True):
|
561 |
-
if link['href'].startswith('/'):
|
562 |
-
css_url = url + link['href']
|
563 |
-
css_response = requests.get(css_url)
|
564 |
-
css_content = css_response.text
|
565 |
-
style_tag = soup.new_tag("style")
|
566 |
-
style_tag.string = css_content
|
567 |
-
link.replace_with(style_tag)
|
568 |
-
|
569 |
-
# Find and modify JS links
|
570 |
-
for script in soup.find_all("script", src=True):
|
571 |
-
if script['src'].startswith('/'):
|
572 |
-
js_url = url + script['src']
|
573 |
-
js_response = requests.get(js_url)
|
574 |
-
js_content = js_response.text
|
575 |
-
script_tag = soup.new_tag("script")
|
576 |
-
script_tag.string = js_content
|
577 |
-
script.replace_with(script_tag)
|
578 |
response_data = {
|
579 |
"type": "web-data",
|
580 |
"data": str(soup),
|
|
|
14 |
from bs4 import BeautifulSoup
|
15 |
from urllib.parse import urlparse, urljoin
|
16 |
|
17 |
+
|
18 |
+
def is_absolute(url):
|
19 |
+
return bool(urlparse(url).netloc)
|
20 |
+
|
21 |
+
def handle_protocol_relative(url, base_url):
|
22 |
+
# If the URL starts with //, prepend the protocol from the base URL
|
23 |
+
if url.startswith("//"):
|
24 |
+
parsed_base_url = urlparse(base_url)
|
25 |
+
return f"{parsed_base_url.scheme}:{url}"
|
26 |
+
return url
|
27 |
+
|
28 |
+
def inline_resources(soup, base_url):
|
29 |
+
# Inline CSS
|
30 |
+
for link_tag in soup.find_all('link', rel='stylesheet'):
|
31 |
+
href = link_tag.get('href')
|
32 |
+
if href:
|
33 |
+
# Handle protocol-relative URLs
|
34 |
+
href = handle_protocol_relative(href, base_url)
|
35 |
+
# Convert relative URL to absolute URL if necessary
|
36 |
+
full_url = urljoin(base_url, href) if not is_absolute(href) else href
|
37 |
+
css_response = requests.get(full_url)
|
38 |
+
if css_response.status_code == 200:
|
39 |
+
style_tag = soup.new_tag('style')
|
40 |
+
style_tag.string = css_response.text
|
41 |
+
link_tag.replace_with(style_tag)
|
42 |
+
|
43 |
+
# Inline JavaScript
|
44 |
+
for script_tag in soup.find_all('script', src=True):
|
45 |
+
src = script_tag.get('src')
|
46 |
+
if src:
|
47 |
+
# Handle protocol-relative URLs
|
48 |
+
src = handle_protocol_relative(src, base_url)
|
49 |
+
# Convert relative URL to absolute URL if necessary
|
50 |
+
full_url = urljoin(base_url, src) if not is_absolute(src) else src
|
51 |
+
js_response = requests.get(full_url)
|
52 |
+
if js_response.status_code == 200:
|
53 |
+
script_tag.string = js_response.text
|
54 |
+
del script_tag['src']
|
55 |
+
|
56 |
+
return soup
|
57 |
+
|
58 |
+
|
59 |
def auto_detect_mode(content):
|
60 |
if isinstance(content, str):
|
61 |
return 'w' # If content is a string, it's text
|
|
|
570 |
soup.body.append(new_body)
|
571 |
else:
|
572 |
print("No <body> tag found in the HTML")
|
573 |
+
# Inline CSS and JavaScript
|
574 |
+
soup = inline_resources(soup, url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
575 |
response_data = {
|
576 |
"type": "web-data",
|
577 |
"data": str(soup),
|