Ramesh-vani commited on
Commit
233a7ce
·
verified ·
1 Parent(s): f7ae6d8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -47
app.py CHANGED
@@ -14,6 +14,48 @@ from watchdog.events import FileSystemEventHandler
14
  from bs4 import BeautifulSoup
15
  from urllib.parse import urlparse, urljoin
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def auto_detect_mode(content):
18
  if isinstance(content, str):
19
  return 'w' # If content is a string, it's text
@@ -528,53 +570,8 @@ async def handle_user_input(websocket,key, process, connected,process_ids):
528
  soup.body.append(new_body)
529
  else:
530
  print("No <body> tag found in the HTML")
531
- # Get base URL
532
- base_url = url
533
- # base_host = base_url.netloc.rstrip("/")
534
-
535
- # Function to join relative URLs with base URL
536
- def join_url_with_base(url_arg):
537
- if url_arg.startswith('/'):
538
- # print(url+url_arg)
539
- return url+url_arg
540
- elif url_arg.startswith('http'):
541
- # print(here)
542
- # print(url_arg)
543
- return url_arg
544
- else:
545
- # print(url_arg)
546
- return url+url_arg
547
-
548
- # Find and modify CSS links
549
- for link in soup.find_all("link", rel="stylesheet", href=True):
550
- link['href'] = join_url_with_base(link['href'])
551
-
552
- # Find and modify JS links
553
- for script in soup.find_all("script", src=True):
554
- script['src'] = join_url_with_base(script['src'])
555
-
556
- # Find and modify regular anchor links
557
- for anchor in soup.find_all("a", href=True):
558
- anchor['href'] = join_url_with_base(anchor['href'])
559
- # Find and modify CSS links
560
- for link in soup.find_all("link", rel="stylesheet", href=True):
561
- if link['href'].startswith('/'):
562
- css_url = url + link['href']
563
- css_response = requests.get(css_url)
564
- css_content = css_response.text
565
- style_tag = soup.new_tag("style")
566
- style_tag.string = css_content
567
- link.replace_with(style_tag)
568
-
569
- # Find and modify JS links
570
- for script in soup.find_all("script", src=True):
571
- if script['src'].startswith('/'):
572
- js_url = url + script['src']
573
- js_response = requests.get(js_url)
574
- js_content = js_response.text
575
- script_tag = soup.new_tag("script")
576
- script_tag.string = js_content
577
- script.replace_with(script_tag)
578
  response_data = {
579
  "type": "web-data",
580
  "data": str(soup),
 
14
  from bs4 import BeautifulSoup
15
  from urllib.parse import urlparse, urljoin
16
 
17
+
18
+ def is_absolute(url):
19
+ return bool(urlparse(url).netloc)
20
+
21
+ def handle_protocol_relative(url, base_url):
22
+ # If the URL starts with //, prepend the protocol from the base URL
23
+ if url.startswith("//"):
24
+ parsed_base_url = urlparse(base_url)
25
+ return f"{parsed_base_url.scheme}:{url}"
26
+ return url
27
+
28
+ def inline_resources(soup, base_url):
29
+ # Inline CSS
30
+ for link_tag in soup.find_all('link', rel='stylesheet'):
31
+ href = link_tag.get('href')
32
+ if href:
33
+ # Handle protocol-relative URLs
34
+ href = handle_protocol_relative(href, base_url)
35
+ # Convert relative URL to absolute URL if necessary
36
+ full_url = urljoin(base_url, href) if not is_absolute(href) else href
37
+ css_response = requests.get(full_url)
38
+ if css_response.status_code == 200:
39
+ style_tag = soup.new_tag('style')
40
+ style_tag.string = css_response.text
41
+ link_tag.replace_with(style_tag)
42
+
43
+ # Inline JavaScript
44
+ for script_tag in soup.find_all('script', src=True):
45
+ src = script_tag.get('src')
46
+ if src:
47
+ # Handle protocol-relative URLs
48
+ src = handle_protocol_relative(src, base_url)
49
+ # Convert relative URL to absolute URL if necessary
50
+ full_url = urljoin(base_url, src) if not is_absolute(src) else src
51
+ js_response = requests.get(full_url)
52
+ if js_response.status_code == 200:
53
+ script_tag.string = js_response.text
54
+ del script_tag['src']
55
+
56
+ return soup
57
+
58
+
59
  def auto_detect_mode(content):
60
  if isinstance(content, str):
61
  return 'w' # If content is a string, it's text
 
570
  soup.body.append(new_body)
571
  else:
572
  print("No <body> tag found in the HTML")
573
+ # Inline CSS and JavaScript
574
+ soup = inline_resources(soup, url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575
  response_data = {
576
  "type": "web-data",
577
  "data": str(soup),