collab_v2.0.0

Running

App Files Files Community

Ramesh-vani commited on May 29, 2024

Commit

233a7ce

verified ·

1 Parent(s): f7ae6d8

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -47

app.py CHANGED Viewed

@@ -14,6 +14,48 @@ from watchdog.events import FileSystemEventHandler
 from bs4 import BeautifulSoup
 from urllib.parse import urlparse, urljoin
 def auto_detect_mode(content):
     if isinstance(content, str):
         return 'w'  # If content is a string, it's text
@@ -528,53 +570,8 @@ async def handle_user_input(websocket,key, process, connected,process_ids):
                             soup.body.append(new_body)
                         else:
                             print("No <body> tag found in the HTML")
-                        # Get base URL
-                        base_url = url
-                        # base_host = base_url.netloc.rstrip("/")
-                        # Function to join relative URLs with base URL
-                        def join_url_with_base(url_arg):
-                            if url_arg.startswith('/'):
-                                # print(url+url_arg)
-                                return url+url_arg
-                            elif url_arg.startswith('http'):
-                                # print(here)
-                                # print(url_arg)
-                                return url_arg
-                            else:
-                                # print(url_arg)
-                                return url+url_arg
-                        # Find and modify CSS links
-                        for link in soup.find_all("link", rel="stylesheet", href=True):
-                            link['href'] = join_url_with_base(link['href'])
-                        # Find and modify JS links
-                        for script in soup.find_all("script", src=True):
-                            script['src'] = join_url_with_base(script['src'])
-                        # Find and modify regular anchor links
-                        for anchor in soup.find_all("a", href=True):
-                            anchor['href'] = join_url_with_base(anchor['href'])
-                        # Find and modify CSS links
-                        for link in soup.find_all("link", rel="stylesheet", href=True):
-                            if link['href'].startswith('/'):
-                                css_url = url + link['href']
-                                css_response = requests.get(css_url)
-                                css_content = css_response.text
-                                style_tag = soup.new_tag("style")
-                                style_tag.string = css_content
-                                link.replace_with(style_tag)
-                        # Find and modify JS links
-                        for script in soup.find_all("script", src=True):
-                            if script['src'].startswith('/'):
-                                js_url = url + script['src']
-                                js_response = requests.get(js_url)
-                                js_content = js_response.text
-                                script_tag = soup.new_tag("script")
-                                script_tag.string = js_content
-                                script.replace_with(script_tag)
                         response_data = {
                             "type": "web-data",
                             "data": str(soup),

 from bs4 import BeautifulSoup
 from urllib.parse import urlparse, urljoin
+def is_absolute(url):
+    return bool(urlparse(url).netloc)
+def handle_protocol_relative(url, base_url):
+    # If the URL starts with //, prepend the protocol from the base URL
+    if url.startswith("//"):
+        parsed_base_url = urlparse(base_url)
+        return f"{parsed_base_url.scheme}:{url}"
+    return url
+def inline_resources(soup, base_url):
+    # Inline CSS
+    for link_tag in soup.find_all('link', rel='stylesheet'):
+        href = link_tag.get('href')
+        if href:
+            # Handle protocol-relative URLs
+            href = handle_protocol_relative(href, base_url)
+            # Convert relative URL to absolute URL if necessary
+            full_url = urljoin(base_url, href) if not is_absolute(href) else href
+            css_response = requests.get(full_url)
+            if css_response.status_code == 200:
+                style_tag = soup.new_tag('style')
+                style_tag.string = css_response.text
+                link_tag.replace_with(style_tag)
+    # Inline JavaScript
+    for script_tag in soup.find_all('script', src=True):
+        src = script_tag.get('src')
+        if src:
+            # Handle protocol-relative URLs
+            src = handle_protocol_relative(src, base_url)
+            # Convert relative URL to absolute URL if necessary
+            full_url = urljoin(base_url, src) if not is_absolute(src) else src
+            js_response = requests.get(full_url)
+            if js_response.status_code == 200:
+                script_tag.string = js_response.text
+                del script_tag['src']
+    return soup
 def auto_detect_mode(content):
     if isinstance(content, str):
         return 'w'  # If content is a string, it's text
                             soup.body.append(new_body)
                         else:
                             print("No <body> tag found in the HTML")
+                       # Inline CSS and JavaScript
+                        soup = inline_resources(soup, url)
                         response_data = {
                             "type": "web-data",
                             "data": str(soup),