webscrapper

Sleeping

App Files Files Community

Arafath10 commited on Oct 2, 2024

Commit

cb2cabb

verified ·

1 Parent(s): c5d8e33

Update main.py

Browse files

Files changed (1) hide show

main.py +72 -0

main.py CHANGED Viewed

@@ -1,6 +1,14 @@
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from scraper import Scraper
 try: from pip._internal.operations import freeze
@@ -28,3 +36,67 @@ async def get_data(url: str):
         except:
             return {"title": "error", "URL": url, "Content": "none"}

 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from scraper import Scraper
+import nest_asyncio
+import asyncio
+from playwright.async_api import async_playwright
+from fastapi import FastAPI
+import random
+# Allow nested use of asyncio.run() in Jupyter
+nest_asyncio.apply()
 try: from pip._internal.operations import freeze
         except:
             return {"title": "error", "URL": url, "Content": "none"}
+# FastAPI route to scrape the website
+@app.get("/scrape")
+async def scrape_website(url):
+    async with async_playwright() as p:
+        # Try using WebKit or Firefox if Chromium fails
+        browser = await p.webkit.launch(headless=True)  # Switch to WebKit
+        # Create a new browser context with a realistic user-agent
+        context = await browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+        )
+        # Set additional headers to force HTTP/1.1 and avoid detection
+        await context.set_extra_http_headers({
+            "Accept-Language": "en-US,en;q=0.9",
+            "Upgrade-Insecure-Requests": "1",
+            "Connection": "keep-alive"  # Force HTTP/1.1 instead of HTTP/2
+        })
+        # Open a new page
+        page = await context.new_page()
+        # Route to block images, videos, and CSS to speed up page load
+        await page.route("**/*", lambda route: route.abort() if route.request.resource_type in ["image", "media", "stylesheet", "font", "xhr"] else route.continue_())
+        # Navigate to the page with an extended timeout and alternate loading strategy
+        await page.goto(url, wait_until='domcontentloaded', timeout=60000)
+        try:
+            # Get the title of the page
+            title = await page.title()
+            # Introduce a slight delay before fetching the links
+            await asyncio.sleep(random.uniform(1, 2))
+            # Get all links on the page
+            links = await page.evaluate("""() => {
+                return Array.from(document.querySelectorAll('a')).map(a => a.href);
+            }""")
+            # Introduce another slight delay before fetching the content
+            await asyncio.sleep(random.uniform(1, 2))
+            # Get page content (text from paragraphs and headers)
+            content = await page.evaluate("""() => {
+                let elements = Array.from(document.querySelectorAll('body *'));
+                return elements
+                    .filter(element => element.tagName.match(/^(P|H1|H2|H3|H4|H5|H6|LI|DIV|SPAN)$/i) && element.innerText.trim().length > 0)
+                    .map(element => element.innerText.trim())
+                    .join('\\n');
+            }""")
+            # Close the browser
+            await browser.close()
+            return {
+                "title": title,
+                "links": links,
+                "content": content
+            }
+        except Exception as e:
+            return {"error": str(e)}