webscrapper

Sleeping

File size: 3,490 Bytes

e84fb4f
d92c861
462e814
cb2cabb
 
 
 
 
 
 
 
462e814
d92c861
9ba3ade
 
 
 
 
 
 
d92c861
 
 
 
 
 
 
 
accd0f3
92199f9
0f0c7dc
 
ef24fee
92199f9
 
ef24fee
 
0f0c7dc
cb2cabb

from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from scraper import Scraper
import nest_asyncio
import asyncio
from playwright.async_api import async_playwright
from fastapi import FastAPI
import random

# Allow nested use of asyncio.run() in Jupyter
nest_asyncio.apply()


try: from pip._internal.operations import freeze
except ImportError: # pip < 10.0
    from pip.operations import freeze

pkgs = freeze.freeze()
for pkg in pkgs: print(pkg)

app = FastAPI()
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)
import time

@app.get("/get_scraped_data")
async def get_data(url: str):
        try:
            data = await Scraper.scrape(url)
            return data
        except:
            return {"title": "error", "URL": url, "Content": "none"}


# FastAPI route to scrape the website
@app.get("/scrape")
async def scrape_website(url):
    async with async_playwright() as p:
        # Try using WebKit or Firefox if Chromium fails
        browser = await p.webkit.launch(headless=True)  # Switch to WebKit
        
        # Create a new browser context with a realistic user-agent
        context = await browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        )

        # Set additional headers to force HTTP/1.1 and avoid detection
        await context.set_extra_http_headers({
            "Accept-Language": "en-US,en;q=0.9",
            "Upgrade-Insecure-Requests": "1",
            "Connection": "keep-alive"  # Force HTTP/1.1 instead of HTTP/2
        })

        # Open a new page
        page = await context.new_page()

        # Route to block images, videos, and CSS to speed up page load
        await page.route("**/*", lambda route: route.abort() if route.request.resource_type in ["image", "media", "stylesheet", "font", "xhr"] else route.continue_())

        # Navigate to the page with an extended timeout and alternate loading strategy
        await page.goto(url, wait_until='domcontentloaded', timeout=60000)
        try:
            # Get the title of the page
            title = await page.title()

            # Introduce a slight delay before fetching the links
            await asyncio.sleep(random.uniform(1, 2))

            # Get all links on the page
            links = await page.evaluate("""() => {
                return Array.from(document.querySelectorAll('a')).map(a => a.href);
            }""")

            # Introduce another slight delay before fetching the content
            await asyncio.sleep(random.uniform(1, 2))

            # Get page content (text from paragraphs and headers)
            content = await page.evaluate("""() => {
                let elements = Array.from(document.querySelectorAll('body *'));
                return elements
                    .filter(element => element.tagName.match(/^(P|H1|H2|H3|H4|H5|H6|LI|DIV|SPAN)$/i) && element.innerText.trim().length > 0)
                    .map(element => element.innerText.trim())
                    .join('\\n');
            }""")

            # Close the browser
            await browser.close()

            return {
                "title": title,
                "links": links,
                "content": content
            }

        except Exception as e:
            return {"error": str(e)}