File size: 3,876 Bytes
e84fb4f
d92c861
462e814
a6e8718
cb2cabb
 
 
 
 
 
a6e8718
462e814
d92c861
9ba3ade
 
 
 
 
 
 
d92c861
 
 
 
 
 
 
 
accd0f3
92199f9
0f0c7dc
 
ef24fee
92199f9
 
ef24fee
 
0f0c7dc
cb2cabb
 
 
 
 
 
c0bee13
cb2cabb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c0bee13
 
 
 
 
 
 
 
 
 
cb2cabb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from scraper import Scraper
#import nest_asyncio
import asyncio
from playwright.async_api import async_playwright
from fastapi import FastAPI
import random

# Allow nested use of asyncio.run() in Jupyter
#nest_asyncio.apply()


try: from pip._internal.operations import freeze
except ImportError: # pip < 10.0
    from pip.operations import freeze

pkgs = freeze.freeze()
for pkg in pkgs: print(pkg)

app = FastAPI()
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)
import time

@app.get("/get_scraped_data")
async def get_data(url: str):
        try:
            data = await Scraper.scrape(url)
            return data
        except:
            return {"title": "error", "URL": url, "Content": "none"}


# FastAPI route to scrape the website
@app.get("/scrape")
async def scrape_website(url):
    async with async_playwright() as p:
        # Try using WebKit or Firefox if Chromium fails
        browser = await p.webkit.launch(headless=True)  # Switch to WebKit
        
        # Create a new browser context with a realistic user-agent
        context = await browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        )

        # Set additional headers to force HTTP/1.1 and avoid detection
        await context.set_extra_http_headers({
            "Accept-Language": "en-US,en;q=0.9",
            "Upgrade-Insecure-Requests": "1",
            "Connection": "keep-alive"  # Force HTTP/1.1 instead of HTTP/2
        })

        # Open a new page
        page = await context.new_page()

        # Route to block images, videos, and CSS to speed up page load
        await page.route("**/*", lambda route: route.abort() if route.request.resource_type in ["image", "media", "stylesheet", "font", "xhr"] else route.continue_())

        try:
            # Introduce a slight delay to mimic human behavior
            await asyncio.sleep(random.uniform(1, 3))

            # Navigate to the page with an extended timeout
            await page.goto(url, wait_until='domcontentloaded', timeout=60000)

            # Simulate human behavior by scrolling and moving the mouse
            await page.mouse.move(random.uniform(0, 100), random.uniform(0, 100))
            await page.mouse.wheel(0, random.uniform(200, 400))
            await asyncio.sleep(random.uniform(1, 3))  # Random delay
            # Get the title of the page
            title = await page.title()

            # Introduce a slight delay before fetching the links
            await asyncio.sleep(random.uniform(1, 2))

            # Get all links on the page
            links = await page.evaluate("""() => {
                return Array.from(document.querySelectorAll('a')).map(a => a.href);
            }""")

            # Introduce another slight delay before fetching the content
            await asyncio.sleep(random.uniform(1, 2))

            # Get page content (text from paragraphs and headers)
            content = await page.evaluate("""() => {
                let elements = Array.from(document.querySelectorAll('body *'));
                return elements
                    .filter(element => element.tagName.match(/^(P|H1|H2|H3|H4|H5|H6|LI|DIV|SPAN)$/i) && element.innerText.trim().length > 0)
                    .map(element => element.innerText.trim())
                    .join('\\n');
            }""")

            # Close the browser
            await browser.close()

            return {
                "title": title,
                "links": links,
                "content": content
            }

        except Exception as e:
            return {"error": str(e)}