File size: 4,950 Bytes
8efa796
 
 
 
 
 
706f4ae
 
8efa796
b928ab9
 
 
628e50e
 
ac10c70
 
 
 
706f4ae
ac10c70
 
706f4ae
ac10c70
c5d8e33
ac10c70
 
 
 
706f4ae
b928ab9
81b8336
706f4ae
b928ab9
 
 
 
706f4ae
 
b928ab9
706f4ae
bdb4bd8
 
 
 
b928ab9
706f4ae
b928ab9
 
81b8336
b928ab9
8efa796
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f74ea1
b8a08e6
bdb4bd8
5f74ea1
 
 
 
 
 
faea8d5
5f74ea1
 
bdb4bd8
5f74ea1
 
81b8336
bdb4bd8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# scraper.py

import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import requests
import time

class Scraper:
    @staticmethod
    async def power_scrapper_2(url):
        async with async_playwright() as p:
            #browser = await p.chromium.launch(headless=True)
            browser = await p.webkit.launch(headless=True)  # Switch to WebKit
            # Create a new browser context with a realistic user-agent
            context = await browser.new_context(
                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
            )
    
            # Open a new page
            page = await context.new_page()
    
            # Route to block images, videos, and CSS to speed up page load
            await page.route("**/*", lambda route: route.abort() if route.request.resource_type in ["image", "media", "stylesheet", "font", "xhr"] else route.continue_())
    
            # Navigate to the page with an extended timeout and alternate loading strategy
            await page.goto(url, wait_until='domcontentloaded', timeout=60000)
            # Wait for a specific element (like the title or an H1 header) to ensure the page is loaded
    
            # Get the title
            title = await page.title()
    
            # Get all links
            page_url = await page.evaluate("""() => {
                return Array.from(document.querySelectorAll('a')).map(a => a.href);
            }""")
    
            # Get page content (text from paragraphs and headers)
            page_content = await page.evaluate("""() => {
                let elements = Array.from(document.querySelectorAll('body *'));
                return elements
                    .filter(element => element.tagName.match(/^(P|H1|H2|H3|H4|H5|H6|LI|DIV|SPAN)$/i) && element.innerText.trim().length > 0)
                    .map(element => element.innerText.trim())
                    .join('\\n');
            }""")

            
            await browser.close()
            return title,page_url, page_content
    
    @staticmethod
    async def power_scrapper(url):
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page()

            # Block unnecessary resources to speed up loading
            await page.route("**/*", lambda route: route.continue_() if route.request.resource_type in ["document", "script"] else route.abort())

            # Open the target website
            await page.goto(url, wait_until='domcontentloaded')

            # Wait for a short time to ensure dynamic content is loaded
            await page.wait_for_timeout(1000)

            # Extract all links
            links = await page.query_selector_all('a')
            page_url = []
            page_content = []
            for link in links:
                href = await link.get_attribute('href')
                page_url.append(href)

            # Extract all text content
            elements = await page.query_selector_all('body *')
        
            for element in elements:
                text_content = await element.text_content()
                if text_content and text_content.strip():
                    page_content.append(text_content.strip())

            await browser.close()
            return page_url, page_content

    @staticmethod
    def get_links(soup):
        links = []
        for link in soup.find_all('a'):
            href = link.get('href')
            links.append(href)
        return links

    @staticmethod
    def get_text_content(soup):
        text_elements = []
        for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span']:
            elements = soup.find_all(tag)
            for element in elements:
                text_elements.append(element.get_text())
        return text_elements

    @staticmethod
    def get_title(soup):
        title = soup.find('title').get_text()
        return title

    @staticmethod
    async def scrape(url):
        try:
            headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
            response = requests.get(url,timeout=3)
            soup = BeautifulSoup(response.content, 'html.parser')
    
            title = Scraper.get_title(soup)
            links = Scraper.get_links(soup)
            text_content = Scraper.get_text_content(soup)
    
            if not links:
                print("Running alternative scrapper")
                links, text_content = await Scraper.power_scrapper_2(url)
    
            return {"title": title, "URL": links, "Content": text_content}
        except:
            title,links, text_content = await Scraper.power_scrapper_2(url)
            return {"title": title, "URL": links, "Content": text_content}