Spaces:
Sleeping
Sleeping
File size: 4,950 Bytes
8efa796 706f4ae 8efa796 b928ab9 628e50e ac10c70 706f4ae ac10c70 706f4ae ac10c70 c5d8e33 ac10c70 706f4ae b928ab9 81b8336 706f4ae b928ab9 706f4ae b928ab9 706f4ae bdb4bd8 b928ab9 706f4ae b928ab9 81b8336 b928ab9 8efa796 5f74ea1 b8a08e6 bdb4bd8 5f74ea1 faea8d5 5f74ea1 bdb4bd8 5f74ea1 81b8336 bdb4bd8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
# scraper.py
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import requests
import time
class Scraper:
@staticmethod
async def power_scrapper_2(url):
async with async_playwright() as p:
#browser = await p.chromium.launch(headless=True)
browser = await p.webkit.launch(headless=True) # Switch to WebKit
# Create a new browser context with a realistic user-agent
context = await browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
)
# Open a new page
page = await context.new_page()
# Route to block images, videos, and CSS to speed up page load
await page.route("**/*", lambda route: route.abort() if route.request.resource_type in ["image", "media", "stylesheet", "font", "xhr"] else route.continue_())
# Navigate to the page with an extended timeout and alternate loading strategy
await page.goto(url, wait_until='domcontentloaded', timeout=60000)
# Wait for a specific element (like the title or an H1 header) to ensure the page is loaded
# Get the title
title = await page.title()
# Get all links
page_url = await page.evaluate("""() => {
return Array.from(document.querySelectorAll('a')).map(a => a.href);
}""")
# Get page content (text from paragraphs and headers)
page_content = await page.evaluate("""() => {
let elements = Array.from(document.querySelectorAll('body *'));
return elements
.filter(element => element.tagName.match(/^(P|H1|H2|H3|H4|H5|H6|LI|DIV|SPAN)$/i) && element.innerText.trim().length > 0)
.map(element => element.innerText.trim())
.join('\\n');
}""")
await browser.close()
return title,page_url, page_content
@staticmethod
async def power_scrapper(url):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
# Block unnecessary resources to speed up loading
await page.route("**/*", lambda route: route.continue_() if route.request.resource_type in ["document", "script"] else route.abort())
# Open the target website
await page.goto(url, wait_until='domcontentloaded')
# Wait for a short time to ensure dynamic content is loaded
await page.wait_for_timeout(1000)
# Extract all links
links = await page.query_selector_all('a')
page_url = []
page_content = []
for link in links:
href = await link.get_attribute('href')
page_url.append(href)
# Extract all text content
elements = await page.query_selector_all('body *')
for element in elements:
text_content = await element.text_content()
if text_content and text_content.strip():
page_content.append(text_content.strip())
await browser.close()
return page_url, page_content
@staticmethod
def get_links(soup):
links = []
for link in soup.find_all('a'):
href = link.get('href')
links.append(href)
return links
@staticmethod
def get_text_content(soup):
text_elements = []
for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span']:
elements = soup.find_all(tag)
for element in elements:
text_elements.append(element.get_text())
return text_elements
@staticmethod
def get_title(soup):
title = soup.find('title').get_text()
return title
@staticmethod
async def scrape(url):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
response = requests.get(url,timeout=3)
soup = BeautifulSoup(response.content, 'html.parser')
title = Scraper.get_title(soup)
links = Scraper.get_links(soup)
text_content = Scraper.get_text_content(soup)
if not links:
print("Running alternative scrapper")
links, text_content = await Scraper.power_scrapper_2(url)
return {"title": title, "URL": links, "Content": text_content}
except:
title,links, text_content = await Scraper.power_scrapper_2(url)
return {"title": title, "URL": links, "Content": text_content} |