Spaces:
Sleeping
Sleeping
Delete scraper.py
Browse files- scraper.py +0 -73
scraper.py
DELETED
@@ -1,73 +0,0 @@
|
|
1 |
-
from playwright.async_api import async_playwright
|
2 |
-
from urllib.parse import urljoin, urlparse
|
3 |
-
import logging
|
4 |
-
|
5 |
-
# Set up logging
|
6 |
-
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
7 |
-
|
8 |
-
async def scrape_page(url: str, visited: set, base_domain: str) -> tuple[dict, set]:
|
9 |
-
"""Scrape a single page for text, images, and links using Playwright."""
|
10 |
-
try:
|
11 |
-
async with async_playwright() as p:
|
12 |
-
browser = await p.chromium.launch(headless=True)
|
13 |
-
context = await browser.new_context(
|
14 |
-
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
|
15 |
-
viewport={"width": 1280, "height": 720}
|
16 |
-
)
|
17 |
-
page = await context.new_page()
|
18 |
-
await page.goto(url, wait_until="networkidle", timeout=30000)
|
19 |
-
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
20 |
-
await page.wait_for_timeout(2000)
|
21 |
-
|
22 |
-
# Extract text content
|
23 |
-
text_content = await page.evaluate("document.body.innerText")
|
24 |
-
text_content = ' '.join(text_content.split()) if text_content else ""
|
25 |
-
|
26 |
-
# Extract images (only JPEG, PNG, WebP, exclude data URLs and SVGs)
|
27 |
-
images = await page.evaluate(
|
28 |
-
"""() => {
|
29 |
-
const validExtensions = ['.jpg', '.jpeg', '.png', '.webp'];
|
30 |
-
const imgElements = document.querySelectorAll('img');
|
31 |
-
const imgUrls = new Set();
|
32 |
-
imgElements.forEach(img => {
|
33 |
-
const src = img.src || '';
|
34 |
-
const dataSrc = img.dataset.src || '';
|
35 |
-
const srcset = img.srcset || '';
|
36 |
-
// Check src
|
37 |
-
if (src && !src.startsWith('data:') && validExtensions.some(ext => src.toLowerCase().endsWith(ext))) {
|
38 |
-
imgUrls.add(src);
|
39 |
-
}
|
40 |
-
// Check data-src
|
41 |
-
if (dataSrc && !dataSrc.startsWith('data:') && validExtensions.some(ext => dataSrc.toLowerCase().endsWith(ext))) {
|
42 |
-
imgUrls.add(dataSrc);
|
43 |
-
}
|
44 |
-
// Check srcset
|
45 |
-
if (srcset) {
|
46 |
-
srcset.split(',').forEach(src => {
|
47 |
-
const url = src.trim().split(' ')[0];
|
48 |
-
if (url && !url.startsWith('data:') && validExtensions.some(ext => url.toLowerCase().endsWith(ext))) {
|
49 |
-
imgUrls.add(url);
|
50 |
-
}
|
51 |
-
});
|
52 |
-
}
|
53 |
-
});
|
54 |
-
return Array.from(imgUrls);
|
55 |
-
}"""
|
56 |
-
)
|
57 |
-
images = [urljoin(url, img) for img in images if img]
|
58 |
-
|
59 |
-
# Extract links
|
60 |
-
links = await page.evaluate("Array.from(document.querySelectorAll('a')).map(a => a.href)")
|
61 |
-
links = set(urljoin(url, link) for link in links
|
62 |
-
if urlparse(urljoin(url, link)).netloc == base_domain
|
63 |
-
and urljoin(url, link) not in visited)
|
64 |
-
|
65 |
-
await browser.close()
|
66 |
-
|
67 |
-
page_data = {"url": url, "text": text_content, "images": images}
|
68 |
-
logging.info(f"Scraped data: url={url}, text_length={len(text_content)}, images={images}")
|
69 |
-
return page_data, links
|
70 |
-
|
71 |
-
except Exception as e:
|
72 |
-
logging.error(f"Error scraping {url}: {e}")
|
73 |
-
return {}, set()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|