AiDeveloper1 commited on
Commit
632c1f1
·
verified ·
1 Parent(s): 022eb64

Delete scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +0 -73
scraper.py DELETED
@@ -1,73 +0,0 @@
1
- from playwright.async_api import async_playwright
2
- from urllib.parse import urljoin, urlparse
3
- import logging
4
-
5
- # Set up logging
6
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
7
-
8
- async def scrape_page(url: str, visited: set, base_domain: str) -> tuple[dict, set]:
9
- """Scrape a single page for text, images, and links using Playwright."""
10
- try:
11
- async with async_playwright() as p:
12
- browser = await p.chromium.launch(headless=True)
13
- context = await browser.new_context(
14
- user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
15
- viewport={"width": 1280, "height": 720}
16
- )
17
- page = await context.new_page()
18
- await page.goto(url, wait_until="networkidle", timeout=30000)
19
- await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
20
- await page.wait_for_timeout(2000)
21
-
22
- # Extract text content
23
- text_content = await page.evaluate("document.body.innerText")
24
- text_content = ' '.join(text_content.split()) if text_content else ""
25
-
26
- # Extract images (only JPEG, PNG, WebP, exclude data URLs and SVGs)
27
- images = await page.evaluate(
28
- """() => {
29
- const validExtensions = ['.jpg', '.jpeg', '.png', '.webp'];
30
- const imgElements = document.querySelectorAll('img');
31
- const imgUrls = new Set();
32
- imgElements.forEach(img => {
33
- const src = img.src || '';
34
- const dataSrc = img.dataset.src || '';
35
- const srcset = img.srcset || '';
36
- // Check src
37
- if (src && !src.startsWith('data:') && validExtensions.some(ext => src.toLowerCase().endsWith(ext))) {
38
- imgUrls.add(src);
39
- }
40
- // Check data-src
41
- if (dataSrc && !dataSrc.startsWith('data:') && validExtensions.some(ext => dataSrc.toLowerCase().endsWith(ext))) {
42
- imgUrls.add(dataSrc);
43
- }
44
- // Check srcset
45
- if (srcset) {
46
- srcset.split(',').forEach(src => {
47
- const url = src.trim().split(' ')[0];
48
- if (url && !url.startsWith('data:') && validExtensions.some(ext => url.toLowerCase().endsWith(ext))) {
49
- imgUrls.add(url);
50
- }
51
- });
52
- }
53
- });
54
- return Array.from(imgUrls);
55
- }"""
56
- )
57
- images = [urljoin(url, img) for img in images if img]
58
-
59
- # Extract links
60
- links = await page.evaluate("Array.from(document.querySelectorAll('a')).map(a => a.href)")
61
- links = set(urljoin(url, link) for link in links
62
- if urlparse(urljoin(url, link)).netloc == base_domain
63
- and urljoin(url, link) not in visited)
64
-
65
- await browser.close()
66
-
67
- page_data = {"url": url, "text": text_content, "images": images}
68
- logging.info(f"Scraped data: url={url}, text_length={len(text_content)}, images={images}")
69
- return page_data, links
70
-
71
- except Exception as e:
72
- logging.error(f"Error scraping {url}: {e}")
73
- return {}, set()