AiDeveloper1 commited on
Commit
bc1da69
·
verified ·
1 Parent(s): 8d27b8a

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +126 -126
main.py CHANGED
@@ -1,126 +1,126 @@
1
- from fastapi import FastAPI, HTTPException, Request
2
- from pydantic import HttpUrl
3
- from playwright.async_api import async_playwright
4
- from urllib.parse import urljoin, urlparse
5
- import logging
6
- from fastapi.responses import JSONResponse
7
- from fastapi.templating import Jinja2Templates
8
- from fastapi.staticfiles import StaticFiles
9
- from typing import List, Dict
10
- import asyncio
11
-
12
- # Set up logging
13
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
14
-
15
- app = FastAPI(title="Website Scraper API (Enhanced for Images)")
16
-
17
- # Mount static files
18
- app.mount("/static", StaticFiles(directory="static"), name="static")
19
-
20
- # Set up Jinja2 templates
21
- templates = Jinja2Templates(directory="templates")
22
-
23
- # Maximum number of pages to scrape
24
- MAX_PAGES = 20
25
-
26
- async def scrape_page(url: str, visited: set, base_domain: str) -> tuple[Dict, set]:
27
- """Scrape a single page for text, images, and links using Playwright."""
28
- try:
29
- async with async_playwright() as p:
30
- browser = await p.chromium.launch(headless=True)
31
- context = await browser.new_context(
32
- user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
33
- viewport={"width": 1280, "height": 720}
34
- )
35
- page = await context.new_page()
36
-
37
- # Navigate and wait for content
38
- await page.goto(url, wait_until="networkidle", timeout=30000)
39
-
40
- # Scroll to trigger lazy-loaded images
41
- await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
42
- await page.wait_for_timeout(2000) # Wait for lazy-loaded content
43
-
44
- # Extract text content
45
- text_content = await page.evaluate(
46
- """() => document.body.innerText"""
47
- )
48
- text_content = ' '.join(text_content.split()) if text_content else ""
49
-
50
- # Extract images from src, data-src, and srcset
51
- images = await page.evaluate(
52
- """() => {
53
- const imgElements = document.querySelectorAll('img');
54
- const imgUrls = new Set();
55
- imgElements.forEach(img => {
56
- if (img.src) imgUrls.add(img.src);
57
- if (img.dataset.src) imgUrls.add(img.dataset.src);
58
- if (img.srcset) {
59
- img.srcset.split(',').forEach(src => {
60
- const url = src.trim().split(' ')[0];
61
- if (url) imgUrls.add(url);
62
- });
63
- }
64
- });
65
- return Array.from(imgUrls);
66
- }"""
67
- )
68
- images = [urljoin(url, img) for img in images if img]
69
-
70
- # Extract links
71
- links = await page.evaluate(
72
- """() => Array.from(document.querySelectorAll('a')).map(a => a.href)"""
73
- )
74
- links = set(urljoin(url, link) for link in links if urlparse(urljoin(url, link)).netloc == base_domain and urljoin(url, link) not in visited)
75
-
76
- await browser.close()
77
-
78
- page_data = {
79
- "url": url,
80
- "text": text_content,
81
- "images": images
82
- }
83
- return page_data, links
84
-
85
- except Exception as e:
86
- logging.error(f"Error scraping {url}: {e}")
87
- return {}, set()
88
-
89
- @app.get("/scrape")
90
- async def crawl_website(url: HttpUrl):
91
- """Crawl the website starting from the given URL and return scraped data for up to 10 pages as JSON."""
92
- try:
93
- visited = set()
94
- to_visit = {str(url)}
95
- base_domain = urlparse(str(url)).netloc
96
- results = []
97
-
98
- while to_visit and len(visited) < MAX_PAGES:
99
- current_url = to_visit.pop()
100
- if current_url in visited:
101
- continue
102
-
103
- logging.info(f"Scraping: {current_url}")
104
- visited.add(current_url)
105
-
106
- page_data, new_links = await scrape_page(current_url, visited, base_domain)
107
- if page_data:
108
- results.append(page_data)
109
- to_visit.update(new_links)
110
-
111
- # Small delay to avoid overwhelming the server
112
- await asyncio.sleep(0.5)
113
-
114
- return JSONResponse(content={"pages": results})
115
-
116
- except Exception as e:
117
- raise HTTPException(status_code=500, detail=f"Scraping failed: {str(e)}")
118
-
119
- @app.get("/")
120
- async def serve_home(request: Request):
121
- """Serve the frontend HTML page."""
122
- return templates.TemplateResponse("index.html", {"request": request})
123
-
124
- if __name__ == "__main__":
125
- import uvicorn
126
- uvicorn.run(app, host="0.0.0.0", port=8001)
 
1
+ from fastapi import FastAPI, HTTPException, Request
2
+ from pydantic import HttpUrl
3
+ from playwright.async_api import async_playwright
4
+ from urllib.parse import urljoin, urlparse
5
+ import logging
6
+ from fastapi.responses import JSONResponse
7
+ from fastapi.templating import Jinja2Templates
8
+ from fastapi.staticfiles import StaticFiles
9
+ from typing import List, Dict
10
+ import asyncio
11
+
12
+ # Set up logging
13
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
14
+
15
+ app = FastAPI(title="Website Scraper API (Enhanced for Images)")
16
+
17
+ # Mount static files
18
+ app.mount("/static", StaticFiles(directory="static"), name="static")
19
+
20
+ # Set up Jinja2 templates
21
+ templates = Jinja2Templates(directory="templates")
22
+
23
+ # Maximum number of pages to scrape
24
+ MAX_PAGES = 20
25
+
26
+ async def scrape_page(url: str, visited: set, base_domain: str) -> tuple[Dict, set]:
27
+ """Scrape a single page for text, images, and links using Playwright."""
28
+ try:
29
+ async with async_playwright() as p:
30
+ browser = await p.chromium.launch(headless=True)
31
+ context = await browser.new_context(
32
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
33
+ viewport={"width": 1280, "height": 720}
34
+ )
35
+ page = await context.new_page()
36
+
37
+ # Navigate and wait for content
38
+ await page.goto(url, wait_until="networkidle", timeout=30000)
39
+
40
+ # Scroll to trigger lazy-loaded images
41
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
42
+ await page.wait_for_timeout(2000) # Wait for lazy-loaded content
43
+
44
+ # Extract text content
45
+ text_content = await page.evaluate(
46
+ """() => document.body.innerText"""
47
+ )
48
+ text_content = ' '.join(text_content.split()) if text_content else ""
49
+
50
+ # Extract images from src, data-src, and srcset
51
+ images = await page.evaluate(
52
+ """() => {
53
+ const imgElements = document.querySelectorAll('img');
54
+ const imgUrls = new Set();
55
+ imgElements.forEach(img => {
56
+ if (img.src) imgUrls.add(img.src);
57
+ if (img.dataset.src) imgUrls.add(img.dataset.src);
58
+ if (img.srcset) {
59
+ img.srcset.split(',').forEach(src => {
60
+ const url = src.trim().split(' ')[0];
61
+ if (url) imgUrls.add(url);
62
+ });
63
+ }
64
+ });
65
+ return Array.from(imgUrls);
66
+ }"""
67
+ )
68
+ images = [urljoin(url, img) for img in images if img]
69
+
70
+ # Extract links
71
+ links = await page.evaluate(
72
+ """() => Array.from(document.querySelectorAll('a')).map(a => a.href)"""
73
+ )
74
+ links = set(urljoin(url, link) for link in links if urlparse(urljoin(url, link)).netloc == base_domain and urljoin(url, link) not in visited)
75
+
76
+ await browser.close()
77
+
78
+ page_data = {
79
+ "url": url,
80
+ "text": text_content,
81
+ "images": images
82
+ }
83
+ return page_data, links
84
+
85
+ except Exception as e:
86
+ logging.error(f"Error scraping {url}: {e}")
87
+ return {}, set()
88
+
89
+ @app.get("/scrape")
90
+ async def crawl_website(url: HttpUrl):
91
+ """Crawl the website starting from the given URL and return scraped data for up to 10 pages as JSON."""
92
+ try:
93
+ visited = set()
94
+ to_visit = {str(url)}
95
+ base_domain = urlparse(str(url)).netloc
96
+ results = []
97
+
98
+ while to_visit and len(visited) < MAX_PAGES:
99
+ current_url = to_visit.pop()
100
+ if current_url in visited:
101
+ continue
102
+
103
+ logging.info(f"Scraping: {current_url}")
104
+ visited.add(current_url)
105
+
106
+ page_data, new_links = await scrape_page(current_url, visited, base_domain)
107
+ if page_data:
108
+ results.append(page_data)
109
+ to_visit.update(new_links)
110
+
111
+ # Small delay to avoid overwhelming the server
112
+ await asyncio.sleep(0.5)
113
+
114
+ return JSONResponse(content={"pages": results})
115
+
116
+ except Exception as e:
117
+ raise HTTPException(status_code=500, detail=f"Scraping failed: {str(e)}")
118
+
119
+ @app.get("/")
120
+ async def serve_home(request: Request):
121
+ """Serve the frontend HTML page."""
122
+ return templates.TemplateResponse("index.html", {"request": request})
123
+
124
+ if __name__ == "__main__":
125
+ import uvicorn
126
+ uvicorn.run(app, host="0.0.0.0", port=8000)