AiDeveloper1 commited on
Commit
b74cfe9
·
verified ·
1 Parent(s): fca7c02

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +31 -10
main.py CHANGED
@@ -8,11 +8,13 @@ from fastapi.templating import Jinja2Templates
8
  from fastapi.staticfiles import StaticFiles
9
  from typing import List, Dict
10
  import asyncio
 
11
 
12
  # Set up logging
13
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
14
 
15
- app = FastAPI(title="Website Scraper API (Enhanced for Images)")
16
 
17
  # Mount static files
18
  app.mount("/static", StaticFiles(directory="static"), name="static")
@@ -26,21 +28,34 @@ MAX_PAGES = 20
26
  async def scrape_page(url: str, visited: set, base_domain: str) -> tuple[Dict, set]:
27
  """Scrape a single page for text, images, and links using Playwright."""
28
  try:
 
29
  async with async_playwright() as p:
30
  browser = await p.chromium.launch(headless=True)
31
  context = await browser.new_context(
32
  user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
33
- viewport={"width": 1280, "height": 720}
 
34
  )
35
  page = await context.new_page()
36
-
37
- # Navigate and wait for content
38
- await page.goto(url, wait_until="networkidle", timeout=40000)
39
-
 
 
 
 
 
 
 
 
 
 
 
40
  # Scroll to trigger lazy-loaded images
41
  await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
42
  await page.wait_for_timeout(2000) # Wait for lazy-loaded content
43
-
44
  # Extract text content
45
  text_content = await page.evaluate(
46
  """() => document.body.innerText"""
@@ -74,6 +89,7 @@ async def scrape_page(url: str, visited: set, base_domain: str) -> tuple[Dict, s
74
  links = set(urljoin(url, link) for link in links if urlparse(urljoin(url, link)).netloc == base_domain and urljoin(url, link) not in visited)
75
 
76
  await browser.close()
 
77
 
78
  page_data = {
79
  "url": url,
@@ -83,13 +99,14 @@ async def scrape_page(url: str, visited: set, base_domain: str) -> tuple[Dict, s
83
  return page_data, links
84
 
85
  except Exception as e:
86
- logging.error(f"Error scraping {url}: {e}")
87
  return {}, set()
88
 
89
  @app.get("/scrape")
90
  async def crawl_website(url: HttpUrl):
91
  """Crawl the website starting from the given URL and return scraped data for up to 10 pages as JSON."""
92
  try:
 
93
  visited = set()
94
  to_visit = {str(url)}
95
  base_domain = urlparse(str(url)).netloc
@@ -100,27 +117,31 @@ async def crawl_website(url: HttpUrl):
100
  if current_url in visited:
101
  continue
102
 
103
- logging.info(f"Scraping: {current_url}")
104
  visited.add(current_url)
105
 
106
  page_data, new_links = await scrape_page(current_url, visited, base_domain)
107
  if page_data:
108
  results.append(page_data)
109
  to_visit.update(new_links)
110
-
111
  # Small delay to avoid overwhelming the server
112
  await asyncio.sleep(0.5)
113
 
 
114
  return JSONResponse(content={"pages": results})
115
 
116
  except Exception as e:
 
117
  raise HTTPException(status_code=500, detail=f"Scraping failed: {str(e)}")
118
 
119
  @app.get("/")
120
  async def serve_home(request: Request):
121
  """Serve the frontend HTML page."""
 
122
  return templates.TemplateResponse("index.html", {"request": request})
123
 
124
  if __name__ == "__main__":
 
125
  import uvicorn
126
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
8
  from fastapi.staticfiles import StaticFiles
9
  from typing import List, Dict
10
  import asyncio
11
+ import os
12
 
13
  # Set up logging
14
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
15
+ logger = logging.getLogger(__name__)
16
 
17
+ app = FastAPI(title="Website Scraper API with Frontend")
18
 
19
  # Mount static files
20
  app.mount("/static", StaticFiles(directory="static"), name="static")
 
28
  async def scrape_page(url: str, visited: set, base_domain: str) -> tuple[Dict, set]:
29
  """Scrape a single page for text, images, and links using Playwright."""
30
  try:
31
+ logger.info(f"Starting Playwright for URL: {url}")
32
  async with async_playwright() as p:
33
  browser = await p.chromium.launch(headless=True)
34
  context = await browser.new_context(
35
  user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
36
+ viewport={"width": 800, "height": 600}, # Reduced viewport for performance
37
+ bypass_csp=True # Bypass Content Security Policy
38
  )
39
  page = await context.new_page()
40
+
41
+ # Retry navigation with fallback
42
+ for attempt in range(2): # Try up to 2 times
43
+ try:
44
+ logger.info(f"Navigating to {url} (Attempt {attempt + 1})")
45
+ await page.goto(url, wait_until="domcontentloaded", timeout=30000) # 30s timeout
46
+ break # Success, exit retry loop
47
+ except Exception as e:
48
+ logger.warning(f"Navigation attempt {attempt + 1} failed for {url}: {str(e)}")
49
+ if attempt == 1: # Last attempt
50
+ logger.error(f"All navigation attempts failed for {url}")
51
+ await browser.close()
52
+ return {}, set()
53
+ await asyncio.sleep(1) # Wait before retry
54
+
55
  # Scroll to trigger lazy-loaded images
56
  await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
57
  await page.wait_for_timeout(2000) # Wait for lazy-loaded content
58
+
59
  # Extract text content
60
  text_content = await page.evaluate(
61
  """() => document.body.innerText"""
 
89
  links = set(urljoin(url, link) for link in links if urlparse(urljoin(url, link)).netloc == base_domain and urljoin(url, link) not in visited)
90
 
91
  await browser.close()
92
+ logger.info(f"Successfully scraped {url}")
93
 
94
  page_data = {
95
  "url": url,
 
99
  return page_data, links
100
 
101
  except Exception as e:
102
+ logger.error(f"Error scraping {url}: {str(e)}")
103
  return {}, set()
104
 
105
  @app.get("/scrape")
106
  async def crawl_website(url: HttpUrl):
107
  """Crawl the website starting from the given URL and return scraped data for up to 10 pages as JSON."""
108
  try:
109
+ logger.info(f"Starting crawl for {url}")
110
  visited = set()
111
  to_visit = {str(url)}
112
  base_domain = urlparse(str(url)).netloc
 
117
  if current_url in visited:
118
  continue
119
 
120
+ logger.info(f"Scraping: {current_url}")
121
  visited.add(current_url)
122
 
123
  page_data, new_links = await scrape_page(current_url, visited, base_domain)
124
  if page_data:
125
  results.append(page_data)
126
  to_visit.update(new_links)
127
+
128
  # Small delay to avoid overwhelming the server
129
  await asyncio.sleep(0.5)
130
 
131
+ logger.info(f"Crawl completed for {url}")
132
  return JSONResponse(content={"pages": results})
133
 
134
  except Exception as e:
135
+ logger.error(f"Scraping failed for {url}: {str(e)}")
136
  raise HTTPException(status_code=500, detail=f"Scraping failed: {str(e)}")
137
 
138
  @app.get("/")
139
  async def serve_home(request: Request):
140
  """Serve the frontend HTML page."""
141
+ logger.info("Serving home page")
142
  return templates.TemplateResponse("index.html", {"request": request})
143
 
144
  if __name__ == "__main__":
145
+ logger.info("Starting FastAPI server on port 7860")
146
  import uvicorn
147
  uvicorn.run(app, host="0.0.0.0", port=7860)