AiDeveloper1 commited on
Commit
0bebdd8
·
verified ·
1 Parent(s): 62780c7

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +23 -105
main.py CHANGED
@@ -1,20 +1,19 @@
1
  from fastapi import FastAPI, HTTPException, Request
2
- from pydantic import HttpUrl
3
- from playwright.async_api import async_playwright
4
- from urllib.parse import urljoin, urlparse
5
- import logging
6
- from fastapi.responses import JSONResponse
7
  from fastapi.templating import Jinja2Templates
8
  from fastapi.staticfiles import StaticFiles
9
- from typing import List, Dict
 
 
 
10
  import asyncio
11
- import os
 
12
 
13
  # Set up logging
14
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
15
- logger = logging.getLogger(__name__)
16
 
17
- app = FastAPI(title="Website Scraper API with Frontend")
18
 
19
  # Mount static files
20
  app.mount("/static", StaticFiles(directory="static"), name="static")
@@ -22,123 +21,42 @@ app.mount("/static", StaticFiles(directory="static"), name="static")
22
  # Set up Jinja2 templates
23
  templates = Jinja2Templates(directory="templates")
24
 
25
- # Maximum number of pages to scrape
26
- MAX_PAGES = 20
27
-
28
- async def scrape_page(url: str, visited: set, base_domain: str) -> tuple[Dict, set]:
29
- """Scrape a single page for text, images, and links using Playwright."""
30
- try:
31
- logger.info(f"Starting Playwright for URL: {url}")
32
- async with async_playwright() as p:
33
- browser = await p.chromium.launch(headless=True)
34
- context = await browser.new_context(
35
- user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
36
- viewport={"width": 800, "height": 600}, # Reduced viewport for performance
37
- bypass_csp=True # Bypass Content Security Policy
38
- )
39
- page = await context.new_page()
40
-
41
- # Retry navigation with fallback
42
- for attempt in range(2): # Try up to 2 times
43
- try:
44
- logger.info(f"Navigating to {url} (Attempt {attempt + 1})")
45
- await page.goto(url, wait_until="domcontentloaded", timeout=30000) # 30s timeout
46
- break # Success, exit retry loop
47
- except Exception as e:
48
- logger.warning(f"Navigation attempt {attempt + 1} failed for {url}: {str(e)}")
49
- if attempt == 1: # Last attempt
50
- logger.error(f"All navigation attempts failed for {url}")
51
- await browser.close()
52
- return {}, set()
53
- await asyncio.sleep(1) # Wait before retry
54
-
55
- # Scroll to trigger lazy-loaded images
56
- await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
57
- await page.wait_for_timeout(2000) # Wait for lazy-loaded content
58
-
59
- # Extract text content
60
- text_content = await page.evaluate(
61
- """() => document.body.innerText"""
62
- )
63
- text_content = ' '.join(text_content.split()) if text_content else ""
64
-
65
- # Extract images from src, data-src, and srcset
66
- images = await page.evaluate(
67
- """() => {
68
- const imgElements = document.querySelectorAll('img');
69
- const imgUrls = new Set();
70
- imgElements.forEach(img => {
71
- if (img.src) imgUrls.add(img.src);
72
- if (img.dataset.src) imgUrls.add(img.dataset.src);
73
- if (img.srcset) {
74
- img.srcset.split(',').forEach(src => {
75
- const url = src.trim().split(' ')[0];
76
- if (url) imgUrls.add(url);
77
- });
78
- }
79
- });
80
- return Array.from(imgUrls);
81
- }"""
82
- )
83
- images = [urljoin(url, img) for img in images if img]
84
-
85
- # Extract links
86
- links = await page.evaluate(
87
- """() => Array.from(document.querySelectorAll('a')).map(a => a.href)"""
88
- )
89
- links = set(urljoin(url, link) for link in links if urlparse(urljoin(url, link)).netloc == base_domain and urljoin(url, link) not in visited)
90
-
91
- await browser.close()
92
- logger.info(f"Successfully scraped {url}")
93
-
94
- page_data = {
95
- "url": url,
96
- "text": text_content,
97
- "images": images
98
- }
99
- return page_data, links
100
-
101
- except Exception as e:
102
- logger.error(f"Error scraping {url}: {str(e)}")
103
- return {}, set()
104
-
105
  @app.get("/scrape")
106
  async def crawl_website(url: HttpUrl):
107
- """Crawl the website starting from the given URL and return scraped data for up to 10 pages as JSON."""
108
  try:
109
- logger.info(f"Starting crawl for {url}")
110
  visited = set()
111
  to_visit = {str(url)}
112
  base_domain = urlparse(str(url)).netloc
113
  results = []
114
 
115
- while to_visit and len(visited) < MAX_PAGES:
116
  current_url = to_visit.pop()
117
  if current_url in visited:
118
  continue
119
-
120
- logger.info(f"Scraping: {current_url}")
121
  visited.add(current_url)
122
-
 
123
  page_data, new_links = await scrape_page(current_url, visited, base_domain)
124
  if page_data:
125
- results.append(page_data)
 
 
 
 
126
  to_visit.update(new_links)
127
-
128
- # Small delay to avoid overwhelming the server
129
  await asyncio.sleep(0.5)
130
-
131
- logger.info(f"Crawl completed for {url}")
132
- return JSONResponse(content={"pages": results})
133
-
134
  except Exception as e:
135
- logger.error(f"Scraping failed for {url}: {str(e)}")
136
  raise HTTPException(status_code=500, detail=f"Scraping failed: {str(e)}")
137
 
138
- @app.get("/")
139
  async def serve_home(request: Request):
140
  """Serve the frontend HTML page."""
141
- logger.info("Serving home page")
142
  return templates.TemplateResponse("index.html", {"request": request})
143
 
144
  if __name__ == "__main__":
 
1
  from fastapi import FastAPI, HTTPException, Request
2
+ from fastapi.responses import HTMLResponse
 
 
 
 
3
  from fastapi.templating import Jinja2Templates
4
  from fastapi.staticfiles import StaticFiles
5
+ from pydantic import HttpUrl
6
+ from scraper import scrape_page
7
+ from summarizer import summarize_text
8
+ from rich_card_builder import build_rich_card
9
  import asyncio
10
+ from urllib.parse import urlparse
11
+ import logging
12
 
13
  # Set up logging
14
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 
15
 
16
+ app = FastAPI(title="Website Scraper API (Enhanced for RCS)")
17
 
18
  # Mount static files
19
  app.mount("/static", StaticFiles(directory="static"), name="static")
 
21
  # Set up Jinja2 templates
22
  templates = Jinja2Templates(directory="templates")
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  @app.get("/scrape")
25
  async def crawl_website(url: HttpUrl):
26
+ """Crawl a website and return rich card JSON for up to 1 page (demo)."""
27
  try:
 
28
  visited = set()
29
  to_visit = {str(url)}
30
  base_domain = urlparse(str(url)).netloc
31
  results = []
32
 
33
+ while to_visit and len(visited) < 1: # Limited to 1 for demo
34
  current_url = to_visit.pop()
35
  if current_url in visited:
36
  continue
 
 
37
  visited.add(current_url)
38
+
39
+ logging.info(f"Scraping page: {current_url}")
40
  page_data, new_links = await scrape_page(current_url, visited, base_domain)
41
  if page_data:
42
+ logging.info(f"Scraped data: {page_data}")
43
+ summary = await summarize_text(page_data["text"], page_data["url"])
44
+ rich_card = build_rich_card(page_data, summary)
45
+ results.append(rich_card)
46
+
47
  to_visit.update(new_links)
 
 
48
  await asyncio.sleep(0.5)
49
+
50
+ logging.info(f"Final response: {results}")
51
+ return {"rich_cards": results}
52
+
53
  except Exception as e:
54
+ logging.error(f"Scraping failed: {str(e)}")
55
  raise HTTPException(status_code=500, detail=f"Scraping failed: {str(e)}")
56
 
57
+ @app.get("/", response_class=HTMLResponse)
58
  async def serve_home(request: Request):
59
  """Serve the frontend HTML page."""
 
60
  return templates.TemplateResponse("index.html", {"request": request})
61
 
62
  if __name__ == "__main__":