Spaces:

AiDeveloper1
/

RCS

Sleeping

App Files Files Community

AiDeveloper1 commited on May 19

Commit

0bebdd8

verified ·

1 Parent(s): 62780c7

Update main.py

Browse files

Files changed (1) hide show

main.py +23 -105

main.py CHANGED Viewed

@@ -1,20 +1,19 @@
 from fastapi import FastAPI, HTTPException, Request
-from pydantic import HttpUrl
-from playwright.async_api import async_playwright
-from urllib.parse import urljoin, urlparse
-import logging
-from fastapi.responses import JSONResponse
 from fastapi.templating import Jinja2Templates
 from fastapi.staticfiles import StaticFiles
-from typing import List, Dict
 import asyncio
-import os
 # Set up logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-app = FastAPI(title="Website Scraper API with Frontend")
 # Mount static files
 app.mount("/static", StaticFiles(directory="static"), name="static")
@@ -22,123 +21,42 @@ app.mount("/static", StaticFiles(directory="static"), name="static")
 # Set up Jinja2 templates
 templates = Jinja2Templates(directory="templates")
-# Maximum number of pages to scrape
-MAX_PAGES = 20
-async def scrape_page(url: str, visited: set, base_domain: str) -> tuple[Dict, set]:
-    """Scrape a single page for text, images, and links using Playwright."""
-    try:
-        logger.info(f"Starting Playwright for URL: {url}")
-        async with async_playwright() as p:
-            browser = await p.chromium.launch(headless=True)
-            context = await browser.new_context(
-                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
-                viewport={"width": 800, "height": 600},  # Reduced viewport for performance
-                bypass_csp=True  # Bypass Content Security Policy
-            )
-            page = await context.new_page()
-            # Retry navigation with fallback
-            for attempt in range(2):  # Try up to 2 times
-                try:
-                    logger.info(f"Navigating to {url} (Attempt {attempt + 1})")
-                    await page.goto(url, wait_until="domcontentloaded", timeout=30000)  # 30s timeout
-                    break  # Success, exit retry loop
-                except Exception as e:
-                    logger.warning(f"Navigation attempt {attempt + 1} failed for {url}: {str(e)}")
-                    if attempt == 1:  # Last attempt
-                        logger.error(f"All navigation attempts failed for {url}")
-                        await browser.close()
-                        return {}, set()
-                    await asyncio.sleep(1)  # Wait before retry
-            # Scroll to trigger lazy-loaded images
-            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
-            await page.wait_for_timeout(2000)  # Wait for lazy-loaded content
-            # Extract text content
-            text_content = await page.evaluate(
-                """() => document.body.innerText"""
-            )
-            text_content = ' '.join(text_content.split()) if text_content else ""
-            # Extract images from src, data-src, and srcset
-            images = await page.evaluate(
-                """() => {
-                    const imgElements = document.querySelectorAll('img');
-                    const imgUrls = new Set();
-                    imgElements.forEach(img => {
-                        if (img.src) imgUrls.add(img.src);
-                        if (img.dataset.src) imgUrls.add(img.dataset.src);
-                        if (img.srcset) {
-                            img.srcset.split(',').forEach(src => {
-                                const url = src.trim().split(' ')[0];
-                                if (url) imgUrls.add(url);
-                            });
-                        }
-                    });
-                    return Array.from(imgUrls);
-                }"""
-            )
-            images = [urljoin(url, img) for img in images if img]
-            # Extract links
-            links = await page.evaluate(
-                """() => Array.from(document.querySelectorAll('a')).map(a => a.href)"""
-            )
-            links = set(urljoin(url, link) for link in links if urlparse(urljoin(url, link)).netloc == base_domain and urljoin(url, link) not in visited)
-            await browser.close()
-            logger.info(f"Successfully scraped {url}")
-        page_data = {
-            "url": url,
-            "text": text_content,
-            "images": images
-        }
-        return page_data, links
-    except Exception as e:
-        logger.error(f"Error scraping {url}: {str(e)}")
-        return {}, set()
 @app.get("/scrape")
 async def crawl_website(url: HttpUrl):
-    """Crawl the website starting from the given URL and return scraped data for up to 10 pages as JSON."""
     try:
-        logger.info(f"Starting crawl for {url}")
         visited = set()
         to_visit = {str(url)}
         base_domain = urlparse(str(url)).netloc
         results = []
-        while to_visit and len(visited) < MAX_PAGES:
             current_url = to_visit.pop()
             if current_url in visited:
                 continue
-            logger.info(f"Scraping: {current_url}")
             visited.add(current_url)
             page_data, new_links = await scrape_page(current_url, visited, base_domain)
             if page_data:
-                results.append(page_data)
             to_visit.update(new_links)
-            # Small delay to avoid overwhelming the server
             await asyncio.sleep(0.5)
-        logger.info(f"Crawl completed for {url}")
-        return JSONResponse(content={"pages": results})
     except Exception as e:
-        logger.error(f"Scraping failed for {url}: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Scraping failed: {str(e)}")
-@app.get("/")
 async def serve_home(request: Request):
     """Serve the frontend HTML page."""
-    logger.info("Serving home page")
     return templates.TemplateResponse("index.html", {"request": request})
 if __name__ == "__main__":

 from fastapi import FastAPI, HTTPException, Request
+from fastapi.responses import HTMLResponse
 from fastapi.templating import Jinja2Templates
 from fastapi.staticfiles import StaticFiles
+from pydantic import HttpUrl
+from scraper import scrape_page
+from summarizer import summarize_text
+from rich_card_builder import build_rich_card
 import asyncio
+from urllib.parse import urlparse
+import logging
 # Set up logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+app = FastAPI(title="Website Scraper API (Enhanced for RCS)")
 # Mount static files
 app.mount("/static", StaticFiles(directory="static"), name="static")
 # Set up Jinja2 templates
 templates = Jinja2Templates(directory="templates")
 @app.get("/scrape")
 async def crawl_website(url: HttpUrl):
+    """Crawl a website and return rich card JSON for up to 1 page (demo)."""
     try:
         visited = set()
         to_visit = {str(url)}
         base_domain = urlparse(str(url)).netloc
         results = []
+        while to_visit and len(visited) < 1:  # Limited to 1 for demo
             current_url = to_visit.pop()
             if current_url in visited:
                 continue
             visited.add(current_url)
+            logging.info(f"Scraping page: {current_url}")
             page_data, new_links = await scrape_page(current_url, visited, base_domain)
             if page_data:
+                logging.info(f"Scraped data: {page_data}")
+                summary = await summarize_text(page_data["text"], page_data["url"])
+                rich_card = build_rich_card(page_data, summary)
+                results.append(rich_card)
             to_visit.update(new_links)
             await asyncio.sleep(0.5)
+        logging.info(f"Final response: {results}")
+        return {"rich_cards": results}
     except Exception as e:
+        logging.error(f"Scraping failed: {str(e)}")
         raise HTTPException(status_code=500, detail=f"Scraping failed: {str(e)}")
+@app.get("/", response_class=HTMLResponse)
 async def serve_home(request: Request):
     """Serve the frontend HTML page."""
     return templates.TemplateResponse("index.html", {"request": request})
 if __name__ == "__main__":