Arafath10 commited on
Commit
cb2cabb
1 Parent(s): c5d8e33

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +72 -0
main.py CHANGED
@@ -1,6 +1,14 @@
1
  from fastapi import FastAPI, HTTPException
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from scraper import Scraper
 
 
 
 
 
 
 
 
4
 
5
 
6
  try: from pip._internal.operations import freeze
@@ -28,3 +36,67 @@ async def get_data(url: str):
28
  except:
29
  return {"title": "error", "URL": url, "Content": "none"}
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from fastapi import FastAPI, HTTPException
2
  from fastapi.middleware.cors import CORSMiddleware
3
  from scraper import Scraper
4
+ import nest_asyncio
5
+ import asyncio
6
+ from playwright.async_api import async_playwright
7
+ from fastapi import FastAPI
8
+ import random
9
+
10
+ # Allow nested use of asyncio.run() in Jupyter
11
+ nest_asyncio.apply()
12
 
13
 
14
  try: from pip._internal.operations import freeze
 
36
  except:
37
  return {"title": "error", "URL": url, "Content": "none"}
38
 
39
+
40
+ # FastAPI route to scrape the website
41
+ @app.get("/scrape")
42
+ async def scrape_website(url):
43
+ async with async_playwright() as p:
44
+ # Try using WebKit or Firefox if Chromium fails
45
+ browser = await p.webkit.launch(headless=True) # Switch to WebKit
46
+
47
+ # Create a new browser context with a realistic user-agent
48
+ context = await browser.new_context(
49
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
50
+ )
51
+
52
+ # Set additional headers to force HTTP/1.1 and avoid detection
53
+ await context.set_extra_http_headers({
54
+ "Accept-Language": "en-US,en;q=0.9",
55
+ "Upgrade-Insecure-Requests": "1",
56
+ "Connection": "keep-alive" # Force HTTP/1.1 instead of HTTP/2
57
+ })
58
+
59
+ # Open a new page
60
+ page = await context.new_page()
61
+
62
+ # Route to block images, videos, and CSS to speed up page load
63
+ await page.route("**/*", lambda route: route.abort() if route.request.resource_type in ["image", "media", "stylesheet", "font", "xhr"] else route.continue_())
64
+
65
+ # Navigate to the page with an extended timeout and alternate loading strategy
66
+ await page.goto(url, wait_until='domcontentloaded', timeout=60000)
67
+ try:
68
+ # Get the title of the page
69
+ title = await page.title()
70
+
71
+ # Introduce a slight delay before fetching the links
72
+ await asyncio.sleep(random.uniform(1, 2))
73
+
74
+ # Get all links on the page
75
+ links = await page.evaluate("""() => {
76
+ return Array.from(document.querySelectorAll('a')).map(a => a.href);
77
+ }""")
78
+
79
+ # Introduce another slight delay before fetching the content
80
+ await asyncio.sleep(random.uniform(1, 2))
81
+
82
+ # Get page content (text from paragraphs and headers)
83
+ content = await page.evaluate("""() => {
84
+ let elements = Array.from(document.querySelectorAll('body *'));
85
+ return elements
86
+ .filter(element => element.tagName.match(/^(P|H1|H2|H3|H4|H5|H6|LI|DIV|SPAN)$/i) && element.innerText.trim().length > 0)
87
+ .map(element => element.innerText.trim())
88
+ .join('\\n');
89
+ }""")
90
+
91
+ # Close the browser
92
+ await browser.close()
93
+
94
+ return {
95
+ "title": title,
96
+ "links": links,
97
+ "content": content
98
+ }
99
+
100
+ except Exception as e:
101
+ return {"error": str(e)}
102
+