Arafath10 commited on
Commit
462e814
·
verified ·
1 Parent(s): 8efa796

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +7 -80
main.py CHANGED
@@ -1,15 +1,7 @@
1
  from fastapi import FastAPI, HTTPException
2
- import asyncio
3
- from playwright.async_api import async_playwright
4
- from fastapi.responses import HTMLResponse
5
- from fastapi.responses import StreamingResponse
6
- from fastapi.responses import FileResponse
7
  from fastapi.middleware.cors import CORSMiddleware
8
- from pydantic import BaseModel
9
- from io import StringIO
10
- from bs4 import BeautifulSoup
11
- import os
12
- import requests
13
 
14
  try: from pip._internal.operations import freeze
15
  except ImportError: # pip < 10.0
@@ -27,76 +19,11 @@ app.add_middleware(
27
  allow_headers=["*"],
28
  )
29
 
30
- async def power_scrapper(url):
31
- async with async_playwright() as p:
32
- browser = await p.chromium.launch(headless=True)
33
- page = await browser.new_page()
34
-
35
- # Block unnecessary resources to speed up loading
36
- await page.route("**/*", lambda route: route.continue_() if route.request.resource_type in ["document", "script"] else route.abort())
37
-
38
- # Open the target website
39
- await page.goto(url, wait_until='domcontentloaded')
40
-
41
- # Wait for a short time to ensure dynamic content is loaded
42
- await page.wait_for_timeout(10)
43
-
44
- # Extract all links
45
- links = await page.query_selector_all('a')
46
- page_url = []
47
- page_content = []
48
- for link in links:
49
- href = await link.get_attribute('href')
50
- page_url.append(href)
51
-
52
- # Extract all text content
53
- elements = await page.query_selector_all('body *')
54
-
55
- for element in elements:
56
- text_content = await element.text_content()
57
- if text_content and text_content.strip():
58
- page_content.append(text_content.strip())
59
-
60
- await browser.close()
61
- return page_url,page_content
62
-
63
-
64
- def get_links(soup):
65
- links = []
66
- title = soup.find('title').get_text()
67
- for link in soup.find_all('a'):
68
- href = link.get('href')
69
- links.append(href)
70
- return links
71
-
72
-
73
- def get_text_content(soup):
74
- text_elements = []
75
- for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span']:
76
- elements = soup.find_all(tag)
77
- for element in elements:
78
- text_elements.append(element.get_text())
79
- return text_elements
80
-
81
-
82
- def get_title(soup):
83
- title = ""
84
- title = soup.find('title').get_text()
85
- return title
86
-
87
-
88
  @app.get("/get_scraped_data")
89
  async def get_data(url: str):
90
- headers = {'User-Agent': 'Mozilla/5.0'}
91
- response = requests.get(url, headers=headers)
92
- soup = BeautifulSoup(response.content, 'html.parser')
93
-
94
- title = get_title(soup)
95
- links = get_links(soup)
96
- text_content = get_text_content(soup)
97
-
98
- if links==[]:
99
- print("running alternative scrapper")
100
- links,text_content = await power_scrapper(url)
101
 
102
- return ({"title": title ,"URL":links,"Content":text_content})
 
1
  from fastapi import FastAPI, HTTPException
 
 
 
 
 
2
  from fastapi.middleware.cors import CORSMiddleware
3
+ from scraper import Scraper
4
+
 
 
 
5
 
6
  try: from pip._internal.operations import freeze
7
  except ImportError: # pip < 10.0
 
19
  allow_headers=["*"],
20
  )
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  @app.get("/get_scraped_data")
23
  async def get_data(url: str):
24
+ try:
25
+ data = await Scraper.scrape(url)
26
+ return data
27
+ except Exception as e:
28
+ raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
29