Spaces:
Sleeping
Sleeping
File size: 2,958 Bytes
e84fb4f 928c55d d92c861 4442b70 d92c861 0f0c7dc d92c861 1949a87 d92c861 9ba3ade d92c861 0f0c7dc 920f2ce 0f0c7dc 920f2ce beb1e33 920f2ce 9ba3ade 920f2ce beb1e33 920f2ce beb1e33 920f2ce 0f0c7dc beb1e33 0f0c7dc beb1e33 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
from fastapi import FastAPI, HTTPException
import asyncio
from playwright.async_api import async_playwright
from fastapi.responses import HTMLResponse
from fastapi.responses import StreamingResponse
from fastapi.responses import FileResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from io import StringIO
from bs4 import BeautifulSoup
import os
import requests
try: from pip._internal.operations import freeze
except ImportError: # pip < 10.0
from pip.operations import freeze
pkgs = freeze.freeze()
for pkg in pkgs: print(pkg)
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
async def power_scrapper(url):
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
# Block unnecessary resources to speed up loading
await page.route("**/*", lambda route: route.continue_() if route.request.resource_type in ["document", "script"] else route.abort())
# Open the target website
await page.goto(url, wait_until='domcontentloaded')
# Wait for a short time to ensure dynamic content is loaded
await page.wait_for_timeout(10)
# Extract all links
links = await page.query_selector_all('a')
page_url = []
page_content = []
for link in links:
href = await link.get_attribute('href')
page_url.append(href)
# Extract all text content
elements = await page.query_selector_all('body *')
for element in elements:
text_content = await element.text_content()
if text_content and text_content.strip():
page_content.append(text_content.strip())
await browser.close()
return page_url,page_content
def get_links(soup):
links = []
title = soup.find('title').get_text()
for link in soup.find_all('a'):
href = link.get('href')
links.append(href)
return links
def get_text_content(soup):
text_elements = []
for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span']:
elements = soup.find_all(tag)
for element in elements:
text_elements.append(element.get_text())
return text_elements
def get_title(soup):
title = ""
title = soup.find('title').get_text()
return title
@app.get("/get_scraped_data")
async def get_data(url: str):
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, 'html.parser')
title = get_title(soup)
links = get_links(soup)
text_content = get_text_content(soup)
if links==[]:
print("running alternative scrapper")
links,text_content = await power_scrapper(url)
return ({"title": title ,"URL":links,"Content":text_content})
|