Spaces:
Sleeping
Sleeping
from fastapi import FastAPI, HTTPException | |
from fastapi.middleware.cors import CORSMiddleware | |
from scraper import Scraper | |
try: from pip._internal.operations import freeze | |
except ImportError: # pip < 10.0 | |
from pip.operations import freeze | |
pkgs = freeze.freeze() | |
for pkg in pkgs: print(pkg) | |
app = FastAPI() | |
app.add_middleware( | |
CORSMiddleware, | |
allow_origins=["*"], | |
allow_credentials=True, | |
allow_methods=["*"], | |
allow_headers=["*"], | |
) | |
async def get_data(url: str): | |
import requests | |
from bs4 import BeautifulSoup | |
# URL of the page to scrape | |
#url = "https://www.imf.org/en/News/Articles/2024/03/21/pr2494-sri-lanka-imf-staff-level-agreement-for-second-review-sla" | |
url = url | |
# Send a GET request to the URL | |
response = requests.get(url) | |
# Check if the request was successful | |
if response.status_code == 200: | |
# Parse the page content | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Extract all text content (paragraphs, headers, etc.) | |
elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']) | |
body_text = "\n".join([element.get_text().strip() for element in elements]) | |
# Extract all links | |
links = [] | |
for a_tag in soup.find_all('a', href=True): | |
links.append(a_tag['href']) | |
# Print the extracted information | |
print("Body Text:") | |
print(body_text) | |
print("\nLinks:") | |
for link in links: | |
print(link) | |
else: | |
print("Failed to retrieve the webpage") | |
return "done" | |
try: | |
data = await Scraper.scrape(url) | |
return data | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=str(e)) | |