Spaces:
Running
Running
from fastapi import FastAPI, HTTPException | |
from bs4 import BeautifulSoup | |
import requests | |
app = FastAPI() | |
async def scrape_titles_and_links(url: str): | |
try: | |
# Send a GET request to the URL | |
response = requests.get(url) | |
# Check if the request was successful | |
if response.status_code == 200: | |
# Parse the HTML content using BeautifulSoup | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# Find all h1 elements with class "entry-title" | |
titles = soup.find_all('h1', class_='entry-title') | |
# Find all div elements with class "entry-summary" | |
summaries = soup.find_all('div', class_='entry-summary') | |
# Initialize an empty list to store the results | |
results = [] | |
# Iterate over the titles and summaries | |
for title, summary in zip(titles, summaries): | |
# Extract the title text and link | |
title_text = title.find('a').text | |
title_link = title.find('a')['href'] | |
# Extract the links from the summary | |
links = summary.find('p').text.split() | |
# Filter out links that don't start with 'http' | |
links = [link for link in links if link.startswith('http')] | |
# Append the result to the list | |
results.append({ | |
"title": title_text, | |
"link": title_link, | |
"links": links | |
}) | |
# Return the results | |
return {"results": results} | |
else: | |
raise HTTPException(status_code=500, detail="Failed to retrieve the webpage. Status code: " + str(response.status_code)) | |
except Exception as e: | |
raise HTTPException(status_code=500, detail="An error occurred: " + str(e)) | |