slimshadow's picture
Create app.py
327fc2b verified
from fastapi import FastAPI, HTTPException
from bs4 import BeautifulSoup
import requests
app = FastAPI()
@app.get("/scrape")
async def scrape_titles_and_links(url: str):
try:
# Send a GET request to the URL
response = requests.get(url)
# Check if the request was successful
if response.status_code == 200:
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')
# Find all h1 elements with class "entry-title"
titles = soup.find_all('h1', class_='entry-title')
# Find all div elements with class "entry-summary"
summaries = soup.find_all('div', class_='entry-summary')
# Initialize an empty list to store the results
results = []
# Iterate over the titles and summaries
for title, summary in zip(titles, summaries):
# Extract the title text and link
title_text = title.find('a').text
title_link = title.find('a')['href']
# Extract the links from the summary
links = summary.find('p').text.split()
# Filter out links that don't start with 'http'
links = [link for link in links if link.startswith('http')]
# Append the result to the list
results.append({
"title": title_text,
"link": title_link,
"links": links
})
# Return the results
return {"results": results}
else:
raise HTTPException(status_code=500, detail="Failed to retrieve the webpage. Status code: " + str(response.status_code))
except Exception as e:
raise HTTPException(status_code=500, detail="An error occurred: " + str(e))