Spaces:

slimshadow
/

4links-scraper-api

Running

4links-scraper-api / app.py

Create app.py

327fc2b verified 7 months ago

2 kB

	from fastapi import FastAPI, HTTPException
	from bs4 import BeautifulSoup
	import requests

	app = FastAPI()

	@app.get("/scrape")
	async def scrape_titles_and_links(url: str):
	try:
	# Send a GET request to the URL
	response = requests.get(url)

	# Check if the request was successful
	if response.status_code == 200:
	# Parse the HTML content using BeautifulSoup
	soup = BeautifulSoup(response.content, 'html.parser')

	# Find all h1 elements with class "entry-title"
	titles = soup.find_all('h1', class_='entry-title')

	# Find all div elements with class "entry-summary"
	summaries = soup.find_all('div', class_='entry-summary')

	# Initialize an empty list to store the results
	results = []

	# Iterate over the titles and summaries
	for title, summary in zip(titles, summaries):
	# Extract the title text and link
	title_text = title.find('a').text
	title_link = title.find('a')['href']

	# Extract the links from the summary
	links = summary.find('p').text.split()

	# Filter out links that don't start with 'http'
	links = [link for link in links if link.startswith('http')]

	# Append the result to the list
	results.append({
	"title": title_text,
	"link": title_link,
	"links": links
	})

	# Return the results
	return {"results": results}

	else:
	raise HTTPException(status_code=500, detail="Failed to retrieve the webpage. Status code: " + str(response.status_code))

	except Exception as e:
	raise HTTPException(status_code=500, detail="An error occurred: " + str(e))