File size: 2,004 Bytes
327fc2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from fastapi import FastAPI, HTTPException
from bs4 import BeautifulSoup
import requests

app = FastAPI()

@app.get("/scrape")
async def scrape_titles_and_links(url: str):
    try:
        # Send a GET request to the URL
        response = requests.get(url)
        
        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find all h1 elements with class "entry-title"
            titles = soup.find_all('h1', class_='entry-title')
            
            # Find all div elements with class "entry-summary"
            summaries = soup.find_all('div', class_='entry-summary')
            
            # Initialize an empty list to store the results
            results = []
            
            # Iterate over the titles and summaries
            for title, summary in zip(titles, summaries):
                # Extract the title text and link
                title_text = title.find('a').text
                title_link = title.find('a')['href']
                
                # Extract the links from the summary
                links = summary.find('p').text.split()
                
                # Filter out links that don't start with 'http'
                links = [link for link in links if link.startswith('http')]
                
                # Append the result to the list
                results.append({
                    "title": title_text,
                    "link": title_link,
                    "links": links
                })
            
            # Return the results
            return {"results": results}
        
        else:
            raise HTTPException(status_code=500, detail="Failed to retrieve the webpage. Status code: " + str(response.status_code))
    
    except Exception as e:
        raise HTTPException(status_code=500, detail="An error occurred: " + str(e))