slimshadow commited on
Commit
327fc2b
·
verified ·
1 Parent(s): 0b598f1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -0
app.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException
2
+ from bs4 import BeautifulSoup
3
+ import requests
4
+
5
+ app = FastAPI()
6
+
7
+ @app.get("/scrape")
8
+ async def scrape_titles_and_links(url: str):
9
+ try:
10
+ # Send a GET request to the URL
11
+ response = requests.get(url)
12
+
13
+ # Check if the request was successful
14
+ if response.status_code == 200:
15
+ # Parse the HTML content using BeautifulSoup
16
+ soup = BeautifulSoup(response.content, 'html.parser')
17
+
18
+ # Find all h1 elements with class "entry-title"
19
+ titles = soup.find_all('h1', class_='entry-title')
20
+
21
+ # Find all div elements with class "entry-summary"
22
+ summaries = soup.find_all('div', class_='entry-summary')
23
+
24
+ # Initialize an empty list to store the results
25
+ results = []
26
+
27
+ # Iterate over the titles and summaries
28
+ for title, summary in zip(titles, summaries):
29
+ # Extract the title text and link
30
+ title_text = title.find('a').text
31
+ title_link = title.find('a')['href']
32
+
33
+ # Extract the links from the summary
34
+ links = summary.find('p').text.split()
35
+
36
+ # Filter out links that don't start with 'http'
37
+ links = [link for link in links if link.startswith('http')]
38
+
39
+ # Append the result to the list
40
+ results.append({
41
+ "title": title_text,
42
+ "link": title_link,
43
+ "links": links
44
+ })
45
+
46
+ # Return the results
47
+ return {"results": results}
48
+
49
+ else:
50
+ raise HTTPException(status_code=500, detail="Failed to retrieve the webpage. Status code: " + str(response.status_code))
51
+
52
+ except Exception as e:
53
+ raise HTTPException(status_code=500, detail="An error occurred: " + str(e))