Spaces:

slimshadow
/

4links-scraper-api

Running

App Files Files Community

slimshadow commited on Dec 9, 2024

Commit

327fc2b

verified ·

1 Parent(s): 0b598f1

Create app.py

Browse files

Files changed (1) hide show

app.py +53 -0

app.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from fastapi import FastAPI, HTTPException
+from bs4 import BeautifulSoup
+import requests
+app = FastAPI()
+@app.get("/scrape")
+async def scrape_titles_and_links(url: str):
+    try:
+        # Send a GET request to the URL
+        response = requests.get(url)
+        # Check if the request was successful
+        if response.status_code == 200:
+            # Parse the HTML content using BeautifulSoup
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # Find all h1 elements with class "entry-title"
+            titles = soup.find_all('h1', class_='entry-title')
+            # Find all div elements with class "entry-summary"
+            summaries = soup.find_all('div', class_='entry-summary')
+            # Initialize an empty list to store the results
+            results = []
+            # Iterate over the titles and summaries
+            for title, summary in zip(titles, summaries):
+                # Extract the title text and link
+                title_text = title.find('a').text
+                title_link = title.find('a')['href']
+                # Extract the links from the summary
+                links = summary.find('p').text.split()
+                # Filter out links that don't start with 'http'
+                links = [link for link in links if link.startswith('http')]
+                # Append the result to the list
+                results.append({
+                    "title": title_text,
+                    "link": title_link,
+                    "links": links
+                })
+            # Return the results
+            return {"results": results}
+        else:
+            raise HTTPException(status_code=500, detail="Failed to retrieve the webpage. Status code: " + str(response.status_code))
+    except Exception as e:
+        raise HTTPException(status_code=500, detail="An error occurred: " + str(e))