Arafath10 commited on
Commit
5f7a419
1 Parent(s): 4728ec1

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +34 -0
scraper.py CHANGED
@@ -5,6 +5,40 @@ from playwright.async_api import async_playwright
5
  from bs4 import BeautifulSoup
6
  import requests
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  class Scraper:
9
  @staticmethod
10
  async def power_scrapper(url):
 
5
  from bs4 import BeautifulSoup
6
  import requests
7
 
8
+
9
+ import requests
10
+ from bs4 import BeautifulSoup
11
+
12
+ # URL of the page to scrape
13
+ url = "https://www.imf.org/en/News/Articles/2024/03/21/pr2494-sri-lanka-imf-staff-level-agreement-for-second-review-sla"
14
+
15
+ # Send a GET request to the URL
16
+ response = requests.get(url)
17
+
18
+ # Check if the request was successful
19
+ if response.status_code == 200:
20
+ # Parse the page content
21
+ soup = BeautifulSoup(response.content, 'html.parser')
22
+
23
+ # Extract all text content (paragraphs, headers, etc.)
24
+ elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
25
+ body_text = "\n".join([element.get_text().strip() for element in elements])
26
+
27
+ # Extract all links
28
+ links = []
29
+ for a_tag in soup.find_all('a', href=True):
30
+ links.append(a_tag['href'])
31
+
32
+ # Print the extracted information
33
+ print("Body Text:")
34
+ print(body_text)
35
+ print("\nLinks:")
36
+ for link in links:
37
+ print(link)
38
+ else:
39
+ print("Failed to retrieve the webpage")
40
+
41
+
42
  class Scraper:
43
  @staticmethod
44
  async def power_scrapper(url):