Arafath10 commited on
Commit
9097392
1 Parent(s): 37c190c

Update scraper.py

Browse files
Files changed (1) hide show
  1. scraper.py +1 -33
scraper.py CHANGED
@@ -6,38 +6,6 @@ from bs4 import BeautifulSoup
6
  import requests
7
 
8
 
9
- import requests
10
- from bs4 import BeautifulSoup
11
-
12
- # URL of the page to scrape
13
- url = "https://www.imf.org/en/News/Articles/2024/03/21/pr2494-sri-lanka-imf-staff-level-agreement-for-second-review-sla"
14
-
15
- # Send a GET request to the URL
16
- response = requests.get(url)
17
-
18
- # Check if the request was successful
19
- if response.status_code == 200:
20
- # Parse the page content
21
- soup = BeautifulSoup(response.content, 'html.parser')
22
-
23
- # Extract all text content (paragraphs, headers, etc.)
24
- elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
25
- body_text = "\n".join([element.get_text().strip() for element in elements])
26
-
27
- # Extract all links
28
- links = []
29
- for a_tag in soup.find_all('a', href=True):
30
- links.append(a_tag['href'])
31
-
32
- # Print the extracted information
33
- print("Body Text:")
34
- print(body_text)
35
- print("\nLinks:")
36
- for link in links:
37
- print(link)
38
- else:
39
- print("Failed to retrieve the webpage")
40
-
41
 
42
  class Scraper:
43
  @staticmethod
@@ -99,7 +67,7 @@ class Scraper:
99
  @staticmethod
100
  async def scrape(url):
101
  headers = {'User-Agent': 'Mozilla/5.0'}
102
- response = requests.get(url, headers=headers)
103
  soup = BeautifulSoup(response.content, 'html.parser')
104
 
105
  title = Scraper.get_title(soup)
 
6
  import requests
7
 
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  class Scraper:
11
  @staticmethod
 
67
  @staticmethod
68
  async def scrape(url):
69
  headers = {'User-Agent': 'Mozilla/5.0'}
70
+ response = requests.get(url)
71
  soup = BeautifulSoup(response.content, 'html.parser')
72
 
73
  title = Scraper.get_title(soup)