Arafath10 commited on
Commit
92199f9
1 Parent(s): fdf1d1f

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +37 -35
main.py CHANGED
@@ -19,44 +19,46 @@ app.add_middleware(
19
  allow_headers=["*"],
20
  )
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  @app.get("/get_scraped_data")
23
  async def get_data(url: str):
24
  import requests
25
  from bs4 import BeautifulSoup
 
 
 
 
 
 
 
26
 
27
- # URL of the page to scrape
28
- #url = "https://www.imf.org/en/News/Articles/2024/03/21/pr2494-sri-lanka-imf-staff-level-agreement-for-second-review-sla"
29
- url = url
30
-
31
- # Send a GET request to the URL
32
- response = requests.get(url)
33
-
34
- # Check if the request was successful
35
- if response.status_code == 200:
36
- # Parse the page content
37
- soup = BeautifulSoup(response.content, 'html.parser')
38
-
39
- # Extract all text content (paragraphs, headers, etc.)
40
- elements = soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
41
- body_text = "\n".join([element.get_text().strip() for element in elements])
42
-
43
- # Extract all links
44
- links = []
45
- for a_tag in soup.find_all('a', href=True):
46
- links.append(a_tag['href'])
47
-
48
- # Print the extracted information
49
- print("Body Text:")
50
- print(body_text)
51
- print("\nLinks:")
52
- for link in links:
53
- print(link)
54
- else:
55
- print("Failed to retrieve the webpage")
56
- return "done"
57
- try:
58
- data = await Scraper.scrape(url)
59
- return data
60
- except Exception as e:
61
- raise HTTPException(status_code=500, detail=str(e))
62
 
 
19
  allow_headers=["*"],
20
  )
21
 
22
+ def get_links(soup):
23
+ links = []
24
+ for link in soup.find_all('a'):
25
+ href = link.get('href')
26
+ links.append(href)
27
+ return links
28
+
29
+
30
+ def get_text_content(soup):
31
+ text_elements = []
32
+ for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span']:
33
+ elements = soup.find_all(tag)
34
+ for element in elements:
35
+ text_elements.append(element.get_text())
36
+ return text_elements
37
+
38
+ def get_title(soup):
39
+ title = soup.find('title').get_text()
40
+ return title
41
+
42
  @app.get("/get_scraped_data")
43
  async def get_data(url: str):
44
  import requests
45
  from bs4 import BeautifulSoup
46
+ headers = {'User-Agent': 'Mozilla/5.0'}
47
+ response = requests.get(url, headers=headers)
48
+ soup = BeautifulSoup(response.content, 'html.parser')
49
+
50
+ title = Scraper.get_title(soup)
51
+ links = Scraper.get_links(soup)
52
+ text_content = Scraper.get_text_content(soup)
53
 
54
+ if not links:
55
+ print("Running alternative scrapper")
56
+
57
+ try:
58
+ data = await Scraper.scrape(url)
59
+ return data
60
+ except Exception as e:
61
+ raise HTTPException(status_code=500, detail=str(e))
62
+ esle:
63
+ return return {"title": title, "URL": links, "Content": text_content}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64