Spaces:
Sleeping
Sleeping
File size: 1,034 Bytes
dbde425 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
from bs4 import BeautifulSoup
import requests
class ScrapeJobDetails:
def __init__(self, urls: list):
self.url_list = urls
def _fetchUrlData(self, url: str):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
'Accept-Language': 'en-US,en;q=0.9',
}
return requests.get(url, headers=headers).text
def _parseHtml(self, html_content):
soup = BeautifulSoup(html_content, 'lxml')
tags_to_extract = ['p', 'li', 'ul']
extracted_text = []
for tag in tags_to_extract:
for element in soup.find_all(tag):
extracted_text.append(element.get_text(strip=True))
return "\n".join(extracted_text)
def _run(self):
result_dict = {}
for url in self.url_list:
html_data = self._fetchUrlData(url)
result_dict[url] = self._parseHtml(html_data)
return result_dict |