from selenium import webdriver from time import sleep import random from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException from selenium.webdriver.common.by import By from selenium.webdriver.edge.options import Options from selenium.webdriver.common.keys import Keys from bs4 import BeautifulSoup import re import datetime import json import os from datetime import datetime, timedelta def init_driver(): chrome_options = Options() options = [ "--headless", "--disable-gpu", "--window-size=1920,1200", "--ignore-certificate-errors", "--disable-extensions", "--no-sandbox", "--disable-dev-shm-usage" ] chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") for option in options: chrome_options.add_argument(option) driver = webdriver.Edge(options=chrome_options) return driver #____________________________________________ def access(driver,url): print("_"*30, "ACCESS URL","_"*30) driver.get(url) sleep(5) def search(driver, job, location): print("_"*30, "SEARCH","_"*30) search_box_job = driver.find_element(By.XPATH, '//input[@id="text-input-what"]') search_box_location=driver.find_element(By.XPATH, '//input[@id="text-input-where"]') search_box_job.send_keys(job) search_box_location.send_keys(location) search_box_location.send_keys(Keys.RETURN) driver.implicitly_wait(5) def save_data(dict_jd): directory = './data' if not os.path.exists(directory): os.makedirs(directory) today = datetime.today().strftime('%Y_%m_%d') filename = f"{directory}/data_{today}.json" json_file = json.dumps(dict_jd, indent= 4, ensure_ascii=False) with open(filename, "w", encoding="utf-8") as f: f.write(json_file) def info_job(driver): # id=0 num_job= driver.find_element(By.XPATH, '//div[@class="jobsearch-JobCountAndSortPane-jobCount css-13jafh6 eu4oa1w0"]//span').text num_job_=re.sub(r'\D', '', num_job) num_job=int(num_job_) num_next= num_job//15 if num_next >15 : num_next=15 dict_job={} for i in range(0,num_next-2): info_jobs = driver.find_elements(By.XPATH, '//div[@class="job_seen_beacon"]') print("_"*30, "START","_"*30) try: close = driver.find_element(By.XPATH, '//button[@aria-label="close"]') close.click() except NoSuchElementException: pass for element in info_jobs: element.click() try: today = datetime.today() date_post= element.find_element(By.XPATH, './/span[@data-testid="myJobsStateDate"]').text date_post_=re.sub(r'\D', '', date_post) if date_post_ != "": posted_date = today - timedelta(days=int(date_post_)) posted_date_str = posted_date.strftime('%Y-%m-%d') else: posted_date_str=today.strftime('%Y-%m-%d') name_job_ = driver.find_element(By.XPATH, '//h2[@data-testid="jobsearch-JobInfoHeader-title"]/span').text name_job = name_job_.replace("- job post", "").strip() name_company = driver.find_element(By.XPATH, '//div[@data-testid="inlineHeader-companyName"]/span/a').text location = driver.find_element(By.XPATH, '//div[@data-testid="inlineHeader-companyLocation"]/div').text job_description = driver.find_elements(By.XPATH, '//div[@id="jobDescriptionText"]') content_jd = "" for jd in job_description: get_html = jd.get_attribute("innerHTML") parser = BeautifulSoup(get_html, 'html.parser') jd = parser.get_text() content_jd += jd.replace("\n"," ").replace(" ","") # id+=1 id=name_company+'@'+name_job try: dict_job[id] except KeyError: dict_job[id] = { "ID":id, "job":name_job, "company": name_company, "location": location, "job_description":content_jd, "date_post": posted_date_str } sleep(4) except NoSuchElementException: pass try: next = driver.find_element(By.XPATH, '//a[@data-testid="pagination-page-next"]') next.click() sleep(4) except NoSuchElementException: break; try: close = driver.find_element(By.XPATH, '//button[@aria-label="close"]') close.click() except NoSuchElementException: pass driver.quit() return dict_job