Nguyen Quang Truong commited on
Commit
6d19d4c
·
1 Parent(s): 61162c9
Files changed (4) hide show
  1. .github/workflows/scrape.yml +38 -0
  2. requirements.txt +8 -0
  3. scrape_data.py +26 -0
  4. utils.py +168 -0
.github/workflows/scrape.yml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Scrape Data
2
+
3
+ on:
4
+ schedule:
5
+ - cron: '0 0 * * *'
6
+ workflow_dispatch:
7
+
8
+ jobs:
9
+ scrape:
10
+ runs-on: ubuntu-latest
11
+
12
+ steps:
13
+ - name: Checkout repository
14
+ uses: actions/checkout@v3
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v4
18
+ with:
19
+ python-version: '3.10'
20
+
21
+ - name: Install dependencies
22
+ run: |
23
+ python -m pip install --upgrade pip
24
+ pip install -r requirements.txt
25
+
26
+
27
+ - name: Run scraping script
28
+ run: python scrape_data.py
29
+
30
+ - name: Commit and push changes
31
+ run: |
32
+ git config --global user.name 'github-actions[bot]'
33
+ git config --global user.email 'github-actions[bot]@users.noreply.github.com'
34
+ git add .
35
+ git commit -m "Update scraped data"
36
+ git push
37
+ env:
38
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ webdriver-manager
2
+ selenium
3
+ beautifulsoup4
4
+ argparse
5
+ numpy
6
+ pandas
7
+ bs4
8
+ chromedriver_autoinstaller
scrape_data.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import argparse
3
+ from selenium.webdriver.edge.options import Options
4
+ from selenium import webdriver
5
+ from utils import save_data, access, info_job,search, init_driver
6
+
7
+ if __name__ == "__main__":
8
+ parser = argparse.ArgumentParser()
9
+ parser.add_argument("--url", default= "https://vn.indeed.com/jobs")
10
+ parser.add_argument("--job", default="AI, Data Science")
11
+ parser.add_argument("--location", default="Thành Phố Hồ Chí Minh")
12
+ args = parser.parse_args()
13
+
14
+
15
+ driver=init_driver()
16
+
17
+ url, job_, location = args.url, args.job, args.location
18
+
19
+ access(driver,url)
20
+
21
+ search(driver,job_,location)
22
+ data=info_job(driver)
23
+ save_data(data)
24
+
25
+
26
+
utils.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from selenium import webdriver
2
+ from time import sleep
3
+ import random
4
+ from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException
5
+ from selenium.webdriver.common.by import By
6
+ from selenium.webdriver.edge.options import Options
7
+ from selenium.webdriver.common.keys import Keys
8
+ from bs4 import BeautifulSoup
9
+ import re
10
+ import datetime
11
+ import json
12
+ import os
13
+ from datetime import datetime, timedelta
14
+
15
+ def init_driver():
16
+
17
+ chrome_options = Options()
18
+ options = [
19
+ "--headless",
20
+ "--disable-gpu",
21
+ "--window-size=1920,1200",
22
+ "--ignore-certificate-errors",
23
+ "--disable-extensions",
24
+ "--no-sandbox",
25
+ "--disable-dev-shm-usage"
26
+ ]
27
+
28
+ chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
29
+
30
+ for option in options:
31
+ chrome_options.add_argument(option)
32
+
33
+ driver = webdriver.Edge(options=chrome_options)
34
+
35
+ return driver
36
+
37
+
38
+ #____________________________________________
39
+
40
+ def access(driver,url):
41
+ print("_"*30, "ACCESS URL","_"*30)
42
+ driver.get(url)
43
+ sleep(5)
44
+
45
+
46
+ def search(driver, job, location):
47
+ print("_"*30, "SEARCH","_"*30)
48
+
49
+ search_box_job = driver.find_element(By.XPATH, '//input[@id="text-input-what"]')
50
+ search_box_location=driver.find_element(By.XPATH, '//input[@id="text-input-where"]')
51
+ search_box_job.send_keys(job)
52
+ search_box_location.send_keys(location)
53
+
54
+ search_box_location.send_keys(Keys.RETURN)
55
+ driver.implicitly_wait(5)
56
+
57
+
58
+
59
+ def save_data(dict_jd):
60
+ directory = './data'
61
+
62
+ if not os.path.exists(directory):
63
+ os.makedirs(directory)
64
+
65
+ today = datetime.today().strftime('%Y_%m_%d')
66
+ filename = f"{directory}/data_{today}.json"
67
+
68
+ json_file = json.dumps(dict_jd, indent= 4, ensure_ascii=False)
69
+
70
+ with open(filename, "w", encoding="utf-8") as f:
71
+ f.write(json_file)
72
+
73
+
74
+ def info_job(driver):
75
+
76
+ id=0
77
+
78
+ num_job= driver.find_element(By.XPATH, '//div[@class="jobsearch-JobCountAndSortPane-jobCount css-13jafh6 eu4oa1w0"]//span').text
79
+ num_job_=re.sub(r'\D', '', num_job)
80
+ num_job=int(num_job_)
81
+ num_next= num_job//15
82
+
83
+
84
+ if num_next >15 :
85
+ num_next=15
86
+
87
+ dict_job={}
88
+ for i in range(0,num_next-2):
89
+ info_jobs = driver.find_elements(By.XPATH, '//div[@class="job_seen_beacon"]')
90
+ print("_"*30, "START","_"*30)
91
+
92
+
93
+ try:
94
+ close = driver.find_element(By.XPATH, '//button[@aria-label="close"]')
95
+ close.click()
96
+ except NoSuchElementException:
97
+ pass
98
+
99
+ for element in info_jobs:
100
+
101
+ element.click()
102
+ try:
103
+ today = datetime.today()
104
+ date_post= element.find_element(By.XPATH, './/span[@data-testid="myJobsStateDate"]').text
105
+ date_post_=re.sub(r'\D', '', date_post)
106
+ if date_post_ != "":
107
+
108
+ posted_date = today - timedelta(days=int(date_post_))
109
+ posted_date_str = posted_date.strftime('%Y-%m-%d')
110
+ else:
111
+ posted_date_str=today.strftime('%Y-%m-%d')
112
+
113
+
114
+
115
+
116
+ name_job_ = driver.find_element(By.XPATH, '//h2[@data-testid="jobsearch-JobInfoHeader-title"]/span').text
117
+ name_job = name_job_.replace("- job post", "").strip()
118
+
119
+ name_company = driver.find_element(By.XPATH, '//div[@data-testid="inlineHeader-companyName"]/span/a').text
120
+
121
+ location = driver.find_element(By.XPATH, '//div[@data-testid="inlineHeader-companyLocation"]/div').text
122
+
123
+
124
+
125
+ job_description = driver.find_elements(By.XPATH, '//div[@id="jobDescriptionText"]')
126
+
127
+
128
+ content_jd = ""
129
+ for jd in job_description:
130
+ get_html = jd.get_attribute("innerHTML")
131
+ parser = BeautifulSoup(get_html, 'html.parser')
132
+ jd = parser.get_text()
133
+ content_jd += jd.replace("\n"," ").replace(" ","")
134
+ id+=1
135
+
136
+ try:
137
+ dict_job[id]
138
+ except KeyError:
139
+ dict_job[id] = {
140
+ "ID":id,
141
+ "job":name_job,
142
+ "company": name_company,
143
+ "location": location,
144
+ "job_description":content_jd,
145
+ "date_post": posted_date_str
146
+
147
+ }
148
+
149
+ sleep(4)
150
+ except NoSuchElementException:
151
+ pass
152
+
153
+ try:
154
+ next = driver.find_element(By.XPATH, '//a[@data-testid="pagination-page-next"]')
155
+ next.click()
156
+ sleep(4)
157
+ except NoSuchElementException:
158
+ break;
159
+ try:
160
+ close = driver.find_element(By.XPATH, '//button[@aria-label="close"]')
161
+ close.click()
162
+ except NoSuchElementException:
163
+ pass
164
+
165
+ driver.quit()
166
+ return dict_job
167
+
168
+