Spaces:
Runtime error
Runtime error
Nguyen Quang Truong
commited on
Commit
·
6d19d4c
1
Parent(s):
61162c9
load file
Browse files- .github/workflows/scrape.yml +38 -0
- requirements.txt +8 -0
- scrape_data.py +26 -0
- utils.py +168 -0
.github/workflows/scrape.yml
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: Scrape Data
|
2 |
+
|
3 |
+
on:
|
4 |
+
schedule:
|
5 |
+
- cron: '0 0 * * *'
|
6 |
+
workflow_dispatch:
|
7 |
+
|
8 |
+
jobs:
|
9 |
+
scrape:
|
10 |
+
runs-on: ubuntu-latest
|
11 |
+
|
12 |
+
steps:
|
13 |
+
- name: Checkout repository
|
14 |
+
uses: actions/checkout@v3
|
15 |
+
|
16 |
+
- name: Set up Python
|
17 |
+
uses: actions/setup-python@v4
|
18 |
+
with:
|
19 |
+
python-version: '3.10'
|
20 |
+
|
21 |
+
- name: Install dependencies
|
22 |
+
run: |
|
23 |
+
python -m pip install --upgrade pip
|
24 |
+
pip install -r requirements.txt
|
25 |
+
|
26 |
+
|
27 |
+
- name: Run scraping script
|
28 |
+
run: python scrape_data.py
|
29 |
+
|
30 |
+
- name: Commit and push changes
|
31 |
+
run: |
|
32 |
+
git config --global user.name 'github-actions[bot]'
|
33 |
+
git config --global user.email 'github-actions[bot]@users.noreply.github.com'
|
34 |
+
git add .
|
35 |
+
git commit -m "Update scraped data"
|
36 |
+
git push
|
37 |
+
env:
|
38 |
+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
webdriver-manager
|
2 |
+
selenium
|
3 |
+
beautifulsoup4
|
4 |
+
argparse
|
5 |
+
numpy
|
6 |
+
pandas
|
7 |
+
bs4
|
8 |
+
chromedriver_autoinstaller
|
scrape_data.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import argparse
|
3 |
+
from selenium.webdriver.edge.options import Options
|
4 |
+
from selenium import webdriver
|
5 |
+
from utils import save_data, access, info_job,search, init_driver
|
6 |
+
|
7 |
+
if __name__ == "__main__":
|
8 |
+
parser = argparse.ArgumentParser()
|
9 |
+
parser.add_argument("--url", default= "https://vn.indeed.com/jobs")
|
10 |
+
parser.add_argument("--job", default="AI, Data Science")
|
11 |
+
parser.add_argument("--location", default="Thành Phố Hồ Chí Minh")
|
12 |
+
args = parser.parse_args()
|
13 |
+
|
14 |
+
|
15 |
+
driver=init_driver()
|
16 |
+
|
17 |
+
url, job_, location = args.url, args.job, args.location
|
18 |
+
|
19 |
+
access(driver,url)
|
20 |
+
|
21 |
+
search(driver,job_,location)
|
22 |
+
data=info_job(driver)
|
23 |
+
save_data(data)
|
24 |
+
|
25 |
+
|
26 |
+
|
utils.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from selenium import webdriver
|
2 |
+
from time import sleep
|
3 |
+
import random
|
4 |
+
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException
|
5 |
+
from selenium.webdriver.common.by import By
|
6 |
+
from selenium.webdriver.edge.options import Options
|
7 |
+
from selenium.webdriver.common.keys import Keys
|
8 |
+
from bs4 import BeautifulSoup
|
9 |
+
import re
|
10 |
+
import datetime
|
11 |
+
import json
|
12 |
+
import os
|
13 |
+
from datetime import datetime, timedelta
|
14 |
+
|
15 |
+
def init_driver():
|
16 |
+
|
17 |
+
chrome_options = Options()
|
18 |
+
options = [
|
19 |
+
"--headless",
|
20 |
+
"--disable-gpu",
|
21 |
+
"--window-size=1920,1200",
|
22 |
+
"--ignore-certificate-errors",
|
23 |
+
"--disable-extensions",
|
24 |
+
"--no-sandbox",
|
25 |
+
"--disable-dev-shm-usage"
|
26 |
+
]
|
27 |
+
|
28 |
+
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
|
29 |
+
|
30 |
+
for option in options:
|
31 |
+
chrome_options.add_argument(option)
|
32 |
+
|
33 |
+
driver = webdriver.Edge(options=chrome_options)
|
34 |
+
|
35 |
+
return driver
|
36 |
+
|
37 |
+
|
38 |
+
#____________________________________________
|
39 |
+
|
40 |
+
def access(driver,url):
|
41 |
+
print("_"*30, "ACCESS URL","_"*30)
|
42 |
+
driver.get(url)
|
43 |
+
sleep(5)
|
44 |
+
|
45 |
+
|
46 |
+
def search(driver, job, location):
|
47 |
+
print("_"*30, "SEARCH","_"*30)
|
48 |
+
|
49 |
+
search_box_job = driver.find_element(By.XPATH, '//input[@id="text-input-what"]')
|
50 |
+
search_box_location=driver.find_element(By.XPATH, '//input[@id="text-input-where"]')
|
51 |
+
search_box_job.send_keys(job)
|
52 |
+
search_box_location.send_keys(location)
|
53 |
+
|
54 |
+
search_box_location.send_keys(Keys.RETURN)
|
55 |
+
driver.implicitly_wait(5)
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
def save_data(dict_jd):
|
60 |
+
directory = './data'
|
61 |
+
|
62 |
+
if not os.path.exists(directory):
|
63 |
+
os.makedirs(directory)
|
64 |
+
|
65 |
+
today = datetime.today().strftime('%Y_%m_%d')
|
66 |
+
filename = f"{directory}/data_{today}.json"
|
67 |
+
|
68 |
+
json_file = json.dumps(dict_jd, indent= 4, ensure_ascii=False)
|
69 |
+
|
70 |
+
with open(filename, "w", encoding="utf-8") as f:
|
71 |
+
f.write(json_file)
|
72 |
+
|
73 |
+
|
74 |
+
def info_job(driver):
|
75 |
+
|
76 |
+
id=0
|
77 |
+
|
78 |
+
num_job= driver.find_element(By.XPATH, '//div[@class="jobsearch-JobCountAndSortPane-jobCount css-13jafh6 eu4oa1w0"]//span').text
|
79 |
+
num_job_=re.sub(r'\D', '', num_job)
|
80 |
+
num_job=int(num_job_)
|
81 |
+
num_next= num_job//15
|
82 |
+
|
83 |
+
|
84 |
+
if num_next >15 :
|
85 |
+
num_next=15
|
86 |
+
|
87 |
+
dict_job={}
|
88 |
+
for i in range(0,num_next-2):
|
89 |
+
info_jobs = driver.find_elements(By.XPATH, '//div[@class="job_seen_beacon"]')
|
90 |
+
print("_"*30, "START","_"*30)
|
91 |
+
|
92 |
+
|
93 |
+
try:
|
94 |
+
close = driver.find_element(By.XPATH, '//button[@aria-label="close"]')
|
95 |
+
close.click()
|
96 |
+
except NoSuchElementException:
|
97 |
+
pass
|
98 |
+
|
99 |
+
for element in info_jobs:
|
100 |
+
|
101 |
+
element.click()
|
102 |
+
try:
|
103 |
+
today = datetime.today()
|
104 |
+
date_post= element.find_element(By.XPATH, './/span[@data-testid="myJobsStateDate"]').text
|
105 |
+
date_post_=re.sub(r'\D', '', date_post)
|
106 |
+
if date_post_ != "":
|
107 |
+
|
108 |
+
posted_date = today - timedelta(days=int(date_post_))
|
109 |
+
posted_date_str = posted_date.strftime('%Y-%m-%d')
|
110 |
+
else:
|
111 |
+
posted_date_str=today.strftime('%Y-%m-%d')
|
112 |
+
|
113 |
+
|
114 |
+
|
115 |
+
|
116 |
+
name_job_ = driver.find_element(By.XPATH, '//h2[@data-testid="jobsearch-JobInfoHeader-title"]/span').text
|
117 |
+
name_job = name_job_.replace("- job post", "").strip()
|
118 |
+
|
119 |
+
name_company = driver.find_element(By.XPATH, '//div[@data-testid="inlineHeader-companyName"]/span/a').text
|
120 |
+
|
121 |
+
location = driver.find_element(By.XPATH, '//div[@data-testid="inlineHeader-companyLocation"]/div').text
|
122 |
+
|
123 |
+
|
124 |
+
|
125 |
+
job_description = driver.find_elements(By.XPATH, '//div[@id="jobDescriptionText"]')
|
126 |
+
|
127 |
+
|
128 |
+
content_jd = ""
|
129 |
+
for jd in job_description:
|
130 |
+
get_html = jd.get_attribute("innerHTML")
|
131 |
+
parser = BeautifulSoup(get_html, 'html.parser')
|
132 |
+
jd = parser.get_text()
|
133 |
+
content_jd += jd.replace("\n"," ").replace(" ","")
|
134 |
+
id+=1
|
135 |
+
|
136 |
+
try:
|
137 |
+
dict_job[id]
|
138 |
+
except KeyError:
|
139 |
+
dict_job[id] = {
|
140 |
+
"ID":id,
|
141 |
+
"job":name_job,
|
142 |
+
"company": name_company,
|
143 |
+
"location": location,
|
144 |
+
"job_description":content_jd,
|
145 |
+
"date_post": posted_date_str
|
146 |
+
|
147 |
+
}
|
148 |
+
|
149 |
+
sleep(4)
|
150 |
+
except NoSuchElementException:
|
151 |
+
pass
|
152 |
+
|
153 |
+
try:
|
154 |
+
next = driver.find_element(By.XPATH, '//a[@data-testid="pagination-page-next"]')
|
155 |
+
next.click()
|
156 |
+
sleep(4)
|
157 |
+
except NoSuchElementException:
|
158 |
+
break;
|
159 |
+
try:
|
160 |
+
close = driver.find_element(By.XPATH, '//button[@aria-label="close"]')
|
161 |
+
close.click()
|
162 |
+
except NoSuchElementException:
|
163 |
+
pass
|
164 |
+
|
165 |
+
driver.quit()
|
166 |
+
return dict_job
|
167 |
+
|
168 |
+
|