Spaces:

nqtruong
/

Job_Knowledge_Graph

Runtime error

App Files Files Community

Nguyen Quang Truong commited on Jun 21, 2024

Commit

6d19d4c

1 Parent(s): 61162c9

load file

Browse files

Files changed (4) hide show

.github/workflows/scrape.yml +38 -0
requirements.txt +8 -0
scrape_data.py +26 -0
utils.py +168 -0

.github/workflows/scrape.yml ADDED Viewed

	@@ -0,0 +1,38 @@

+name: Scrape Data
+on:
+  schedule:
+    - cron: '0 0 * * *'
+  workflow_dispatch:
+jobs:
+  scrape:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.10'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+    - name: Run scraping script
+      run: python scrape_data.py
+    - name: Commit and push changes
+      run: |
+        git config --global user.name 'github-actions[bot]'
+        git config --global user.email 'github-actions[bot]@users.noreply.github.com'
+        git add .
+        git commit -m "Update scraped data"
+        git push
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+webdriver-manager
+selenium
+beautifulsoup4
+argparse
+numpy
+pandas
+bs4
+chromedriver_autoinstaller

scrape_data.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import argparse
+from selenium.webdriver.edge.options import Options
+from selenium import webdriver
+from utils import save_data, access, info_job,search, init_driver
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--url", default= "https://vn.indeed.com/jobs")
+    parser.add_argument("--job", default="AI, Data Science")
+    parser.add_argument("--location", default="Thành Phố Hồ Chí Minh")
+    args = parser.parse_args()
+    driver=init_driver()
+    url, job_, location = args.url, args.job, args.location
+    access(driver,url)
+    search(driver,job_,location)
+    data=info_job(driver)
+    save_data(data)

utils.py ADDED Viewed

	@@ -0,0 +1,168 @@

+from selenium import webdriver
+from time import sleep
+import random
+from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException
+from selenium.webdriver.common.by import By
+from selenium.webdriver.edge.options import Options
+from selenium.webdriver.common.keys import Keys
+from bs4 import BeautifulSoup
+import re
+import datetime
+import json
+import os
+from datetime import datetime, timedelta
+def init_driver():
+    chrome_options = Options()
+    options = [
+        "--headless",
+        "--disable-gpu",
+        "--window-size=1920,1200",
+        "--ignore-certificate-errors",
+        "--disable-extensions",
+        "--no-sandbox",
+        "--disable-dev-shm-usage"
+    ]
+    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
+    for option in options:
+        chrome_options.add_argument(option)
+    driver = webdriver.Edge(options=chrome_options)
+    return driver
+#____________________________________________
+def access(driver,url):
+    print("_"*30, "ACCESS URL","_"*30)
+    driver.get(url)
+    sleep(5)
+def search(driver, job, location):
+    print("_"*30, "SEARCH","_"*30)
+    search_box_job = driver.find_element(By.XPATH, '//input[@id="text-input-what"]')
+    search_box_location=driver.find_element(By.XPATH, '//input[@id="text-input-where"]')
+    search_box_job.send_keys(job)
+    search_box_location.send_keys(location)
+    search_box_location.send_keys(Keys.RETURN)
+    driver.implicitly_wait(5)
+def save_data(dict_jd):
+    directory = './data'
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+    today = datetime.today().strftime('%Y_%m_%d')
+    filename = f"{directory}/data_{today}.json"
+    json_file = json.dumps(dict_jd, indent= 4, ensure_ascii=False)
+    with open(filename, "w", encoding="utf-8") as f:
+        f.write(json_file)
+def info_job(driver):
+    id=0
+    num_job= driver.find_element(By.XPATH, '//div[@class="jobsearch-JobCountAndSortPane-jobCount css-13jafh6 eu4oa1w0"]//span').text
+    num_job_=re.sub(r'\D', '', num_job)
+    num_job=int(num_job_)
+    num_next= num_job//15
+    if num_next >15 :
+        num_next=15
+    dict_job={}
+    for i in range(0,num_next-2):
+        info_jobs = driver.find_elements(By.XPATH, '//div[@class="job_seen_beacon"]')
+        print("_"*30, "START","_"*30)
+        try:
+            close = driver.find_element(By.XPATH, '//button[@aria-label="close"]')
+            close.click()
+        except NoSuchElementException:
+            pass
+        for element in info_jobs:
+            element.click()
+            try:
+                today = datetime.today()
+                date_post= element.find_element(By.XPATH, './/span[@data-testid="myJobsStateDate"]').text
+                date_post_=re.sub(r'\D', '', date_post)
+                if date_post_ != "":
+                    posted_date = today - timedelta(days=int(date_post_))
+                    posted_date_str = posted_date.strftime('%Y-%m-%d')
+                else:
+                    posted_date_str=today.strftime('%Y-%m-%d')
+                name_job_ = driver.find_element(By.XPATH, '//h2[@data-testid="jobsearch-JobInfoHeader-title"]/span').text
+                name_job = name_job_.replace("- job post", "").strip()
+                name_company = driver.find_element(By.XPATH, '//div[@data-testid="inlineHeader-companyName"]/span/a').text
+                location = driver.find_element(By.XPATH, '//div[@data-testid="inlineHeader-companyLocation"]/div').text
+                job_description = driver.find_elements(By.XPATH, '//div[@id="jobDescriptionText"]')
+                content_jd = ""
+                for jd in job_description:
+                    get_html = jd.get_attribute("innerHTML")
+                    parser = BeautifulSoup(get_html, 'html.parser')
+                    jd = parser.get_text()
+                    content_jd += jd.replace("\n"," ").replace("   ","")
+                id+=1
+                try:
+                    dict_job[id]
+                except KeyError:
+                    dict_job[id] = {
+                        "ID":id,
+                        "job":name_job,
+                        "company": name_company,
+                        "location": location,
+                        "job_description":content_jd,
+                        "date_post": posted_date_str
+                    }
+                sleep(4)
+            except NoSuchElementException:
+                pass
+        try:
+            next = driver.find_element(By.XPATH, '//a[@data-testid="pagination-page-next"]')
+            next.click()
+            sleep(4)
+        except NoSuchElementException:
+            break;
+        try:
+            close = driver.find_element(By.XPATH, '//button[@aria-label="close"]')
+            close.click()
+        except NoSuchElementException:
+            pass
+    driver.quit()
+    return dict_job