text-analysis / DAI scraper /scrap_assessment.py
Daryl Fung
added top 10
2a000a7
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
import csv
# Set up the Selenium driver (ensure you have the appropriate webdriver installed)
driver = webdriver.Chrome()
# Open the webpage
mchp = "https://www.hdrn.ca/en/inventory/label/42/4826"
bc = 'https://www.hdrn.ca/en/inventory/label/46/4672/'
ab = "https://www.hdrn.ca/en/inventory/label/44/4684/"
sk = "https://www.hdrn.ca/en/inventory/label/51/4378/"
ices = "https://www.hdrn.ca/en/inventory/label/43/4436/"
nb = "https://www.hdrn.ca/en/inventory/label/47/4611/"
hdns = "https://www.hdrn.ca/en/inventory/label/49/4411/"
nlchi = "https://www.hdrn.ca/en/inventory/label/50/4350/"
cihi = "https://www.hdrn.ca/en/inventory/label/45/4744/"
jurisdictions = {
'mchp': mchp,
'bc': bc,
'ab': ab,
'sk': sk,
'ices': ices,
'nb': nb,
'hdns': hdns,
'nlchi': nlchi,
'cihi': cihi
}
dataset_assessments = []
for jurisdiction_name, jurisdiction in list(jurisdictions.items())[2:]:
driver.get(jurisdiction)
while True:
try:
# Wait for the page to load after login (adjust the timeout as needed)
WebDriverWait(driver, 2).until(EC.presence_of_element_located((By.CLASS_NAME, "table")))
title = driver.find_element(By.CLASS_NAME, 'panel-title').text
dataset = Select(driver.find_element(By.ID, "selected_dataset")).first_selected_option.text
dataset_dict = {'dataset': dataset}
# Find the table element with class "table"
table = driver.find_element(By.CLASS_NAME, "table")
# Find the tbody element within the table
tbody = table.find_element(By.TAG_NAME, "tbody")
# Find the first tr element within the tbody
first_tr = tbody.find_element(By.TAG_NAME, "tr")
# Extract the text or perform any other desired actions with the first tr block
tr = first_tr.find_elements(By.TAG_NAME, "label") # should return 8 if there is discussion
rationale = ""
discussion = ""
if len(tr) == 6:
rationale = tr[3].text
elif len(tr) == 8:
rationale = tr[3].text
discussion = tr[5].text
dataset_dict['rationale'] = rationale
dataset_dict['discussion'] = discussion
dataset_assessments.append(dataset_dict)
next_button = driver.find_elements(By.XPATH, "//*[contains(text(), 'Next')]")
if len(next_button) == 0:
break
next_button[0].click()
except:
# If the table element is not found, perform login
# Find the login form elements (e.g., username and password inputs)
username_input = driver.find_element('name', 'username')
password_input = driver.find_element('name', 'password')
# Fill in the login credentials
username_input.send_keys("dfung") # Replace with your username
password_input.send_keys("Daryl_1212hdrnhdrn") # Replace with your password
# Submit the login form
password_input.send_keys(Keys.RETURN)
# Define the CSV file path
csv_file = f'{jurisdiction_name}_assessment.csv'
# Extract the column names from the first dictionary
header = list(dataset_assessments[0].keys())
# Open the CSV file in write mode
with open(csv_file, mode='w', newline='') as file:
writer = csv.DictWriter(file, fieldnames=header)
# Write the header row
writer.writeheader()
# Write the data rows
for row in dataset_assessments:
writer.writerow(row)