Spaces:

darylfunggg
/

text-analysis

Sleeping

text-analysis / DAI scraper /scrap_assessment.py

Daryl Fung

added top 10

2a000a7 about 2 years ago

3.91 kB

	from selenium import webdriver
	from selenium.webdriver.common.keys import Keys
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.webdriver.support.ui import Select
	import csv

	# Set up the Selenium driver (ensure you have the appropriate webdriver installed)
	driver = webdriver.Chrome()

	# Open the webpage

	mchp = "https://www.hdrn.ca/en/inventory/label/42/4826"
	bc = 'https://www.hdrn.ca/en/inventory/label/46/4672/'
	ab = "https://www.hdrn.ca/en/inventory/label/44/4684/"
	sk = "https://www.hdrn.ca/en/inventory/label/51/4378/"
	ices = "https://www.hdrn.ca/en/inventory/label/43/4436/"
	nb = "https://www.hdrn.ca/en/inventory/label/47/4611/"
	hdns = "https://www.hdrn.ca/en/inventory/label/49/4411/"
	nlchi = "https://www.hdrn.ca/en/inventory/label/50/4350/"
	cihi = "https://www.hdrn.ca/en/inventory/label/45/4744/"

	jurisdictions = {
	'mchp': mchp,
	'bc': bc,
	'ab': ab,
	'sk': sk,
	'ices': ices,
	'nb': nb,
	'hdns': hdns,
	'nlchi': nlchi,
	'cihi': cihi
	}

	dataset_assessments = []

	for jurisdiction_name, jurisdiction in list(jurisdictions.items())[2:]:
	driver.get(jurisdiction)
	while True:
	try:
	# Wait for the page to load after login (adjust the timeout as needed)
	WebDriverWait(driver, 2).until(EC.presence_of_element_located((By.CLASS_NAME, "table")))

	title = driver.find_element(By.CLASS_NAME, 'panel-title').text
	dataset = Select(driver.find_element(By.ID, "selected_dataset")).first_selected_option.text
	dataset_dict = {'dataset': dataset}

	# Find the table element with class "table"
	table = driver.find_element(By.CLASS_NAME, "table")

	# Find the tbody element within the table
	tbody = table.find_element(By.TAG_NAME, "tbody")

	# Find the first tr element within the tbody
	first_tr = tbody.find_element(By.TAG_NAME, "tr")

	# Extract the text or perform any other desired actions with the first tr block
	tr = first_tr.find_elements(By.TAG_NAME, "label") # should return 8 if there is discussion

	rationale = ""
	discussion = ""
	if len(tr) == 6:
	rationale = tr[3].text
	elif len(tr) == 8:
	rationale = tr[3].text
	discussion = tr[5].text

	dataset_dict['rationale'] = rationale
	dataset_dict['discussion'] = discussion
	dataset_assessments.append(dataset_dict)

	next_button = driver.find_elements(By.XPATH, "//*[contains(text(), 'Next')]")
	if len(next_button) == 0:
	break
	next_button[0].click()

	except:
	# If the table element is not found, perform login

	# Find the login form elements (e.g., username and password inputs)
	username_input = driver.find_element('name', 'username')
	password_input = driver.find_element('name', 'password')

	# Fill in the login credentials
	username_input.send_keys("dfung") # Replace with your username
	password_input.send_keys("Daryl_1212hdrnhdrn") # Replace with your password

	# Submit the login form
	password_input.send_keys(Keys.RETURN)


	# Define the CSV file path
	csv_file = f'{jurisdiction_name}_assessment.csv'

	# Extract the column names from the first dictionary
	header = list(dataset_assessments[0].keys())

	# Open the CSV file in write mode
	with open(csv_file, mode='w', newline='') as file:
	writer = csv.DictWriter(file, fieldnames=header)

	# Write the header row
	writer.writeheader()

	# Write the data rows
	for row in dataset_assessments:
	writer.writerow(row)