Spaces:
Sleeping
Sleeping
Daryl Fung
commited on
Commit
·
2a000a7
1
Parent(s):
c082b57
added top 10
Browse files- DAI scraper/scrap_assessment.py +108 -0
- keyphrase_extraction.py +46 -2
- keyword_extraction.py +4 -4
DAI scraper/scrap_assessment.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from selenium import webdriver
|
2 |
+
from selenium.webdriver.common.keys import Keys
|
3 |
+
from selenium.webdriver.common.by import By
|
4 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
5 |
+
from selenium.webdriver.support import expected_conditions as EC
|
6 |
+
from selenium.webdriver.support.ui import Select
|
7 |
+
import csv
|
8 |
+
|
9 |
+
# Set up the Selenium driver (ensure you have the appropriate webdriver installed)
|
10 |
+
driver = webdriver.Chrome()
|
11 |
+
|
12 |
+
# Open the webpage
|
13 |
+
|
14 |
+
mchp = "https://www.hdrn.ca/en/inventory/label/42/4826"
|
15 |
+
bc = 'https://www.hdrn.ca/en/inventory/label/46/4672/'
|
16 |
+
ab = "https://www.hdrn.ca/en/inventory/label/44/4684/"
|
17 |
+
sk = "https://www.hdrn.ca/en/inventory/label/51/4378/"
|
18 |
+
ices = "https://www.hdrn.ca/en/inventory/label/43/4436/"
|
19 |
+
nb = "https://www.hdrn.ca/en/inventory/label/47/4611/"
|
20 |
+
hdns = "https://www.hdrn.ca/en/inventory/label/49/4411/"
|
21 |
+
nlchi = "https://www.hdrn.ca/en/inventory/label/50/4350/"
|
22 |
+
cihi = "https://www.hdrn.ca/en/inventory/label/45/4744/"
|
23 |
+
|
24 |
+
jurisdictions = {
|
25 |
+
'mchp': mchp,
|
26 |
+
'bc': bc,
|
27 |
+
'ab': ab,
|
28 |
+
'sk': sk,
|
29 |
+
'ices': ices,
|
30 |
+
'nb': nb,
|
31 |
+
'hdns': hdns,
|
32 |
+
'nlchi': nlchi,
|
33 |
+
'cihi': cihi
|
34 |
+
}
|
35 |
+
|
36 |
+
dataset_assessments = []
|
37 |
+
|
38 |
+
for jurisdiction_name, jurisdiction in list(jurisdictions.items())[2:]:
|
39 |
+
driver.get(jurisdiction)
|
40 |
+
while True:
|
41 |
+
try:
|
42 |
+
# Wait for the page to load after login (adjust the timeout as needed)
|
43 |
+
WebDriverWait(driver, 2).until(EC.presence_of_element_located((By.CLASS_NAME, "table")))
|
44 |
+
|
45 |
+
title = driver.find_element(By.CLASS_NAME, 'panel-title').text
|
46 |
+
dataset = Select(driver.find_element(By.ID, "selected_dataset")).first_selected_option.text
|
47 |
+
dataset_dict = {'dataset': dataset}
|
48 |
+
|
49 |
+
# Find the table element with class "table"
|
50 |
+
table = driver.find_element(By.CLASS_NAME, "table")
|
51 |
+
|
52 |
+
# Find the tbody element within the table
|
53 |
+
tbody = table.find_element(By.TAG_NAME, "tbody")
|
54 |
+
|
55 |
+
# Find the first tr element within the tbody
|
56 |
+
first_tr = tbody.find_element(By.TAG_NAME, "tr")
|
57 |
+
|
58 |
+
# Extract the text or perform any other desired actions with the first tr block
|
59 |
+
tr = first_tr.find_elements(By.TAG_NAME, "label") # should return 8 if there is discussion
|
60 |
+
|
61 |
+
rationale = ""
|
62 |
+
discussion = ""
|
63 |
+
if len(tr) == 6:
|
64 |
+
rationale = tr[3].text
|
65 |
+
elif len(tr) == 8:
|
66 |
+
rationale = tr[3].text
|
67 |
+
discussion = tr[5].text
|
68 |
+
|
69 |
+
dataset_dict['rationale'] = rationale
|
70 |
+
dataset_dict['discussion'] = discussion
|
71 |
+
dataset_assessments.append(dataset_dict)
|
72 |
+
|
73 |
+
next_button = driver.find_elements(By.XPATH, "//*[contains(text(), 'Next')]")
|
74 |
+
if len(next_button) == 0:
|
75 |
+
break
|
76 |
+
next_button[0].click()
|
77 |
+
|
78 |
+
except:
|
79 |
+
# If the table element is not found, perform login
|
80 |
+
|
81 |
+
# Find the login form elements (e.g., username and password inputs)
|
82 |
+
username_input = driver.find_element('name', 'username')
|
83 |
+
password_input = driver.find_element('name', 'password')
|
84 |
+
|
85 |
+
# Fill in the login credentials
|
86 |
+
username_input.send_keys("dfung") # Replace with your username
|
87 |
+
password_input.send_keys("Daryl_1212hdrnhdrn") # Replace with your password
|
88 |
+
|
89 |
+
# Submit the login form
|
90 |
+
password_input.send_keys(Keys.RETURN)
|
91 |
+
|
92 |
+
|
93 |
+
# Define the CSV file path
|
94 |
+
csv_file = f'{jurisdiction_name}_assessment.csv'
|
95 |
+
|
96 |
+
# Extract the column names from the first dictionary
|
97 |
+
header = list(dataset_assessments[0].keys())
|
98 |
+
|
99 |
+
# Open the CSV file in write mode
|
100 |
+
with open(csv_file, mode='w', newline='') as file:
|
101 |
+
writer = csv.DictWriter(file, fieldnames=header)
|
102 |
+
|
103 |
+
# Write the header row
|
104 |
+
writer.writeheader()
|
105 |
+
|
106 |
+
# Write the data rows
|
107 |
+
for row in dataset_assessments:
|
108 |
+
writer.writerow(row)
|
keyphrase_extraction.py
CHANGED
@@ -3,8 +3,18 @@ import spacy
|
|
3 |
from spacy import displacy
|
4 |
import pandas as pd
|
5 |
import seaborn as sns
|
|
|
6 |
import matplotlib.pyplot as plt
|
7 |
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
import pytextrank
|
9 |
|
10 |
# Load the pre-trained NLP model
|
@@ -42,6 +52,39 @@ def get_top_key_phrases(text, top_n, save_output):
|
|
42 |
plt.savefig(save_output, dpi=300, bbox_inches="tight")
|
43 |
plt.close()
|
44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
|
46 |
def display_key_phrases(text, save_output):
|
47 |
text = text.replace('\n', ' \n')
|
@@ -69,5 +112,6 @@ def display_key_phrases(text, save_output):
|
|
69 |
|
70 |
|
71 |
if __name__ == '__main__':
|
72 |
-
|
73 |
-
|
|
|
|
3 |
from spacy import displacy
|
4 |
import pandas as pd
|
5 |
import seaborn as sns
|
6 |
+
import textrank
|
7 |
import matplotlib.pyplot as plt
|
8 |
from pathlib import Path
|
9 |
+
from nltk.tokenize import sent_tokenize, word_tokenize
|
10 |
+
from nltk.corpus import stopwords
|
11 |
+
from nltk.probability import FreqDist
|
12 |
+
from nltk.tokenize import word_tokenize
|
13 |
+
from nltk.stem import PorterStemmer
|
14 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
15 |
+
import networkx as nx
|
16 |
+
import matplotlib.pyplot as plt
|
17 |
+
import numpy as np
|
18 |
import pytextrank
|
19 |
|
20 |
# Load the pre-trained NLP model
|
|
|
52 |
plt.savefig(save_output, dpi=300, bbox_inches="tight")
|
53 |
plt.close()
|
54 |
|
55 |
+
def visualize_textrank(text):
|
56 |
+
# Get text
|
57 |
+
# Generate TextRank
|
58 |
+
tr = textrank.TextRank()
|
59 |
+
tr.calculate_scores(text)
|
60 |
+
|
61 |
+
# Get top 10 words
|
62 |
+
words = [w for w, s in tr.top_words(10)]
|
63 |
+
|
64 |
+
# Create graph
|
65 |
+
G = nx.Graph()
|
66 |
+
|
67 |
+
# Add nodes
|
68 |
+
for w in words:
|
69 |
+
G.add_node(w)
|
70 |
+
|
71 |
+
# Find co-occurrence counts
|
72 |
+
counts = {}
|
73 |
+
for i in range(len(words) - 1):
|
74 |
+
w1 = words[i]
|
75 |
+
w2 = words[i + 1]
|
76 |
+
key = (w1, w2)
|
77 |
+
counts[key] = counts.get(key, 0) + 1
|
78 |
+
|
79 |
+
# Add edges with weights
|
80 |
+
for key, count in counts.items():
|
81 |
+
w1, w2 = key
|
82 |
+
G.add_edge(w1, w2, weight=count)
|
83 |
+
|
84 |
+
# Draw graph with weighted edges
|
85 |
+
nx.draw(G, with_labels=True, width=list(e[2]['weight'] for e in G.edges()))
|
86 |
+
plt.show()
|
87 |
+
|
88 |
|
89 |
def display_key_phrases(text, save_output):
|
90 |
text = text.replace('\n', ' \n')
|
|
|
112 |
|
113 |
|
114 |
if __name__ == '__main__':
|
115 |
+
visualize_textrank(text)
|
116 |
+
# get_top_key_phrases(text, 10, 'test_results/keyphrase.png')
|
117 |
+
# display_key_phrases(text)
|
keyword_extraction.py
CHANGED
@@ -32,7 +32,7 @@ Captures administrative, clinical and demographic information on discharges for
|
|
32 |
"""
|
33 |
|
34 |
def keyword_extract(doc, kw_model, n_grams, save_output='results/'):
|
35 |
-
keyword_onegram = kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, n_grams), stop_words=None)
|
36 |
words = list(zip(*keyword_onegram))[0]
|
37 |
scores = list(zip(*keyword_onegram))[1]
|
38 |
keyword_df = pd.DataFrame({'words': words, 'scores': scores})
|
@@ -47,8 +47,8 @@ def keyword_extract(doc, kw_model, n_grams, save_output='results/'):
|
|
47 |
|
48 |
if __name__ == '__main__':
|
49 |
kw_model = KeyBERT()
|
50 |
-
keyword_extract(kw_model, 1)
|
51 |
-
keyword_extract(kw_model, 2)
|
52 |
-
keyword_extract(kw_model, 3)
|
53 |
keywords = kw_model.extract_keywords(test_doc, highlight=True)
|
54 |
print(keywords)
|
|
|
32 |
"""
|
33 |
|
34 |
def keyword_extract(doc, kw_model, n_grams, save_output='results/'):
|
35 |
+
keyword_onegram = kw_model.extract_keywords(doc, top_n=10, keyphrase_ngram_range=(1, n_grams), stop_words=None)
|
36 |
words = list(zip(*keyword_onegram))[0]
|
37 |
scores = list(zip(*keyword_onegram))[1]
|
38 |
keyword_df = pd.DataFrame({'words': words, 'scores': scores})
|
|
|
47 |
|
48 |
if __name__ == '__main__':
|
49 |
kw_model = KeyBERT()
|
50 |
+
keyword_extract(test_doc, kw_model, 1)
|
51 |
+
keyword_extract(test_doc, kw_model, 2)
|
52 |
+
keyword_extract(test_doc, kw_model, 3)
|
53 |
keywords = kw_model.extract_keywords(test_doc, highlight=True)
|
54 |
print(keywords)
|