|
import requests |
|
from bs4 import BeautifulSoup |
|
import json |
|
from selenium import webdriver |
|
from selenium.webdriver.chrome.options import Options |
|
import time |
|
import re |
|
import os |
|
|
|
|
|
url = 'https://XXX.com' |
|
|
|
|
|
chrome_options = Options() |
|
chrome_options.add_argument('--headless') |
|
chrome_options.add_argument('--no-sandbox') |
|
chrome_options.add_argument('--disable-dev-shm-usage') |
|
|
|
|
|
chromedriver_path = '/usr/local/bin/chromedriver' |
|
|
|
|
|
driver = webdriver.Chrome(options=chrome_options) |
|
|
|
|
|
driver.get(url) |
|
|
|
|
|
time.sleep(5) |
|
|
|
|
|
html = driver.page_source |
|
|
|
|
|
driver.quit() |
|
|
|
|
|
soup = BeautifulSoup(html, 'html.parser') |
|
|
|
data = {} |
|
|
|
|
|
headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5']) |
|
(h1_text,h2_text,h3_text,h4_text,h5_text)=("","","","","") |
|
for heading in headings: |
|
if heading.name == 'h1': |
|
h1_text = heading.text |
|
key = h1_text |
|
elif heading.name == 'h2': |
|
h2_text = heading.text |
|
key = f"{h1_text}-{h2_text}" |
|
elif heading.name == 'h3': |
|
h3_text = heading.text |
|
key = f"{h1_text}-{h2_text}-{h3_text}" |
|
elif heading.name == 'h4': |
|
h4_text = heading.text |
|
key = f"{h1_text}-{h2_text}-{h3_text}-{h4_text}" |
|
elif heading.name == 'h5': |
|
h5_text = heading.text |
|
key = f"{h1_text}-{h2_text}-{h3_text}-{h5_text}" |
|
|
|
|
|
sibling = heading.find_next_sibling() |
|
value = '' |
|
while sibling and not sibling.name in ['h1', 'h2', 'h3', 'h4', 'h5']: |
|
value += sibling.text |
|
sibling = sibling.find_next_sibling() |
|
|
|
data[key] = value.strip() |
|
|
|
print(len(data),(data.keys())) |