|
import requests |
|
from bs4 import BeautifulSoup |
|
import pandas as pd |
|
import time |
|
from typing import List, Dict |
|
import logging |
|
import urllib.parse |
|
from sentence_transformers import SentenceTransformer |
|
import torch |
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
|
class SHLScraper: |
|
def __init__(self): |
|
self.base_url = "https://www.shl.com/solutions/products/product-catalog/" |
|
self.headers = { |
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' |
|
} |
|
|
|
try: |
|
self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu') |
|
except Exception as e: |
|
logging.error(f"Error initializing embedding model: {e}") |
|
self.embedding_model = None |
|
|
|
def get_page_content(self, start: int, type_num: int) -> str: |
|
"""Fetch page content with given start and type parameters.""" |
|
params = { |
|
'start': start, |
|
'type': type_num |
|
} |
|
try: |
|
response = requests.get(self.base_url, params=params, headers=self.headers) |
|
response.raise_for_status() |
|
return response.text |
|
except requests.RequestException as e: |
|
logging.error(f"Error fetching page: {e}") |
|
return "" |
|
|
|
def check_yes_no(self, cell) -> str: |
|
"""Check if a cell contains a yes or no indicator based on CSS classes.""" |
|
yes_span = cell.find('span', class_='catalogue__circle -yes') |
|
no_span = cell.find('span', class_='catalogue__circle -no') |
|
|
|
if yes_span: |
|
return "Yes" |
|
elif no_span: |
|
return "No" |
|
return "" |
|
|
|
def get_test_link(self, cell) -> str: |
|
"""Extract the href link from the test name cell.""" |
|
link = cell.find('a') |
|
if link and 'href' in link.attrs: |
|
return link['href'] |
|
return "" |
|
|
|
def get_test_description(self, test_link: str) -> str: |
|
"""Fetch and extract the description from a test's detail page.""" |
|
if not test_link: |
|
return "" |
|
|
|
|
|
if test_link.startswith('/'): |
|
test_link = urllib.parse.urljoin("https://www.shl.com", test_link) |
|
|
|
try: |
|
logging.info(f"Fetching description for: {test_link}") |
|
response = requests.get(test_link, headers=self.headers) |
|
response.raise_for_status() |
|
|
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
|
|
|
|
description_parts = [] |
|
|
|
|
|
desc_div = soup.find('div', class_='product-description') |
|
if desc_div: |
|
description_parts.append(desc_div.get_text(strip=True)) |
|
|
|
|
|
details_div = soup.find('div', class_='product-details') |
|
if details_div: |
|
description_parts.append(details_div.get_text(strip=True)) |
|
|
|
|
|
features_div = soup.find('div', class_='product-features') |
|
if features_div: |
|
description_parts.append(features_div.get_text(strip=True)) |
|
|
|
|
|
benefits_div = soup.find('div', class_='product-benefits') |
|
if benefits_div: |
|
description_parts.append(benefits_div.get_text(strip=True)) |
|
|
|
|
|
if not description_parts: |
|
meta_desc = soup.find('meta', {'name': 'description'}) |
|
if meta_desc and 'content' in meta_desc.attrs: |
|
description_parts.append(meta_desc['content']) |
|
|
|
|
|
full_description = " | ".join(filter(None, description_parts)) |
|
|
|
time.sleep(1) |
|
return full_description |
|
|
|
except requests.RequestException as e: |
|
logging.error(f"Error fetching description from {test_link}: {e}") |
|
return "" |
|
|
|
def extract_table_data(self, html_content: str) -> List[Dict]: |
|
"""Extract table data from HTML content.""" |
|
if not html_content: |
|
return [] |
|
|
|
soup = BeautifulSoup(html_content, 'html.parser') |
|
tables = soup.find_all('table') |
|
|
|
all_data = [] |
|
for table in tables: |
|
rows = table.find_all('tr') |
|
for row in rows[1:]: |
|
cols = row.find_all('td') |
|
if len(cols) >= 4: |
|
test_link = self.get_test_link(cols[0]) |
|
data = { |
|
'Test Name': cols[0].get_text(strip=True), |
|
'Test Link': test_link, |
|
'Description': self.get_test_description(test_link), |
|
'Remote Testing': self.check_yes_no(cols[1]), |
|
'Adaptive/IRT': self.check_yes_no(cols[2]), |
|
'Test Type': cols[3].get_text(strip=True) |
|
} |
|
all_data.append(data) |
|
return all_data |
|
|
|
def scrape_all_tables(self, max_pages: int = 10): |
|
"""Scrape tables from multiple pages.""" |
|
all_data = [] |
|
|
|
for start in range(0, max_pages * 12, 12): |
|
for type_num in range(1, 9): |
|
logging.info(f"Scraping page with start={start}, type={type_num}") |
|
|
|
html_content = self.get_page_content(start, type_num) |
|
if not html_content: |
|
continue |
|
|
|
page_data = self.extract_table_data(html_content) |
|
if page_data: |
|
all_data.extend(page_data) |
|
logging.info(f"Found {len(page_data)} items on this page") |
|
|
|
|
|
time.sleep(1) |
|
|
|
return all_data |
|
|
|
def save_to_csv(self, data: List[Dict], filename: str = 'shl_products.csv'): |
|
"""Save scraped data to CSV file.""" |
|
if not data: |
|
logging.warning("No data to save") |
|
return |
|
|
|
df = pd.DataFrame(data) |
|
df.to_csv(filename, index=False) |
|
logging.info(f"Saved {len(data)} records to {filename}") |
|
|
|
def main(): |
|
scraper = SHLScraper() |
|
logging.info("Starting SHL product catalog scraping...") |
|
|
|
data = scraper.scrape_all_tables() |
|
logging.info(f"Total records scraped: {len(data)}") |
|
|
|
scraper.save_to_csv(data) |
|
logging.info("Scraping completed!") |
|
|
|
if __name__ == "__main__": |
|
main() |