Pravincoder's picture
Upload 4 files
ed8425a verified
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from typing import List, Dict
import logging
import urllib.parse
from sentence_transformers import SentenceTransformer
import torch
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
class SHLScraper:
def __init__(self):
self.base_url = "https://www.shl.com/solutions/products/product-catalog/"
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
# Initialize the embedding model
try:
self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
except Exception as e:
logging.error(f"Error initializing embedding model: {e}")
self.embedding_model = None
def get_page_content(self, start: int, type_num: int) -> str:
"""Fetch page content with given start and type parameters."""
params = {
'start': start,
'type': type_num
}
try:
response = requests.get(self.base_url, params=params, headers=self.headers)
response.raise_for_status()
return response.text
except requests.RequestException as e:
logging.error(f"Error fetching page: {e}")
return ""
def check_yes_no(self, cell) -> str:
"""Check if a cell contains a yes or no indicator based on CSS classes."""
yes_span = cell.find('span', class_='catalogue__circle -yes')
no_span = cell.find('span', class_='catalogue__circle -no')
if yes_span:
return "Yes"
elif no_span:
return "No"
return ""
def get_test_link(self, cell) -> str:
"""Extract the href link from the test name cell."""
link = cell.find('a')
if link and 'href' in link.attrs:
return link['href']
return ""
def get_test_description(self, test_link: str) -> str:
"""Fetch and extract the description from a test's detail page."""
if not test_link:
return ""
# Construct full URL if it's a relative path
if test_link.startswith('/'):
test_link = urllib.parse.urljoin("https://www.shl.com", test_link)
try:
logging.info(f"Fetching description for: {test_link}")
response = requests.get(test_link, headers=self.headers)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Initialize description parts
description_parts = []
# Try to find main description
desc_div = soup.find('div', class_='product-description')
if desc_div:
description_parts.append(desc_div.get_text(strip=True))
# Try to find additional details
details_div = soup.find('div', class_='product-details')
if details_div:
description_parts.append(details_div.get_text(strip=True))
# Try to find features
features_div = soup.find('div', class_='product-features')
if features_div:
description_parts.append(features_div.get_text(strip=True))
# Try to find benefits
benefits_div = soup.find('div', class_='product-benefits')
if benefits_div:
description_parts.append(benefits_div.get_text(strip=True))
# Try to find meta description as fallback
if not description_parts:
meta_desc = soup.find('meta', {'name': 'description'})
if meta_desc and 'content' in meta_desc.attrs:
description_parts.append(meta_desc['content'])
# Combine all parts with proper spacing
full_description = " | ".join(filter(None, description_parts))
time.sleep(1) # Be respectful with requests
return full_description
except requests.RequestException as e:
logging.error(f"Error fetching description from {test_link}: {e}")
return ""
def extract_table_data(self, html_content: str) -> List[Dict]:
"""Extract table data from HTML content."""
if not html_content:
return []
soup = BeautifulSoup(html_content, 'html.parser')
tables = soup.find_all('table')
all_data = []
for table in tables:
rows = table.find_all('tr')
for row in rows[1:]: # Skip header row
cols = row.find_all('td')
if len(cols) >= 4: # Ensure we have all columns
test_link = self.get_test_link(cols[0])
data = {
'Test Name': cols[0].get_text(strip=True),
'Test Link': test_link,
'Description': self.get_test_description(test_link),
'Remote Testing': self.check_yes_no(cols[1]),
'Adaptive/IRT': self.check_yes_no(cols[2]),
'Test Type': cols[3].get_text(strip=True)
}
all_data.append(data)
return all_data
def scrape_all_tables(self, max_pages: int = 10):
"""Scrape tables from multiple pages."""
all_data = []
for start in range(0, max_pages * 12, 12): # Each page has 12 items
for type_num in range(1, 9): # Types 1-8
logging.info(f"Scraping page with start={start}, type={type_num}")
html_content = self.get_page_content(start, type_num)
if not html_content:
continue
page_data = self.extract_table_data(html_content)
if page_data:
all_data.extend(page_data)
logging.info(f"Found {len(page_data)} items on this page")
# Add delay to be respectful to the server
time.sleep(1)
return all_data
def save_to_csv(self, data: List[Dict], filename: str = 'shl_products.csv'):
"""Save scraped data to CSV file."""
if not data:
logging.warning("No data to save")
return
df = pd.DataFrame(data)
df.to_csv(filename, index=False)
logging.info(f"Saved {len(data)} records to {filename}")
def main():
scraper = SHLScraper()
logging.info("Starting SHL product catalog scraping...")
data = scraper.scrape_all_tables()
logging.info(f"Total records scraped: {len(data)}")
scraper.save_to_csv(data)
logging.info("Scraping completed!")
if __name__ == "__main__":
main()