File size: 6,998 Bytes
ed8425a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from typing import List, Dict
import logging
import urllib.parse
from sentence_transformers import SentenceTransformer
import torch

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class SHLScraper:
    def __init__(self):
        self.base_url = "https://www.shl.com/solutions/products/product-catalog/"
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        # Initialize the embedding model
        try:
            self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
        except Exception as e:
            logging.error(f"Error initializing embedding model: {e}")
            self.embedding_model = None

    def get_page_content(self, start: int, type_num: int) -> str:
        """Fetch page content with given start and type parameters."""
        params = {
            'start': start,
            'type': type_num
        }
        try:
            response = requests.get(self.base_url, params=params, headers=self.headers)
            response.raise_for_status()
            return response.text
        except requests.RequestException as e:
            logging.error(f"Error fetching page: {e}")
            return ""

    def check_yes_no(self, cell) -> str:
        """Check if a cell contains a yes or no indicator based on CSS classes."""
        yes_span = cell.find('span', class_='catalogue__circle -yes')
        no_span = cell.find('span', class_='catalogue__circle -no')
        
        if yes_span:
            return "Yes"
        elif no_span:
            return "No"
        return ""

    def get_test_link(self, cell) -> str:
        """Extract the href link from the test name cell."""
        link = cell.find('a')
        if link and 'href' in link.attrs:
            return link['href']
        return ""

    def get_test_description(self, test_link: str) -> str:
        """Fetch and extract the description from a test's detail page."""
        if not test_link:
            return ""

        # Construct full URL if it's a relative path
        if test_link.startswith('/'):
            test_link = urllib.parse.urljoin("https://www.shl.com", test_link)

        try:
            logging.info(f"Fetching description for: {test_link}")
            response = requests.get(test_link, headers=self.headers)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Initialize description parts
            description_parts = []
            
            # Try to find main description
            desc_div = soup.find('div', class_='product-description')
            if desc_div:
                description_parts.append(desc_div.get_text(strip=True))
            
            # Try to find additional details
            details_div = soup.find('div', class_='product-details')
            if details_div:
                description_parts.append(details_div.get_text(strip=True))
            
            # Try to find features
            features_div = soup.find('div', class_='product-features')
            if features_div:
                description_parts.append(features_div.get_text(strip=True))
            
            # Try to find benefits
            benefits_div = soup.find('div', class_='product-benefits')
            if benefits_div:
                description_parts.append(benefits_div.get_text(strip=True))
            
            # Try to find meta description as fallback
            if not description_parts:
                meta_desc = soup.find('meta', {'name': 'description'})
                if meta_desc and 'content' in meta_desc.attrs:
                    description_parts.append(meta_desc['content'])

            # Combine all parts with proper spacing
            full_description = " | ".join(filter(None, description_parts))
            
            time.sleep(1)  # Be respectful with requests
            return full_description

        except requests.RequestException as e:
            logging.error(f"Error fetching description from {test_link}: {e}")
            return ""

    def extract_table_data(self, html_content: str) -> List[Dict]:
        """Extract table data from HTML content."""
        if not html_content:
            return []

        soup = BeautifulSoup(html_content, 'html.parser')
        tables = soup.find_all('table')
        
        all_data = []
        for table in tables:
            rows = table.find_all('tr')
            for row in rows[1:]:  # Skip header row
                cols = row.find_all('td')
                if len(cols) >= 4:  # Ensure we have all columns
                    test_link = self.get_test_link(cols[0])
                    data = {
                        'Test Name': cols[0].get_text(strip=True),
                        'Test Link': test_link,
                        'Description': self.get_test_description(test_link),
                        'Remote Testing': self.check_yes_no(cols[1]),
                        'Adaptive/IRT': self.check_yes_no(cols[2]),
                        'Test Type': cols[3].get_text(strip=True)
                    }
                    all_data.append(data)
        return all_data

    def scrape_all_tables(self, max_pages: int = 10):
        """Scrape tables from multiple pages."""
        all_data = []
        
        for start in range(0, max_pages * 12, 12):  # Each page has 12 items
            for type_num in range(1, 9):  # Types 1-8
                logging.info(f"Scraping page with start={start}, type={type_num}")
                
                html_content = self.get_page_content(start, type_num)
                if not html_content:
                    continue
                
                page_data = self.extract_table_data(html_content)
                if page_data:
                    all_data.extend(page_data)
                    logging.info(f"Found {len(page_data)} items on this page")
                
                # Add delay to be respectful to the server
                time.sleep(1)
        
        return all_data

    def save_to_csv(self, data: List[Dict], filename: str = 'shl_products.csv'):
        """Save scraped data to CSV file."""
        if not data:
            logging.warning("No data to save")
            return
        
        df = pd.DataFrame(data)
        df.to_csv(filename, index=False)
        logging.info(f"Saved {len(data)} records to {filename}")

def main():
    scraper = SHLScraper()
    logging.info("Starting SHL product catalog scraping...")
    
    data = scraper.scrape_all_tables()
    logging.info(f"Total records scraped: {len(data)}")
    
    scraper.save_to_csv(data)
    logging.info("Scraping completed!")

if __name__ == "__main__":
    main()