lol_champion_pick_predictor / util /Leaderboard_scrapper.py
Jimin Park
added model
abcb943
import pandas as pd
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def scrape_leaderboards(regions=None, pages_per_region=5, output_file=None, delay=2):
"""
Scrape leaderboard data from op.gg for specified regions and return as DataFrame.
Args:
regions (list): List of regions to scrape. Defaults to ["kr", "na", "vn", "euw"]
pages_per_region (int): Number of pages to scrape per region. Defaults to 5
output_file (str): Path to output file. Defaults to "util/data/leaderboard_data.csv"
delay (int): Delay between requests in seconds. Defaults to 2
Returns:
pandas.DataFrame: Scraped leaderboard data
"""
# Set defaults
if regions is None:
regions = ["kr", "na", "vn", "euw"]
if output_file is None:
output_file = os.path.join("util", "data", "leaderboard_data.csv")
# Initialize data list to store rows
leaderboard_data = []
try:
# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-logging")
chrome_options.add_argument("--log-level=3")
chrome_options.add_argument("--disable-extensions")
chrome_options.page_load_strategy = 'eager'
chrome_options.add_argument(
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
)
# Initialize WebDriver
driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=chrome_options
)
for region in regions:
print(f"\nScraping {region.upper()} region...")
for page in range(1, pages_per_region + 1):
print(f"Processing page {page}/{pages_per_region}")
url = f"https://www.op.gg/leaderboards/tier?region={region}&type=ladder&page={page}"
try:
# Access the webpage
driver.get(url)
# Wait for table to load
table = WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "table.css-1l95r9q.e4dns9u11"))
)
# Process rows
rows = table.find_elements(By.TAG_NAME, "tr")[1:] # Skip header row
for row in rows:
try:
cells = row.find_elements(By.TAG_NAME, "td")
if len(cells) >= 7:
# Extract basic data
summoner = cells[1].text.strip().replace("\n", " ")
rank = cells[0].text.strip()
tier = cells[2].text.strip()
lp = cells[3].text.strip()
level = cells[5].text.strip()
# Extract champion data
champion_imgs = cells[4].find_elements(By.TAG_NAME, "img")
champions = [img.get_attribute("alt") for img in champion_imgs]
champion_data = champions + [""] * (3 - len(champions))
# Parse win/loss data
winrate_text = cells[6].text.strip().split("\n")
wins = winrate_text[0].rstrip("W") if len(winrate_text) > 0 else ""
losses = winrate_text[1].rstrip("L") if len(winrate_text) > 1 else ""
winrate = winrate_text[2] if len(winrate_text) > 2 else ""
# Append row data
leaderboard_data.append({
"summoner": summoner,
"region": region,
"rank": rank,
"tier": tier,
"lp": lp,
"most_champion_1": champion_data[0],
"most_champion_2": champion_data[1],
"most_champion_3": champion_data[2],
"level": level,
"win": wins,
"loss": losses,
"winrate": winrate
})
except Exception as e:
print(f"Error processing row in {region} page {page}: {e}")
continue
except Exception as e:
print(f"Error processing {region} page {page}: {e}")
continue
time.sleep(delay)
except Exception as e:
print(f"Fatal error: {e}")
return None
finally:
driver.quit()
# Create DataFrame
df = pd.DataFrame(leaderboard_data)
# Clean and convert data types
df['lp'] = df['lp'].str.replace(',', '').str.replace('LP', '').astype(float)
df['level'] = df['level'].astype(int)
df['win'] = pd.to_numeric(df['win'], errors='coerce')
df['loss'] = pd.to_numeric(df['loss'], errors='coerce')
df['winrate'] = df['winrate'].str.rstrip('%').astype(float) / 100
# Save to CSV if output_file is specified
if output_file:
os.makedirs(os.path.dirname(output_file), exist_ok=True)
df.to_csv(output_file, index=False)
print(f"Leaderboard data saved to {output_file}")
return df