|
import pandas as pd |
|
import os |
|
import time |
|
from selenium import webdriver |
|
from selenium.webdriver.chrome.service import Service |
|
from selenium.webdriver.chrome.options import Options |
|
from webdriver_manager.chrome import ChromeDriverManager |
|
from selenium.webdriver.common.by import By |
|
from selenium.webdriver.support.ui import WebDriverWait |
|
from selenium.webdriver.support import expected_conditions as EC |
|
|
|
def scrape_leaderboards(regions=None, pages_per_region=5, output_file=None, delay=2): |
|
""" |
|
Scrape leaderboard data from op.gg for specified regions and return as DataFrame. |
|
|
|
Args: |
|
regions (list): List of regions to scrape. Defaults to ["kr", "na", "vn", "euw"] |
|
pages_per_region (int): Number of pages to scrape per region. Defaults to 5 |
|
output_file (str): Path to output file. Defaults to "util/data/leaderboard_data.csv" |
|
delay (int): Delay between requests in seconds. Defaults to 2 |
|
|
|
Returns: |
|
pandas.DataFrame: Scraped leaderboard data |
|
""" |
|
|
|
if regions is None: |
|
regions = ["kr", "na", "vn", "euw"] |
|
|
|
if output_file is None: |
|
output_file = os.path.join("util", "data", "leaderboard_data.csv") |
|
|
|
|
|
leaderboard_data = [] |
|
|
|
try: |
|
|
|
chrome_options = Options() |
|
chrome_options.add_argument("--headless") |
|
chrome_options.add_argument("--no-sandbox") |
|
chrome_options.add_argument("--disable-dev-shm-usage") |
|
chrome_options.add_argument("--disable-gpu") |
|
chrome_options.add_argument("--disable-logging") |
|
chrome_options.add_argument("--log-level=3") |
|
chrome_options.add_argument("--disable-extensions") |
|
chrome_options.page_load_strategy = 'eager' |
|
chrome_options.add_argument( |
|
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" |
|
) |
|
|
|
|
|
driver = webdriver.Chrome( |
|
service=Service(ChromeDriverManager().install()), |
|
options=chrome_options |
|
) |
|
|
|
for region in regions: |
|
print(f"\nScraping {region.upper()} region...") |
|
for page in range(1, pages_per_region + 1): |
|
print(f"Processing page {page}/{pages_per_region}") |
|
url = f"https://www.op.gg/leaderboards/tier?region={region}&type=ladder&page={page}" |
|
|
|
try: |
|
|
|
driver.get(url) |
|
|
|
|
|
table = WebDriverWait(driver, 15).until( |
|
EC.presence_of_element_located((By.CSS_SELECTOR, "table.css-1l95r9q.e4dns9u11")) |
|
) |
|
|
|
|
|
rows = table.find_elements(By.TAG_NAME, "tr")[1:] |
|
for row in rows: |
|
try: |
|
cells = row.find_elements(By.TAG_NAME, "td") |
|
if len(cells) >= 7: |
|
|
|
summoner = cells[1].text.strip().replace("\n", " ") |
|
rank = cells[0].text.strip() |
|
tier = cells[2].text.strip() |
|
lp = cells[3].text.strip() |
|
level = cells[5].text.strip() |
|
|
|
|
|
champion_imgs = cells[4].find_elements(By.TAG_NAME, "img") |
|
champions = [img.get_attribute("alt") for img in champion_imgs] |
|
champion_data = champions + [""] * (3 - len(champions)) |
|
|
|
|
|
winrate_text = cells[6].text.strip().split("\n") |
|
wins = winrate_text[0].rstrip("W") if len(winrate_text) > 0 else "" |
|
losses = winrate_text[1].rstrip("L") if len(winrate_text) > 1 else "" |
|
winrate = winrate_text[2] if len(winrate_text) > 2 else "" |
|
|
|
|
|
leaderboard_data.append({ |
|
"summoner": summoner, |
|
"region": region, |
|
"rank": rank, |
|
"tier": tier, |
|
"lp": lp, |
|
"most_champion_1": champion_data[0], |
|
"most_champion_2": champion_data[1], |
|
"most_champion_3": champion_data[2], |
|
"level": level, |
|
"win": wins, |
|
"loss": losses, |
|
"winrate": winrate |
|
}) |
|
|
|
except Exception as e: |
|
print(f"Error processing row in {region} page {page}: {e}") |
|
continue |
|
|
|
except Exception as e: |
|
print(f"Error processing {region} page {page}: {e}") |
|
continue |
|
|
|
time.sleep(delay) |
|
|
|
except Exception as e: |
|
print(f"Fatal error: {e}") |
|
return None |
|
|
|
finally: |
|
driver.quit() |
|
|
|
|
|
df = pd.DataFrame(leaderboard_data) |
|
|
|
|
|
df['lp'] = df['lp'].str.replace(',', '').str.replace('LP', '').astype(float) |
|
df['level'] = df['level'].astype(int) |
|
df['win'] = pd.to_numeric(df['win'], errors='coerce') |
|
df['loss'] = pd.to_numeric(df['loss'], errors='coerce') |
|
df['winrate'] = df['winrate'].str.rstrip('%').astype(float) / 100 |
|
|
|
|
|
if output_file: |
|
os.makedirs(os.path.dirname(output_file), exist_ok=True) |
|
df.to_csv(output_file, index=False) |
|
print(f"Leaderboard data saved to {output_file}") |
|
|
|
return df |