Spaces:
Sleeping
Sleeping
import pandas as pd | |
import os | |
import time | |
from selenium import webdriver | |
from selenium.webdriver.chrome.service import Service | |
from selenium.webdriver.chrome.options import Options | |
from webdriver_manager.chrome import ChromeDriverManager | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
def scrape_leaderboards(regions=None, pages_per_region=5, output_file=None, delay=2): | |
""" | |
Scrape leaderboard data from op.gg for specified regions and return as DataFrame. | |
Args: | |
regions (list): List of regions to scrape. Defaults to ["kr", "na", "vn", "euw"] | |
pages_per_region (int): Number of pages to scrape per region. Defaults to 5 | |
output_file (str): Path to output file. Defaults to "util/data/leaderboard_data.csv" | |
delay (int): Delay between requests in seconds. Defaults to 2 | |
Returns: | |
pandas.DataFrame: Scraped leaderboard data | |
""" | |
# Set defaults | |
if regions is None: | |
regions = ["kr", "na", "vn", "euw"] | |
if output_file is None: | |
output_file = os.path.join("util", "data", "leaderboard_data.csv") | |
# Initialize data list to store rows | |
leaderboard_data = [] | |
try: | |
# Setup Chrome options | |
chrome_options = Options() | |
chrome_options.add_argument("--headless") | |
chrome_options.add_argument("--no-sandbox") | |
chrome_options.add_argument("--disable-dev-shm-usage") | |
chrome_options.add_argument("--disable-gpu") | |
chrome_options.add_argument("--disable-logging") | |
chrome_options.add_argument("--log-level=3") | |
chrome_options.add_argument("--disable-extensions") | |
chrome_options.page_load_strategy = 'eager' | |
chrome_options.add_argument( | |
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" | |
) | |
# Initialize WebDriver | |
driver = webdriver.Chrome( | |
service=Service(ChromeDriverManager().install()), | |
options=chrome_options | |
) | |
for region in regions: | |
print(f"\nScraping {region.upper()} region...") | |
for page in range(1, pages_per_region + 1): | |
print(f"Processing page {page}/{pages_per_region}") | |
url = f"https://www.op.gg/leaderboards/tier?region={region}&type=ladder&page={page}" | |
try: | |
# Access the webpage | |
driver.get(url) | |
# Wait for table to load | |
table = WebDriverWait(driver, 15).until( | |
EC.presence_of_element_located((By.CSS_SELECTOR, "table.css-1l95r9q.e4dns9u11")) | |
) | |
# Process rows | |
rows = table.find_elements(By.TAG_NAME, "tr")[1:] # Skip header row | |
for row in rows: | |
try: | |
cells = row.find_elements(By.TAG_NAME, "td") | |
if len(cells) >= 7: | |
# Extract basic data | |
summoner = cells[1].text.strip().replace("\n", " ") | |
rank = cells[0].text.strip() | |
tier = cells[2].text.strip() | |
lp = cells[3].text.strip() | |
level = cells[5].text.strip() | |
# Extract champion data | |
champion_imgs = cells[4].find_elements(By.TAG_NAME, "img") | |
champions = [img.get_attribute("alt") for img in champion_imgs] | |
champion_data = champions + [""] * (3 - len(champions)) | |
# Parse win/loss data | |
winrate_text = cells[6].text.strip().split("\n") | |
wins = winrate_text[0].rstrip("W") if len(winrate_text) > 0 else "" | |
losses = winrate_text[1].rstrip("L") if len(winrate_text) > 1 else "" | |
winrate = winrate_text[2] if len(winrate_text) > 2 else "" | |
# Append row data | |
leaderboard_data.append({ | |
"summoner": summoner, | |
"region": region, | |
"rank": rank, | |
"tier": tier, | |
"lp": lp, | |
"most_champion_1": champion_data[0], | |
"most_champion_2": champion_data[1], | |
"most_champion_3": champion_data[2], | |
"level": level, | |
"win": wins, | |
"loss": losses, | |
"winrate": winrate | |
}) | |
except Exception as e: | |
print(f"Error processing row in {region} page {page}: {e}") | |
continue | |
except Exception as e: | |
print(f"Error processing {region} page {page}: {e}") | |
continue | |
time.sleep(delay) | |
except Exception as e: | |
print(f"Fatal error: {e}") | |
return None | |
finally: | |
driver.quit() | |
# Create DataFrame | |
df = pd.DataFrame(leaderboard_data) | |
# Clean and convert data types | |
df['lp'] = df['lp'].str.replace(',', '').str.replace('LP', '').astype(float) | |
df['level'] = df['level'].astype(int) | |
df['win'] = pd.to_numeric(df['win'], errors='coerce') | |
df['loss'] = pd.to_numeric(df['loss'], errors='coerce') | |
df['winrate'] = df['winrate'].str.rstrip('%').astype(float) / 100 | |
# Save to CSV if output_file is specified | |
if output_file: | |
os.makedirs(os.path.dirname(output_file), exist_ok=True) | |
df.to_csv(output_file, index=False) | |
print(f"Leaderboard data saved to {output_file}") | |
return df |