Spaces:
Sleeping
Sleeping
File size: 6,241 Bytes
abcb943 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import pandas as pd
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
def scrape_leaderboards(regions=None, pages_per_region=5, output_file=None, delay=2):
"""
Scrape leaderboard data from op.gg for specified regions and return as DataFrame.
Args:
regions (list): List of regions to scrape. Defaults to ["kr", "na", "vn", "euw"]
pages_per_region (int): Number of pages to scrape per region. Defaults to 5
output_file (str): Path to output file. Defaults to "util/data/leaderboard_data.csv"
delay (int): Delay between requests in seconds. Defaults to 2
Returns:
pandas.DataFrame: Scraped leaderboard data
"""
# Set defaults
if regions is None:
regions = ["kr", "na", "vn", "euw"]
if output_file is None:
output_file = os.path.join("util", "data", "leaderboard_data.csv")
# Initialize data list to store rows
leaderboard_data = []
try:
# Setup Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--disable-logging")
chrome_options.add_argument("--log-level=3")
chrome_options.add_argument("--disable-extensions")
chrome_options.page_load_strategy = 'eager'
chrome_options.add_argument(
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
)
# Initialize WebDriver
driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()),
options=chrome_options
)
for region in regions:
print(f"\nScraping {region.upper()} region...")
for page in range(1, pages_per_region + 1):
print(f"Processing page {page}/{pages_per_region}")
url = f"https://www.op.gg/leaderboards/tier?region={region}&type=ladder&page={page}"
try:
# Access the webpage
driver.get(url)
# Wait for table to load
table = WebDriverWait(driver, 15).until(
EC.presence_of_element_located((By.CSS_SELECTOR, "table.css-1l95r9q.e4dns9u11"))
)
# Process rows
rows = table.find_elements(By.TAG_NAME, "tr")[1:] # Skip header row
for row in rows:
try:
cells = row.find_elements(By.TAG_NAME, "td")
if len(cells) >= 7:
# Extract basic data
summoner = cells[1].text.strip().replace("\n", " ")
rank = cells[0].text.strip()
tier = cells[2].text.strip()
lp = cells[3].text.strip()
level = cells[5].text.strip()
# Extract champion data
champion_imgs = cells[4].find_elements(By.TAG_NAME, "img")
champions = [img.get_attribute("alt") for img in champion_imgs]
champion_data = champions + [""] * (3 - len(champions))
# Parse win/loss data
winrate_text = cells[6].text.strip().split("\n")
wins = winrate_text[0].rstrip("W") if len(winrate_text) > 0 else ""
losses = winrate_text[1].rstrip("L") if len(winrate_text) > 1 else ""
winrate = winrate_text[2] if len(winrate_text) > 2 else ""
# Append row data
leaderboard_data.append({
"summoner": summoner,
"region": region,
"rank": rank,
"tier": tier,
"lp": lp,
"most_champion_1": champion_data[0],
"most_champion_2": champion_data[1],
"most_champion_3": champion_data[2],
"level": level,
"win": wins,
"loss": losses,
"winrate": winrate
})
except Exception as e:
print(f"Error processing row in {region} page {page}: {e}")
continue
except Exception as e:
print(f"Error processing {region} page {page}: {e}")
continue
time.sleep(delay)
except Exception as e:
print(f"Fatal error: {e}")
return None
finally:
driver.quit()
# Create DataFrame
df = pd.DataFrame(leaderboard_data)
# Clean and convert data types
df['lp'] = df['lp'].str.replace(',', '').str.replace('LP', '').astype(float)
df['level'] = df['level'].astype(int)
df['win'] = pd.to_numeric(df['win'], errors='coerce')
df['loss'] = pd.to_numeric(df['loss'], errors='coerce')
df['winrate'] = df['winrate'].str.rstrip('%').astype(float) / 100
# Save to CSV if output_file is specified
if output_file:
os.makedirs(os.path.dirname(output_file), exist_ok=True)
df.to_csv(output_file, index=False)
print(f"Leaderboard data saved to {output_file}")
return df |