chess-mamba-vs-xformer / filter_lichess.py
HaileyStorm's picture
Update filter_lichess.py
c0179a9 verified
import chess
import chess.pgn
import csv
import os
start_at = 0
total_games = 92055571
def process_pgn_file(input_file, output_file):
with open(input_file, 'r') as pgn_file, open(output_file, 'a', newline='') as csv_file:
csv_writer = csv.writer(csv_file)
if start_at == 0:
csv_writer.writerow(['transcript'])
file_size = os.stat(pgn_file.fileno()).st_size
pgn_file.seek(int(file_size * (start_at / total_games)))
games_seen = 0
games_added = 0
while True:
game = chess.pgn.read_game(pgn_file)
if game is None:
break
games_seen += 1
# Filter games based on the specified criteria
if (
game.headers['Result'] == '1-0' and
'Rated' in game.headers['Event'] and
1500 < int(game.headers['WhiteElo']) < 2400 and
1400 < int(game.headers['BlackElo']) < 2800
):
board = chess.Board()
moves = []
move_number = 1
for move in game.mainline_moves():
if board.turn == chess.WHITE:
moves.append(f"{move_number}.")
move_number += 1
san = board.san(move)
moves.append(san + " ")
board.push(board.parse_san(san))
if board.is_game_over() and board.result() == "1-0":
transcript = ''.join(moves)
csv_writer.writerow([transcript.rstrip()])
games_added += 1
if games_added % 100 == 0:
print(f"Added {games_added} of {games_seen} games. {(games_seen+start_at)/float(total_games):.2%} complete.")
# Usage example
input_file = './chess-mamba-vs-xformer/lichess_db_standard_rated_2022-07.pgn'
output_file = './chess-mamba-vs-xformer/lichess_transcripts_phase2_stable.csv'
process_pgn_file(input_file, output_file)