HaileyStorm
commited on
Commit
•
22cfeab
1
Parent(s):
d3da7b8
Delete filter_lichess_multi.py
Browse files- filter_lichess_multi.py +0 -77
filter_lichess_multi.py
DELETED
@@ -1,77 +0,0 @@
|
|
1 |
-
import chess
|
2 |
-
import chess.pgn
|
3 |
-
import csv
|
4 |
-
import os
|
5 |
-
import threading
|
6 |
-
import mmap
|
7 |
-
|
8 |
-
start_at = 0
|
9 |
-
total_games = 92055571
|
10 |
-
num_threads = 8
|
11 |
-
|
12 |
-
def process_pgn_chunk(pgn_data, output_file, start_index, end_index):
|
13 |
-
with open(output_file, 'a', newline='') as csv_file:
|
14 |
-
csv_writer = csv.writer(csv_file)
|
15 |
-
|
16 |
-
pgn = chess.pgn.read_game(chess.pgn.StringIO(pgn_data.decode('utf-8')))
|
17 |
-
games_seen = 0
|
18 |
-
games_added = 0
|
19 |
-
while pgn is not None:
|
20 |
-
if games_seen >= end_index - start_index:
|
21 |
-
break
|
22 |
-
games_seen += 1
|
23 |
-
|
24 |
-
# Filter games based on the specified criteria
|
25 |
-
if (
|
26 |
-
pgn.headers['Result'] == '1-0' and
|
27 |
-
'Rated' in pgn.headers['Event'] and
|
28 |
-
1500 < int(pgn.headers['WhiteElo']) < 2400 and
|
29 |
-
1400 < int(pgn.headers['BlackElo']) < 2800
|
30 |
-
):
|
31 |
-
board = chess.Board()
|
32 |
-
moves = []
|
33 |
-
move_number = 1
|
34 |
-
for move in pgn.mainline_moves():
|
35 |
-
if board.turn == chess.WHITE:
|
36 |
-
moves.append(f"{move_number}.")
|
37 |
-
move_number += 1
|
38 |
-
san = board.san(move)
|
39 |
-
moves.append(san + " ")
|
40 |
-
board.push(board.parse_san(san))
|
41 |
-
|
42 |
-
if board.is_game_over() and board.result() == "1-0":
|
43 |
-
transcript = ''.join(moves)
|
44 |
-
csv_writer.writerow([transcript.rstrip()])
|
45 |
-
games_added += 1
|
46 |
-
if games_added % 100 == 0:
|
47 |
-
print(f"Thread {threading.current_thread().name} - Added {games_added} of {games_seen} games. {(games_seen+start_index)/float(total_games):.2%} complete.")
|
48 |
-
|
49 |
-
pgn = chess.pgn.read_game(chess.pgn.StringIO(pgn_data.decode('utf-8')))
|
50 |
-
|
51 |
-
def process_pgn_file(input_file, output_file):
|
52 |
-
with open(output_file, 'w', newline='') as csv_file:
|
53 |
-
csv_writer = csv.writer(csv_file)
|
54 |
-
csv_writer.writerow(['transcript'])
|
55 |
-
|
56 |
-
file_size = os.path.getsize(input_file)
|
57 |
-
chunk_size = (file_size - start_at) // num_threads
|
58 |
-
threads = []
|
59 |
-
with open(input_file, 'rb') as pgn_file:
|
60 |
-
with mmap.mmap(pgn_file.fileno(), 0, access=mmap.ACCESS_READ) as pgn_mmap:
|
61 |
-
for i in range(num_threads):
|
62 |
-
start_index = start_at + i * chunk_size
|
63 |
-
end_index = start_at + (i + 1) * chunk_size
|
64 |
-
if i == num_threads - 1:
|
65 |
-
end_index = file_size
|
66 |
-
pgn_data = pgn_mmap[start_index:end_index]
|
67 |
-
thread = threading.Thread(target=process_pgn_chunk, args=(pgn_data, f"{output_file[:-4]}_{i}.csv", start_index, end_index))
|
68 |
-
threads.append(thread)
|
69 |
-
thread.start()
|
70 |
-
|
71 |
-
for thread in threads:
|
72 |
-
thread.join()
|
73 |
-
|
74 |
-
# Usage example
|
75 |
-
input_file = './chess-mamba-vs-xformer/lichess_db_standard_rated_2022-07.pgn'
|
76 |
-
output_file = './chess-mamba-vs-xformer/lichess_transcripts_phase2_stable.csv'
|
77 |
-
process_pgn_file(input_file, output_file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|