HaileyStorm
/

chess-mamba-vs-xformer

Model card Files Files and versions Community

HaileyStorm commited on Mar 26, 2024

Commit

5122a87

•

1 Parent(s): 03a4fe1

Update filter_lichess_multi.py

Browse files

Files changed (1) hide show

filter_lichess_multi.py +31 -26

filter_lichess_multi.py CHANGED Viewed

@@ -3,37 +3,34 @@ import chess.pgn
 import csv
 import os
 import threading
 start_at = 0
 total_games = 92055571
 num_threads = 8
-def process_pgn_chunk(input_file, output_file, start_index, end_index):
-    with open(input_file, 'r') as pgn_file, open(output_file, 'a', newline='') as csv_file:
         csv_writer = csv.writer(csv_file)
-        file_size = os.stat(pgn_file.fileno()).st_size
-        pgn_file.seek(int(file_size * (start_index / total_games)))
         games_seen = 0
         games_added = 0
-        while True:
-            game = chess.pgn.read_game(pgn_file)
-            if game is None or games_seen >= end_index - start_index:
-                break
             games_seen += 1
             # Filter games based on the specified criteria
             if (
-                game.headers['Result'] == '1-0' and
-                'Rated' in game.headers['Event'] and
-                1500 < int(game.headers['WhiteElo']) < 2400 and
-                1400 < int(game.headers['BlackElo']) < 2800
             ):
                 board = chess.Board()
                 moves = []
                 move_number = 1
-                for move in game.mainline_moves():
                     if board.turn == chess.WHITE:
                         moves.append(f"{move_number}.")
                         move_number += 1
@@ -46,28 +43,36 @@ def process_pgn_chunk(input_file, output_file, start_index, end_index):
                     csv_writer.writerow([transcript.rstrip()])
                     games_added += 1
                     if games_added % 100 == 0:
-                        print(f"Thread {threading.current_thread().name} - Added {games_added} of {games_seen} games.")  # {(games_seen+start_index)/float(total_games):.2%} complete.")
 def process_pgn_file(input_file, output_file):
     with open(output_file, 'w', newline='') as csv_file:
         csv_writer = csv.writer(csv_file)
         csv_writer.writerow(['transcript'])
-    chunk_size = (total_games - start_at) // num_threads
-    threads = []
-    for i in range(num_threads):
-        start_index = start_at + i * chunk_size
-        end_index = start_at + (i + 1) * chunk_size
-        if i == num_threads - 1:
-            end_index = total_games
-        thread = threading.Thread(target=process_pgn_chunk, args=(input_file, f"{output_file[:-4]}_{i}.csv", start_index, end_index))
-        threads.append(thread)
-        thread.start()
-    for thread in threads:
-        thread.join()
 input_file = './chess-mamba-vs-xformer/lichess_db_standard_rated_2022-07.pgn'
 output_file = './chess-mamba-vs-xformer/lichess_transcripts_phase2_stable.csv'
 process_pgn_file(input_file, output_file)

 import csv
 import os
 import threading
+import mmap
 start_at = 0
 total_games = 92055571
 num_threads = 8
+def process_pgn_chunk(pgn_data, output_file, start_index, end_index):
+    with open(output_file, 'a', newline='') as csv_file:
         csv_writer = csv.writer(csv_file)
+        pgn = chess.pgn.read_game(chess.pgn.StringIO(pgn_data.decode('utf-8')))
         games_seen = 0
         games_added = 0
+        while pgn is not None and games_seen < end_index - start_index:
             games_seen += 1
             # Filter games based on the specified criteria
             if (
+                pgn.headers['Result'] == '1-0' and
+                'Rated' in pgn.headers['Event'] and
+                1500 < int(pgn.headers['WhiteElo']) < 2400 and
+                1400 < int(pgn.headers['BlackElo']) < 2800
             ):
                 board = chess.Board()
                 moves = []
                 move_number = 1
+                for move in pgn.mainline_moves():
                     if board.turn == chess.WHITE:
                         moves.append(f"{move_number}.")
                         move_number += 1
                     csv_writer.writerow([transcript.rstrip()])
                     games_added += 1
                     if games_added % 100 == 0:
+                        print(f"Thread {threading.current_thread().name} - Added {games_added} of {games_seen} games. {(games_seen+start_index)/float(total_games):.2%} complete.")
+            pgn = chess.pgn.read_game(chess.pgn.StringIO(pgn_data.decode('utf-8')))
 def process_pgn_file(input_file, output_file):
     with open(output_file, 'w', newline='') as csv_file:
         csv_writer = csv.writer(csv_file)
         csv_writer.writerow(['transcript'])
+    file_size = os.path.getsize(input_file)
+    chunk_size = (file_size - start_at) // num_threads
+    with open(input_file, 'rb') as pgn_file:
+        with mmap.mmap(pgn_file.fileno(), 0, access=mmap.ACCESS_READ) as pgn_mmap:
+            threads = []
+            for i in range(num_threads):
+                start_index = start_at + i * chunk_size
+                end_index = start_at + (i + 1) * chunk_size
+                if i == num_threads - 1:
+                    end_index = file_size
+                pgn_chunk = pgn_mmap[start_index:end_index]
+                thread = threading.Thread(target=process_pgn_chunk, args=(pgn_chunk, f"{output_file[:-4]}_{i}.csv", start_index, end_index))
+                threads.append(thread)
+                thread.start()
+            for thread in threads:
+                thread.join()
+# Usage example
 input_file = './chess-mamba-vs-xformer/lichess_db_standard_rated_2022-07.pgn'
 output_file = './chess-mamba-vs-xformer/lichess_transcripts_phase2_stable.csv'
 process_pgn_file(input_file, output_file)