HaileyStorm
/

chess-mamba-vs-xformer

HaileyStorm commited on Mar 26

Commit

d3da7b8

•

1 Parent(s): 5122a87

Update filter_lichess_multi.py

Files changed (1) hide show

filter_lichess_multi.py CHANGED Viewed

@@ -14,10 +14,11 @@ def process_pgn_chunk(pgn_data, output_file, start_index, end_index):
         csv_writer = csv.writer(csv_file)
         pgn = chess.pgn.read_game(chess.pgn.StringIO(pgn_data.decode('utf-8')))
         games_seen = 0
         games_added = 0
-        while pgn is not None and games_seen < end_index - start_index:
             games_seen += 1
             # Filter games based on the specified criteria
@@ -54,18 +55,16 @@ def process_pgn_file(input_file, output_file):
     file_size = os.path.getsize(input_file)
     chunk_size = (file_size - start_at) // num_threads
     with open(input_file, 'rb') as pgn_file:
         with mmap.mmap(pgn_file.fileno(), 0, access=mmap.ACCESS_READ) as pgn_mmap:
-            threads = []
             for i in range(num_threads):
                 start_index = start_at + i * chunk_size
                 end_index = start_at + (i + 1) * chunk_size
                 if i == num_threads - 1:
                     end_index = file_size
-                pgn_chunk = pgn_mmap[start_index:end_index]
-                thread = threading.Thread(target=process_pgn_chunk, args=(pgn_chunk, f"{output_file[:-4]}_{i}.csv", start_index, end_index))
                 threads.append(thread)
                 thread.start()

         csv_writer = csv.writer(csv_file)
         pgn = chess.pgn.read_game(chess.pgn.StringIO(pgn_data.decode('utf-8')))
         games_seen = 0
         games_added = 0
+        while pgn is not None:
+            if games_seen >= end_index - start_index:
+                break
             games_seen += 1
             # Filter games based on the specified criteria
     file_size = os.path.getsize(input_file)
     chunk_size = (file_size - start_at) // num_threads
+    threads = []
     with open(input_file, 'rb') as pgn_file:
         with mmap.mmap(pgn_file.fileno(), 0, access=mmap.ACCESS_READ) as pgn_mmap:
             for i in range(num_threads):
                 start_index = start_at + i * chunk_size
                 end_index = start_at + (i + 1) * chunk_size
                 if i == num_threads - 1:
                     end_index = file_size
+                pgn_data = pgn_mmap[start_index:end_index]
+                thread = threading.Thread(target=process_pgn_chunk, args=(pgn_data, f"{output_file[:-4]}_{i}.csv", start_index, end_index))
                 threads.append(thread)
                 thread.start()