HaileyStorm commited on
Commit
80bc2b3
·
verified ·
1 Parent(s): 062c52f

Upload 5 files

Browse files
Files changed (5) hide show
  1. csv2pqt_windraw.py +78 -0
  2. filter_csv.py +16 -0
  3. filter_lichess.py +53 -0
  4. merge_csv.py +14 -0
  5. sort_split.py +62 -0
csv2pqt_windraw.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import pyarrow as pa
3
+ import pyarrow.parquet as pq
4
+ import numpy as np
5
+ import tiktoken
6
+ import pickle
7
+ from sklearn.model_selection import train_test_split
8
+ import random
9
+ import os
10
+
11
+
12
+ move_num_in_gamestate = False
13
+
14
+ def tokenize_game(game, stoi):
15
+ # Remove the prefix and tokenize the game
16
+ game_cleaned = game.split('\n\n', 1)[1] if '\n\n' in game else game
17
+ game_cleaned = ' '.join(['.' + m.split(".")[-1] if "." in m else m for m in game_cleaned.split()])
18
+ return np.array(encode(game_cleaned), dtype=np.uint8)
19
+
20
+ if __name__ == "__main__":
21
+ dataset_path = "/media/hailey/TVBox/csv_datasets/anneal.csv"
22
+ meta_path = "data/chess/meta.pkl"
23
+
24
+ # Load metadata for tokenization
25
+ if move_num_in_gamestate:
26
+ meta_path = os.path.join(os.path.join('data', 'chess'), 'meta.pkl')
27
+ with open(meta_path, "rb") as f:
28
+ meta = pickle.load(f)
29
+ stoi, itos = meta["stoi"], meta["itos"]
30
+ encode = lambda s: [stoi[c] for c in s]
31
+ decode = lambda l: "".join([itos[i] for i in l])
32
+ else:
33
+ stoi = {' ': 0, '.': 1, 'a': 2, 'b': 3, 'c': 4, 'd': 5, 'e': 6, 'f': 7, 'g': 8, 'h': 9, '1': 10, '2': 11, '3': 12, '4': 13, '5': 14, '6': 15, '7': 16, '8': 17, 'B': 18, 'N': 19, 'R': 20, 'Q': 21, 'K': 22, 'O': 23, 'x': 24, '+': 25, '#': 26, '=': 27}
34
+ itos = {0: ' ', 1: '.', 2: 'a', 3: 'b', 4: 'c', 5: 'd', 6: 'e', 7: 'f', 8: 'g', 9: 'h', 10: '1', 11: '2', 12: '3', 13: '4', 14: '5', 15: '6', 16: '7', 17: '8', 18: 'B', 19: 'N', 20: 'R', 21: 'Q', 22: 'K', 23: 'O', 24: 'x', 25: '+', 26: '#', 27: '='}
35
+ for s in stoi:
36
+ assert itos[stoi[s]] == s
37
+ encode = lambda s: [stoi[c] for c in s.replace('-', '')]
38
+ decode = lambda l: "".join([itos[i] for i in l]).replace("OOO", "O-O-O").replace("OO", "O-O")
39
+
40
+ # Read CSV with headers
41
+ print("Opening csv...")
42
+ df = pd.read_csv(dataset_path)
43
+ #print(df.iloc[random.randint(0, len(df) - 1)])
44
+
45
+ # Report statistics
46
+ total_games = len(df)
47
+ #white_wins = len(df[df['Result'] == '1-0'])
48
+ #white_draws = len(df[df['Result'] == '1/2-1/2'])
49
+ #discarded_games = total_games - white_wins #- white_draws
50
+ print(f"Total games: {total_games}. Tokenizing...")
51
+ #print(f"White wins: {white_wins} ({white_wins/total_games*100:.2f}%)")
52
+ #print(f"White draws: {white_draws} ({white_draws/total_games*100:.2f}%)")
53
+ #print(f"Discarded games: {discarded_games} ({discarded_games/total_games*100:.2f}%)")
54
+
55
+ # Filter out games where white loses
56
+ #df = df[df['Result'].isin(['1-0', '1/2-1/2'])]
57
+ #df = df[df['Result'] == '1-0']
58
+
59
+ # Tokenize games in the 'transcript' column
60
+ df['tokenized'] = df['transcript'].apply(lambda x: tokenize_game(x, stoi))
61
+ print("Tokenized. Writing parquet file...")
62
+
63
+ # Split dataset into training and validation
64
+ #train_df, val_df = train_test_split(df, test_size=0.0, random_state=42)
65
+ train_df = df
66
+ val_df = None
67
+
68
+ # Define a function to write the DataFrame to a Parquet file with multiple rows per row group
69
+ def write_parquet_with_row_groups(df, file_name, rows_per_group=100):
70
+ table = pa.Table.from_pandas(df[['tokenized']])
71
+ writer = pq.ParquetWriter(file_name, table.schema)
72
+ for i in range(0, len(df), rows_per_group):
73
+ writer.write_table(table.slice(i, min(rows_per_group, len(df) - i)))
74
+ writer.close()
75
+
76
+ write_parquet_with_row_groups(train_df, '/media/hailey/TVBox/NEW_anneal.parquet')
77
+ #write_parquet_with_row_groups(val_df, 'val_lich_windraw.parquet')
78
+ print("Done.")
filter_csv.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ # Read the CSV file
4
+ df = pd.read_csv('gt1_8kElo_all.zip')
5
+
6
+ # Filter the DataFrame based on the conditions
7
+ filtered_df = df[(df['Result'] == '1-0') &
8
+ (df['WhiteElo'] > 1900) &
9
+ (df['WhiteElo'] < 2300) &
10
+ (df['BlackElo'] < 2600)]
11
+
12
+ # Select only the 'transcript' column
13
+ transcript_df = filtered_df[['transcript']]
14
+
15
+ # Save the filtered 'transcript' column to a new CSV file
16
+ transcript_df.to_csv('NEW_lichess_filtered.csv', index=False)
filter_lichess.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import chess
2
+ import chess.pgn
3
+ import csv
4
+ import os
5
+
6
+ start_at = 0
7
+ total_games = 92055571
8
+ def process_pgn_file(input_file, output_file):
9
+ with open(input_file, 'r') as pgn_file, open(output_file, 'a', newline='') as csv_file:
10
+ csv_writer = csv.writer(csv_file)
11
+ if start_at == 0:
12
+ csv_writer.writerow(['transcript'])
13
+
14
+ file_size = os.stat(pgn_file.fileno()).st_size
15
+ pgn_file.seek(int(file_size * (start_at / total_games)))
16
+
17
+ games_seen = 0
18
+ games_added = 0
19
+ while True:
20
+ game = chess.pgn.read_game(pgn_file)
21
+ if game is None:
22
+ break
23
+ games_seen += 1
24
+
25
+ # Filter games based on the specified criteria
26
+ if (
27
+ game.headers['Result'] == '1-0' and
28
+ 'Rated' in game.headers['Event'] and
29
+ 1500 < int(game.headers['WhiteElo']) < 2400 and
30
+ 1400 < int(game.headers['BlackElo']) < 2800
31
+ ):
32
+ board = chess.Board()
33
+ moves = []
34
+ move_number = 1
35
+ for move in game.mainline_moves():
36
+ if board.turn == chess.WHITE:
37
+ moves.append(f"{move_number}.")
38
+ move_number += 1
39
+ san = board.san(move)
40
+ moves.append(san + " ")
41
+ board.push(board.parse_san(san))
42
+
43
+ if board.is_game_over() and board.result() == "1-0":
44
+ transcript = ''.join(moves)
45
+ csv_writer.writerow([transcript.rstrip()])
46
+ games_added += 1
47
+ if games_added % 100 == 0:
48
+ print(f"Added {games_added} of {games_seen} games. {(games_seen+start_at)/float(total_games):.2%} complete.")
49
+
50
+ # Usage example
51
+ input_file = './lichess_db_standard_rated_2022-07.pgn'
52
+ output_file = './lichess_transcripts_phase2_stable.csv'
53
+ process_pgn_file(input_file, output_file)
merge_csv.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ # Read the first CSV file
4
+ df1 = pd.read_csv('/media/hailey/TVBox/lichess_db_stable.csv')
5
+
6
+ # Read the second CSV file and filter it
7
+ df2 = pd.read_csv('/media/hailey/TVBox/lichess_transcripts_2.csv')
8
+ #df2 = df2[df2['Result'] == '1-0'][['transcript']]
9
+
10
+ # Concatenate the filtered DataFrames
11
+ merged_df = pd.concat([df1, df2], ignore_index=True)
12
+
13
+ # Save the merged DataFrame to a new CSV file
14
+ merged_df.to_csv('/media/hailey/TVBox/lichess_db_stable2.csv', index=False)
sort_split.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import pyarrow.parquet as pq
3
+ import os
4
+ import numpy as np
5
+ import math
6
+
7
+ def sort_and_split_parquet(input_file, output_dir, n_splits, prefix, min_len, max_len):
8
+ # Load the parquet file
9
+ print("Loading parquet file...")
10
+ df = pq.read_table(input_file).to_pandas()
11
+
12
+ # Sort by the length of the 'tokenized' column
13
+ print("Sorting games & filtering by length...")
14
+ df['length'] = df['tokenized'].apply(len)
15
+ df_sorted = df.sort_values(by='length').drop(columns=['length'])
16
+ lenb4 = len(df_sorted)
17
+ df_sorted = df_sorted[df_sorted['tokenized'].apply(len) <= max_len]
18
+ df_sorted = df_sorted[df_sorted['tokenized'].apply(len) >= min_len]
19
+ if len(df_sorted) < lenb4:
20
+ removed = lenb4 - len(df_sorted)
21
+ print(f"Removed {removed} ({float(removed)/lenb4:.2%}) short and long games.")
22
+
23
+ # Calculate the number of rows per split
24
+ total_rows = len(df_sorted)
25
+ rows_per_split = math.ceil(total_rows / n_splits)
26
+
27
+ print("Dataset sorted. Splitting...")
28
+ games = 0
29
+ # Split and save each part
30
+ for i in range(n_splits):
31
+ start_row = i * rows_per_split
32
+ end_row = min(start_row + rows_per_split, total_rows)
33
+ split_df = df_sorted.iloc[start_row:end_row]
34
+ #lenb4 = len(split_df)
35
+ #split_df = split_df[split_df['tokenized'].apply(len) <= max_len]
36
+ #if len(split_df) < lenb4:
37
+ # print(f"\tRemoved {lenb4 - len(split_df)} long games.")
38
+ games += len(split_df)
39
+
40
+ first_game_length = len(split_df.iloc[0]['tokenized'])
41
+ last_game_length = len(split_df.iloc[-1]['tokenized'])
42
+
43
+ # Save the split DataFrame as a parquet file
44
+ split_file_name = f"{prefix}_{i}.parquet"
45
+ split_df.to_parquet(os.path.join(output_dir, split_file_name))
46
+
47
+ print(f"Saved {split_file_name}... Game lengths: {first_game_length} - {last_game_length}")
48
+ print(f"Saved {games} games total.")
49
+
50
+
51
+
52
+ input_file = '/media/hailey/TVBox/NEW_stable.parquet'
53
+ output_dir = '/media/hailey/More/AI/mamba.py/data/stable'
54
+ os.makedirs(output_dir, exist_ok=True)
55
+ n_splits = 360 #should be roughly input size / 10MB
56
+ prefix = "stable"
57
+ min_len = 200
58
+ max_len = 1536
59
+
60
+ sort_and_split_parquet(input_file, output_dir, n_splits, prefix, min_len, max_len)
61
+ print("Done.")
62
+