Liss, Alex (NYC-HUG) commited on
Commit
9e940d2
·
1 Parent(s): 6382248

added WIP player search functionality

Browse files
data/april_11_multimedia_data_collect/new_final_april 11/neo4j_player_update/update_player_nodes.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ update_player_nodes.py - Updates existing Player nodes in Neo4j with additional attributes
4
+
5
+ This script reads player data from the roster_april_11.csv file and updates
6
+ existing Player nodes in Neo4j with the following attributes:
7
+ - headshot_url
8
+ - instagram_url
9
+ - highlight_video_url
10
+
11
+ The script uses Player_id as the primary key for matching and updating nodes.
12
+ """
13
+
14
+ import os
15
+ import sys
16
+ import pandas as pd
17
+ from neo4j import GraphDatabase
18
+ from dotenv import load_dotenv
19
+
20
+ # Define base project directory relative to script location
21
+ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
22
+ # Construct absolute path to project root (ifx-sandbox) based on known workspace structure
23
+ # This assumes the script is always located at the same relative depth
24
+ WORKSPACE_ROOT = os.path.abspath(os.path.join(SCRIPT_DIR, "../../../../..")) # Goes up 5 levels to workspace root
25
+ PROJECT_DIR = os.path.join(WORKSPACE_ROOT, "ifx-sandbox") # Specify ifx-sandbox within workspace
26
+
27
+ # Add parent directory (ifx-sandbox) to path if needed for imports, though unlikely needed here
28
+ # sys.path.append(PROJECT_DIR)
29
+
30
+ # Set up paths using PROJECT_DIR
31
+ DATA_DIR = os.path.join(PROJECT_DIR, "data")
32
+ ROSTER_DATA_DIR = os.path.join(DATA_DIR, "april_11_multimedia_data_collect", "new_final_april 11")
33
+ ROSTER_FILE = os.path.join(ROSTER_DATA_DIR, "roster_april_11.csv")
34
+
35
+ # Load environment variables from ifx-sandbox/.env
36
+ ENV_FILE = os.path.join(PROJECT_DIR, ".env")
37
+
38
+ if not os.path.exists(ENV_FILE):
39
+ print(f"Error: .env file not found at {ENV_FILE}")
40
+ # Attempt fallback if PROJECT_DIR might be wrong
41
+ alt_project_dir = os.path.join(os.path.abspath(os.path.join(SCRIPT_DIR, "../../../../../")), "ifx-sandbox") # Go up 6 and specify
42
+ alt_env_file = os.path.join(alt_project_dir, ".env")
43
+ if os.path.exists(alt_env_file):
44
+ print("Fallback: Found .env using alternative path calculation.")
45
+ ENV_FILE = alt_env_file
46
+ else:
47
+ sys.exit(1)
48
+
49
+ # Explicitly pass the path to load_dotenv
50
+ load_dotenv(dotenv_path=ENV_FILE)
51
+ print(f"Loading environment variables from: {ENV_FILE}")
52
+
53
+ # Neo4j connection credentials
54
+ NEO4J_URI = os.getenv('AURA_CONNECTION_URI')
55
+ NEO4J_USER = os.getenv('AURA_USERNAME')
56
+ NEO4J_PASS = os.getenv('AURA_PASSWORD')
57
+
58
+ if not all([NEO4J_URI, NEO4J_USER, NEO4J_PASS]):
59
+ print(f"Error: Missing required Neo4j credentials in {ENV_FILE}")
60
+ print(f"Required variables: AURA_CONNECTION_URI, AURA_USERNAME, AURA_PASSWORD")
61
+ sys.exit(1)
62
+
63
+ def clean_row_dict(row):
64
+ """Convert pandas row to dict and replace NaN or empty strings with None"""
65
+ return {k: None if pd.isna(v) or v == '' else v for k, v in row.items()}
66
+
67
+ def update_player_nodes():
68
+ """
69
+ Updates existing Player nodes with additional attributes from the roster CSV.
70
+ Uses Player_id as the primary key for matching.
71
+ """
72
+ print(f"Loading player roster data from: {ROSTER_FILE}")
73
+
74
+ # Check if the file exists
75
+ if not os.path.exists(ROSTER_FILE):
76
+ print(f"Error: Roster file not found at {ROSTER_FILE}")
77
+ return False
78
+
79
+ # Load the roster data
80
+ try:
81
+ roster_df = pd.read_csv(ROSTER_FILE)
82
+ print(f"Loaded {len(roster_df)} players from CSV")
83
+ except Exception as e:
84
+ print(f"Error loading roster CSV: {str(e)}")
85
+ return False
86
+
87
+ # Verify required columns exist
88
+ required_columns = ['player_id', 'headshot_url', 'instagram_url', 'highlight_video_url']
89
+ missing_columns = [col for col in required_columns if col not in roster_df.columns]
90
+
91
+ if missing_columns:
92
+ print(f"Error: Missing required columns in CSV: {', '.join(missing_columns)}")
93
+ return False
94
+
95
+ # Connect to Neo4j
96
+ print(f"Connecting to Neo4j at {NEO4J_URI}")
97
+ driver = None # Initialize driver to None
98
+ try:
99
+ driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASS))
100
+ driver.verify_connectivity()
101
+ print("Neo4j connection successful.")
102
+ with driver.session() as session:
103
+ result = session.run("MATCH (p:Player) RETURN count(p) as count")
104
+ player_count = result.single()["count"]
105
+ print(f"Found {player_count} Player nodes in Neo4j")
106
+ except Exception as e:
107
+ print(f"Error connecting to or querying Neo4j: {str(e)}")
108
+ if driver:
109
+ driver.close()
110
+ return False
111
+
112
+ # Update player nodes
113
+ success_count = 0
114
+ error_count = 0
115
+
116
+ with driver.session() as session:
117
+ for index, row in roster_df.iterrows():
118
+ # Use player_id (lowercase) which is the correct column name
119
+ player_id_val = row.get('player_id')
120
+
121
+ if not player_id_val:
122
+ error_count += 1
123
+ print(f"Skipping row {index + 1}: Missing player_id")
124
+ continue
125
+
126
+ params = clean_row_dict(row)
127
+ # Ensure the key used for matching exists in params for the query
128
+ params['match_player_id'] = player_id_val
129
+
130
+ # Update query - Use correct case for property key and parameter name
131
+ query = """
132
+ MATCH (p:Player {player_id: $match_player_id})
133
+ SET p.headshot_url = $headshot_url,
134
+ p.instagram_url = $instagram_url,
135
+ p.highlight_video_url = $highlight_video_url
136
+ RETURN p.player_id as player_id
137
+ """
138
+
139
+ try:
140
+ result = session.run(query, params)
141
+ updated_player = result.single()
142
+
143
+ if updated_player:
144
+ success_count += 1
145
+ if success_count % 10 == 0 or success_count == 1:
146
+ print(f"Updated {success_count} players...")
147
+ else:
148
+ error_count += 1
149
+ print(f"Warning: Player with ID {player_id_val} not found in Neo4j")
150
+ except Exception as e:
151
+ error_count += 1
152
+ print(f"Error updating player {player_id_val}: {str(e)}")
153
+
154
+ # Close the driver
155
+ driver.close()
156
+
157
+ # Print summary
158
+ print("\nUpdate Summary:")
159
+ print(f"Total players in CSV: {len(roster_df)}")
160
+ print(f"Successfully updated: {success_count}")
161
+ print(f"Errors/not found: {error_count}")
162
+
163
+ # Verify updates
164
+ if success_count > 0:
165
+ print("\nVerifying updates...")
166
+ verify_updates()
167
+
168
+ return success_count > 0
169
+
170
+ def verify_updates():
171
+ """Verify that Player nodes were updated with the new attributes"""
172
+ driver = None
173
+ try:
174
+ driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASS))
175
+ driver.verify_connectivity()
176
+ with driver.session() as session:
177
+ # Check for players with headshot & instagram URLs
178
+ query1 = """
179
+ MATCH (p:Player)
180
+ WHERE p.headshot_url IS NOT NULL AND p.instagram_url IS NOT NULL
181
+ RETURN count(p) as count
182
+ """
183
+ result1 = session.run(query1)
184
+ count1 = result1.single()["count"]
185
+
186
+ # Check for players with highlight URLs
187
+ query2 = """
188
+ MATCH (p:Player)
189
+ WHERE p.highlight_video_url IS NOT NULL
190
+ RETURN count(p) as count
191
+ """
192
+ result2 = session.run(query2)
193
+ count2 = result2.single()["count"]
194
+
195
+ print(f"Players with headshot & Instagram URLs: {count1}")
196
+ print(f"Players with highlight URLs: {count2}")
197
+ except Exception as e:
198
+ print(f"Error during verification: {str(e)}")
199
+ finally:
200
+ if driver:
201
+ driver.close()
202
+
203
+ def main():
204
+ print("=== Player Node Update Tool ===")
205
+ print("This script will update existing Player nodes in Neo4j with additional attributes")
206
+ print(f"from the {ROSTER_FILE} file.")
207
+
208
+ # Check for --yes flag
209
+ if len(sys.argv) > 1 and sys.argv[1] == '--yes':
210
+ print("Automatic confirmation enabled. Proceeding with update...")
211
+ confirmed = True
212
+ else:
213
+ # Confirm with user
214
+ user_input = input("\nDo you want to proceed with the update? (y/n): ")
215
+ confirmed = user_input.lower() == 'y'
216
+
217
+ if not confirmed:
218
+ print("Update cancelled.")
219
+ return
220
+
221
+ # Run the update
222
+ success = update_player_nodes()
223
+
224
+ if success:
225
+ print("\n✅ Player nodes updated successfully!")
226
+ else:
227
+ print("\n❌ Player node update failed. Please check the errors above.")
228
+
229
+ if __name__ == "__main__":
230
+ main()