Spaces:
No application file
No application file
Liss, Alex (NYC-HUG)
commited on
Commit
·
9e940d2
1
Parent(s):
6382248
added WIP player search functionality
Browse files
data/april_11_multimedia_data_collect/new_final_april 11/neo4j_player_update/update_player_nodes.py
ADDED
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
update_player_nodes.py - Updates existing Player nodes in Neo4j with additional attributes
|
4 |
+
|
5 |
+
This script reads player data from the roster_april_11.csv file and updates
|
6 |
+
existing Player nodes in Neo4j with the following attributes:
|
7 |
+
- headshot_url
|
8 |
+
- instagram_url
|
9 |
+
- highlight_video_url
|
10 |
+
|
11 |
+
The script uses Player_id as the primary key for matching and updating nodes.
|
12 |
+
"""
|
13 |
+
|
14 |
+
import os
|
15 |
+
import sys
|
16 |
+
import pandas as pd
|
17 |
+
from neo4j import GraphDatabase
|
18 |
+
from dotenv import load_dotenv
|
19 |
+
|
20 |
+
# Define base project directory relative to script location
|
21 |
+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
22 |
+
# Construct absolute path to project root (ifx-sandbox) based on known workspace structure
|
23 |
+
# This assumes the script is always located at the same relative depth
|
24 |
+
WORKSPACE_ROOT = os.path.abspath(os.path.join(SCRIPT_DIR, "../../../../..")) # Goes up 5 levels to workspace root
|
25 |
+
PROJECT_DIR = os.path.join(WORKSPACE_ROOT, "ifx-sandbox") # Specify ifx-sandbox within workspace
|
26 |
+
|
27 |
+
# Add parent directory (ifx-sandbox) to path if needed for imports, though unlikely needed here
|
28 |
+
# sys.path.append(PROJECT_DIR)
|
29 |
+
|
30 |
+
# Set up paths using PROJECT_DIR
|
31 |
+
DATA_DIR = os.path.join(PROJECT_DIR, "data")
|
32 |
+
ROSTER_DATA_DIR = os.path.join(DATA_DIR, "april_11_multimedia_data_collect", "new_final_april 11")
|
33 |
+
ROSTER_FILE = os.path.join(ROSTER_DATA_DIR, "roster_april_11.csv")
|
34 |
+
|
35 |
+
# Load environment variables from ifx-sandbox/.env
|
36 |
+
ENV_FILE = os.path.join(PROJECT_DIR, ".env")
|
37 |
+
|
38 |
+
if not os.path.exists(ENV_FILE):
|
39 |
+
print(f"Error: .env file not found at {ENV_FILE}")
|
40 |
+
# Attempt fallback if PROJECT_DIR might be wrong
|
41 |
+
alt_project_dir = os.path.join(os.path.abspath(os.path.join(SCRIPT_DIR, "../../../../../")), "ifx-sandbox") # Go up 6 and specify
|
42 |
+
alt_env_file = os.path.join(alt_project_dir, ".env")
|
43 |
+
if os.path.exists(alt_env_file):
|
44 |
+
print("Fallback: Found .env using alternative path calculation.")
|
45 |
+
ENV_FILE = alt_env_file
|
46 |
+
else:
|
47 |
+
sys.exit(1)
|
48 |
+
|
49 |
+
# Explicitly pass the path to load_dotenv
|
50 |
+
load_dotenv(dotenv_path=ENV_FILE)
|
51 |
+
print(f"Loading environment variables from: {ENV_FILE}")
|
52 |
+
|
53 |
+
# Neo4j connection credentials
|
54 |
+
NEO4J_URI = os.getenv('AURA_CONNECTION_URI')
|
55 |
+
NEO4J_USER = os.getenv('AURA_USERNAME')
|
56 |
+
NEO4J_PASS = os.getenv('AURA_PASSWORD')
|
57 |
+
|
58 |
+
if not all([NEO4J_URI, NEO4J_USER, NEO4J_PASS]):
|
59 |
+
print(f"Error: Missing required Neo4j credentials in {ENV_FILE}")
|
60 |
+
print(f"Required variables: AURA_CONNECTION_URI, AURA_USERNAME, AURA_PASSWORD")
|
61 |
+
sys.exit(1)
|
62 |
+
|
63 |
+
def clean_row_dict(row):
|
64 |
+
"""Convert pandas row to dict and replace NaN or empty strings with None"""
|
65 |
+
return {k: None if pd.isna(v) or v == '' else v for k, v in row.items()}
|
66 |
+
|
67 |
+
def update_player_nodes():
|
68 |
+
"""
|
69 |
+
Updates existing Player nodes with additional attributes from the roster CSV.
|
70 |
+
Uses Player_id as the primary key for matching.
|
71 |
+
"""
|
72 |
+
print(f"Loading player roster data from: {ROSTER_FILE}")
|
73 |
+
|
74 |
+
# Check if the file exists
|
75 |
+
if not os.path.exists(ROSTER_FILE):
|
76 |
+
print(f"Error: Roster file not found at {ROSTER_FILE}")
|
77 |
+
return False
|
78 |
+
|
79 |
+
# Load the roster data
|
80 |
+
try:
|
81 |
+
roster_df = pd.read_csv(ROSTER_FILE)
|
82 |
+
print(f"Loaded {len(roster_df)} players from CSV")
|
83 |
+
except Exception as e:
|
84 |
+
print(f"Error loading roster CSV: {str(e)}")
|
85 |
+
return False
|
86 |
+
|
87 |
+
# Verify required columns exist
|
88 |
+
required_columns = ['player_id', 'headshot_url', 'instagram_url', 'highlight_video_url']
|
89 |
+
missing_columns = [col for col in required_columns if col not in roster_df.columns]
|
90 |
+
|
91 |
+
if missing_columns:
|
92 |
+
print(f"Error: Missing required columns in CSV: {', '.join(missing_columns)}")
|
93 |
+
return False
|
94 |
+
|
95 |
+
# Connect to Neo4j
|
96 |
+
print(f"Connecting to Neo4j at {NEO4J_URI}")
|
97 |
+
driver = None # Initialize driver to None
|
98 |
+
try:
|
99 |
+
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASS))
|
100 |
+
driver.verify_connectivity()
|
101 |
+
print("Neo4j connection successful.")
|
102 |
+
with driver.session() as session:
|
103 |
+
result = session.run("MATCH (p:Player) RETURN count(p) as count")
|
104 |
+
player_count = result.single()["count"]
|
105 |
+
print(f"Found {player_count} Player nodes in Neo4j")
|
106 |
+
except Exception as e:
|
107 |
+
print(f"Error connecting to or querying Neo4j: {str(e)}")
|
108 |
+
if driver:
|
109 |
+
driver.close()
|
110 |
+
return False
|
111 |
+
|
112 |
+
# Update player nodes
|
113 |
+
success_count = 0
|
114 |
+
error_count = 0
|
115 |
+
|
116 |
+
with driver.session() as session:
|
117 |
+
for index, row in roster_df.iterrows():
|
118 |
+
# Use player_id (lowercase) which is the correct column name
|
119 |
+
player_id_val = row.get('player_id')
|
120 |
+
|
121 |
+
if not player_id_val:
|
122 |
+
error_count += 1
|
123 |
+
print(f"Skipping row {index + 1}: Missing player_id")
|
124 |
+
continue
|
125 |
+
|
126 |
+
params = clean_row_dict(row)
|
127 |
+
# Ensure the key used for matching exists in params for the query
|
128 |
+
params['match_player_id'] = player_id_val
|
129 |
+
|
130 |
+
# Update query - Use correct case for property key and parameter name
|
131 |
+
query = """
|
132 |
+
MATCH (p:Player {player_id: $match_player_id})
|
133 |
+
SET p.headshot_url = $headshot_url,
|
134 |
+
p.instagram_url = $instagram_url,
|
135 |
+
p.highlight_video_url = $highlight_video_url
|
136 |
+
RETURN p.player_id as player_id
|
137 |
+
"""
|
138 |
+
|
139 |
+
try:
|
140 |
+
result = session.run(query, params)
|
141 |
+
updated_player = result.single()
|
142 |
+
|
143 |
+
if updated_player:
|
144 |
+
success_count += 1
|
145 |
+
if success_count % 10 == 0 or success_count == 1:
|
146 |
+
print(f"Updated {success_count} players...")
|
147 |
+
else:
|
148 |
+
error_count += 1
|
149 |
+
print(f"Warning: Player with ID {player_id_val} not found in Neo4j")
|
150 |
+
except Exception as e:
|
151 |
+
error_count += 1
|
152 |
+
print(f"Error updating player {player_id_val}: {str(e)}")
|
153 |
+
|
154 |
+
# Close the driver
|
155 |
+
driver.close()
|
156 |
+
|
157 |
+
# Print summary
|
158 |
+
print("\nUpdate Summary:")
|
159 |
+
print(f"Total players in CSV: {len(roster_df)}")
|
160 |
+
print(f"Successfully updated: {success_count}")
|
161 |
+
print(f"Errors/not found: {error_count}")
|
162 |
+
|
163 |
+
# Verify updates
|
164 |
+
if success_count > 0:
|
165 |
+
print("\nVerifying updates...")
|
166 |
+
verify_updates()
|
167 |
+
|
168 |
+
return success_count > 0
|
169 |
+
|
170 |
+
def verify_updates():
|
171 |
+
"""Verify that Player nodes were updated with the new attributes"""
|
172 |
+
driver = None
|
173 |
+
try:
|
174 |
+
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASS))
|
175 |
+
driver.verify_connectivity()
|
176 |
+
with driver.session() as session:
|
177 |
+
# Check for players with headshot & instagram URLs
|
178 |
+
query1 = """
|
179 |
+
MATCH (p:Player)
|
180 |
+
WHERE p.headshot_url IS NOT NULL AND p.instagram_url IS NOT NULL
|
181 |
+
RETURN count(p) as count
|
182 |
+
"""
|
183 |
+
result1 = session.run(query1)
|
184 |
+
count1 = result1.single()["count"]
|
185 |
+
|
186 |
+
# Check for players with highlight URLs
|
187 |
+
query2 = """
|
188 |
+
MATCH (p:Player)
|
189 |
+
WHERE p.highlight_video_url IS NOT NULL
|
190 |
+
RETURN count(p) as count
|
191 |
+
"""
|
192 |
+
result2 = session.run(query2)
|
193 |
+
count2 = result2.single()["count"]
|
194 |
+
|
195 |
+
print(f"Players with headshot & Instagram URLs: {count1}")
|
196 |
+
print(f"Players with highlight URLs: {count2}")
|
197 |
+
except Exception as e:
|
198 |
+
print(f"Error during verification: {str(e)}")
|
199 |
+
finally:
|
200 |
+
if driver:
|
201 |
+
driver.close()
|
202 |
+
|
203 |
+
def main():
|
204 |
+
print("=== Player Node Update Tool ===")
|
205 |
+
print("This script will update existing Player nodes in Neo4j with additional attributes")
|
206 |
+
print(f"from the {ROSTER_FILE} file.")
|
207 |
+
|
208 |
+
# Check for --yes flag
|
209 |
+
if len(sys.argv) > 1 and sys.argv[1] == '--yes':
|
210 |
+
print("Automatic confirmation enabled. Proceeding with update...")
|
211 |
+
confirmed = True
|
212 |
+
else:
|
213 |
+
# Confirm with user
|
214 |
+
user_input = input("\nDo you want to proceed with the update? (y/n): ")
|
215 |
+
confirmed = user_input.lower() == 'y'
|
216 |
+
|
217 |
+
if not confirmed:
|
218 |
+
print("Update cancelled.")
|
219 |
+
return
|
220 |
+
|
221 |
+
# Run the update
|
222 |
+
success = update_player_nodes()
|
223 |
+
|
224 |
+
if success:
|
225 |
+
print("\n✅ Player nodes updated successfully!")
|
226 |
+
else:
|
227 |
+
print("\n❌ Player node update failed. Please check the errors above.")
|
228 |
+
|
229 |
+
if __name__ == "__main__":
|
230 |
+
main()
|