Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -324,8 +324,28 @@ def download_ibtracs_file(basin, force_download=False):
|
|
324 |
logging.error(f"Failed to download {basin} basin file: {e}")
|
325 |
return None
|
326 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
327 |
def load_ibtracs_csv_directly(basin='WP'):
|
328 |
-
"""Load IBTrACS data directly from CSV
|
329 |
filename = BASIN_FILES[basin]
|
330 |
local_path = os.path.join(DATA_PATH, filename)
|
331 |
|
@@ -336,47 +356,102 @@ def load_ibtracs_csv_directly(basin='WP'):
|
|
336 |
return None
|
337 |
|
338 |
try:
|
339 |
-
#
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
]
|
348 |
-
|
349 |
-
# Read with error handling for missing columns
|
350 |
logging.info(f"Reading IBTrACS CSV file: {local_path}")
|
351 |
-
df = pd.read_csv(local_path, low_memory=False, skiprows=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
352 |
|
353 |
-
#
|
354 |
-
|
355 |
-
|
356 |
|
357 |
-
|
358 |
-
|
|
|
|
|
|
|
|
|
|
|
359 |
|
360 |
-
|
361 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
362 |
|
363 |
# Clean and standardize the data
|
364 |
-
|
365 |
-
|
366 |
|
367 |
# Clean numeric columns
|
368 |
-
numeric_columns = ['LAT', 'LON', '
|
369 |
for col in numeric_columns:
|
370 |
if col in df.columns:
|
371 |
df[col] = pd.to_numeric(df[col], errors='coerce')
|
372 |
|
373 |
# Filter out invalid/missing critical data
|
374 |
-
|
|
|
375 |
|
376 |
# Ensure LAT/LON are in reasonable ranges
|
377 |
df = df[(df['LAT'] >= -90) & (df['LAT'] <= 90)]
|
378 |
df = df[(df['LON'] >= -180) & (df['LON'] <= 180)]
|
379 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
380 |
logging.info(f"Successfully loaded {len(df)} records from {basin} basin")
|
381 |
return df
|
382 |
|
@@ -439,10 +514,10 @@ def load_data_fixed(oni_path, typhoon_path):
|
|
439 |
try:
|
440 |
typhoon_data = pd.read_csv(typhoon_path, low_memory=False)
|
441 |
# Ensure basic columns exist and are valid
|
442 |
-
required_cols = ['
|
443 |
if all(col in typhoon_data.columns for col in required_cols):
|
444 |
-
|
445 |
-
|
446 |
logging.info(f"Loaded processed typhoon data with {len(typhoon_data)} records")
|
447 |
else:
|
448 |
logging.warning("Processed typhoon data missing required columns, will reload from IBTrACS")
|
@@ -469,10 +544,14 @@ def load_data_fixed(oni_path, typhoon_path):
|
|
469 |
# Ensure SID has proper format
|
470 |
if 'SID' not in typhoon_data.columns and 'BASIN' in typhoon_data.columns:
|
471 |
# Create SID from basin and other identifiers if missing
|
472 |
-
if '
|
473 |
typhoon_data['SID'] = (typhoon_data['BASIN'].astype(str) +
|
474 |
-
typhoon_data
|
475 |
typhoon_data['SEASON'].astype(str))
|
|
|
|
|
|
|
|
|
476 |
|
477 |
# Save the processed data for future use
|
478 |
safe_file_write(typhoon_path, typhoon_data, get_fallback_data_dir())
|
@@ -502,23 +581,25 @@ def load_data_fixed(oni_path, typhoon_path):
|
|
502 |
logging.warning(f"Added missing column {col} with default value")
|
503 |
|
504 |
# Ensure data types
|
505 |
-
|
|
|
506 |
typhoon_data['LAT'] = pd.to_numeric(typhoon_data['LAT'], errors='coerce')
|
507 |
typhoon_data['LON'] = pd.to_numeric(typhoon_data['LON'], errors='coerce')
|
508 |
typhoon_data['USA_WIND'] = pd.to_numeric(typhoon_data['USA_WIND'], errors='coerce')
|
509 |
typhoon_data['USA_PRES'] = pd.to_numeric(typhoon_data['USA_PRES'], errors='coerce')
|
510 |
|
511 |
-
# Remove rows with invalid
|
512 |
-
typhoon_data = typhoon_data.dropna(subset=['
|
513 |
|
514 |
logging.info(f"Final typhoon data: {len(typhoon_data)} records after validation")
|
515 |
|
516 |
return oni_data, typhoon_data
|
517 |
|
518 |
def create_fallback_typhoon_data():
|
519 |
-
"""Create minimal fallback typhoon data"""
|
|
|
520 |
dates = pd.date_range(start='2000-01-01', end='2023-12-31', freq='D')
|
521 |
-
storm_dates = np.random.choice(dates, size=100, replace=False)
|
522 |
|
523 |
data = []
|
524 |
for i, date in enumerate(storm_dates):
|
@@ -538,7 +619,7 @@ def create_fallback_typhoon_data():
|
|
538 |
|
539 |
data.append({
|
540 |
'SID': sid,
|
541 |
-
'ISO_TIME': date +
|
542 |
'NAME': f'FALLBACK_{i+1}',
|
543 |
'SEASON': date.year,
|
544 |
'LAT': lat,
|
@@ -548,7 +629,9 @@ def create_fallback_typhoon_data():
|
|
548 |
'BASIN': 'WP'
|
549 |
})
|
550 |
|
551 |
-
|
|
|
|
|
552 |
|
553 |
def process_oni_data(oni_data):
|
554 |
"""Process ONI data into long format"""
|
@@ -562,7 +645,8 @@ def process_oni_data(oni_data):
|
|
562 |
|
563 |
def process_typhoon_data(typhoon_data):
|
564 |
"""Process typhoon data"""
|
565 |
-
|
|
|
566 |
typhoon_data['USA_WIND'] = pd.to_numeric(typhoon_data['USA_WIND'], errors='coerce')
|
567 |
typhoon_data['USA_PRES'] = pd.to_numeric(typhoon_data['USA_PRES'], errors='coerce')
|
568 |
typhoon_data['LON'] = pd.to_numeric(typhoon_data['LON'], errors='coerce')
|
@@ -574,8 +658,14 @@ def process_typhoon_data(typhoon_data):
|
|
574 |
'LAT':'first','LON':'first'
|
575 |
}).reset_index()
|
576 |
|
577 |
-
|
578 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
579 |
typhoon_max['Category'] = typhoon_max['USA_WIND'].apply(categorize_typhoon)
|
580 |
return typhoon_max
|
581 |
|
@@ -585,6 +675,8 @@ def merge_data(oni_long, typhoon_max):
|
|
585 |
|
586 |
def categorize_typhoon(wind_speed):
|
587 |
"""Categorize typhoon based on wind speed"""
|
|
|
|
|
588 |
if wind_speed >= 137:
|
589 |
return 'C5 Super Typhoon'
|
590 |
elif wind_speed >= 113:
|
@@ -604,6 +696,8 @@ def classify_enso_phases(oni_value):
|
|
604 |
"""Classify ENSO phases based on ONI value"""
|
605 |
if isinstance(oni_value, pd.Series):
|
606 |
oni_value = oni_value.iloc[0]
|
|
|
|
|
607 |
if oni_value >= 0.5:
|
608 |
return 'El Nino'
|
609 |
elif oni_value <= -0.5:
|
@@ -778,6 +872,8 @@ def get_full_tracks(start_year, start_month, end_year, end_month, enso_phase, ty
|
|
778 |
fig = go.Figure()
|
779 |
for sid in unique_storms:
|
780 |
storm_data = typhoon_data[typhoon_data['SID']==sid]
|
|
|
|
|
781 |
name = storm_data['NAME'].iloc[0] if pd.notnull(storm_data['NAME'].iloc[0]) else "Unnamed"
|
782 |
basin = storm_data['SID'].iloc[0][:2]
|
783 |
storm_oni = filtered_data[filtered_data['SID']==sid]['ONI'].iloc[0]
|
@@ -842,6 +938,9 @@ def get_longitude_analysis(start_year, start_month, end_year, end_month, enso_ph
|
|
842 |
|
843 |
def categorize_typhoon_by_standard(wind_speed, standard='atlantic'):
|
844 |
"""Categorize typhoon by standard"""
|
|
|
|
|
|
|
845 |
if standard=='taiwan':
|
846 |
wind_speed_ms = wind_speed * 0.514444
|
847 |
if wind_speed_ms >= 51.0:
|
@@ -875,6 +974,10 @@ def update_route_clusters(start_year, start_month, end_year, end_month, enso_val
|
|
875 |
try:
|
876 |
# Merge raw typhoon data with ONI
|
877 |
raw_data = typhoon_data.copy()
|
|
|
|
|
|
|
|
|
878 |
raw_data['Year'] = raw_data['ISO_TIME'].dt.year
|
879 |
raw_data['Month'] = raw_data['ISO_TIME'].dt.strftime('%m')
|
880 |
merged_raw = pd.merge(raw_data, process_oni_data(oni_data), on=['Year','Month'], how='left')
|
@@ -918,7 +1021,7 @@ def update_route_clusters(start_year, start_month, end_year, end_month, enso_val
|
|
918 |
return go.Figure(), go.Figure(), make_subplots(rows=2, cols=1), "No valid storms for clustering."
|
919 |
|
920 |
# Interpolate each storm's route to a common length
|
921 |
-
max_length = max(len(item[1]) for item in all_storms_data)
|
922 |
route_vectors = []
|
923 |
wind_curves = []
|
924 |
pres_curves = []
|
@@ -972,31 +1075,34 @@ def update_route_clusters(start_year, start_month, end_year, end_month, enso_val
|
|
972 |
pres_curves = np.array(pres_curves)
|
973 |
|
974 |
# Run TSNE on route vectors
|
975 |
-
|
|
|
|
|
|
|
976 |
tsne_results = tsne.fit_transform(route_vectors)
|
977 |
|
978 |
# Dynamic DBSCAN
|
979 |
selected_labels = None
|
980 |
selected_eps = None
|
981 |
for eps in np.linspace(1.0, 10.0, 91):
|
982 |
-
dbscan = DBSCAN(eps=eps, min_samples=
|
983 |
labels = dbscan.fit_predict(tsne_results)
|
984 |
clusters = set(labels) - {-1}
|
985 |
-
if
|
986 |
selected_labels = labels
|
987 |
selected_eps = eps
|
988 |
break
|
989 |
|
990 |
if selected_labels is None:
|
991 |
selected_eps = 5.0
|
992 |
-
dbscan = DBSCAN(eps=selected_eps, min_samples=
|
993 |
selected_labels = dbscan.fit_predict(tsne_results)
|
994 |
|
995 |
logging.info(f"Selected DBSCAN eps: {selected_eps:.2f} yielding {len(set(selected_labels)-{-1})} clusters.")
|
996 |
|
997 |
# TSNE scatter plot
|
998 |
fig_tsne = go.Figure()
|
999 |
-
colors = px.colors.qualitative.
|
1000 |
unique_labels = sorted(set(selected_labels) - {-1})
|
1001 |
|
1002 |
for i, label in enumerate(unique_labels):
|
@@ -1053,6 +1159,12 @@ def update_route_clusters(start_year, start_month, end_year, end_month, enso_val
|
|
1053 |
mean_pres_curve = np.nanmean(cluster_pres, axis=0)
|
1054 |
cluster_stats.append((label, mean_wind_curve, mean_pres_curve))
|
1055 |
|
|
|
|
|
|
|
|
|
|
|
|
|
1056 |
# Create cluster stats plot
|
1057 |
x_axis = np.linspace(0, 1, max_length)
|
1058 |
fig_stats = make_subplots(rows=2, cols=1, shared_xaxes=True,
|
@@ -1064,7 +1176,8 @@ def update_route_clusters(start_year, start_month, end_year, end_month, enso_val
|
|
1064 |
y=wind_curve,
|
1065 |
mode='lines',
|
1066 |
line=dict(width=2, color=colors[i % len(colors)]),
|
1067 |
-
name=f"Cluster {label} Mean Wind"
|
|
|
1068 |
), row=1, col=1)
|
1069 |
|
1070 |
fig_stats.add_trace(go.Scatter(
|
@@ -1072,7 +1185,8 @@ def update_route_clusters(start_year, start_month, end_year, end_month, enso_val
|
|
1072 |
y=pres_curve,
|
1073 |
mode='lines',
|
1074 |
line=dict(width=2, color=colors[i % len(colors)]),
|
1075 |
-
name=f"Cluster {label} Mean MSLP"
|
|
|
1076 |
), row=2, col=1)
|
1077 |
|
1078 |
fig_stats.update_layout(
|
@@ -1081,10 +1195,10 @@ def update_route_clusters(start_year, start_month, end_year, end_month, enso_val
|
|
1081 |
yaxis_title="Mean Wind Speed (knots)",
|
1082 |
xaxis2_title="Normalized Route Index",
|
1083 |
yaxis2_title="Mean MSLP (hPa)",
|
1084 |
-
|
1085 |
)
|
1086 |
|
1087 |
-
info = f"TSNE clustering complete. Selected eps: {selected_eps:.2f}. Clusters: {len(unique_labels)}."
|
1088 |
return fig_tsne, fig_routes, fig_stats, info
|
1089 |
|
1090 |
except Exception as e:
|
@@ -1112,9 +1226,9 @@ def generate_track_video_from_csv(year, storm_id, standard):
|
|
1112 |
else:
|
1113 |
winds = np.full(len(lats), np.nan)
|
1114 |
|
1115 |
-
storm_name = storm_df['NAME'].iloc[0]
|
1116 |
basin = storm_df['SID'].iloc[0][:2]
|
1117 |
-
season = storm_df['SEASON'].iloc[0]
|
1118 |
|
1119 |
min_lat, max_lat = np.min(lats), np.max(lats)
|
1120 |
min_lon, max_lon = np.min(lons), np.max(lons)
|
@@ -1157,7 +1271,7 @@ def generate_track_video_from_csv(year, storm_id, standard):
|
|
1157 |
def update(frame):
|
1158 |
line.set_data(lons[:frame+1], lats[:frame+1])
|
1159 |
point.set_data([lons[frame]], [lats[frame]])
|
1160 |
-
wind_speed = winds[frame] if frame < len(winds) else
|
1161 |
category, color = categorize_typhoon_by_standard(wind_speed, standard)
|
1162 |
point.set_color(color)
|
1163 |
dt_str = pd.to_datetime(times[frame]).strftime('%Y-%m-%d %H:%M')
|
@@ -1201,7 +1315,13 @@ def update_typhoon_options_fixed(year, basin):
|
|
1201 |
return gr.update(choices=[], value=None)
|
1202 |
|
1203 |
# Filter by year
|
1204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1205 |
|
1206 |
if basin != "All Basins":
|
1207 |
# Extract basin code
|
@@ -1222,7 +1342,7 @@ def update_typhoon_options_fixed(year, basin):
|
|
1222 |
|
1223 |
for _, storm in storms.iterrows():
|
1224 |
name = storm.get('NAME', 'UNNAMED')
|
1225 |
-
if pd.isna(name) or name == '':
|
1226 |
name = 'UNNAMED'
|
1227 |
sid = storm['SID']
|
1228 |
options.append(f"{name} ({sid})")
|
|
|
324 |
logging.error(f"Failed to download {basin} basin file: {e}")
|
325 |
return None
|
326 |
|
327 |
+
def examine_ibtracs_structure(file_path):
|
328 |
+
"""Examine the actual structure of an IBTrACS CSV file"""
|
329 |
+
try:
|
330 |
+
with open(file_path, 'r') as f:
|
331 |
+
lines = f.readlines()
|
332 |
+
|
333 |
+
# Show first 5 lines
|
334 |
+
logging.info("First 5 lines of IBTrACS file:")
|
335 |
+
for i, line in enumerate(lines[:5]):
|
336 |
+
logging.info(f"Line {i}: {line.strip()}")
|
337 |
+
|
338 |
+
# Try to read with proper skip
|
339 |
+
df = pd.read_csv(file_path, skiprows=3, nrows=5)
|
340 |
+
logging.info(f"Columns after skipping 3 rows: {list(df.columns)}")
|
341 |
+
|
342 |
+
return list(df.columns)
|
343 |
+
except Exception as e:
|
344 |
+
logging.error(f"Error examining IBTrACS structure: {e}")
|
345 |
+
return None
|
346 |
+
|
347 |
def load_ibtracs_csv_directly(basin='WP'):
|
348 |
+
"""Load IBTrACS data directly from CSV - FIXED VERSION"""
|
349 |
filename = BASIN_FILES[basin]
|
350 |
local_path = os.path.join(DATA_PATH, filename)
|
351 |
|
|
|
356 |
return None
|
357 |
|
358 |
try:
|
359 |
+
# First, examine the structure
|
360 |
+
actual_columns = examine_ibtracs_structure(local_path)
|
361 |
+
if not actual_columns:
|
362 |
+
logging.error("Could not examine IBTrACS file structure")
|
363 |
+
return None
|
364 |
+
|
365 |
+
# Read IBTrACS CSV with proper number of header rows skipped
|
366 |
+
# IBTrACS v04r01 has 3 header rows that need to be skipped
|
|
|
|
|
|
|
367 |
logging.info(f"Reading IBTrACS CSV file: {local_path}")
|
368 |
+
df = pd.read_csv(local_path, low_memory=False, skiprows=3) # Skip 3 metadata rows
|
369 |
+
|
370 |
+
logging.info(f"Original columns: {list(df.columns)}")
|
371 |
+
logging.info(f"Data shape before cleaning: {df.shape}")
|
372 |
+
|
373 |
+
# Map actual column names to our expected names
|
374 |
+
# Based on IBTrACS documentation, typical column names are:
|
375 |
+
column_mapping = {}
|
376 |
+
|
377 |
+
# Look for common variations of column names
|
378 |
+
for col in df.columns:
|
379 |
+
col_upper = col.upper()
|
380 |
+
if 'SID' in col_upper or col_upper == 'STORM_ID':
|
381 |
+
column_mapping[col] = 'SID'
|
382 |
+
elif 'SEASON' in col_upper and col_upper != 'SUB_SEASON':
|
383 |
+
column_mapping[col] = 'SEASON'
|
384 |
+
elif 'NAME' in col_upper and 'FILE' not in col_upper:
|
385 |
+
column_mapping[col] = 'NAME'
|
386 |
+
elif 'ISO_TIME' in col_upper or col_upper == 'TIME':
|
387 |
+
column_mapping[col] = 'ISO_TIME'
|
388 |
+
elif col_upper == 'LAT' or 'LATITUDE' in col_upper:
|
389 |
+
column_mapping[col] = 'LAT'
|
390 |
+
elif col_upper == 'LON' or 'LONGITUDE' in col_upper:
|
391 |
+
column_mapping[col] = 'LON'
|
392 |
+
elif 'USA_WIND' in col_upper or col_upper == 'WIND':
|
393 |
+
column_mapping[col] = 'USA_WIND'
|
394 |
+
elif 'USA_PRES' in col_upper or col_upper == 'PRESSURE':
|
395 |
+
column_mapping[col] = 'USA_PRES'
|
396 |
+
elif 'BASIN' in col_upper and 'SUB' not in col_upper:
|
397 |
+
column_mapping[col] = 'BASIN'
|
398 |
|
399 |
+
# Rename columns
|
400 |
+
df = df.rename(columns=column_mapping)
|
401 |
+
logging.info(f"Mapped columns: {list(df.columns)}")
|
402 |
|
403 |
+
# If we still don't have essential columns, try creating them
|
404 |
+
if 'SID' not in df.columns:
|
405 |
+
# Try to create SID from other columns
|
406 |
+
possible_sid_cols = [col for col in df.columns if 'id' in col.lower() or 'sid' in col.lower()]
|
407 |
+
if possible_sid_cols:
|
408 |
+
df['SID'] = df[possible_sid_cols[0]]
|
409 |
+
logging.info(f"Created SID from {possible_sid_cols[0]}")
|
410 |
|
411 |
+
if 'ISO_TIME' not in df.columns:
|
412 |
+
# Look for time-related columns
|
413 |
+
time_cols = [col for col in df.columns if 'time' in col.lower() or 'date' in col.lower()]
|
414 |
+
if time_cols:
|
415 |
+
df['ISO_TIME'] = df[time_cols[0]]
|
416 |
+
logging.info(f"Created ISO_TIME from {time_cols[0]}")
|
417 |
+
|
418 |
+
# Ensure we have minimum required columns
|
419 |
+
required_cols = ['LAT', 'LON']
|
420 |
+
available_required = [col for col in required_cols if col in df.columns]
|
421 |
+
|
422 |
+
if len(available_required) < 2:
|
423 |
+
logging.error(f"Missing critical columns. Available: {list(df.columns)}")
|
424 |
+
return None
|
425 |
|
426 |
# Clean and standardize the data
|
427 |
+
if 'ISO_TIME' in df.columns:
|
428 |
+
df['ISO_TIME'] = pd.to_datetime(df['ISO_TIME'], errors='coerce')
|
429 |
|
430 |
# Clean numeric columns
|
431 |
+
numeric_columns = ['LAT', 'LON', 'USA_WIND', 'USA_PRES']
|
432 |
for col in numeric_columns:
|
433 |
if col in df.columns:
|
434 |
df[col] = pd.to_numeric(df[col], errors='coerce')
|
435 |
|
436 |
# Filter out invalid/missing critical data
|
437 |
+
valid_rows = df['LAT'].notna() & df['LON'].notna()
|
438 |
+
df = df[valid_rows]
|
439 |
|
440 |
# Ensure LAT/LON are in reasonable ranges
|
441 |
df = df[(df['LAT'] >= -90) & (df['LAT'] <= 90)]
|
442 |
df = df[(df['LON'] >= -180) & (df['LON'] <= 180)]
|
443 |
|
444 |
+
# Add basin info if missing
|
445 |
+
if 'BASIN' not in df.columns:
|
446 |
+
df['BASIN'] = basin
|
447 |
+
|
448 |
+
# Add default columns if missing
|
449 |
+
if 'NAME' not in df.columns:
|
450 |
+
df['NAME'] = 'UNNAMED'
|
451 |
+
|
452 |
+
if 'SEASON' not in df.columns and 'ISO_TIME' in df.columns:
|
453 |
+
df['SEASON'] = df['ISO_TIME'].dt.year
|
454 |
+
|
455 |
logging.info(f"Successfully loaded {len(df)} records from {basin} basin")
|
456 |
return df
|
457 |
|
|
|
514 |
try:
|
515 |
typhoon_data = pd.read_csv(typhoon_path, low_memory=False)
|
516 |
# Ensure basic columns exist and are valid
|
517 |
+
required_cols = ['LAT', 'LON']
|
518 |
if all(col in typhoon_data.columns for col in required_cols):
|
519 |
+
if 'ISO_TIME' in typhoon_data.columns:
|
520 |
+
typhoon_data['ISO_TIME'] = pd.to_datetime(typhoon_data['ISO_TIME'], errors='coerce')
|
521 |
logging.info(f"Loaded processed typhoon data with {len(typhoon_data)} records")
|
522 |
else:
|
523 |
logging.warning("Processed typhoon data missing required columns, will reload from IBTrACS")
|
|
|
544 |
# Ensure SID has proper format
|
545 |
if 'SID' not in typhoon_data.columns and 'BASIN' in typhoon_data.columns:
|
546 |
# Create SID from basin and other identifiers if missing
|
547 |
+
if 'SEASON' in typhoon_data.columns:
|
548 |
typhoon_data['SID'] = (typhoon_data['BASIN'].astype(str) +
|
549 |
+
typhoon_data.index.astype(str).str.zfill(2) +
|
550 |
typhoon_data['SEASON'].astype(str))
|
551 |
+
else:
|
552 |
+
typhoon_data['SID'] = (typhoon_data['BASIN'].astype(str) +
|
553 |
+
typhoon_data.index.astype(str).str.zfill(2) +
|
554 |
+
'2000')
|
555 |
|
556 |
# Save the processed data for future use
|
557 |
safe_file_write(typhoon_path, typhoon_data, get_fallback_data_dir())
|
|
|
581 |
logging.warning(f"Added missing column {col} with default value")
|
582 |
|
583 |
# Ensure data types
|
584 |
+
if 'ISO_TIME' in typhoon_data.columns:
|
585 |
+
typhoon_data['ISO_TIME'] = pd.to_datetime(typhoon_data['ISO_TIME'], errors='coerce')
|
586 |
typhoon_data['LAT'] = pd.to_numeric(typhoon_data['LAT'], errors='coerce')
|
587 |
typhoon_data['LON'] = pd.to_numeric(typhoon_data['LON'], errors='coerce')
|
588 |
typhoon_data['USA_WIND'] = pd.to_numeric(typhoon_data['USA_WIND'], errors='coerce')
|
589 |
typhoon_data['USA_PRES'] = pd.to_numeric(typhoon_data['USA_PRES'], errors='coerce')
|
590 |
|
591 |
+
# Remove rows with invalid coordinates
|
592 |
+
typhoon_data = typhoon_data.dropna(subset=['LAT', 'LON'])
|
593 |
|
594 |
logging.info(f"Final typhoon data: {len(typhoon_data)} records after validation")
|
595 |
|
596 |
return oni_data, typhoon_data
|
597 |
|
598 |
def create_fallback_typhoon_data():
|
599 |
+
"""Create minimal fallback typhoon data - FIXED VERSION"""
|
600 |
+
# Use proper pandas date_range instead of numpy
|
601 |
dates = pd.date_range(start='2000-01-01', end='2023-12-31', freq='D')
|
602 |
+
storm_dates = dates[np.random.choice(len(dates), size=100, replace=False)]
|
603 |
|
604 |
data = []
|
605 |
for i, date in enumerate(storm_dates):
|
|
|
619 |
|
620 |
data.append({
|
621 |
'SID': sid,
|
622 |
+
'ISO_TIME': date + pd.Timedelta(hours=j*6), # Use pd.Timedelta instead
|
623 |
'NAME': f'FALLBACK_{i+1}',
|
624 |
'SEASON': date.year,
|
625 |
'LAT': lat,
|
|
|
629 |
'BASIN': 'WP'
|
630 |
})
|
631 |
|
632 |
+
df = pd.DataFrame(data)
|
633 |
+
logging.info(f"Created fallback typhoon data with {len(df)} records")
|
634 |
+
return df
|
635 |
|
636 |
def process_oni_data(oni_data):
|
637 |
"""Process ONI data into long format"""
|
|
|
645 |
|
646 |
def process_typhoon_data(typhoon_data):
|
647 |
"""Process typhoon data"""
|
648 |
+
if 'ISO_TIME' in typhoon_data.columns:
|
649 |
+
typhoon_data['ISO_TIME'] = pd.to_datetime(typhoon_data['ISO_TIME'], errors='coerce')
|
650 |
typhoon_data['USA_WIND'] = pd.to_numeric(typhoon_data['USA_WIND'], errors='coerce')
|
651 |
typhoon_data['USA_PRES'] = pd.to_numeric(typhoon_data['USA_PRES'], errors='coerce')
|
652 |
typhoon_data['LON'] = pd.to_numeric(typhoon_data['LON'], errors='coerce')
|
|
|
658 |
'LAT':'first','LON':'first'
|
659 |
}).reset_index()
|
660 |
|
661 |
+
if 'ISO_TIME' in typhoon_max.columns:
|
662 |
+
typhoon_max['Month'] = typhoon_max['ISO_TIME'].dt.strftime('%m')
|
663 |
+
typhoon_max['Year'] = typhoon_max['ISO_TIME'].dt.year
|
664 |
+
else:
|
665 |
+
# Fallback if no ISO_TIME
|
666 |
+
typhoon_max['Month'] = '01'
|
667 |
+
typhoon_max['Year'] = typhoon_max['SEASON']
|
668 |
+
|
669 |
typhoon_max['Category'] = typhoon_max['USA_WIND'].apply(categorize_typhoon)
|
670 |
return typhoon_max
|
671 |
|
|
|
675 |
|
676 |
def categorize_typhoon(wind_speed):
|
677 |
"""Categorize typhoon based on wind speed"""
|
678 |
+
if pd.isna(wind_speed):
|
679 |
+
return 'Tropical Depression'
|
680 |
if wind_speed >= 137:
|
681 |
return 'C5 Super Typhoon'
|
682 |
elif wind_speed >= 113:
|
|
|
696 |
"""Classify ENSO phases based on ONI value"""
|
697 |
if isinstance(oni_value, pd.Series):
|
698 |
oni_value = oni_value.iloc[0]
|
699 |
+
if pd.isna(oni_value):
|
700 |
+
return 'Neutral'
|
701 |
if oni_value >= 0.5:
|
702 |
return 'El Nino'
|
703 |
elif oni_value <= -0.5:
|
|
|
872 |
fig = go.Figure()
|
873 |
for sid in unique_storms:
|
874 |
storm_data = typhoon_data[typhoon_data['SID']==sid]
|
875 |
+
if storm_data.empty:
|
876 |
+
continue
|
877 |
name = storm_data['NAME'].iloc[0] if pd.notnull(storm_data['NAME'].iloc[0]) else "Unnamed"
|
878 |
basin = storm_data['SID'].iloc[0][:2]
|
879 |
storm_oni = filtered_data[filtered_data['SID']==sid]['ONI'].iloc[0]
|
|
|
938 |
|
939 |
def categorize_typhoon_by_standard(wind_speed, standard='atlantic'):
|
940 |
"""Categorize typhoon by standard"""
|
941 |
+
if pd.isna(wind_speed):
|
942 |
+
return 'Tropical Depression', '#808080'
|
943 |
+
|
944 |
if standard=='taiwan':
|
945 |
wind_speed_ms = wind_speed * 0.514444
|
946 |
if wind_speed_ms >= 51.0:
|
|
|
974 |
try:
|
975 |
# Merge raw typhoon data with ONI
|
976 |
raw_data = typhoon_data.copy()
|
977 |
+
if 'ISO_TIME' not in raw_data.columns:
|
978 |
+
logging.error("ISO_TIME column not found in typhoon data")
|
979 |
+
return go.Figure(), go.Figure(), make_subplots(rows=2, cols=1), "Error: ISO_TIME column missing"
|
980 |
+
|
981 |
raw_data['Year'] = raw_data['ISO_TIME'].dt.year
|
982 |
raw_data['Month'] = raw_data['ISO_TIME'].dt.strftime('%m')
|
983 |
merged_raw = pd.merge(raw_data, process_oni_data(oni_data), on=['Year','Month'], how='left')
|
|
|
1021 |
return go.Figure(), go.Figure(), make_subplots(rows=2, cols=1), "No valid storms for clustering."
|
1022 |
|
1023 |
# Interpolate each storm's route to a common length
|
1024 |
+
max_length = min(50, max(len(item[1]) for item in all_storms_data)) # Cap at 50 points
|
1025 |
route_vectors = []
|
1026 |
wind_curves = []
|
1027 |
pres_curves = []
|
|
|
1075 |
pres_curves = np.array(pres_curves)
|
1076 |
|
1077 |
# Run TSNE on route vectors
|
1078 |
+
if len(route_vectors) < 5:
|
1079 |
+
return go.Figure(), go.Figure(), make_subplots(rows=2, cols=1), "Need at least 5 storms for clustering."
|
1080 |
+
|
1081 |
+
tsne = TSNE(n_components=2, random_state=42, verbose=1, perplexity=min(30, len(route_vectors)-1))
|
1082 |
tsne_results = tsne.fit_transform(route_vectors)
|
1083 |
|
1084 |
# Dynamic DBSCAN
|
1085 |
selected_labels = None
|
1086 |
selected_eps = None
|
1087 |
for eps in np.linspace(1.0, 10.0, 91):
|
1088 |
+
dbscan = DBSCAN(eps=eps, min_samples=max(2, len(route_vectors)//10))
|
1089 |
labels = dbscan.fit_predict(tsne_results)
|
1090 |
clusters = set(labels) - {-1}
|
1091 |
+
if 2 <= len(clusters) <= min(10, len(route_vectors)//2):
|
1092 |
selected_labels = labels
|
1093 |
selected_eps = eps
|
1094 |
break
|
1095 |
|
1096 |
if selected_labels is None:
|
1097 |
selected_eps = 5.0
|
1098 |
+
dbscan = DBSCAN(eps=selected_eps, min_samples=max(2, len(route_vectors)//10))
|
1099 |
selected_labels = dbscan.fit_predict(tsne_results)
|
1100 |
|
1101 |
logging.info(f"Selected DBSCAN eps: {selected_eps:.2f} yielding {len(set(selected_labels)-{-1})} clusters.")
|
1102 |
|
1103 |
# TSNE scatter plot
|
1104 |
fig_tsne = go.Figure()
|
1105 |
+
colors = px.colors.qualitative.Set3
|
1106 |
unique_labels = sorted(set(selected_labels) - {-1})
|
1107 |
|
1108 |
for i, label in enumerate(unique_labels):
|
|
|
1159 |
mean_pres_curve = np.nanmean(cluster_pres, axis=0)
|
1160 |
cluster_stats.append((label, mean_wind_curve, mean_pres_curve))
|
1161 |
|
1162 |
+
fig_routes.update_layout(
|
1163 |
+
title="Cluster Mean Routes",
|
1164 |
+
geo=dict(projection_type='natural earth', showland=True),
|
1165 |
+
height=600
|
1166 |
+
)
|
1167 |
+
|
1168 |
# Create cluster stats plot
|
1169 |
x_axis = np.linspace(0, 1, max_length)
|
1170 |
fig_stats = make_subplots(rows=2, cols=1, shared_xaxes=True,
|
|
|
1176 |
y=wind_curve,
|
1177 |
mode='lines',
|
1178 |
line=dict(width=2, color=colors[i % len(colors)]),
|
1179 |
+
name=f"Cluster {label} Mean Wind",
|
1180 |
+
showlegend=True
|
1181 |
), row=1, col=1)
|
1182 |
|
1183 |
fig_stats.add_trace(go.Scatter(
|
|
|
1185 |
y=pres_curve,
|
1186 |
mode='lines',
|
1187 |
line=dict(width=2, color=colors[i % len(colors)]),
|
1188 |
+
name=f"Cluster {label} Mean MSLP",
|
1189 |
+
showlegend=False
|
1190 |
), row=2, col=1)
|
1191 |
|
1192 |
fig_stats.update_layout(
|
|
|
1195 |
yaxis_title="Mean Wind Speed (knots)",
|
1196 |
xaxis2_title="Normalized Route Index",
|
1197 |
yaxis2_title="Mean MSLP (hPa)",
|
1198 |
+
height=600
|
1199 |
)
|
1200 |
|
1201 |
+
info = f"TSNE clustering complete. Selected eps: {selected_eps:.2f}. Clusters: {len(unique_labels)}. Total storms: {len(route_vectors)}."
|
1202 |
return fig_tsne, fig_routes, fig_stats, info
|
1203 |
|
1204 |
except Exception as e:
|
|
|
1226 |
else:
|
1227 |
winds = np.full(len(lats), np.nan)
|
1228 |
|
1229 |
+
storm_name = storm_df['NAME'].iloc[0] if pd.notnull(storm_df['NAME'].iloc[0]) else "Unnamed"
|
1230 |
basin = storm_df['SID'].iloc[0][:2]
|
1231 |
+
season = storm_df['SEASON'].iloc[0] if 'SEASON' in storm_df.columns else year
|
1232 |
|
1233 |
min_lat, max_lat = np.min(lats), np.max(lats)
|
1234 |
min_lon, max_lon = np.min(lons), np.max(lons)
|
|
|
1271 |
def update(frame):
|
1272 |
line.set_data(lons[:frame+1], lats[:frame+1])
|
1273 |
point.set_data([lons[frame]], [lats[frame]])
|
1274 |
+
wind_speed = winds[frame] if frame < len(winds) and not pd.isna(winds[frame]) else 0
|
1275 |
category, color = categorize_typhoon_by_standard(wind_speed, standard)
|
1276 |
point.set_color(color)
|
1277 |
dt_str = pd.to_datetime(times[frame]).strftime('%Y-%m-%d %H:%M')
|
|
|
1315 |
return gr.update(choices=[], value=None)
|
1316 |
|
1317 |
# Filter by year
|
1318 |
+
if 'ISO_TIME' in typhoon_data.columns:
|
1319 |
+
year_data = typhoon_data[typhoon_data['ISO_TIME'].dt.year == int(year)].copy()
|
1320 |
+
elif 'SEASON' in typhoon_data.columns:
|
1321 |
+
year_data = typhoon_data[typhoon_data['SEASON'] == int(year)].copy()
|
1322 |
+
else:
|
1323 |
+
# Fallback: use all data
|
1324 |
+
year_data = typhoon_data.copy()
|
1325 |
|
1326 |
if basin != "All Basins":
|
1327 |
# Extract basin code
|
|
|
1342 |
|
1343 |
for _, storm in storms.iterrows():
|
1344 |
name = storm.get('NAME', 'UNNAMED')
|
1345 |
+
if pd.isna(name) or name == '' or name == 'UNNAMED':
|
1346 |
name = 'UNNAMED'
|
1347 |
sid = storm['SID']
|
1348 |
options.append(f"{name} ({sid})")
|