ksaramout commited on
Commit
14a5579
Β·
1 Parent(s): a234a1a

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +167 -121
app.py CHANGED
@@ -6,6 +6,23 @@ import time
6
  import pandas as pd
7
  import labelbox
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  def validate_dataset_name(name):
10
  """Validate the dataset name."""
11
  # Check length
@@ -193,7 +210,7 @@ if new_dataset_name or selected_dataset_name:
193
  formatted_title = re.sub(r'/$', '', formatted_title) # Remove trailing slash if present
194
 
195
  if formatted_title:
196
- st.subheader("Select and existing cluster or make a new one", divider='grey', help="Jobs in preview mode will use all purpose compute clusters to help you itersate faster. Jobs in production mode will use job clusters to reduce DBUs consumed.")
197
  DOMAIN = f"https://{formatted_title}"
198
  TOKEN = f"Bearer {databricks_api_key}"
199
 
@@ -217,23 +234,17 @@ if new_dataset_name or selected_dataset_name:
217
  }
218
 
219
  # Display dropdown with cluster names
220
- make_cluster = st.toggle('Make me a new cluster', value=False)
221
- if make_cluster:
222
- #make a cluster
223
- st.write("Making a new cluster")
224
- else:
225
- if cluster_dict:
226
- selected_cluster_name = st.selectbox(
227
- 'Select a cluster to run on',
228
- list(cluster_dict.keys()),
229
- key='unique_key_for_cluster_selectbox',
230
- index=None,
231
- placeholder="Select a cluster..",
232
- )
233
- if selected_cluster_name:
234
- cluster_id = cluster_dict[selected_cluster_name]
235
- else:
236
- st.write("No UI or API-based compute clusters found.")
237
 
238
  except requests.RequestException as e:
239
  st.write(f"Error communicating with Databricks API: {str(e)}")
@@ -298,7 +309,7 @@ if new_dataset_name or selected_dataset_name:
298
  st.subheader("Run Frequency", divider='grey')
299
 
300
  # Dropdown to select frequency
301
- freq_options = ["1 minute", "1 hour", "1 day", "1 week", "1 month"]
302
  selected_freq = st.selectbox("Select frequency:", freq_options, placeholder="Select frequency..")
303
 
304
  day_of_week = None
@@ -306,12 +317,7 @@ if new_dataset_name or selected_dataset_name:
306
 
307
  # If the frequency is hourly, daily, weekly, or monthly, ask for a specific time
308
  if selected_freq != "1 minute":
309
- col1, col2 = st.columns(2)
310
- with col1:
311
- hour = st.selectbox("Hour:", list(range(0, 24)))
312
- with col2:
313
- minute = st.selectbox("Minute:", list(range(0, 60)))
314
-
315
  if selected_freq == "1 week":
316
  days_options = ["MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN"]
317
  day_of_week = st.selectbox("Select day of the week:", days_options)
@@ -319,12 +325,49 @@ if new_dataset_name or selected_dataset_name:
319
  elif selected_freq == "1 month":
320
  day_of_month = st.selectbox("Select day of the month:", list(range(1, 32)))
321
 
 
 
 
 
 
 
322
  else:
323
  hour, minute = 0, 0
324
 
325
  # Generate the cron expression
326
  frequency = generate_cron_expression(selected_freq, hour, minute, day_of_week, day_of_month)
327
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
  def generate_human_readable_message(freq, hour=0, minute=0, day_of_week=None, day_of_month=None):
329
  """
330
  Generate a human-readable message for the scheduling.
@@ -349,37 +392,32 @@ if new_dataset_name or selected_dataset_name:
349
  # Generate the human-readable message
350
  readable_msg = generate_human_readable_message(selected_freq, hour, minute, day_of_week, day_of_month)
351
 
 
352
  if frequency:
353
  st.success(readable_msg, icon="πŸ“…")
 
 
354
 
 
 
 
355
  st.subheader("Select a table", divider="grey")
356
 
357
- with st.spinner('Querying Databricks...'):
358
- query = "SHOW DATABASES;"
359
- result_data = execute_databricks_query(query, cluster_id, formatted_title, databricks_api_key)
360
-
361
- # Extract the databaseName values from the DataFrame
362
- database_names = result_data['databaseName'].tolist()
363
-
364
- # Create a dropdown with the database names
365
  selected_database = st.selectbox("Select a Database:", database_names, index=None, placeholder="Select a database..")
366
 
367
  if selected_database:
368
- with st.spinner('Querying Databricks...'):
369
- query = f"SHOW TABLES IN {selected_database};"
370
- result_data = execute_databricks_query(query, cluster_id, formatted_title, databricks_api_key)
371
-
372
- # Extract the tableName values from the DataFrame
373
- table_names = result_data['tableName'].tolist()
374
-
375
- # Create a dropdown with the database names
376
  selected_table = st.selectbox("Select a Table:", table_names, index=None, placeholder="Select a table..")
377
 
378
  if selected_table:
379
- with st.spinner('Querying Databricks...'):
380
- query = f"SHOW COLUMNS IN {selected_database}.{selected_table};"
381
- result_data = execute_databricks_query(query, cluster_id, formatted_title, databricks_api_key)
382
- column_names = result_data['col_name'].tolist()
383
 
384
  st.subheader("Map table schema to Labelbox schema", divider="grey")
385
  # Your existing code to handle schema mapping...
@@ -388,11 +426,8 @@ if new_dataset_name or selected_dataset_name:
388
  with st.spinner('Fetching first 5 rows of the selected table...'):
389
  query = f"SELECT * FROM {selected_database}.{selected_table} LIMIT 5;"
390
  table_sample_data = execute_databricks_query(query, cluster_id, formatted_title, databricks_api_key)
 
391
 
392
- # Display the sample data in the Streamlit UI
393
- st.write(table_sample_data)
394
-
395
-
396
  # Define two columns for side-by-side selectboxes
397
  col1, col2 = st.columns(2)
398
 
@@ -423,75 +458,86 @@ if new_dataset_name or selected_dataset_name:
423
  sample_row_data_value = result_sample[selected_row_data].iloc[0]
424
 
425
  # Validate the extracted value
426
- if is_valid_url_or_uri(sample_row_data_value):
427
- st.success(f"Sample URI/URL from selected row data column: {sample_row_data_value}", icon="βœ…")
428
- dataset_id = create_new_dataset_labelbox(new_dataset_name) if create_new_dataset else dataset_id
429
- # Mode
430
- mode = "preview" if is_preview else "production"
431
-
432
- # Databricks instance and API key
433
- databricks_instance = formatted_title
434
- databricks_api_key = databricks_api_key
435
-
436
- # Dataset ID and New Dataset
437
- new_dataset = 1 if create_new_dataset else 0
438
- dataset_id = dataset_id
439
-
440
- # Table Path
441
- table_path = f"{selected_database}.{selected_table}"
442
- # Frequency
443
- frequency = frequency
444
-
445
- # Cluster ID and New Cluster
446
- new_cluster = 1 if make_cluster else 0
447
- cluster_id = cluster_id if not make_cluster else ""
448
-
449
- # Schema Map
450
- row_data_input = selected_row_data
451
- global_key_input = selected_global_key
452
- schema_map_dict = {'row_data': row_data_input}
453
- if global_key_input:
454
- schema_map_dict['global_key'] = global_key_input
455
-
456
- # Convert the dict to a stringified JSON
457
- schema_map_str = json.dumps(schema_map_dict)
458
-
459
-
460
- data = {
461
- "mode": mode,
462
- "databricks_instance": databricks_instance,
463
- "databricks_api_key": databricks_api_key,
464
- "new_dataset": new_dataset,
465
- "dataset_id": dataset_id,
466
- "table_path": table_path,
467
- "labelbox_api_key": labelbox_api_key,
468
- "frequency": frequency,
469
- "new_cluster": new_cluster,
470
- "cluster_id": cluster_id,
471
- "schema_map": schema_map_str
472
- }
473
-
474
-
475
- if st.button("Deploy Pipeline!", type="primary"):
476
- # Ensure all fields are filled out
477
- required_fields = [
478
- mode, databricks_instance, databricks_api_key, new_dataset, dataset_id,
479
- table_path, labelbox_api_key, frequency, new_cluster, cluster_id, schema_map_str
480
- ]
481
-
482
-
483
- # Sending a POST request to the Flask app endpoint
484
- with st.spinner("Deploying pipeline..."):
485
- response = requests.post("https://us-central1-dbt-prod.cloudfunctions.net/deploy-databricks-pipeline", json=data)
486
-
487
- # Check if request was successful
488
- if response.status_code == 200:
489
- # Display the response using Streamlit
490
- st.balloons()
491
- st.success("Pipeline deployed successfully!", icon="πŸš€")
492
- st.json(response.json())
493
- else:
494
- st.error(f"Failed to deploy pipeline. Response: {response.text}", icon="🚫")
495
-
496
- else:
497
- st.error(f"row_data '{sample_row_data_value}' is not a valid URI or URL. Please select a different column.", icon="🚫")
 
 
 
 
 
 
 
 
 
 
 
 
6
  import pandas as pd
7
  import labelbox
8
 
9
+ @st.cache_data(show_spinner=True)
10
+ def fetch_databases(cluster_id, formatted_title, databricks_api_key):
11
+ query = "SHOW DATABASES;"
12
+ return execute_databricks_query(query, cluster_id, formatted_title, databricks_api_key)
13
+
14
+ # Cached function to fetch tables
15
+ @st.cache_data(show_spinner=True)
16
+ def fetch_tables(selected_database, cluster_id, formatted_title, databricks_api_key):
17
+ query = f"SHOW TABLES IN {selected_database};"
18
+ return execute_databricks_query(query, cluster_id, formatted_title, databricks_api_key)
19
+
20
+ # Cached function to fetch columns
21
+ @st.cache_data(show_spinner=True)
22
+ def fetch_columns(selected_database, selected_table, cluster_id, formatted_title, databricks_api_key):
23
+ query = f"SHOW COLUMNS IN {selected_database}.{selected_table};"
24
+ return execute_databricks_query(query, cluster_id, formatted_title, databricks_api_key)
25
+
26
  def validate_dataset_name(name):
27
  """Validate the dataset name."""
28
  # Check length
 
210
  formatted_title = re.sub(r'/$', '', formatted_title) # Remove trailing slash if present
211
 
212
  if formatted_title:
213
+ st.subheader("Select an existing cluster", divider='grey', help="Jobs will use job clusters to reduce DBUs consumed.")
214
  DOMAIN = f"https://{formatted_title}"
215
  TOKEN = f"Bearer {databricks_api_key}"
216
 
 
234
  }
235
 
236
  # Display dropdown with cluster names
237
+
238
+ if cluster_dict:
239
+ selected_cluster_name = st.selectbox(
240
+ 'Select a cluster to run on',
241
+ list(cluster_dict.keys()),
242
+ key='unique_key_for_cluster_selectbox',
243
+ index=None,
244
+ placeholder="Select a cluster..",
245
+ )
246
+ if selected_cluster_name:
247
+ cluster_id = cluster_dict[selected_cluster_name]
 
 
 
 
 
 
248
 
249
  except requests.RequestException as e:
250
  st.write(f"Error communicating with Databricks API: {str(e)}")
 
309
  st.subheader("Run Frequency", divider='grey')
310
 
311
  # Dropdown to select frequency
312
+ freq_options = ["1 day", "1 week", "1 month"]
313
  selected_freq = st.selectbox("Select frequency:", freq_options, placeholder="Select frequency..")
314
 
315
  day_of_week = None
 
317
 
318
  # If the frequency is hourly, daily, weekly, or monthly, ask for a specific time
319
  if selected_freq != "1 minute":
320
+
 
 
 
 
 
321
  if selected_freq == "1 week":
322
  days_options = ["MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN"]
323
  day_of_week = st.selectbox("Select day of the week:", days_options)
 
325
  elif selected_freq == "1 month":
326
  day_of_month = st.selectbox("Select day of the month:", list(range(1, 32)))
327
 
328
+ col1, col2 = st.columns(2)
329
+ with col1:
330
+ hour = st.selectbox("Hour:", list(range(0, 24)))
331
+ with col2:
332
+ minute = st.selectbox("Minute:", list(range(0, 60)))
333
+
334
  else:
335
  hour, minute = 0, 0
336
 
337
  # Generate the cron expression
338
  frequency = generate_cron_expression(selected_freq, hour, minute, day_of_week, day_of_month)
339
 
340
+ # Assumed DBU consumption rate for a 32GB, 4-core node per hour
341
+ X = 1 # Replace this with the actual rate from Databricks' pricing or documentation
342
+
343
+ # Calculate DBU consumption for a single run
344
+ min_dbu_single_run = (X/6) * (1 + 10) # Assuming maximum scaling to 10 workers
345
+ max_dbu_single_run = (2*X/3) * (1 + 10)
346
+
347
+ # Estimate monthly DBU consumption based on frequency
348
+ frequency = st.selectbox("Select Run Frequency:", ["Daily", "Weekly", "Monthly"])
349
+ if frequency == "Daily":
350
+ min_dbu_monthly = 30 * min_dbu_single_run
351
+ max_dbu_monthly = 30 * max_dbu_single_run
352
+ elif frequency == "Weekly":
353
+ min_dbu_monthly = 4 * min_dbu_single_run
354
+ max_dbu_monthly = 4 * max_dbu_single_run
355
+ else: # Monthly
356
+ min_dbu_monthly = min_dbu_single_run
357
+ max_dbu_monthly = max_dbu_single_run
358
+
359
+ # Calculate runs per month
360
+ if selected_freq == "1 day":
361
+ runs_per_month = 30
362
+ elif selected_freq == "1 week":
363
+ runs_per_month = 4
364
+ else: # "1 month"
365
+ runs_per_month = 1
366
+
367
+ # Calculate estimated DBU consumption per month
368
+ min_dbu_monthly = runs_per_month * min_dbu_single_run
369
+ max_dbu_monthly = runs_per_month * max_dbu_single_run
370
+
371
  def generate_human_readable_message(freq, hour=0, minute=0, day_of_week=None, day_of_month=None):
372
  """
373
  Generate a human-readable message for the scheduling.
 
392
  # Generate the human-readable message
393
  readable_msg = generate_human_readable_message(selected_freq, hour, minute, day_of_week, day_of_month)
394
 
395
+ # Main code block
396
  if frequency:
397
  st.success(readable_msg, icon="πŸ“…")
398
+ # Display the estimated DBU consumption to the user
399
+ st.warning(f"Estimated DBU Consumption:\n- For a single run: {min_dbu_single_run:.2f} to {max_dbu_single_run:.2f} DBUs\n- Monthly (based on {runs_per_month} runs): {min_dbu_monthly:.2f} to {max_dbu_monthly:.2f} DBUs")
400
 
401
+ # Disclaimer
402
+ st.info("Disclaimer: This is only an estimation. Always monitor the job in Databricks to assess actual DBU consumption.")
403
+
404
  st.subheader("Select a table", divider="grey")
405
 
406
+ # Fetching databases
407
+ result_data = fetch_databases(cluster_id, formatted_title, databricks_api_key)
408
+ database_names = result_data['databaseName'].tolist()
 
 
 
 
 
409
  selected_database = st.selectbox("Select a Database:", database_names, index=None, placeholder="Select a database..")
410
 
411
  if selected_database:
412
+ # Fetching tables
413
+ result_data = fetch_tables(selected_database, cluster_id, formatted_title, databricks_api_key)
414
+ table_names = result_data['tableName'].tolist()
 
 
 
 
 
415
  selected_table = st.selectbox("Select a Table:", table_names, index=None, placeholder="Select a table..")
416
 
417
  if selected_table:
418
+ # Fetching columns
419
+ result_data = fetch_columns(selected_database, selected_table, cluster_id, formatted_title, databricks_api_key)
420
+ column_names = result_data['col_name'].tolist()
 
421
 
422
  st.subheader("Map table schema to Labelbox schema", divider="grey")
423
  # Your existing code to handle schema mapping...
 
426
  with st.spinner('Fetching first 5 rows of the selected table...'):
427
  query = f"SELECT * FROM {selected_database}.{selected_table} LIMIT 5;"
428
  table_sample_data = execute_databricks_query(query, cluster_id, formatted_title, databricks_api_key)
429
+ st.write(table_sample_data)
430
 
 
 
 
 
431
  # Define two columns for side-by-side selectboxes
432
  col1, col2 = st.columns(2)
433
 
 
458
  sample_row_data_value = result_sample[selected_row_data].iloc[0]
459
 
460
  # Validate the extracted value
461
+ dataset_id = create_new_dataset_labelbox(new_dataset_name) if create_new_dataset else dataset_id
462
+ # Mode
463
+ mode = "preview" if is_preview else "production"
464
+
465
+ # Databricks instance and API key
466
+ databricks_instance = formatted_title
467
+ databricks_api_key = databricks_api_key
468
+
469
+ # Dataset ID and New Dataset
470
+ new_dataset = 1 if create_new_dataset else 0
471
+ dataset_id = dataset_id
472
+
473
+ # Table Path
474
+ table_path = f"{selected_database}.{selected_table}"
475
+ # Frequency
476
+ frequency = frequency
477
+
478
+ # Schema Map
479
+ row_data_input = selected_row_data
480
+ global_key_input = selected_global_key
481
+ schema_map_dict = {'row_data': row_data_input}
482
+ if global_key_input:
483
+ schema_map_dict['global_key'] = global_key_input
484
+
485
+ # Convert the dict to a stringified JSON
486
+ schema_map_str = json.dumps(schema_map_dict)
487
+
488
+
489
+ data = {
490
+ "mode": mode,
491
+ "databricks_instance": databricks_instance,
492
+ "databricks_api_key": databricks_api_key,
493
+ "new_dataset": new_dataset,
494
+ "dataset_id": dataset_id,
495
+ "table_path": table_path,
496
+ "labelbox_api_key": labelbox_api_key,
497
+ "frequency": frequency,
498
+ "new_cluster": 0,
499
+ "cluster_id": cluster_id,
500
+ "schema_map": schema_map_str
501
+ }
502
+
503
+
504
+ if st.button("Deploy Pipeline!", type="primary"):
505
+ # Ensure all fields are filled out
506
+ required_fields = [
507
+ mode, databricks_instance, databricks_api_key, new_dataset, dataset_id,
508
+ table_path, labelbox_api_key, frequency, cluster_id, schema_map_str
509
+ ]
510
+
511
+
512
+ # Sending a POST request to the Flask app endpoint
513
+ with st.spinner("Deploying pipeline..."):
514
+ response = requests.post("https://us-central1-dbt-prod.cloudfunctions.net/deploy-databricks-pipeline", json=data)
515
+
516
+ # Check if request was successful
517
+ if response.status_code == 200:
518
+ # Display the response using Streamlit
519
+ st.balloons()
520
+ response = response.json()
521
+ # Extract the job_id
522
+ job_id = response['message'].split('job_id":')[1].split('}')[0]
523
+ from urllib.parse import urlparse, parse_qs
524
+
525
+ # Parse the Databricks instance URL to extract the organization ID
526
+ parsed_url = urlparse(formatted_title)
527
+ query_params = parse_qs(parsed_url.query)
528
+ organization_id = query_params.get("o", [""])[0]
529
+
530
+ # Generate the Databricks Job URL
531
+ job_url = f"http://{formatted_title}/?o={organization_id}#job/{job_id}"
532
+ st.success(f"Pipeline deployed successfully! [{job_url}]({job_url}) πŸš€")
533
+ else:
534
+ st.error(f"Failed to deploy pipeline. Response: {response.text}", icon="🚫")
535
+
536
+ st.markdown("""
537
+ <style>
538
+ /* Add a large bottom padding to the main content */
539
+ .main .block-container {
540
+ padding-bottom: 1000px; /* Adjust this value as needed */
541
+ }
542
+ </style>
543
+ """, unsafe_allow_html=True)