Spaces:
Sleeping
Sleeping
from flask import Flask, request, jsonify | |
import threading | |
import requests | |
import json | |
app = Flask(__name__) | |
def create_databricks_job(data): | |
mode = data.get('mode') | |
databricks_instance = data.get('databricks_instance') | |
databricks_api_key = data.get('databricks_api_key') | |
new_dataset = data.get('new_dataset') | |
dataset_id = data.get('dataset_id') | |
table_path = data.get('table_path') | |
labelbox_api_key = data.get('labelbox_api_key') | |
frequency = data.get('frequency') | |
new_cluster = data.get('new_cluster') | |
cluster_id = data.get('cluster_id') | |
schema_map = data.get('schema_map') | |
# Define the authentication headers | |
headers = { | |
"Authorization": f"Bearer {databricks_api_key}", | |
"Content-Type": "application/json", | |
} | |
# ----- CLUSTER CREATION LOGIC ----- | |
def create_all_purpose_cluster(databricks_instance): | |
url = f"https://{databricks_instance}/api/2.0/clusters/create" | |
cluster_payload = { | |
"autoscale": { | |
"min_workers": 1, | |
"max_workers": 10 | |
}, | |
"cluster_name": "Labelbox Worker", | |
"spark_version": "11.3.x-scala2.12", | |
"gcp_attributes": { | |
"use_preemptible_executors": False, | |
"availability": "PREEMPTIBLE_WITH_FALLBACK_GCP", | |
"zone_id": "HA" | |
}, | |
"node_type_id": "n2-highmem-4", | |
"driver_node_type_id": "n2-highmem-4", | |
"ssh_public_keys": [], | |
"custom_tags": {}, | |
"cluster_log_conf": { | |
"dbfs": { | |
"destination": "dbfs:/cluster-logs" | |
} | |
}, | |
"spark_env_vars": {}, | |
"autotermination_minutes": 60, | |
"enable_elastic_disk": False, | |
"init_scripts": [], | |
"enable_local_disk_encryption": False, | |
"runtime_engine": "STANDARD" | |
} | |
response = requests.post(url, data=json.dumps(cluster_payload), headers=headers) | |
if response.status_code == 200: | |
return response.json()['cluster_id'] | |
else: | |
raise Exception(f"Failed to create all-purpose cluster. Error: {response.text}") | |
# ----- PREVIEW MODE LOGIC ----- | |
def create_preview(dataset_id, table_path, labelbox_api_key, frequency, cluster_id): | |
# ----- JOB SCHEDULING LOGIC ----- | |
if frequency == "continuous": | |
schedule_block = { | |
"continuous": { | |
"pause_status": "UNPAUSED" | |
} | |
} | |
else: | |
schedule_block = { | |
"schedule": { | |
"quartz_cron_expression": frequency, | |
"timezone_id": "UTC", | |
"pause_status": "UNPAUSED" | |
} | |
} | |
# ----- JOB DEFINITION ----- | |
# Define the parameters and structure of the job to be created in Databricks | |
payload = { | |
"name": "PREVIEW_upload_to_labelbox", | |
"email_notifications": {"no_alert_for_skipped_runs": False}, | |
"webhook_notifications": {}, | |
"timeout_seconds": 0, | |
"max_concurrent_runs": 1, | |
"tasks": [ | |
{ | |
"existing_cluster_id": cluster_id, # Move this inside the task | |
"task_key": "PREVIEW_upload_to_labelbox", | |
"run_if": "ALL_SUCCESS", | |
"notebook_task": { | |
"notebook_path": "notebooks/databricks_pipeline_creator/preview_upload_to_labelbox", | |
"base_parameters": { | |
"dataset_id": dataset_id, | |
"table_path": table_path, | |
"labelbox_api_key": labelbox_api_key, | |
"schema_map": schema_map | |
}, | |
"source": "GIT" | |
}, | |
"libraries": [ | |
{"pypi": {"package": "labelspark"}}, | |
{"pypi": {"package": "labelbox==3.49.1"}}, | |
{"pypi": {"package": "numpy==1.25"}}, | |
{"pypi": {"package": "opencv-python==4.8.0.74"}} | |
], | |
"timeout_seconds": 0, | |
"email_notifications": {}, | |
"notification_settings": { | |
"no_alert_for_skipped_runs": False, | |
"no_alert_for_canceled_runs": False, | |
"alert_on_last_attempt": False | |
} | |
} | |
], | |
"git_source": { | |
"git_url": "https://github.com/Labelbox/labelspark.git", | |
"git_provider": "gitHub", | |
"git_branch": "master" | |
}, | |
"format": "MULTI_TASK" | |
} | |
# Merge the scheduling configuration into the main job payload | |
payload.update(schedule_block) | |
return payload | |
# ----- PRODUCTION MODE LOGIC ----- | |
def create_production(dataset_id, table_path, labelbox_api_key, frequency): | |
# ----- JOB SCHEDULING LOGIC ----- | |
# If the job needs to run continuously, use the "continuous" block | |
# Else, use the "schedule" block with the specified cron frequency | |
if frequency == "continuous": | |
schedule_block = { | |
"continuous": { | |
"pause_status": "UNPAUSED" | |
} | |
} | |
else: | |
schedule_block = { | |
"schedule": { | |
"quartz_cron_expression": frequency, | |
"timezone_id": "UTC", | |
"pause_status": "UNPAUSED" | |
} | |
} | |
# ----- JOB DEFINITION ----- | |
# Define the parameters and structure of the job to be created in Databricks | |
payload = { | |
"name": "upload_to_labelbox", | |
"email_notifications": {"no_alert_for_skipped_runs": False}, | |
"webhook_notifications": {}, | |
"timeout_seconds": 0, | |
"max_concurrent_runs": 1, | |
"tasks": [ | |
{ | |
"task_key": "upload_to_labelbox", | |
"run_if": "ALL_SUCCESS", | |
"notebook_task": { | |
"notebook_path": "notebooks/databricks_pipeline_creator/upload_to_labelbox", | |
"base_parameters": { | |
"dataset_id": dataset_id, | |
"table_path": table_path, | |
"labelbox_api_key": labelbox_api_key, | |
"schema_map": schema_map | |
}, | |
"source": "GIT" | |
}, | |
"job_cluster_key": "Job_cluster", | |
"libraries": [ | |
{"pypi": {"package": "labelspark"}}, | |
{"pypi": {"package": "labelbox==3.49.1"}}, | |
{"pypi": {"package": "numpy==1.25"}}, | |
{"pypi": {"package": "opencv-python==4.8.0.74"}} | |
], | |
"timeout_seconds": 0, | |
"email_notifications": {}, | |
"notification_settings": { | |
"no_alert_for_skipped_runs": False, | |
"no_alert_for_canceled_runs": False, | |
"alert_on_last_attempt": False | |
} | |
} | |
], | |
"job_clusters": [ | |
{ | |
"job_cluster_key": "Job_cluster", | |
"new_cluster": { | |
"cluster_name": "", | |
"spark_version": "13.3.x-scala2.12", | |
"gcp_attributes": { | |
"use_preemptible_executors": False, | |
"availability": "ON_DEMAND_GCP", | |
"zone_id": "HA" | |
}, | |
"node_type_id": "n2-highmem-4", | |
"enable_elastic_disk": True, | |
"data_security_mode": "SINGLE_USER", | |
"runtime_engine": "STANDARD", | |
"autoscale": { | |
"min_workers": 1, | |
"max_workers": 10 | |
} | |
} | |
} | |
], | |
"git_source": { | |
"git_url": "https://github.com/Labelbox/labelspark.git", | |
"git_provider": "gitHub", | |
"git_branch": "master" | |
}, | |
"format": "MULTI_TASK" | |
} | |
# Merge the scheduling configuration into the main job payload | |
payload.update(schedule_block) | |
return payload | |
# if cluster_id blank or null: | |
if new_cluster == True: | |
cluster_id = create_all_purpose_cluster(databricks_instance) | |
print(f"Created all-purpose cluster with ID: {cluster_id}") | |
else: | |
print(f"Using existing cluster with ID: {cluster_id}") | |
if mode == "preview": | |
payload = create_preview(dataset_id, table_path, labelbox_api_key, frequency, cluster_id) | |
elif mode == "production": | |
payload = create_production(dataset_id, table_path, labelbox_api_key, frequency) | |
else: | |
return f"Invalid mode: {mode}" | |
# ----- JOB CREATION ----- | |
# Formulate the endpoint URL for the Databricks REST API job creation | |
url = f"https://{databricks_instance}/api/2.0/jobs/create" | |
# Send the POST request to Databricks to create the job | |
response = requests.post(url, data=json.dumps(payload), headers=headers) | |
# ----- RESPONSE HANDLING ----- | |
if response.status_code == 200: | |
return f"Job created successfully. {response.text}" | |
else: | |
return f"Failed to create job. Error: {response.text}" | |
def api_create_databricks_job(): | |
data = request.get_json() | |
result = create_databricks_job(data) | |
return jsonify({"message": result}) | |
def run(): | |
app.run(port=5000) | |
threading.Thread(target=run).start() |