Spaces:

Labelbox
/

databricks_upload

Sleeping

File size: 10,809 Bytes

70a2533

from flask import Flask, request, jsonify
import threading
import requests
import json

app = Flask(__name__)

def create_databricks_job(data):
    mode = data.get('mode')
    databricks_instance = data.get('databricks_instance')
    databricks_api_key = data.get('databricks_api_key')
    new_dataset = data.get('new_dataset')
    dataset_id = data.get('dataset_id')
    table_path = data.get('table_path')
    labelbox_api_key = data.get('labelbox_api_key')
    frequency = data.get('frequency')
    new_cluster = data.get('new_cluster')
    cluster_id = data.get('cluster_id')
    schema_map = data.get('schema_map')

    # Define the authentication headers
    headers = {
        "Authorization": f"Bearer {databricks_api_key}",
        "Content-Type": "application/json",
    }

    # ----- CLUSTER CREATION LOGIC -----

    def create_all_purpose_cluster(databricks_instance):
        url = f"https://{databricks_instance}/api/2.0/clusters/create"

        cluster_payload = {
            "autoscale": {
                "min_workers": 1,
                "max_workers": 10
            },
            "cluster_name": "Labelbox Worker",
            "spark_version": "11.3.x-scala2.12",
            "gcp_attributes": {
                "use_preemptible_executors": False,
                "availability": "PREEMPTIBLE_WITH_FALLBACK_GCP",
                "zone_id": "HA"
            },
            "node_type_id": "n2-highmem-4",
            "driver_node_type_id": "n2-highmem-4",
            "ssh_public_keys": [],
            "custom_tags": {},
            "cluster_log_conf": {
                "dbfs": {
                    "destination": "dbfs:/cluster-logs"
                }
            },
            "spark_env_vars": {},
            "autotermination_minutes": 60,
            "enable_elastic_disk": False,
            "init_scripts": [],
            "enable_local_disk_encryption": False,
            "runtime_engine": "STANDARD"
        }
        
        response = requests.post(url, data=json.dumps(cluster_payload), headers=headers)
        if response.status_code == 200:
            return response.json()['cluster_id']
        else:
            raise Exception(f"Failed to create all-purpose cluster. Error: {response.text}")
        
    # ----- PREVIEW MODE LOGIC -----

    def create_preview(dataset_id, table_path, labelbox_api_key, frequency, cluster_id):
            # ----- JOB SCHEDULING LOGIC -----
            if frequency == "continuous":
                schedule_block = {
                    "continuous": {
                        "pause_status": "UNPAUSED"
                    }
                }
            else:
                schedule_block = {
                    "schedule": {
                        "quartz_cron_expression": frequency,
                        "timezone_id": "UTC",
                        "pause_status": "UNPAUSED"
                    }
                }

            # ----- JOB DEFINITION -----

        # Define the parameters and structure of the job to be created in Databricks

            payload = {
                "name": "PREVIEW_upload_to_labelbox",
                "email_notifications": {"no_alert_for_skipped_runs": False},
                "webhook_notifications": {},
                "timeout_seconds": 0,
                "max_concurrent_runs": 1,
                "tasks": [
                    {
                        "existing_cluster_id": cluster_id,  # Move this inside the task
                        "task_key": "PREVIEW_upload_to_labelbox",
                        "run_if": "ALL_SUCCESS",
                        "notebook_task": {
                            "notebook_path": "notebooks/databricks_pipeline_creator/preview_upload_to_labelbox",
                            "base_parameters": {
                                "dataset_id": dataset_id,
                                "table_path": table_path,
                                "labelbox_api_key": labelbox_api_key,
                                "schema_map": schema_map
                            },
                            "source": "GIT"
                        },
                        "libraries": [
                            {"pypi": {"package": "labelspark"}},
                            {"pypi": {"package": "labelbox==3.49.1"}},
                            {"pypi": {"package": "numpy==1.25"}},
                            {"pypi": {"package": "opencv-python==4.8.0.74"}}
                        ],
                        "timeout_seconds": 0,
                        "email_notifications": {},
                        "notification_settings": {
                            "no_alert_for_skipped_runs": False,
                            "no_alert_for_canceled_runs": False,
                            "alert_on_last_attempt": False
                        }
                    }
                ],
                "git_source": {
                    "git_url": "https://github.com/Labelbox/labelspark.git",
                    "git_provider": "gitHub",
                    "git_branch": "master"
                },
                "format": "MULTI_TASK"
            }

            # Merge the scheduling configuration into the main job payload
            payload.update(schedule_block)

            return payload
    
    # ----- PRODUCTION MODE LOGIC -----
    def create_production(dataset_id, table_path, labelbox_api_key, frequency):
            # ----- JOB SCHEDULING LOGIC -----

            # If the job needs to run continuously, use the "continuous" block
            # Else, use the "schedule" block with the specified cron frequency
            if frequency == "continuous":
                schedule_block = {
                    "continuous": {
                        "pause_status": "UNPAUSED"
                    }
                }
            else:
                schedule_block = {
                    "schedule": {
                        "quartz_cron_expression": frequency,
                        "timezone_id": "UTC",
                        "pause_status": "UNPAUSED"
                    }
                }

            # ----- JOB DEFINITION -----

            # Define the parameters and structure of the job to be created in Databricks
            payload = {
                "name": "upload_to_labelbox",
                "email_notifications": {"no_alert_for_skipped_runs": False},
                "webhook_notifications": {},
                "timeout_seconds": 0,
                "max_concurrent_runs": 1,
                "tasks": [
                    {
                        "task_key": "upload_to_labelbox",
                        "run_if": "ALL_SUCCESS",
                        "notebook_task": {
                            "notebook_path": "notebooks/databricks_pipeline_creator/upload_to_labelbox",
                            "base_parameters": {
                                "dataset_id": dataset_id,
                                "table_path": table_path,
                                "labelbox_api_key": labelbox_api_key,
                                "schema_map": schema_map
                            },
                            "source": "GIT"
                        },
                        "job_cluster_key": "Job_cluster",
                        "libraries": [
                            {"pypi": {"package": "labelspark"}},
                            {"pypi": {"package": "labelbox==3.49.1"}},
                            {"pypi": {"package": "numpy==1.25"}},
                            {"pypi": {"package": "opencv-python==4.8.0.74"}}
                        ],
                        "timeout_seconds": 0,
                        "email_notifications": {},
                        "notification_settings": {
                            "no_alert_for_skipped_runs": False,
                            "no_alert_for_canceled_runs": False,
                            "alert_on_last_attempt": False
                        }
                    }
                ],
                "job_clusters": [
                    {
                        "job_cluster_key": "Job_cluster",
                        "new_cluster": {
                            "cluster_name": "",
                            "spark_version": "13.3.x-scala2.12",
                            "gcp_attributes": {
                                "use_preemptible_executors": False,
                                "availability": "ON_DEMAND_GCP",
                                "zone_id": "HA"
                            },
                            "node_type_id": "n2-highmem-4",
                            "enable_elastic_disk": True,
                            "data_security_mode": "SINGLE_USER",
                            "runtime_engine": "STANDARD",
                            "autoscale": {
                                "min_workers": 1,
                                "max_workers": 10
                            }
                        }
                    }
                ],
                "git_source": {
                    "git_url": "https://github.com/Labelbox/labelspark.git",
                    "git_provider": "gitHub",
                    "git_branch": "master"
                },
                "format": "MULTI_TASK"
            }

            # Merge the scheduling configuration into the main job payload
            payload.update(schedule_block)
            return payload
        
    # if cluster_id blank or null:
    if new_cluster == True:
        cluster_id = create_all_purpose_cluster(databricks_instance)
        print(f"Created all-purpose cluster with ID: {cluster_id}")
    else:
        print(f"Using existing cluster with ID: {cluster_id}")
    
    if mode == "preview":
        payload = create_preview(dataset_id, table_path, labelbox_api_key, frequency, cluster_id)
    elif mode == "production":
        payload = create_production(dataset_id, table_path, labelbox_api_key, frequency)
    else:
        return f"Invalid mode: {mode}"

    # ----- JOB CREATION -----

    # Formulate the endpoint URL for the Databricks REST API job creation
    url = f"https://{databricks_instance}/api/2.0/jobs/create"

    # Send the POST request to Databricks to create the job
    response = requests.post(url, data=json.dumps(payload), headers=headers)

    # ----- RESPONSE HANDLING -----
    if response.status_code == 200:
        return f"Job created successfully. {response.text}"
    else:
        return f"Failed to create job. Error: {response.text}"

@app.route('/create-databricks-job', methods=['POST'])
def api_create_databricks_job():
    data = request.get_json()
    result = create_databricks_job(data)
    return jsonify({"message": result})

def run():
    app.run(port=5000)

threading.Thread(target=run).start()