maximevo commited on
Commit
70a2533
·
1 Parent(s): e073608

Create preview_mode_server.py

Browse files
Files changed (1) hide show
  1. preview_mode_server.py +270 -0
preview_mode_server.py ADDED
@@ -0,0 +1,270 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify
2
+ import threading
3
+ import requests
4
+ import json
5
+
6
+ app = Flask(__name__)
7
+
8
+ def create_databricks_job(data):
9
+ mode = data.get('mode')
10
+ databricks_instance = data.get('databricks_instance')
11
+ databricks_api_key = data.get('databricks_api_key')
12
+ new_dataset = data.get('new_dataset')
13
+ dataset_id = data.get('dataset_id')
14
+ table_path = data.get('table_path')
15
+ labelbox_api_key = data.get('labelbox_api_key')
16
+ frequency = data.get('frequency')
17
+ new_cluster = data.get('new_cluster')
18
+ cluster_id = data.get('cluster_id')
19
+ schema_map = data.get('schema_map')
20
+
21
+ # Define the authentication headers
22
+ headers = {
23
+ "Authorization": f"Bearer {databricks_api_key}",
24
+ "Content-Type": "application/json",
25
+ }
26
+
27
+ # ----- CLUSTER CREATION LOGIC -----
28
+
29
+ def create_all_purpose_cluster(databricks_instance):
30
+ url = f"https://{databricks_instance}/api/2.0/clusters/create"
31
+
32
+ cluster_payload = {
33
+ "autoscale": {
34
+ "min_workers": 1,
35
+ "max_workers": 10
36
+ },
37
+ "cluster_name": "Labelbox Worker",
38
+ "spark_version": "11.3.x-scala2.12",
39
+ "gcp_attributes": {
40
+ "use_preemptible_executors": False,
41
+ "availability": "PREEMPTIBLE_WITH_FALLBACK_GCP",
42
+ "zone_id": "HA"
43
+ },
44
+ "node_type_id": "n2-highmem-4",
45
+ "driver_node_type_id": "n2-highmem-4",
46
+ "ssh_public_keys": [],
47
+ "custom_tags": {},
48
+ "cluster_log_conf": {
49
+ "dbfs": {
50
+ "destination": "dbfs:/cluster-logs"
51
+ }
52
+ },
53
+ "spark_env_vars": {},
54
+ "autotermination_minutes": 60,
55
+ "enable_elastic_disk": False,
56
+ "init_scripts": [],
57
+ "enable_local_disk_encryption": False,
58
+ "runtime_engine": "STANDARD"
59
+ }
60
+
61
+ response = requests.post(url, data=json.dumps(cluster_payload), headers=headers)
62
+ if response.status_code == 200:
63
+ return response.json()['cluster_id']
64
+ else:
65
+ raise Exception(f"Failed to create all-purpose cluster. Error: {response.text}")
66
+
67
+ # ----- PREVIEW MODE LOGIC -----
68
+
69
+ def create_preview(dataset_id, table_path, labelbox_api_key, frequency, cluster_id):
70
+ # ----- JOB SCHEDULING LOGIC -----
71
+ if frequency == "continuous":
72
+ schedule_block = {
73
+ "continuous": {
74
+ "pause_status": "UNPAUSED"
75
+ }
76
+ }
77
+ else:
78
+ schedule_block = {
79
+ "schedule": {
80
+ "quartz_cron_expression": frequency,
81
+ "timezone_id": "UTC",
82
+ "pause_status": "UNPAUSED"
83
+ }
84
+ }
85
+
86
+ # ----- JOB DEFINITION -----
87
+
88
+ # Define the parameters and structure of the job to be created in Databricks
89
+
90
+ payload = {
91
+ "name": "PREVIEW_upload_to_labelbox",
92
+ "email_notifications": {"no_alert_for_skipped_runs": False},
93
+ "webhook_notifications": {},
94
+ "timeout_seconds": 0,
95
+ "max_concurrent_runs": 1,
96
+ "tasks": [
97
+ {
98
+ "existing_cluster_id": cluster_id, # Move this inside the task
99
+ "task_key": "PREVIEW_upload_to_labelbox",
100
+ "run_if": "ALL_SUCCESS",
101
+ "notebook_task": {
102
+ "notebook_path": "notebooks/databricks_pipeline_creator/preview_upload_to_labelbox",
103
+ "base_parameters": {
104
+ "dataset_id": dataset_id,
105
+ "table_path": table_path,
106
+ "labelbox_api_key": labelbox_api_key,
107
+ "schema_map": schema_map
108
+ },
109
+ "source": "GIT"
110
+ },
111
+ "libraries": [
112
+ {"pypi": {"package": "labelspark"}},
113
+ {"pypi": {"package": "labelbox==3.49.1"}},
114
+ {"pypi": {"package": "numpy==1.25"}},
115
+ {"pypi": {"package": "opencv-python==4.8.0.74"}}
116
+ ],
117
+ "timeout_seconds": 0,
118
+ "email_notifications": {},
119
+ "notification_settings": {
120
+ "no_alert_for_skipped_runs": False,
121
+ "no_alert_for_canceled_runs": False,
122
+ "alert_on_last_attempt": False
123
+ }
124
+ }
125
+ ],
126
+ "git_source": {
127
+ "git_url": "https://github.com/Labelbox/labelspark.git",
128
+ "git_provider": "gitHub",
129
+ "git_branch": "master"
130
+ },
131
+ "format": "MULTI_TASK"
132
+ }
133
+
134
+ # Merge the scheduling configuration into the main job payload
135
+ payload.update(schedule_block)
136
+
137
+ return payload
138
+
139
+ # ----- PRODUCTION MODE LOGIC -----
140
+ def create_production(dataset_id, table_path, labelbox_api_key, frequency):
141
+ # ----- JOB SCHEDULING LOGIC -----
142
+
143
+ # If the job needs to run continuously, use the "continuous" block
144
+ # Else, use the "schedule" block with the specified cron frequency
145
+ if frequency == "continuous":
146
+ schedule_block = {
147
+ "continuous": {
148
+ "pause_status": "UNPAUSED"
149
+ }
150
+ }
151
+ else:
152
+ schedule_block = {
153
+ "schedule": {
154
+ "quartz_cron_expression": frequency,
155
+ "timezone_id": "UTC",
156
+ "pause_status": "UNPAUSED"
157
+ }
158
+ }
159
+
160
+ # ----- JOB DEFINITION -----
161
+
162
+ # Define the parameters and structure of the job to be created in Databricks
163
+ payload = {
164
+ "name": "upload_to_labelbox",
165
+ "email_notifications": {"no_alert_for_skipped_runs": False},
166
+ "webhook_notifications": {},
167
+ "timeout_seconds": 0,
168
+ "max_concurrent_runs": 1,
169
+ "tasks": [
170
+ {
171
+ "task_key": "upload_to_labelbox",
172
+ "run_if": "ALL_SUCCESS",
173
+ "notebook_task": {
174
+ "notebook_path": "notebooks/databricks_pipeline_creator/upload_to_labelbox",
175
+ "base_parameters": {
176
+ "dataset_id": dataset_id,
177
+ "table_path": table_path,
178
+ "labelbox_api_key": labelbox_api_key,
179
+ "schema_map": schema_map
180
+ },
181
+ "source": "GIT"
182
+ },
183
+ "job_cluster_key": "Job_cluster",
184
+ "libraries": [
185
+ {"pypi": {"package": "labelspark"}},
186
+ {"pypi": {"package": "labelbox==3.49.1"}},
187
+ {"pypi": {"package": "numpy==1.25"}},
188
+ {"pypi": {"package": "opencv-python==4.8.0.74"}}
189
+ ],
190
+ "timeout_seconds": 0,
191
+ "email_notifications": {},
192
+ "notification_settings": {
193
+ "no_alert_for_skipped_runs": False,
194
+ "no_alert_for_canceled_runs": False,
195
+ "alert_on_last_attempt": False
196
+ }
197
+ }
198
+ ],
199
+ "job_clusters": [
200
+ {
201
+ "job_cluster_key": "Job_cluster",
202
+ "new_cluster": {
203
+ "cluster_name": "",
204
+ "spark_version": "13.3.x-scala2.12",
205
+ "gcp_attributes": {
206
+ "use_preemptible_executors": False,
207
+ "availability": "ON_DEMAND_GCP",
208
+ "zone_id": "HA"
209
+ },
210
+ "node_type_id": "n2-highmem-4",
211
+ "enable_elastic_disk": True,
212
+ "data_security_mode": "SINGLE_USER",
213
+ "runtime_engine": "STANDARD",
214
+ "autoscale": {
215
+ "min_workers": 1,
216
+ "max_workers": 10
217
+ }
218
+ }
219
+ }
220
+ ],
221
+ "git_source": {
222
+ "git_url": "https://github.com/Labelbox/labelspark.git",
223
+ "git_provider": "gitHub",
224
+ "git_branch": "master"
225
+ },
226
+ "format": "MULTI_TASK"
227
+ }
228
+
229
+ # Merge the scheduling configuration into the main job payload
230
+ payload.update(schedule_block)
231
+ return payload
232
+
233
+ # if cluster_id blank or null:
234
+ if new_cluster == True:
235
+ cluster_id = create_all_purpose_cluster(databricks_instance)
236
+ print(f"Created all-purpose cluster with ID: {cluster_id}")
237
+ else:
238
+ print(f"Using existing cluster with ID: {cluster_id}")
239
+
240
+ if mode == "preview":
241
+ payload = create_preview(dataset_id, table_path, labelbox_api_key, frequency, cluster_id)
242
+ elif mode == "production":
243
+ payload = create_production(dataset_id, table_path, labelbox_api_key, frequency)
244
+ else:
245
+ return f"Invalid mode: {mode}"
246
+
247
+ # ----- JOB CREATION -----
248
+
249
+ # Formulate the endpoint URL for the Databricks REST API job creation
250
+ url = f"https://{databricks_instance}/api/2.0/jobs/create"
251
+
252
+ # Send the POST request to Databricks to create the job
253
+ response = requests.post(url, data=json.dumps(payload), headers=headers)
254
+
255
+ # ----- RESPONSE HANDLING -----
256
+ if response.status_code == 200:
257
+ return f"Job created successfully. {response.text}"
258
+ else:
259
+ return f"Failed to create job. Error: {response.text}"
260
+
261
+ @app.route('/create-databricks-job', methods=['POST'])
262
+ def api_create_databricks_job():
263
+ data = request.get_json()
264
+ result = create_databricks_job(data)
265
+ return jsonify({"message": result})
266
+
267
+ def run():
268
+ app.run(port=5000)
269
+
270
+ threading.Thread(target=run).start()