ksaramout commited on
Commit
a234a1a
·
1 Parent(s): 70a2533

Delete preview_mode_server.py

Browse files
Files changed (1) hide show
  1. preview_mode_server.py +0 -270
preview_mode_server.py DELETED
@@ -1,270 +0,0 @@
1
- from flask import Flask, request, jsonify
2
- import threading
3
- import requests
4
- import json
5
-
6
- app = Flask(__name__)
7
-
8
- def create_databricks_job(data):
9
- mode = data.get('mode')
10
- databricks_instance = data.get('databricks_instance')
11
- databricks_api_key = data.get('databricks_api_key')
12
- new_dataset = data.get('new_dataset')
13
- dataset_id = data.get('dataset_id')
14
- table_path = data.get('table_path')
15
- labelbox_api_key = data.get('labelbox_api_key')
16
- frequency = data.get('frequency')
17
- new_cluster = data.get('new_cluster')
18
- cluster_id = data.get('cluster_id')
19
- schema_map = data.get('schema_map')
20
-
21
- # Define the authentication headers
22
- headers = {
23
- "Authorization": f"Bearer {databricks_api_key}",
24
- "Content-Type": "application/json",
25
- }
26
-
27
- # ----- CLUSTER CREATION LOGIC -----
28
-
29
- def create_all_purpose_cluster(databricks_instance):
30
- url = f"https://{databricks_instance}/api/2.0/clusters/create"
31
-
32
- cluster_payload = {
33
- "autoscale": {
34
- "min_workers": 1,
35
- "max_workers": 10
36
- },
37
- "cluster_name": "Labelbox Worker",
38
- "spark_version": "11.3.x-scala2.12",
39
- "gcp_attributes": {
40
- "use_preemptible_executors": False,
41
- "availability": "PREEMPTIBLE_WITH_FALLBACK_GCP",
42
- "zone_id": "HA"
43
- },
44
- "node_type_id": "n2-highmem-4",
45
- "driver_node_type_id": "n2-highmem-4",
46
- "ssh_public_keys": [],
47
- "custom_tags": {},
48
- "cluster_log_conf": {
49
- "dbfs": {
50
- "destination": "dbfs:/cluster-logs"
51
- }
52
- },
53
- "spark_env_vars": {},
54
- "autotermination_minutes": 60,
55
- "enable_elastic_disk": False,
56
- "init_scripts": [],
57
- "enable_local_disk_encryption": False,
58
- "runtime_engine": "STANDARD"
59
- }
60
-
61
- response = requests.post(url, data=json.dumps(cluster_payload), headers=headers)
62
- if response.status_code == 200:
63
- return response.json()['cluster_id']
64
- else:
65
- raise Exception(f"Failed to create all-purpose cluster. Error: {response.text}")
66
-
67
- # ----- PREVIEW MODE LOGIC -----
68
-
69
- def create_preview(dataset_id, table_path, labelbox_api_key, frequency, cluster_id):
70
- # ----- JOB SCHEDULING LOGIC -----
71
- if frequency == "continuous":
72
- schedule_block = {
73
- "continuous": {
74
- "pause_status": "UNPAUSED"
75
- }
76
- }
77
- else:
78
- schedule_block = {
79
- "schedule": {
80
- "quartz_cron_expression": frequency,
81
- "timezone_id": "UTC",
82
- "pause_status": "UNPAUSED"
83
- }
84
- }
85
-
86
- # ----- JOB DEFINITION -----
87
-
88
- # Define the parameters and structure of the job to be created in Databricks
89
-
90
- payload = {
91
- "name": "PREVIEW_upload_to_labelbox",
92
- "email_notifications": {"no_alert_for_skipped_runs": False},
93
- "webhook_notifications": {},
94
- "timeout_seconds": 0,
95
- "max_concurrent_runs": 1,
96
- "tasks": [
97
- {
98
- "existing_cluster_id": cluster_id, # Move this inside the task
99
- "task_key": "PREVIEW_upload_to_labelbox",
100
- "run_if": "ALL_SUCCESS",
101
- "notebook_task": {
102
- "notebook_path": "notebooks/databricks_pipeline_creator/preview_upload_to_labelbox",
103
- "base_parameters": {
104
- "dataset_id": dataset_id,
105
- "table_path": table_path,
106
- "labelbox_api_key": labelbox_api_key,
107
- "schema_map": schema_map
108
- },
109
- "source": "GIT"
110
- },
111
- "libraries": [
112
- {"pypi": {"package": "labelspark"}},
113
- {"pypi": {"package": "labelbox==3.49.1"}},
114
- {"pypi": {"package": "numpy==1.25"}},
115
- {"pypi": {"package": "opencv-python==4.8.0.74"}}
116
- ],
117
- "timeout_seconds": 0,
118
- "email_notifications": {},
119
- "notification_settings": {
120
- "no_alert_for_skipped_runs": False,
121
- "no_alert_for_canceled_runs": False,
122
- "alert_on_last_attempt": False
123
- }
124
- }
125
- ],
126
- "git_source": {
127
- "git_url": "https://github.com/Labelbox/labelspark.git",
128
- "git_provider": "gitHub",
129
- "git_branch": "master"
130
- },
131
- "format": "MULTI_TASK"
132
- }
133
-
134
- # Merge the scheduling configuration into the main job payload
135
- payload.update(schedule_block)
136
-
137
- return payload
138
-
139
- # ----- PRODUCTION MODE LOGIC -----
140
- def create_production(dataset_id, table_path, labelbox_api_key, frequency):
141
- # ----- JOB SCHEDULING LOGIC -----
142
-
143
- # If the job needs to run continuously, use the "continuous" block
144
- # Else, use the "schedule" block with the specified cron frequency
145
- if frequency == "continuous":
146
- schedule_block = {
147
- "continuous": {
148
- "pause_status": "UNPAUSED"
149
- }
150
- }
151
- else:
152
- schedule_block = {
153
- "schedule": {
154
- "quartz_cron_expression": frequency,
155
- "timezone_id": "UTC",
156
- "pause_status": "UNPAUSED"
157
- }
158
- }
159
-
160
- # ----- JOB DEFINITION -----
161
-
162
- # Define the parameters and structure of the job to be created in Databricks
163
- payload = {
164
- "name": "upload_to_labelbox",
165
- "email_notifications": {"no_alert_for_skipped_runs": False},
166
- "webhook_notifications": {},
167
- "timeout_seconds": 0,
168
- "max_concurrent_runs": 1,
169
- "tasks": [
170
- {
171
- "task_key": "upload_to_labelbox",
172
- "run_if": "ALL_SUCCESS",
173
- "notebook_task": {
174
- "notebook_path": "notebooks/databricks_pipeline_creator/upload_to_labelbox",
175
- "base_parameters": {
176
- "dataset_id": dataset_id,
177
- "table_path": table_path,
178
- "labelbox_api_key": labelbox_api_key,
179
- "schema_map": schema_map
180
- },
181
- "source": "GIT"
182
- },
183
- "job_cluster_key": "Job_cluster",
184
- "libraries": [
185
- {"pypi": {"package": "labelspark"}},
186
- {"pypi": {"package": "labelbox==3.49.1"}},
187
- {"pypi": {"package": "numpy==1.25"}},
188
- {"pypi": {"package": "opencv-python==4.8.0.74"}}
189
- ],
190
- "timeout_seconds": 0,
191
- "email_notifications": {},
192
- "notification_settings": {
193
- "no_alert_for_skipped_runs": False,
194
- "no_alert_for_canceled_runs": False,
195
- "alert_on_last_attempt": False
196
- }
197
- }
198
- ],
199
- "job_clusters": [
200
- {
201
- "job_cluster_key": "Job_cluster",
202
- "new_cluster": {
203
- "cluster_name": "",
204
- "spark_version": "13.3.x-scala2.12",
205
- "gcp_attributes": {
206
- "use_preemptible_executors": False,
207
- "availability": "ON_DEMAND_GCP",
208
- "zone_id": "HA"
209
- },
210
- "node_type_id": "n2-highmem-4",
211
- "enable_elastic_disk": True,
212
- "data_security_mode": "SINGLE_USER",
213
- "runtime_engine": "STANDARD",
214
- "autoscale": {
215
- "min_workers": 1,
216
- "max_workers": 10
217
- }
218
- }
219
- }
220
- ],
221
- "git_source": {
222
- "git_url": "https://github.com/Labelbox/labelspark.git",
223
- "git_provider": "gitHub",
224
- "git_branch": "master"
225
- },
226
- "format": "MULTI_TASK"
227
- }
228
-
229
- # Merge the scheduling configuration into the main job payload
230
- payload.update(schedule_block)
231
- return payload
232
-
233
- # if cluster_id blank or null:
234
- if new_cluster == True:
235
- cluster_id = create_all_purpose_cluster(databricks_instance)
236
- print(f"Created all-purpose cluster with ID: {cluster_id}")
237
- else:
238
- print(f"Using existing cluster with ID: {cluster_id}")
239
-
240
- if mode == "preview":
241
- payload = create_preview(dataset_id, table_path, labelbox_api_key, frequency, cluster_id)
242
- elif mode == "production":
243
- payload = create_production(dataset_id, table_path, labelbox_api_key, frequency)
244
- else:
245
- return f"Invalid mode: {mode}"
246
-
247
- # ----- JOB CREATION -----
248
-
249
- # Formulate the endpoint URL for the Databricks REST API job creation
250
- url = f"https://{databricks_instance}/api/2.0/jobs/create"
251
-
252
- # Send the POST request to Databricks to create the job
253
- response = requests.post(url, data=json.dumps(payload), headers=headers)
254
-
255
- # ----- RESPONSE HANDLING -----
256
- if response.status_code == 200:
257
- return f"Job created successfully. {response.text}"
258
- else:
259
- return f"Failed to create job. Error: {response.text}"
260
-
261
- @app.route('/create-databricks-job', methods=['POST'])
262
- def api_create_databricks_job():
263
- data = request.get_json()
264
- result = create_databricks_job(data)
265
- return jsonify({"message": result})
266
-
267
- def run():
268
- app.run(port=5000)
269
-
270
- threading.Thread(target=run).start()