Spaces:
Paused
Paused
Update handler.py
Browse files- handler.py +40 -27
handler.py
CHANGED
@@ -11,24 +11,26 @@ from azure.storage.blob import BlobServiceClient
|
|
11 |
|
12 |
def get_azure_connection_string():
|
13 |
"""Get Azure connection string from environment variable"""
|
14 |
-
conn_string ="DefaultEndpointsProtocol=https;AccountName=transcribedblobstorage;AccountKey=1Z7yKPP5DLbxnoHdh7NmHgwg3dFLaDiYHUELdid7dzfzR6/DvkZnnzpJ30lrXIMhtD5GYKo+71jP+AStC1TEvA==;EndpointSuffix=core.windows.net"
|
15 |
if not conn_string:
|
16 |
raise ValueError("Azure Storage connection string not found in environment variables")
|
17 |
return conn_string
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
# container_client = blob_service_client.get_container_client(container_name)
|
27 |
|
28 |
-
#
|
29 |
-
|
30 |
-
|
31 |
-
|
|
|
|
|
|
|
32 |
|
33 |
def download_blob(blob_name: str, download_file_path: str) -> None:
|
34 |
"""Download a file from Azure Blob Storage"""
|
@@ -45,6 +47,7 @@ def download_blob(blob_name: str, download_file_path: str) -> None:
|
|
45 |
download_file.write(download_stream.readall())
|
46 |
logging.info(f"Blob '{blob_name}' downloaded to '{download_file_path}'")
|
47 |
|
|
|
48 |
def clean_directory(directory: str) -> None:
|
49 |
"""Clean up a directory by removing all files and subdirectories"""
|
50 |
if os.path.exists(directory):
|
@@ -62,7 +65,6 @@ def handler(job: Dict[str, Any]) -> Dict[str, Any]:
|
|
62 |
start_time = time.time()
|
63 |
logging.info("Handler function started")
|
64 |
|
65 |
-
# Extract job input and validate
|
66 |
job_input = job.get('input', {})
|
67 |
required_fields = ['pdf_file', 'system_prompt', 'model_name', 'max_step', 'learning_rate', 'epochs']
|
68 |
missing_fields = [field for field in required_fields if field not in job_input]
|
@@ -76,22 +78,23 @@ def handler(job: Dict[str, Any]) -> Dict[str, Any]:
|
|
76 |
work_dir = os.path.abspath(f"/tmp/work_{str(uuid.uuid4())}")
|
77 |
|
78 |
try:
|
79 |
-
# Create working directory
|
80 |
os.makedirs(work_dir, exist_ok=True)
|
81 |
logging.info(f"Working directory created: {work_dir}")
|
82 |
|
83 |
-
#
|
84 |
-
|
85 |
-
|
|
|
86 |
|
87 |
-
#
|
88 |
-
|
89 |
-
|
|
|
90 |
|
91 |
-
# Save pipeline input as
|
92 |
pipeline_input_path = os.path.join(work_dir, "pipeline_input.json")
|
93 |
pipeline_input = {
|
94 |
-
"pdf_file":
|
95 |
"system_prompt": job_input['system_prompt'],
|
96 |
"model_name": job_input['model_name'],
|
97 |
"max_step": job_input['max_step'],
|
@@ -114,15 +117,18 @@ def handler(job: Dict[str, Any]) -> Dict[str, Any]:
|
|
114 |
}
|
115 |
|
116 |
finally:
|
117 |
-
# Clean up working directory
|
118 |
try:
|
119 |
clean_directory(work_dir)
|
120 |
os.rmdir(work_dir)
|
121 |
except Exception as e:
|
122 |
logging.error(f"Failed to clean up working directory: {str(e)}")
|
123 |
|
|
|
124 |
def run_pipeline_and_evaluate(pipeline_input_path: str, model_name: str, start_time: float) -> Dict[str, Any]:
|
125 |
try:
|
|
|
|
|
|
|
126 |
# Read the pipeline input file
|
127 |
with open(pipeline_input_path, 'r') as f:
|
128 |
pipeline_input = json.load(f)
|
@@ -131,7 +137,7 @@ def run_pipeline_and_evaluate(pipeline_input_path: str, model_name: str, start_t
|
|
131 |
pipeline_input_str = json.dumps(pipeline_input)
|
132 |
|
133 |
# Run fine-tuning pipeline with JSON string as argument
|
134 |
-
logging.info(f"Running pipeline with input: {pipeline_input_str[:100]}...")
|
135 |
finetuning_result = subprocess.run(
|
136 |
['python3', 'Finetuning_Pipeline.py', pipeline_input_str],
|
137 |
capture_output=True,
|
@@ -166,6 +172,13 @@ def run_pipeline_and_evaluate(pipeline_input_path: str, model_name: str, start_t
|
|
166 |
except Exception as e:
|
167 |
evaluation_results = {"error": f"Failed to process evaluation output: {str(e)}"}
|
168 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
|
170 |
return {
|
171 |
"status": "success",
|
@@ -180,8 +193,8 @@ def run_pipeline_and_evaluate(pipeline_input_path: str, model_name: str, start_t
|
|
180 |
return {
|
181 |
"status": "error",
|
182 |
"error": error_message,
|
183 |
-
"stdout": e.stdout,
|
184 |
-
"stderr": e.stderr
|
185 |
}
|
186 |
except Exception as e:
|
187 |
error_message = f"Pipeline execution failed: {str(e)}"
|
|
|
11 |
|
12 |
def get_azure_connection_string():
|
13 |
"""Get Azure connection string from environment variable"""
|
14 |
+
conn_string = "DefaultEndpointsProtocol=https;AccountName=transcribedblobstorage;AccountKey=1Z7yKPP5DLbxnoHdh7NmHgwg3dFLaDiYHUELdid7dzfzR6/DvkZnnzpJ30lrXIMhtD5GYKo+71jP+AStC1TEvA==;EndpointSuffix=core.windows.net"
|
15 |
if not conn_string:
|
16 |
raise ValueError("Azure Storage connection string not found in environment variables")
|
17 |
return conn_string
|
18 |
|
19 |
+
def upload_file(file_path: str) -> str:
|
20 |
+
if not os.path.isfile(file_path):
|
21 |
+
raise FileNotFoundError(f"The specified file does not exist: {file_path}")
|
22 |
+
container_name = "saasdev"
|
23 |
+
connection_string = get_azure_connection_string()
|
24 |
+
blob_service_client = BlobServiceClient.from_connection_string(connection_string)
|
25 |
+
container_client = blob_service_client.get_container_client(container_name)
|
|
|
26 |
|
27 |
+
# Generate a unique blob name using UUID
|
28 |
+
blob_name = f"{uuid.uuid4()}.pdf"
|
29 |
+
with open(file_path, 'rb') as file:
|
30 |
+
blob_client = container_client.get_blob_client(blob_name)
|
31 |
+
blob_client.upload_blob(file)
|
32 |
+
logging.info(f"File uploaded to blob: {blob_name}")
|
33 |
+
return blob_name
|
34 |
|
35 |
def download_blob(blob_name: str, download_file_path: str) -> None:
|
36 |
"""Download a file from Azure Blob Storage"""
|
|
|
47 |
download_file.write(download_stream.readall())
|
48 |
logging.info(f"Blob '{blob_name}' downloaded to '{download_file_path}'")
|
49 |
|
50 |
+
|
51 |
def clean_directory(directory: str) -> None:
|
52 |
"""Clean up a directory by removing all files and subdirectories"""
|
53 |
if os.path.exists(directory):
|
|
|
65 |
start_time = time.time()
|
66 |
logging.info("Handler function started")
|
67 |
|
|
|
68 |
job_input = job.get('input', {})
|
69 |
required_fields = ['pdf_file', 'system_prompt', 'model_name', 'max_step', 'learning_rate', 'epochs']
|
70 |
missing_fields = [field for field in required_fields if field not in job_input]
|
|
|
78 |
work_dir = os.path.abspath(f"/tmp/work_{str(uuid.uuid4())}")
|
79 |
|
80 |
try:
|
|
|
81 |
os.makedirs(work_dir, exist_ok=True)
|
82 |
logging.info(f"Working directory created: {work_dir}")
|
83 |
|
84 |
+
# Upload PDF to Blob
|
85 |
+
pdf_path = job_input['pdf_file']
|
86 |
+
generated_blob_name = upload_file(pdf_path)
|
87 |
+
logging.info(f"PDF uploaded with blob name: {generated_blob_name}")
|
88 |
|
89 |
+
# Download the uploaded PDF using the internally generated blob name
|
90 |
+
downloaded_path = os.path.join(work_dir, "Downloaded_PDF.pdf")
|
91 |
+
download_blob(generated_blob_name, downloaded_path)
|
92 |
+
logging.info(f"PDF downloaded to: {downloaded_path}")
|
93 |
|
94 |
+
# Save pipeline input as JSON
|
95 |
pipeline_input_path = os.path.join(work_dir, "pipeline_input.json")
|
96 |
pipeline_input = {
|
97 |
+
"pdf_file": downloaded_path,
|
98 |
"system_prompt": job_input['system_prompt'],
|
99 |
"model_name": job_input['model_name'],
|
100 |
"max_step": job_input['max_step'],
|
|
|
117 |
}
|
118 |
|
119 |
finally:
|
|
|
120 |
try:
|
121 |
clean_directory(work_dir)
|
122 |
os.rmdir(work_dir)
|
123 |
except Exception as e:
|
124 |
logging.error(f"Failed to clean up working directory: {str(e)}")
|
125 |
|
126 |
+
|
127 |
def run_pipeline_and_evaluate(pipeline_input_path: str, model_name: str, start_time: float) -> Dict[str, Any]:
|
128 |
try:
|
129 |
+
# Suppress logging output
|
130 |
+
logging.getLogger().setLevel(logging.ERROR)
|
131 |
+
|
132 |
# Read the pipeline input file
|
133 |
with open(pipeline_input_path, 'r') as f:
|
134 |
pipeline_input = json.load(f)
|
|
|
137 |
pipeline_input_str = json.dumps(pipeline_input)
|
138 |
|
139 |
# Run fine-tuning pipeline with JSON string as argument
|
140 |
+
# logging.info(f"Running pipeline with input: {pipeline_input_str[:100]}...")
|
141 |
finetuning_result = subprocess.run(
|
142 |
['python3', 'Finetuning_Pipeline.py', pipeline_input_str],
|
143 |
capture_output=True,
|
|
|
172 |
except Exception as e:
|
173 |
evaluation_results = {"error": f"Failed to process evaluation output: {str(e)}"}
|
174 |
|
175 |
+
# Print only the JSON part to stdout for capturing in Gradio
|
176 |
+
print(json.dumps({
|
177 |
+
"status": "success",
|
178 |
+
"model_name": f"PharynxAI/{model_name}",
|
179 |
+
"processing_time": time.time() - start_time,
|
180 |
+
"evaluation_results": evaluation_results
|
181 |
+
}))
|
182 |
|
183 |
return {
|
184 |
"status": "success",
|
|
|
193 |
return {
|
194 |
"status": "error",
|
195 |
"error": error_message,
|
196 |
+
# "stdout": e.stdout,
|
197 |
+
# "stderr": e.stderr
|
198 |
}
|
199 |
except Exception as e:
|
200 |
error_message = f"Pipeline execution failed: {str(e)}"
|