Diksha2001 commited on
Commit
7552f31
·
verified ·
1 Parent(s): 30b9ad3

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +40 -27
handler.py CHANGED
@@ -11,24 +11,26 @@ from azure.storage.blob import BlobServiceClient
11
 
12
  def get_azure_connection_string():
13
  """Get Azure connection string from environment variable"""
14
- conn_string ="DefaultEndpointsProtocol=https;AccountName=transcribedblobstorage;AccountKey=1Z7yKPP5DLbxnoHdh7NmHgwg3dFLaDiYHUELdid7dzfzR6/DvkZnnzpJ30lrXIMhtD5GYKo+71jP+AStC1TEvA==;EndpointSuffix=core.windows.net"
15
  if not conn_string:
16
  raise ValueError("Azure Storage connection string not found in environment variables")
17
  return conn_string
18
 
19
- # def upload_file(file_path: str, blob_name: str) -> str:
20
- # """Upload a file to Azure Blob Storage"""
21
- # if not os.path.isfile(file_path):
22
- # raise FileNotFoundError(f"The specified file does not exist: {file_path}")
23
- # container_name = "saasdev"
24
- # connection_string = get_azure_connection_string()
25
- # blob_service_client = BlobServiceClient.from_connection_string(connection_string)
26
- # container_client = blob_service_client.get_container_client(container_name)
27
 
28
- # with open(file_path, 'rb') as file:
29
- # blob_client = container_client.get_blob_client(blob_name)
30
- # blob_client.upload_blob(file)
31
- # return blob_client.blob_name
 
 
 
32
 
33
  def download_blob(blob_name: str, download_file_path: str) -> None:
34
  """Download a file from Azure Blob Storage"""
@@ -45,6 +47,7 @@ def download_blob(blob_name: str, download_file_path: str) -> None:
45
  download_file.write(download_stream.readall())
46
  logging.info(f"Blob '{blob_name}' downloaded to '{download_file_path}'")
47
 
 
48
  def clean_directory(directory: str) -> None:
49
  """Clean up a directory by removing all files and subdirectories"""
50
  if os.path.exists(directory):
@@ -62,7 +65,6 @@ def handler(job: Dict[str, Any]) -> Dict[str, Any]:
62
  start_time = time.time()
63
  logging.info("Handler function started")
64
 
65
- # Extract job input and validate
66
  job_input = job.get('input', {})
67
  required_fields = ['pdf_file', 'system_prompt', 'model_name', 'max_step', 'learning_rate', 'epochs']
68
  missing_fields = [field for field in required_fields if field not in job_input]
@@ -76,22 +78,23 @@ def handler(job: Dict[str, Any]) -> Dict[str, Any]:
76
  work_dir = os.path.abspath(f"/tmp/work_{str(uuid.uuid4())}")
77
 
78
  try:
79
- # Create working directory
80
  os.makedirs(work_dir, exist_ok=True)
81
  logging.info(f"Working directory created: {work_dir}")
82
 
83
- # Download and process PDF
84
- download_path = os.path.join(work_dir, "Input_PDF.pdf")
85
- download_blob(job_input['pdf_file'], download_path)
 
86
 
87
- # Verify downloaded file exists
88
- if not os.path.exists(download_path):
89
- raise FileNotFoundError(f"Downloaded PDF file not found at: {download_path}")
 
90
 
91
- # Save pipeline input as a JSON file
92
  pipeline_input_path = os.path.join(work_dir, "pipeline_input.json")
93
  pipeline_input = {
94
- "pdf_file": download_path,
95
  "system_prompt": job_input['system_prompt'],
96
  "model_name": job_input['model_name'],
97
  "max_step": job_input['max_step'],
@@ -114,15 +117,18 @@ def handler(job: Dict[str, Any]) -> Dict[str, Any]:
114
  }
115
 
116
  finally:
117
- # Clean up working directory
118
  try:
119
  clean_directory(work_dir)
120
  os.rmdir(work_dir)
121
  except Exception as e:
122
  logging.error(f"Failed to clean up working directory: {str(e)}")
123
 
 
124
  def run_pipeline_and_evaluate(pipeline_input_path: str, model_name: str, start_time: float) -> Dict[str, Any]:
125
  try:
 
 
 
126
  # Read the pipeline input file
127
  with open(pipeline_input_path, 'r') as f:
128
  pipeline_input = json.load(f)
@@ -131,7 +137,7 @@ def run_pipeline_and_evaluate(pipeline_input_path: str, model_name: str, start_t
131
  pipeline_input_str = json.dumps(pipeline_input)
132
 
133
  # Run fine-tuning pipeline with JSON string as argument
134
- logging.info(f"Running pipeline with input: {pipeline_input_str[:100]}...")
135
  finetuning_result = subprocess.run(
136
  ['python3', 'Finetuning_Pipeline.py', pipeline_input_str],
137
  capture_output=True,
@@ -166,6 +172,13 @@ def run_pipeline_and_evaluate(pipeline_input_path: str, model_name: str, start_t
166
  except Exception as e:
167
  evaluation_results = {"error": f"Failed to process evaluation output: {str(e)}"}
168
 
 
 
 
 
 
 
 
169
 
170
  return {
171
  "status": "success",
@@ -180,8 +193,8 @@ def run_pipeline_and_evaluate(pipeline_input_path: str, model_name: str, start_t
180
  return {
181
  "status": "error",
182
  "error": error_message,
183
- "stdout": e.stdout,
184
- "stderr": e.stderr
185
  }
186
  except Exception as e:
187
  error_message = f"Pipeline execution failed: {str(e)}"
 
11
 
12
  def get_azure_connection_string():
13
  """Get Azure connection string from environment variable"""
14
+ conn_string = "DefaultEndpointsProtocol=https;AccountName=transcribedblobstorage;AccountKey=1Z7yKPP5DLbxnoHdh7NmHgwg3dFLaDiYHUELdid7dzfzR6/DvkZnnzpJ30lrXIMhtD5GYKo+71jP+AStC1TEvA==;EndpointSuffix=core.windows.net"
15
  if not conn_string:
16
  raise ValueError("Azure Storage connection string not found in environment variables")
17
  return conn_string
18
 
19
+ def upload_file(file_path: str) -> str:
20
+ if not os.path.isfile(file_path):
21
+ raise FileNotFoundError(f"The specified file does not exist: {file_path}")
22
+ container_name = "saasdev"
23
+ connection_string = get_azure_connection_string()
24
+ blob_service_client = BlobServiceClient.from_connection_string(connection_string)
25
+ container_client = blob_service_client.get_container_client(container_name)
 
26
 
27
+ # Generate a unique blob name using UUID
28
+ blob_name = f"{uuid.uuid4()}.pdf"
29
+ with open(file_path, 'rb') as file:
30
+ blob_client = container_client.get_blob_client(blob_name)
31
+ blob_client.upload_blob(file)
32
+ logging.info(f"File uploaded to blob: {blob_name}")
33
+ return blob_name
34
 
35
  def download_blob(blob_name: str, download_file_path: str) -> None:
36
  """Download a file from Azure Blob Storage"""
 
47
  download_file.write(download_stream.readall())
48
  logging.info(f"Blob '{blob_name}' downloaded to '{download_file_path}'")
49
 
50
+
51
  def clean_directory(directory: str) -> None:
52
  """Clean up a directory by removing all files and subdirectories"""
53
  if os.path.exists(directory):
 
65
  start_time = time.time()
66
  logging.info("Handler function started")
67
 
 
68
  job_input = job.get('input', {})
69
  required_fields = ['pdf_file', 'system_prompt', 'model_name', 'max_step', 'learning_rate', 'epochs']
70
  missing_fields = [field for field in required_fields if field not in job_input]
 
78
  work_dir = os.path.abspath(f"/tmp/work_{str(uuid.uuid4())}")
79
 
80
  try:
 
81
  os.makedirs(work_dir, exist_ok=True)
82
  logging.info(f"Working directory created: {work_dir}")
83
 
84
+ # Upload PDF to Blob
85
+ pdf_path = job_input['pdf_file']
86
+ generated_blob_name = upload_file(pdf_path)
87
+ logging.info(f"PDF uploaded with blob name: {generated_blob_name}")
88
 
89
+ # Download the uploaded PDF using the internally generated blob name
90
+ downloaded_path = os.path.join(work_dir, "Downloaded_PDF.pdf")
91
+ download_blob(generated_blob_name, downloaded_path)
92
+ logging.info(f"PDF downloaded to: {downloaded_path}")
93
 
94
+ # Save pipeline input as JSON
95
  pipeline_input_path = os.path.join(work_dir, "pipeline_input.json")
96
  pipeline_input = {
97
+ "pdf_file": downloaded_path,
98
  "system_prompt": job_input['system_prompt'],
99
  "model_name": job_input['model_name'],
100
  "max_step": job_input['max_step'],
 
117
  }
118
 
119
  finally:
 
120
  try:
121
  clean_directory(work_dir)
122
  os.rmdir(work_dir)
123
  except Exception as e:
124
  logging.error(f"Failed to clean up working directory: {str(e)}")
125
 
126
+
127
  def run_pipeline_and_evaluate(pipeline_input_path: str, model_name: str, start_time: float) -> Dict[str, Any]:
128
  try:
129
+ # Suppress logging output
130
+ logging.getLogger().setLevel(logging.ERROR)
131
+
132
  # Read the pipeline input file
133
  with open(pipeline_input_path, 'r') as f:
134
  pipeline_input = json.load(f)
 
137
  pipeline_input_str = json.dumps(pipeline_input)
138
 
139
  # Run fine-tuning pipeline with JSON string as argument
140
+ # logging.info(f"Running pipeline with input: {pipeline_input_str[:100]}...")
141
  finetuning_result = subprocess.run(
142
  ['python3', 'Finetuning_Pipeline.py', pipeline_input_str],
143
  capture_output=True,
 
172
  except Exception as e:
173
  evaluation_results = {"error": f"Failed to process evaluation output: {str(e)}"}
174
 
175
+ # Print only the JSON part to stdout for capturing in Gradio
176
+ print(json.dumps({
177
+ "status": "success",
178
+ "model_name": f"PharynxAI/{model_name}",
179
+ "processing_time": time.time() - start_time,
180
+ "evaluation_results": evaluation_results
181
+ }))
182
 
183
  return {
184
  "status": "success",
 
193
  return {
194
  "status": "error",
195
  "error": error_message,
196
+ # "stdout": e.stdout,
197
+ # "stderr": e.stderr
198
  }
199
  except Exception as e:
200
  error_message = f"Pipeline execution failed: {str(e)}"