Spaces:

PharynxAI
/

LLM_FinetuneR

Paused

App Files Files Community

Diksha2001 commited on Dec 2, 2024

Commit

7552f31

verified ·

1 Parent(s): 30b9ad3

Update handler.py

Browse files

Files changed (1) hide show

handler.py +40 -27

handler.py CHANGED Viewed

@@ -11,24 +11,26 @@ from azure.storage.blob import BlobServiceClient
 def get_azure_connection_string():
     """Get Azure connection string from environment variable"""
-    conn_string ="DefaultEndpointsProtocol=https;AccountName=transcribedblobstorage;AccountKey=1Z7yKPP5DLbxnoHdh7NmHgwg3dFLaDiYHUELdid7dzfzR6/DvkZnnzpJ30lrXIMhtD5GYKo+71jP+AStC1TEvA==;EndpointSuffix=core.windows.net"
     if not conn_string:
         raise ValueError("Azure Storage connection string not found in environment variables")
     return conn_string
-# def upload_file(file_path: str, blob_name: str) -> str:
-#     """Upload a file to Azure Blob Storage"""
-#     if not os.path.isfile(file_path):
-#         raise FileNotFoundError(f"The specified file does not exist: {file_path}")
-#     container_name = "saasdev"
-#     connection_string = get_azure_connection_string()
-#     blob_service_client = BlobServiceClient.from_connection_string(connection_string)
-#     container_client = blob_service_client.get_container_client(container_name)
-#     with open(file_path, 'rb') as file:
-#         blob_client = container_client.get_blob_client(blob_name)
-#         blob_client.upload_blob(file)
-#         return blob_client.blob_name
 def download_blob(blob_name: str, download_file_path: str) -> None:
     """Download a file from Azure Blob Storage"""
@@ -45,6 +47,7 @@ def download_blob(blob_name: str, download_file_path: str) -> None:
         download_file.write(download_stream.readall())
     logging.info(f"Blob '{blob_name}' downloaded to '{download_file_path}'")
 def clean_directory(directory: str) -> None:
     """Clean up a directory by removing all files and subdirectories"""
     if os.path.exists(directory):
@@ -62,7 +65,6 @@ def handler(job: Dict[str, Any]) -> Dict[str, Any]:
     start_time = time.time()
     logging.info("Handler function started")
-    # Extract job input and validate
     job_input = job.get('input', {})
     required_fields = ['pdf_file', 'system_prompt', 'model_name', 'max_step', 'learning_rate', 'epochs']
     missing_fields = [field for field in required_fields if field not in job_input]
@@ -76,22 +78,23 @@ def handler(job: Dict[str, Any]) -> Dict[str, Any]:
     work_dir = os.path.abspath(f"/tmp/work_{str(uuid.uuid4())}")
     try:
-        # Create working directory
         os.makedirs(work_dir, exist_ok=True)
         logging.info(f"Working directory created: {work_dir}")
-        # Download and process PDF
-        download_path = os.path.join(work_dir, "Input_PDF.pdf")
-        download_blob(job_input['pdf_file'], download_path)
-        # Verify downloaded file exists
-        if not os.path.exists(download_path):
-            raise FileNotFoundError(f"Downloaded PDF file not found at: {download_path}")
-        # Save pipeline input as a JSON file
         pipeline_input_path = os.path.join(work_dir, "pipeline_input.json")
         pipeline_input = {
-            "pdf_file": download_path,
             "system_prompt": job_input['system_prompt'],
             "model_name": job_input['model_name'],
             "max_step": job_input['max_step'],
@@ -114,15 +117,18 @@ def handler(job: Dict[str, Any]) -> Dict[str, Any]:
         }
     finally:
-        # Clean up working directory
         try:
             clean_directory(work_dir)
             os.rmdir(work_dir)
         except Exception as e:
             logging.error(f"Failed to clean up working directory: {str(e)}")
 def run_pipeline_and_evaluate(pipeline_input_path: str, model_name: str, start_time: float) -> Dict[str, Any]:
     try:
         # Read the pipeline input file
         with open(pipeline_input_path, 'r') as f:
             pipeline_input = json.load(f)
@@ -131,7 +137,7 @@ def run_pipeline_and_evaluate(pipeline_input_path: str, model_name: str, start_t
         pipeline_input_str = json.dumps(pipeline_input)
         # Run fine-tuning pipeline with JSON string as argument
-        logging.info(f"Running pipeline with input: {pipeline_input_str[:100]}...")
         finetuning_result = subprocess.run(
             ['python3', 'Finetuning_Pipeline.py', pipeline_input_str],
             capture_output=True,
@@ -166,6 +172,13 @@ def run_pipeline_and_evaluate(pipeline_input_path: str, model_name: str, start_t
         except Exception as e:
             evaluation_results = {"error": f"Failed to process evaluation output: {str(e)}"}
         return {
             "status": "success",
@@ -180,8 +193,8 @@ def run_pipeline_and_evaluate(pipeline_input_path: str, model_name: str, start_t
         return {
             "status": "error",
             "error": error_message,
-            "stdout": e.stdout,
-            "stderr": e.stderr
         }
     except Exception as e:
         error_message = f"Pipeline execution failed: {str(e)}"

 def get_azure_connection_string():
     """Get Azure connection string from environment variable"""
+    conn_string = "DefaultEndpointsProtocol=https;AccountName=transcribedblobstorage;AccountKey=1Z7yKPP5DLbxnoHdh7NmHgwg3dFLaDiYHUELdid7dzfzR6/DvkZnnzpJ30lrXIMhtD5GYKo+71jP+AStC1TEvA==;EndpointSuffix=core.windows.net"
     if not conn_string:
         raise ValueError("Azure Storage connection string not found in environment variables")
     return conn_string
+def upload_file(file_path: str) -> str:
+    if not os.path.isfile(file_path):
+        raise FileNotFoundError(f"The specified file does not exist: {file_path}")
+    container_name = "saasdev"
+    connection_string = get_azure_connection_string()
+    blob_service_client = BlobServiceClient.from_connection_string(connection_string)
+    container_client = blob_service_client.get_container_client(container_name)
+    # Generate a unique blob name using UUID
+    blob_name = f"{uuid.uuid4()}.pdf"
+    with open(file_path, 'rb') as file:
+        blob_client = container_client.get_blob_client(blob_name)
+        blob_client.upload_blob(file)
+        logging.info(f"File uploaded to blob: {blob_name}")
+        return blob_name
 def download_blob(blob_name: str, download_file_path: str) -> None:
     """Download a file from Azure Blob Storage"""
         download_file.write(download_stream.readall())
     logging.info(f"Blob '{blob_name}' downloaded to '{download_file_path}'")
 def clean_directory(directory: str) -> None:
     """Clean up a directory by removing all files and subdirectories"""
     if os.path.exists(directory):
     start_time = time.time()
     logging.info("Handler function started")
     job_input = job.get('input', {})
     required_fields = ['pdf_file', 'system_prompt', 'model_name', 'max_step', 'learning_rate', 'epochs']
     missing_fields = [field for field in required_fields if field not in job_input]
     work_dir = os.path.abspath(f"/tmp/work_{str(uuid.uuid4())}")
     try:
         os.makedirs(work_dir, exist_ok=True)
         logging.info(f"Working directory created: {work_dir}")
+        # Upload PDF to Blob
+        pdf_path = job_input['pdf_file']
+        generated_blob_name = upload_file(pdf_path)
+        logging.info(f"PDF uploaded with blob name: {generated_blob_name}")
+        # Download the uploaded PDF using the internally generated blob name
+        downloaded_path = os.path.join(work_dir, "Downloaded_PDF.pdf")
+        download_blob(generated_blob_name, downloaded_path)
+        logging.info(f"PDF downloaded to: {downloaded_path}")
+        # Save pipeline input as JSON
         pipeline_input_path = os.path.join(work_dir, "pipeline_input.json")
         pipeline_input = {
+            "pdf_file": downloaded_path,
             "system_prompt": job_input['system_prompt'],
             "model_name": job_input['model_name'],
             "max_step": job_input['max_step'],
         }
     finally:
         try:
             clean_directory(work_dir)
             os.rmdir(work_dir)
         except Exception as e:
             logging.error(f"Failed to clean up working directory: {str(e)}")
 def run_pipeline_and_evaluate(pipeline_input_path: str, model_name: str, start_time: float) -> Dict[str, Any]:
     try:
+        # Suppress logging output
+        logging.getLogger().setLevel(logging.ERROR)
         # Read the pipeline input file
         with open(pipeline_input_path, 'r') as f:
             pipeline_input = json.load(f)
         pipeline_input_str = json.dumps(pipeline_input)
         # Run fine-tuning pipeline with JSON string as argument
+        # logging.info(f"Running pipeline with input: {pipeline_input_str[:100]}...")
         finetuning_result = subprocess.run(
             ['python3', 'Finetuning_Pipeline.py', pipeline_input_str],
             capture_output=True,
         except Exception as e:
             evaluation_results = {"error": f"Failed to process evaluation output: {str(e)}"}
+        # Print only the JSON part to stdout for capturing in Gradio
+        print(json.dumps({
+            "status": "success",
+            "model_name": f"PharynxAI/{model_name}",
+            "processing_time": time.time() - start_time,
+            "evaluation_results": evaluation_results
+        }))
         return {
             "status": "success",
         return {
             "status": "error",
             "error": error_message,
+            # "stdout": e.stdout,
+            # "stderr": e.stderr
         }
     except Exception as e:
         error_message = f"Pipeline execution failed: {str(e)}"