Spaces:
Runtime error
Runtime error
Fix long queue waits with mechanism to prevent running duplicate jobs
#42
by
ccocks-deca
- opened
app.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
import pathlib
|
|
|
|
| 3 |
import random
|
| 4 |
import string
|
| 5 |
import tempfile
|
|
@@ -113,12 +114,40 @@ examples = [[str(f)] for f in pathlib.Path("examples").glob("*.yaml")]
|
|
| 113 |
COMMUNITY_HF_TOKEN = os.getenv("COMMUNITY_HF_TOKEN")
|
| 114 |
|
| 115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
def merge(yaml_config: str, hf_token: str, repo_name: str) -> Iterable[List[Log]]:
|
| 117 |
runner = LogsViewRunner()
|
| 118 |
|
| 119 |
if not yaml_config:
|
| 120 |
yield runner.log("Empty yaml, pick an example below", level="ERROR")
|
| 121 |
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
try:
|
| 123 |
merge_config = MergeConfiguration.model_validate(yaml.safe_load(yaml_config))
|
| 124 |
except Exception as e:
|
|
@@ -170,7 +199,7 @@ def merge(yaml_config: str, hf_token: str, repo_name: str) -> Iterable[List[Log]
|
|
| 170 |
return
|
| 171 |
|
| 172 |
# Set tmp HF_HOME to avoid filling up disk Space
|
| 173 |
-
tmp_env = os.environ.copy()
|
| 174 |
tmp_env["HF_HOME"] = f"{tmpdirname}/.cache"
|
| 175 |
full_cli = cli + f" --lora-merge-cache {tmpdirname}/.lora_cache"
|
| 176 |
yield from runner.run_command(full_cli.split(), cwd=merged_path, env=tmp_env)
|
|
@@ -188,6 +217,9 @@ def merge(yaml_config: str, hf_token: str, repo_name: str) -> Iterable[List[Log]
|
|
| 188 |
)
|
| 189 |
yield runner.log(f"Model successfully uploaded to HF: {repo_url.repo_id}")
|
| 190 |
|
|
|
|
|
|
|
|
|
|
| 191 |
# This is workaround. As the space always getting stuck.
|
| 192 |
def _restart_space():
|
| 193 |
huggingface_hub.HfApi().restart_space(repo_id="arcee-ai/mergekit-gui", token=COMMUNITY_HF_TOKEN, factory_reboot=False)
|
|
|
|
| 1 |
import os
|
| 2 |
import pathlib
|
| 3 |
+
import hashlib
|
| 4 |
import random
|
| 5 |
import string
|
| 6 |
import tempfile
|
|
|
|
| 114 |
COMMUNITY_HF_TOKEN = os.getenv("COMMUNITY_HF_TOKEN")
|
| 115 |
|
| 116 |
|
| 117 |
+
# A dictionary to store active jobs and their respective job IDs (which will be used to track them)
|
| 118 |
+
active_jobs = {}
|
| 119 |
+
|
| 120 |
+
def get_yaml_hash(yaml_config: str) -> str:
|
| 121 |
+
"""Generate a hash for the YAML config to detect duplicates."""
|
| 122 |
+
return hashlib.sha256(yaml_config.encode("utf-8")).hexdigest()
|
| 123 |
+
|
| 124 |
def merge(yaml_config: str, hf_token: str, repo_name: str) -> Iterable[List[Log]]:
|
| 125 |
runner = LogsViewRunner()
|
| 126 |
|
| 127 |
if not yaml_config:
|
| 128 |
yield runner.log("Empty yaml, pick an example below", level="ERROR")
|
| 129 |
return
|
| 130 |
+
|
| 131 |
+
yaml_hash = get_yaml_hash(yaml_config)
|
| 132 |
+
|
| 133 |
+
# Check if this YAML job is already running
|
| 134 |
+
if yaml_hash in active_jobs:
|
| 135 |
+
old_job_id = active_jobs[yaml_hash]
|
| 136 |
+
yield runner.log(f"Duplicate job detected! An identical job is already running with Job ID: {old_job_id}.", level="WARNING")
|
| 137 |
+
user_input = yield gradio.inputs.Button(label="Continue with new job", info="Do you want to cancel the old job and continue with the new one?")
|
| 138 |
+
|
| 139 |
+
if user_input == "Continue with new job":
|
| 140 |
+
# Cancel the old job and remove it from active jobs
|
| 141 |
+
runner.log(f"Cancelling the old job with Job ID: {old_job_id}")
|
| 142 |
+
# This part assumes you have the ability to cancel the previous job if needed
|
| 143 |
+
# In real implementation, you'd stop the old task/process here
|
| 144 |
+
active_jobs.pop(yaml_hash) # Remove the old job from the active jobs list
|
| 145 |
+
else:
|
| 146 |
+
# If user chooses not to continue, exit
|
| 147 |
+
yield runner.log("Duplicate job detected. Operation aborted.", level="ERROR")
|
| 148 |
+
return
|
| 149 |
+
|
| 150 |
+
# Proceed with the merge
|
| 151 |
try:
|
| 152 |
merge_config = MergeConfiguration.model_validate(yaml.safe_load(yaml_config))
|
| 153 |
except Exception as e:
|
|
|
|
| 199 |
return
|
| 200 |
|
| 201 |
# Set tmp HF_HOME to avoid filling up disk Space
|
| 202 |
+
tmp_env = os.environ.copy() # taken from https://stackoverflow.com/a/4453495
|
| 203 |
tmp_env["HF_HOME"] = f"{tmpdirname}/.cache"
|
| 204 |
full_cli = cli + f" --lora-merge-cache {tmpdirname}/.lora_cache"
|
| 205 |
yield from runner.run_command(full_cli.split(), cwd=merged_path, env=tmp_env)
|
|
|
|
| 217 |
)
|
| 218 |
yield runner.log(f"Model successfully uploaded to HF: {repo_url.repo_id}")
|
| 219 |
|
| 220 |
+
# Track this YAML as an active job
|
| 221 |
+
active_jobs[yaml_hash] = "new_job_id"
|
| 222 |
+
|
| 223 |
# This is workaround. As the space always getting stuck.
|
| 224 |
def _restart_space():
|
| 225 |
huggingface_hub.HfApi().restart_space(repo_id="arcee-ai/mergekit-gui", token=COMMUNITY_HF_TOKEN, factory_reboot=False)
|