advanced

Running on CPU Upgrade

advanced / app.py

Alina Lozovskaya

Add kill task

3119795 4 months ago

9.03 kB

	import os
	import sys
	import pathlib
	import shutil
	import threading
	import multiprocessing
	import io
	import yaml
	import gradio as gr
	from loguru import logger
	from yourbench.pipeline import run_pipeline

	UPLOAD_DIRECTORY = pathlib.Path("/app/uploaded_files")
	UPLOAD_DIRECTORY.mkdir(parents=True, exist_ok=True)
	CONFIG_PATH = pathlib.Path("/app/yourbench_config.yml")

	logger.remove()
	logger.add(sys.stderr, level="INFO")

	import subprocess
	import io
	import os
	import time

	class SubprocessManager:
	def __init__(self, command):
	self.command = command
	self.process = None
	self.output_stream = io.StringIO()

	def start_process(self):
	"""Start the subprocess."""
	if self.is_running():
	logger.info("Process is already running")
	return

	self.process = subprocess.Popen(
	self.command,
	stdout=subprocess.PIPE,
	stderr=subprocess.STDOUT, # Combine stderr with stdout
	text=True,
	bufsize=1, # Line-buffered
	start_new_session=True # Start the process in a new session
	)
	os.set_blocking(self.process.stdout.fileno(), False)
	logger.info("Started the process")

	def read_and_get_output(self):
	"""Read available subprocess output and return the captured output."""
	if self.process and self.process.stdout:
	try:
	while True:
	line = self.process.stdout.readline()
	if line:
	self.output_stream.write(line) # Capture in StringIO
	else:
	break
	except BlockingIOError:
	pass
	return self.output_stream.getvalue()

	def stop_process(self):
	"""Terminate the subprocess."""
	if not self.is_running():
	logger.info("Started the process")
	return
	logger.info("Sending SIGTERM to the Process")
	self.process.terminate()
	exit_code = self.process.wait() # Wait for process to terminate
	logger.info(f"Process stopped exit code {exit_code}")
	#return exit_code

	def kill_process(self):
	"""Forcefully kill the subprocess."""
	if not self.is_running():
	logger.info("Process is not running")
	return
	logger.info("Sending SIGKILL to the Process")
	self.process.kill()
	exit_code = self.process.wait() # Wait for process to be killed
	logger.info(f"Process killed exit code {exit_code}")
	#return exit_code

	def is_running(self):
	"""Check if the subprocess is still running."""
	return self.process and self.process.poll() is None


	command = ["uv", "run", "yourbench", f"--config={CONFIG_PATH}"]
	manager = SubprocessManager(command)

	def generate_config(hf_token, hf_org, model_name, provider, base_url, api_key, max_concurrent_requests):
	config = {
	"hf_configuration": {
	"token": hf_token,
	"private": True,
	"hf_organization": hf_org
	},
	"model_list": [{
	"model_name": model_name,
	"provider": provider,
	"base_url": base_url,
	"api_key": api_key,
	"max_concurrent_requests": max_concurrent_requests
	}],
	"model_roles": {role: [model_name] for role in [
	"ingestion", "summarization", "single_shot_question_generation",
	"multi_hop_question_generation", "answer_generation", "judge_answers"
	]},
	"inference_config": {"max_concurrent_requests": 16},
	"pipeline": {
	"ingestion": {
	"source_documents_dir": "/app/uploaded_files",
	"output_dir": "/app/ingested",
	"run": True
	},
	"upload_ingest_to_hub": {
	"source_documents_dir": "/app/ingested",
	"hub_dataset_name": "test_ingested_documents",
	"local_dataset_path": "/app/ingested_dataset",
	"run": True
	},
	"summarization": {
	"source_dataset_name": "test_ingested_documents",
	"output_dataset_name": "test_summaries",
	"local_dataset_path": "/results/test_summaries",
	"concat_existing_dataset": False,
	"run": True
	},
	"chunking": {
	"source_dataset_name": "test_summaries",
	"output_dataset_name": "test_chunked_documents",
	"local_dataset_path": "/results/test_chunked_documents",
	"concat_existing_dataset": False,
	"chunking_configuration": {
	"l_min_tokens": 64,
	"l_max_tokens": 128,
	"tau_threshold": 0.3,
	"h_min": 2,
	"h_max": 4
	},
	"run": True
	},
	"single_shot_question_generation": {
	"source_dataset_name": "test_chunked_documents",
	"output_dataset_name": "test_single_shot_questions",
	"local_dataset_path": "/results/test_single_shot_questions",
	"diversification_seed": "24 year old adult",
	"concat_existing_dataset": False,
	"run": True
	},
	"multi_hop_question_generation": {
	"source_dataset_name": "test_chunked_documents",
	"output_dataset_name": "test_multi_hop_questions",
	"local_dataset_path": "/results/test_multi_hop_questions",
	"concat_existing_dataset": False,
	"run": True
	},
	"answer_generation": {
	"run": True,
	"question_dataset_name": "test_single_shot_questions",
	"output_dataset_name": "test_answered_questions",
	"local_dataset_path": "/results/test_answered_questions",
	"concat_existing_dataset": False,
	"strategies": [{
	"name": "zeroshot",
	"prompt": "ZEROSHOT_QA_USER_PROMPT",
	"model_name": model_name
	}, {
	"name": "gold",
	"prompt": "GOLD_QA_USER_PROMPT",
	"model_name": model_name
	}]
	},
	"judge_answers": {
	"run": True,
	"source_judge_dataset_name": "test_answered_questions",
	"output_judged_dataset_name": "test_judged_comparisons",
	"local_dataset_path": "/results/test_judged_comparisons",
	"concat_existing_dataset": False,
	"comparing_strategies": [["zeroshot", "gold"]],
	"chunk_column_index": 0,
	"random_seed": 42
	}
	}
	}
	return yaml.dump(config, default_flow_style=False)

	def save_config(yaml_text):
	with open(CONFIG_PATH, "w") as file:
	file.write(yaml_text)
	return "✅ Config saved!"

	def save_files(files: list[str]):
	saved_paths = [shutil.move(str(pathlib.Path(file)), str(UPLOAD_DIRECTORY / pathlib.Path(file).name)) for file in files]
	return f"Files saved to: {', '.join(saved_paths)}"

	app = gr.Blocks()

	with app:
	gr.Markdown("## YourBench Configuration")

	with gr.Tab("Configuration"):
	hf_token = gr.Textbox(label="HF Token")
	hf_org = gr.Textbox(label="HF Organization")
	model_name = gr.Textbox(label="Model Name")
	provider = gr.Dropdown(["openrouter", "openai", "huggingface"], value="huggingface", label="Provider")
	base_url = gr.Textbox(label="Base URL")
	api_key = gr.Textbox(label="API Key")
	max_concurrent_requests = gr.Dropdown([8, 16, 32], value=16, label="Max Concurrent Requests")
	config_output = gr.Code(label="Generated Config", language="yaml")
	preview_button = gr.Button("Generate Config")
	save_button = gr.Button("Save Config")

	preview_button.click(generate_config, inputs=[hf_token, hf_org, model_name, provider, base_url, api_key, max_concurrent_requests], outputs=config_output)
	save_button.click(save_config, inputs=[config_output], outputs=[gr.Textbox(label="Save Status")])

	with gr.Tab("Files"):
	file_input = gr.File(label="Upload text files", file_count="multiple", file_types=[".txt", ".md", ".html"])
	output = gr.Textbox(label="Log")
	file_input.upload(save_files, file_input, output)

	with gr.Tab("Run Generation"):
	log_output = gr.Code(label="Log Output", language=None, lines=20, interactive=False)
	start_button = gr.Button("Start Task")
	start_button.click(manager.start_process)
	timer = gr.Timer(0.1, active=True)
	timer.tick(manager.read_and_get_output, outputs=log_output)

	start_button = gr.Button("Kill Task")
	start_button.click(manager.kill_process)

	app.launch()