"""Example Python client for vllm.entrypoints.api_server""" import argparse import json from typing import Iterable, List import requests def clear_line(n: int = 1) -> None: LINE_UP = '\033[1A' LINE_CLEAR = '\x1b[2K' for _ in range(n): print(LINE_UP, end=LINE_CLEAR, flush=True) def post_http_request(prompt: str, api_url: str, n: int = 1, stream: bool = False) -> requests.Response: headers = {"User-Agent": "Test Client"} pload = { "prompt": prompt, "n": n, "use_beam_search": True, "temperature": 0.0, "max_tokens": 16, "stream": stream, } response = requests.post(api_url, headers=headers, json=pload, stream=True) return response def get_streaming_response(response: requests.Response) -> Iterable[List[str]]: for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"): if chunk: data = json.loads(chunk.decode("utf-8")) output = data["text"] yield output def get_response(response: requests.Response) -> List[str]: data = json.loads(response.content) output = data["text"] return output if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--host", type=str, default="localhost") parser.add_argument("--port", type=int, default=8000) parser.add_argument("--n", type=int, default=4) parser.add_argument("--prompt", type=str, default="San Francisco is a") parser.add_argument("--stream", action="store_true") args = parser.parse_args() prompt = args.prompt api_url = f"http://{args.host}:{args.port}/generate" n = args.n stream = args.stream print(f"Prompt: {prompt!r}\n", flush=True) response = post_http_request(prompt, api_url, n, stream) if stream: num_printed_lines = 0 for h in get_streaming_response(response): clear_line(num_printed_lines) num_printed_lines = 0 for i, line in enumerate(h): num_printed_lines += 1 print(f"Beam candidate {i}: {line!r}", flush=True) else: output = get_response(response) for i, line in enumerate(output): print(f"Beam candidate {i}: {line!r}", flush=True)