File size: 2,950 Bytes
efe0924
8910711
efe0924
8910711
efe0924
6a0a9f7
efe0924
 
 
 
 
 
 
 
 
80d4e55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
efe0924
 
 
 
 
 
80d4e55
 
 
 
 
 
 
 
 
efe0924
 
 
80d4e55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
efe0924
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8910711
 
 
 
 
 
80d4e55
8910711
 
 
 
 
 
80d4e55
efe0924
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
"""
Client test.

Run server:

python generate.py  --base_model=h2oai/h2ogpt-oig-oasst1-512-6.9b

NOTE: For private models, add --use-auth_token=True

NOTE: --infer_devices=True (default) must be used for multi-GPU in case see failures with cuda:x cuda:y mismatches.
Currently, this will force model to be on a single GPU.

Then run this client as:

python client_test.py



For HF spaces:

HOST="https://h2oai-h2ogpt-chatbot.hf.space" python client_test.py

Result:

Loaded as API: https://h2oai-h2ogpt-chatbot.hf.space βœ”
{'instruction_nochat': 'Who are you?', 'iinput_nochat': '', 'response': 'I am h2oGPT, a large language model developed by LAION.'}


For demo:

HOST="https://gpt.h2o.ai" python client_test.py

Result:

Loaded as API: https://gpt.h2o.ai βœ”
{'instruction_nochat': 'Who are you?', 'iinput_nochat': '', 'response': 'I am h2oGPT, a chatbot created by LAION.'}

"""

debug = False

import os
os.environ['HF_HUB_DISABLE_TELEMETRY'] = '1'


def get_client():
    from gradio_client import Client

    client = Client(os.getenv('HOST', "http://localhost:7860"))
    if debug:
        print(client.view_api(all_endpoints=True))
    return client


def test_client_basic():
    instruction = ''  # only for chat=True
    iinput = ''  # only for chat=True
    context = ''
    # streaming output is supported, loops over and outputs each generation in streaming mode
    # but leave stream_output=False for simple input/output mode
    stream_output = False
    prompt_type = 'human_bot'
    temperature = 0.1
    top_p = 0.75
    top_k = 40
    num_beams = 1
    max_new_tokens = 50
    min_new_tokens = 0
    early_stopping = False
    max_time = 20
    repetition_penalty = 1.0
    num_return_sequences = 1
    do_sample = True
    # only these 2 below used if pass chat=False
    chat = False
    instruction_nochat = "Who are you?"
    iinput_nochat = ''

    args = [instruction,
            iinput,
            context,
            stream_output,
            prompt_type,
            temperature,
            top_p,
            top_k,
            num_beams,
            max_new_tokens,
            min_new_tokens,
            early_stopping,
            max_time,
            repetition_penalty,
            num_return_sequences,
            do_sample,
            chat,
            instruction_nochat,
            iinput_nochat,
            ]
    api_name = '/submit_nochat'
    client = get_client()
    res = client.predict(
        *tuple(args),
        api_name=api_name,
    )
    res_dict = dict(instruction_nochat=instruction_nochat, iinput_nochat=iinput_nochat, response=md_to_text(res))
    print(res_dict)
    return res_dict


import markdown  # pip install markdown
from bs4 import BeautifulSoup  # pip install beautifulsoup4


def md_to_text(md):
    html = markdown.markdown(md)
    soup = BeautifulSoup(html, features='html.parser')
    return soup.get_text()


if __name__ == '__main__':
    test_client_basic()