Spaces:
Sleeping
Sleeping
File size: 5,121 Bytes
c48ca45 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
import os
import fire
from enum import Enum
from threading import Thread
from transformers import AutoModelForCausalLM, AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
from transformers import TextIteratorStreamer
from flask import Flask, request, jsonify
BOS, EOS = "<s>", "</s>"
E_INST = "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest cybersecurity analyst. Being a security analyst you must scrutanize the details provided to ensure it is usable for penitration testing. Please ensure that your responses are socially unbiased and positive in nature.
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
def format_to_llama_chat_style(user_instructions, history) -> str:
B_INST = f"[INST]{user_instructions}"
prompt = ""
for i, dialog in enumerate(history[:-1]):
instruction, response = dialog[0], dialog[1]
if i == 0:
instruction = f"{B_SYS}{DEFAULT_SYSTEM_PROMPT}{E_SYS}" + instruction
else:
prompt += BOS
prompt += f"{B_INST} {instruction.strip()} {E_INST} {response.strip()} " + EOS
new_instruction = history[-1][0].strip()
if len(history) > 1:
prompt += BOS
else:
new_instruction = f"{B_SYS}{DEFAULT_SYSTEM_PROMPT}{E_SYS}" + \
new_instruction
prompt += f"{B_INST} {new_instruction} {E_INST}"
return prompt
class Model_Type(Enum):
gptq = 1
ggml = 2
full_precision = 3
def get_model_type(model_name):
if "gptq" in model_name.lower():
return Model_Type.gptq
elif "ggml" in model_name.lower():
return Model_Type.ggml
else:
return Model_Type.full_precision
def create_folder_if_not_exists(folder_path):
if not os.path.exists(folder_path):
os.makedirs(folder_path)
def initialize_gpu_model_and_tokenizer(model_name, model_type):
if model_type == Model_Type.gptq:
model = AutoGPTQForCausalLM.from_quantized(
model_name, device_map="auto", use_safetensors=True,
use_triton=False)
tokenizer = AutoTokenizer.from_pretrained(model_name)
else:
model = AutoModelForCausalLM.from_pretrained(
model_name, device_map="auto", token=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, token=True)
return model, tokenizer
def init_auto_model_and_tokenizer(model_name, model_type, file_name=None):
model_type = get_model_type(model_name)
if Model_Type.ggml == model_type:
models_folder = "./models"
create_folder_if_not_exists(models_folder)
file_path = hf_hub_download(
repo_id=model_name, filename=file_name, local_dir=models_folder)
model = Llama(file_path, n_ctx=4096)
tokenizer = None
else:
model, tokenizer = initialize_gpu_model_and_tokenizer(
model_name, model_type=model_type)
return model, tokenizer
app = Flask(__name__)
@app.route('/api/chatbot', methods=['POST'])
def chatbot_api():
data = request.json
user_instruction = data['user_instruction']
user_message = data['user_message']
model_name = data['model_name']
file_name = data.get('file_name')
is_chat_model = 'chat' in model_name.lower()
model_type = get_model_type(model_name)
if model_type == Model_Type.ggml:
assert file_name is not None, """
When model_name is provided for a GGML quantized model, file_name argument must also be provided."""
model, tokenizer = init_auto_model_and_tokenizer(
model_name, model_type, file_name)
if is_chat_model:
instruction = format_to_llama_chat_style(user_instruction, [[user_message, None]])
else:
instruction = user_message
history = [[user_message, None]]
response = generate_response(
model, tokenizer, instruction, history, model_type)
return jsonify({'bot_response': response})
def generate_response(model, tokenizer, instruction, history, model_type):
response = ""
kwargs = dict(temperature=0.6, top_p=0.9)
if model_type == Model_Type.ggml:
kwargs["max_tokens"] = 512
for chunk in model(prompt=instruction, stream=True, **kwargs):
token = chunk["choices"][0]["text"]
response += token
else:
streamer = TextIteratorStreamer(
tokenizer, skip_prompt=True, Timeout=5)
inputs = tokenizer(instruction, return_tensors="pt").to(model.device)
kwargs["max_new_tokens"] = 512
kwargs["input_ids"] = inputs["input_ids"]
kwargs["streamer"] = streamer
thread = Thread(target=model.generate, kwargs=kwargs)
thread.start()
for token in streamer:
response += token
return response
def run_app(port):
app.run(port=port)
if __name__ == '__main__':
fire.Fire(run_app(5000))
|