Spaces:

enotkrutoy
/

nmap

Sleeping

App Files Files Community

nmap / llama_api.py

enotkrutoy

Upload 15 files

c48ca45 verified 2 months ago

raw

history blame contribute delete

5.12 kB

	import os
	import fire
	from enum import Enum
	from threading import Thread
	from transformers import AutoModelForCausalLM, AutoTokenizer
	from auto_gptq import AutoGPTQForCausalLM
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download
	from transformers import TextIteratorStreamer
	from flask import Flask, request, jsonify


	BOS, EOS = "<s>", "</s>"
	E_INST = "[/INST]"
	B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
	DEFAULT_SYSTEM_PROMPT = """\
	You are a helpful, respectful and honest cybersecurity analyst. Being a security analyst you must scrutanize the details provided to ensure it is usable for penitration testing. Please ensure that your responses are socially unbiased and positive in nature.
	If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""


	def format_to_llama_chat_style(user_instructions, history) -> str:
	B_INST = f"[INST]{user_instructions}"
	prompt = ""
	for i, dialog in enumerate(history[:-1]):
	instruction, response = dialog[0], dialog[1]
	if i == 0:
	instruction = f"{B_SYS}{DEFAULT_SYSTEM_PROMPT}{E_SYS}" + instruction
	else:
	prompt += BOS
	prompt += f"{B_INST} {instruction.strip()} {E_INST} {response.strip()} " + EOS

	new_instruction = history[-1][0].strip()
	if len(history) > 1:
	prompt += BOS
	else:
	new_instruction = f"{B_SYS}{DEFAULT_SYSTEM_PROMPT}{E_SYS}" + \
	new_instruction

	prompt += f"{B_INST} {new_instruction} {E_INST}"
	return prompt


	class Model_Type(Enum):
	gptq = 1
	ggml = 2
	full_precision = 3


	def get_model_type(model_name):
	if "gptq" in model_name.lower():
	return Model_Type.gptq
	elif "ggml" in model_name.lower():
	return Model_Type.ggml
	else:
	return Model_Type.full_precision


	def create_folder_if_not_exists(folder_path):
	if not os.path.exists(folder_path):
	os.makedirs(folder_path)


	def initialize_gpu_model_and_tokenizer(model_name, model_type):
	if model_type == Model_Type.gptq:
	model = AutoGPTQForCausalLM.from_quantized(
	model_name, device_map="auto", use_safetensors=True,
	use_triton=False)
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	else:
	model = AutoModelForCausalLM.from_pretrained(
	model_name, device_map="auto", token=True)
	tokenizer = AutoTokenizer.from_pretrained(model_name, token=True)
	return model, tokenizer


	def init_auto_model_and_tokenizer(model_name, model_type, file_name=None):
	model_type = get_model_type(model_name)

	if Model_Type.ggml == model_type:
	models_folder = "./models"
	create_folder_if_not_exists(models_folder)
	file_path = hf_hub_download(
	repo_id=model_name, filename=file_name, local_dir=models_folder)
	model = Llama(file_path, n_ctx=4096)
	tokenizer = None
	else:
	model, tokenizer = initialize_gpu_model_and_tokenizer(
	model_name, model_type=model_type)
	return model, tokenizer


	app = Flask(__name__)


	@app.route('/api/chatbot', methods=['POST'])
	def chatbot_api():
	data = request.json
	user_instruction = data['user_instruction']
	user_message = data['user_message']
	model_name = data['model_name']
	file_name = data.get('file_name')
	is_chat_model = 'chat' in model_name.lower()
	model_type = get_model_type(model_name)

	if model_type == Model_Type.ggml:
	assert file_name is not None, """
	When model_name is provided for a GGML quantized model, file_name argument must also be provided."""

	model, tokenizer = init_auto_model_and_tokenizer(
	model_name, model_type, file_name)

	if is_chat_model:
	instruction = format_to_llama_chat_style(user_instruction, [[user_message, None]])
	else:
	instruction = user_message

	history = [[user_message, None]]

	response = generate_response(
	model, tokenizer, instruction, history, model_type)
	return jsonify({'bot_response': response})


	def generate_response(model, tokenizer, instruction, history, model_type):
	response = ""

	kwargs = dict(temperature=0.6, top_p=0.9)
	if model_type == Model_Type.ggml:
	kwargs["max_tokens"] = 512
	for chunk in model(prompt=instruction, stream=True, **kwargs):
	token = chunk["choices"][0]["text"]
	response += token

	else:
	streamer = TextIteratorStreamer(
	tokenizer, skip_prompt=True, Timeout=5)
	inputs = tokenizer(instruction, return_tensors="pt").to(model.device)
	kwargs["max_new_tokens"] = 512
	kwargs["input_ids"] = inputs["input_ids"]
	kwargs["streamer"] = streamer
	thread = Thread(target=model.generate, kwargs=kwargs)
	thread.start()

	for token in streamer:
	response += token

	return response


	def run_app(port):
	app.run(port=port)


	if __name__ == '__main__':
	fire.Fire(run_app(5000))