File size: 3,174 Bytes
fe97ada b8cf9e8 b644180 b8cf9e8 fe97ada b8cf9e8 fe97ada b8cf9e8 b644180 20c7176 b644180 f40f065 b644180 b8cf9e8 fe97ada b8cf9e8 fe97ada b644180 fe97ada b8cf9e8 b644180 b8cf9e8 b644180 b8cf9e8 fe97ada b8cf9e8 20c7176 0150dd4 b8cf9e8 f40f065 fe97ada 7a67e15 fe97ada |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import os
import chainlit as cl # importing chainlit for our app
import torch
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
BitsAndBytesConfig,
)
from peft import PeftModel, PeftConfig
import bitsandbytes as bnb
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# Prompt Templates
INSTRUCTION_PROMPT_TEMPLATE = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
Please convert the following legal content into a human-readable summary<|eot_id|><|start_header_id|>user<|end_header_id|>
[LEGAL_DOC]
{input}
[END_LEGAL_DOC]<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""
RESPONSE_TEMPLATE = """
{summary}<|eot_id|>
"""
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.float16,
)
config = PeftConfig.from_pretrained("lakshyaag/llama38binstruct_summarize")
base_model = AutoModelForCausalLM.from_pretrained(
"NousResearch/Meta-Llama-3-8B-Instruct",
quantization_config=bnb_config,
device_map="auto",
)
model = PeftModel.from_pretrained(base_model, "lakshyaag/llama38binstruct_summarize")
model.merge_and_unload()
# Move model to GPU if available
if torch.cuda.is_available():
model = model.to("cuda")
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Meta-Llama-3-8B-Instruct")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
def create_prompt(sample, include_response=False):
"""
Parameters:
- sample: dict representing row of dataset
- include_response: bool
Functionality:
This function should build the Python str `full_prompt`.
If `include_response` is true, it should include the summary -
else it should not contain the summary (useful for prompting) and testing
Returns:
- full_prompt: str
"""
full_prompt = INSTRUCTION_PROMPT_TEMPLATE.format(input=sample["original_text"])
if include_response:
full_prompt += RESPONSE_TEMPLATE.format(summary=sample["reference_summary"])
full_prompt += "<|end_of_text|>"
return full_prompt
@cl.on_message # marks a function that should be run each time the chatbot receives a message from a user
async def main(message: cl.Message):
prompt = create_prompt({"original_text": message.content}, include_response=False)
# convert str input into tokenized input
encoded_input = tokenizer(prompt, return_tensors="pt")
# send the tokenized inputs to our GPU
model_inputs = encoded_input.to("cuda" if torch.cuda.is_available() else "cpu")
# generate response and set desired generation parameters
generated_ids = model.generate(
**model_inputs,
max_new_tokens=256,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
)
# decode output from tokenized output to str output
decoded_output = tokenizer.batch_decode(generated_ids)
# return only the generated response (not the prompt) as output
response = decoded_output[0].split("<|end_header_id|>")[-1]
msg = cl.Message(content=response)
await msg.send()
if __name__ == "__main__":
cl.run()
|