--- base_model: unsloth/llama-3-8b-Instruct-bnb-4bit language: - en license: apache-2.0 tags: - text-generation-inference - transformers - unsloth - llama - trl --- # Uploaded model - **Developed by:** AlberBshara - **License:** apache-2.0 - **Finetuned from model :** unsloth/llama-3-8b-Instruct-bnb-4bit # How to Use it: *Installs Unsloth, Xformers (Flash Attention) and all other packages!* %%capture !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" !pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes !pip install triton ```python import sys, os from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig from unsloth.chat_templates import get_chat_template from typing import Tuple, Dict, Any import torch class LLM: def __init__(self, load_in_4bit: bool = True, load_cpu_mem_usage: bool = True, hf_model_path: str = "AlberBshara/scholara_QA"): """ Args: load_in_4bit (bool): Use 4-bit quantization. Defaults to True. load_cpu_mem_usage (bool): Reduce CPU memory usage. Defaults to True. hf_model_path (str): The path of your model on HuggingFace-Hub like "your-user-name/model-name". """ assert torch.cuda.is_available(), "CUDA is not available. An NVIDIA GPU is required." hf_auth_token = HUGGING_FACE_API_TOKEN # Specify the quantization config self._bnb_config = BitsAndBytesConfig(load_in_4bit=load_in_4bit) # Load model directly with quantization config self._model = AutoModelForCausalLM.from_pretrained( hf_model_path, low_cpu_mem_usage=load_cpu_mem_usage, quantization_config=self._bnb_config, use_auth_token=hf_auth_token ) # Load the tokenizer self._tokenizer = AutoTokenizer.from_pretrained( hf_model_path, use_auth_token=hf_auth_token ) self._tokenizer = get_chat_template( self._tokenizer, chat_template="llama-3", mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"}, # ShareGPT style ) self._hf_model_path = hf_model_path self._EOS_TOKEN_ID = self._tokenizer.eos_token_id self._prompt = lambda context, question: f""" Answer the following question, use the given context. Context: [{context}] Question: [{question}] """ def invoke(self, context: str, question: str) -> Tuple: if not question.strip(): raise ValueError("question cannot be empty or None") if not context.strip(): raise ValueError("context cannot be empty or None") inputs = self._prompt(context, question) messages = [{"from": "human", "value": inputs}] inputs = self._tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, # Must add for generation return_tensors="pt", ).to("cuda") # Increase the max_new_tokens to allow more detailed responses output_ids = self._model.generate(inputs, max_new_tokens=2048, pad_token_id=self._EOS_TOKEN_ID) output_ids = output_ids.tolist()[0] if output_ids.size(0) == 1 else output_ids.tolist() output_text = self._tokenizer.decode(output_ids, skip_special_tokens=True) # free GPU Mem. del inputs torch.cuda.empty_cache() return output_text, output_ids, None def extract_answer(self, response: str) -> str: start_with: str = ".assistant" start_index = response.find(start_with) # If the word is found, extract the substring from that point onward if start_index != -1: # Move start_index to the end of the word start_index += len(start_with) return response[start_index:] else: return response def get_metadata(self) -> Dict[str, Any]: return { "class_name": self.__class__.__name__, "init_params": { "load_in_4bit": True, "load_cpu_mem_usage": True, "hf_model_path": "AlberBshara/scholara_QA", "hf_auth_token": "--%$%--" }, "methods": ["invoke", "extract_answer"] } test_llm = LLM() ```