File size: 2,238 Bytes
eccde2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Define a Language Model class
class LLM:
    def __init__(self, model_name):
        # Determine the device to use (GPU if available, otherwise CPU)
        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        
        # Load the pre-trained language model with specific settings
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16,  # Set the data type to float16
            load_in_8bit=True,         # Load in 8-bit format if available
            device_map='auto'          # Automatically select the device
        ).bfloat16()  # Convert the model to bfloat16 for lower precision
        
        # Initialize the tokenizer for the same model
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        # Set custom padding token and padding side
        self.tokenizer.pad_token = "[PAD]"
        self.tokenizer.padding_side = "left"

    def generate_response(self, messages, max_tokens=100, do_sample=True):
        # Tokenize the input messages and move them to the selected device (GPU or CPU)
        input_ids = self.tokenizer(
            messages,
            max_length=512,
            padding=True,
            truncation=True,
            return_tensors='pt'
        ).input_ids.cuda()
        
        with torch.no_grad():
            # Generate a response using the loaded model
            generated_ids = self.model.generate(
                input_ids,
                pad_token_id=self.tokenizer.pad_token_id,
                max_new_tokens=max_tokens,
                do_sample=do_sample,
                temperature=0.3  # Adjust the sampling temperature
            )
            # Decode the generated tokens into a human-readable response
            response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=False)[0]
        
        return response

# Main program
if __name__ == '__main__':
    # Specify the model name to use
    model_name = "mistralai/Mistral-7B-Instruct-v0.1"
    
    # Create an instance of the Language Model class with the specified model
    llm = LLM(model_name)