File size: 1,644 Bytes
1a81cef
2042c5e
81ee8f8
9313118
1a81cef
81ee8f8
1a81cef
9313118
 
 
 
 
 
 
1a81cef
9313118
 
 
1a81cef
 
 
 
 
9313118
 
 
 
 
 
 
1a81cef
 
12d6b61
 
ce5f258
9313118
 
 
ce5f258
9313118
12d6b61
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen2.5-Math-7B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"

# Modell und Tokenizer laden
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    trust_remote_code=True
).eval()

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Eingabe vorbereiten
chat = [
    {"role": "system", "content": "Please reason step by step, and put your final answer within \\boxed{}."},
    {"role": "user", "content": "Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?"}
]

# Vorbereiten der Eingabe
conversation_str = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=False)

input_ids = tokenizer.encode(conversation_str, return_tensors="pt", add_special_tokens=False).to(device)

# attention_mask erstellen
attention_mask = (input_ids != tokenizer.pad_token_id).long()

# Inferenz durchführen
try:
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            max_new_tokens=100,  # Passe dies an, je nach Bedarf
            attention_mask=attention_mask
        )
    print("Antwort generiert:", tokenizer.decode(outputs[0], skip_special_tokens=True))
except Exception as e:
    print(f"Fehler bei der Inferenz: {e}")