File size: 3,439 Bytes
5860b41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5002792
 
5860b41
 
 
 
5002792
 
c8f289a
387046f
0b5e165
f754508
 
387046f
 
d772d9d
 
 
 
 
 
 
387046f
5860b41
 
 
 
 
 
 
 
 
 
387046f
 
 
0b5e165
387046f
5860b41
 
 
 
 
 
 
 
c8f289a
387046f
 
 
 
c8f289a
387046f
5002792
 
 
 
 
5860b41
c8f289a
c755e09
5860b41
f754508
 
 
 
 
 
 
5860b41
 
 
 
 
 
 
 
be560ea
5860b41
 
dc7abea
5860b41
 
 
 
 
 
5002792
5860b41
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import os
import sys
import torch
from dotenv import find_dotenv, load_dotenv

found_dotenv = find_dotenv(".env")

if len(found_dotenv) == 0:
    found_dotenv = find_dotenv(".env.example")
print(f"loading env vars from: {found_dotenv}")
load_dotenv(found_dotenv, override=False)

path = os.path.dirname(found_dotenv)
print(f"Adding {path} to sys.path")
sys.path.append(path)

from llm_toolkit.llm_utils import *
from llm_toolkit.logical_reasoning_utils import *

model_name = os.getenv("MODEL_NAME")
adapter_name_or_path = os.getenv("ADAPTER_NAME_OR_PATH")
load_in_4bit = os.getenv("LOAD_IN_4BIT") == "true"
data_path = os.getenv("LOGICAL_REASONING_DATA_PATH")
results_path = os.getenv("LOGICAL_REASONING_RESULTS_PATH")
use_english_datasets = os.getenv("USE_ENGLISH_DATASETS") == "true"
using_p1 = os.getenv("USING_P1_PROMPT_TEMPLATE") == "true"
using_llama_factory = os.getenv("USING_LLAMA_FACTORY") == "true"
max_new_tokens = int(os.getenv("MAX_NEW_TOKENS", 16))
repetition_penalty = float(os.getenv("REPETITION_PENALTY", 1.0))

dtype = (
    torch.float32
    if os.getenv("USE_FLOAT32_FOR_INFERENCE") == "true"
    else (
        torch.bfloat16
        if os.getenv("USE_BF16_FOR_INFERENCE") == "true"
        else torch.float16
    )
)

print(model_name, adapter_name_or_path, load_in_4bit, data_path, results_path)

gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"(1) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

model, tokenizer = load_model(
    model_name,
    load_in_4bit=load_in_4bit,
    adapter_name_or_path=adapter_name_or_path,
    using_llama_factory=using_llama_factory,
    dtype=dtype,
)

gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"(2) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

datasets = load_logical_reasoning_dataset(
    data_path,
    tokenizer=tokenizer,
    chinese_prompt=not use_english_datasets,
    using_p1=using_p1,
)

if len(sys.argv) > 1:
    num = int(sys.argv[1])
    if num > 0:
        print(f"--- evaluating {num} entries")
        datasets["test"] = datasets["test"].select(range(num))

print_row_details(datasets["test"].to_pandas(), indices=[0, -1])

print("Evaluating model: " + model_name)
predictions = eval_model(
    model,
    tokenizer,
    datasets["test"],
    max_new_tokens=max_new_tokens,
    repetition_penalty=repetition_penalty,
)

gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"(3) GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

if adapter_name_or_path is not None:
    model_name += "/" + adapter_name_or_path.split("/")[-1]

save_results(
    f"{model_name}_{dtype}{'_4bit' if load_in_4bit else ''}{'_lf' if using_llama_factory else ''}",
    results_path,
    datasets["test"],
    predictions,
    debug=True,
)

metrics = calc_metrics(datasets["test"]["label"], predictions, debug=True)
print(metrics)