File size: 3,880 Bytes
1a8deb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c7f3542
1a8deb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from pydantic import BaseModel, Field


class GgufParser(BaseModel):
    metadata: "Metadata"
    architecture: "Architecture"
    tokenizer: "Tokenizer"
    estimate: "Estimate"


class Metadata(BaseModel):
    type_: str = Field(alias="type")
    architecture: str
    quantization_version: int = Field(alias="quantizationVersion")
    alignment: int
    name: str
    file_type: int = Field(alias="fileType")
    little_endian: bool = Field(alias="littleEndian")
    file_size: int = Field(alias="fileSize")
    size: int
    parameters: int
    bits_per_weight: float = Field(alias="bitsPerWeight")


class Architecture(BaseModel):
    type_: str = Field(alias="type")
    architecture: str
    maximum_context_length: int = Field(alias="maximumContextLength")
    embedding_length: int = Field(alias="embeddingLength")
    block_count: int = Field(alias="blockCount")
    feed_forward_length: int = Field(alias="feedForwardLength")
    attention_head_count: int = Field(alias="attentionHeadCount")
    attention_head_count_kv: int = Field(alias="attentionHeadCountKV")
    attention_layer_norm_rmse_epsilon: float = Field(
        alias="attentionLayerNormRMSEpsilon"
    )
    attention_key_length: int = Field(alias="attentionKeyLength")
    attention_value_length: int = Field(alias="attentionValueLength")
    attention_causal: bool = Field(alias="attentionCausal")
    rope_dimension_count: int = Field(alias="ropeDimensionCount")
    rope_frequency_base: int = Field(alias="ropeFrequencyBase")
    vocabulary_length: int = Field(alias="vocabularyLength")
    embedding_gqa: int = Field(alias="embeddingGQA")
    embedding_key_gqa: int = Field(alias="embeddingKeyGQA")
    embedding_value_gqa: int = Field(alias="embeddingValueGQA")


class Tokenizer(BaseModel):
    model: str
    tokens_length: int = Field(alias="tokensLength")
    merges_length: int = Field(alias="mergesLength")
    added_token_length: int = Field(alias="addedTokenLength")
    bos_token_id: int = Field(alias="bosTokenID")
    eos_token_id: int = Field(alias="eosTokenID")
    eot_token_id: int = Field(alias="eotTokenID")
    eom_token_id: int = Field(alias="eomTokenID")
    unknown_token_id: int = Field(alias="unknownTokenID")
    separator_token_id: int = Field(alias="separatorTokenID")
    padding_token_id: int = Field(alias="paddingTokenID")
    tokens_size: int = Field(alias="tokensSize")
    merges_size: int = Field(alias="mergesSize")


class Ram(BaseModel):
    handle_layers: int = Field(alias="handleLayers")
    handle_last_layer: int = Field(alias="handleLastLayer")
    handle_output_layer: bool = Field(alias="handleOutputLayer")
    remote: bool
    position: int
    uma: int
    nonuma: int


class Item(BaseModel):
    offload_layers: int = Field(alias="offloadLayers")
    full_offloaded: bool = Field(alias="fullOffloaded")
    maximum_tokens_per_second: float = Field(None, alias="maximumTokensPerSecond")
    ram: "Ram"
    vrams: list["Ram"]


class Estimate(BaseModel):
    items: list["Item"]
    type_: str = Field(alias="type")
    architecture: str
    context_size: int = Field(alias="contextSize")
    flash_attention: bool = Field(alias="flashAttention")
    no_mmap: bool = Field(alias="noMMap")
    embedding_only: bool = Field(alias="embeddingOnly")
    reranking: bool
    distributable: bool
    logical_batch_size: int = Field(alias="logicalBatchSize")
    physical_batch_size: int = Field(alias="physicalBatchSize")
    type_: str = Field(alias="type")
    architecture: str
    context_size: int = Field(alias="contextSize")
    flash_attention: bool = Field(alias="flashAttention")
    no_mmap: bool = Field(alias="noMMap")
    embedding_only: bool = Field(alias="embeddingOnly")
    reranking: bool
    distributable: bool
    logical_batch_size: int = Field(alias="logicalBatchSize")
    physical_batch_size: int = Field(alias="physicalBatchSize")