Spaces:
Running
Running
struct llama_cparams { | |
uint32_t n_ctx; // context size used during inference | |
uint32_t n_batch; | |
uint32_t n_ubatch; | |
uint32_t n_seq_max; | |
int n_threads; // number of threads to use for generation | |
int n_threads_batch; // number of threads to use for batch processing | |
float rope_freq_base; | |
float rope_freq_scale; | |
uint32_t n_ctx_orig_yarn; | |
// These hyperparameters are not exposed in GGUF, because all | |
// existing YaRN models use the same values for them. | |
float yarn_ext_factor; | |
float yarn_attn_factor; | |
float yarn_beta_fast; | |
float yarn_beta_slow; | |
float defrag_thold; | |
bool embeddings; | |
bool causal_attn; | |
bool offload_kqv; | |
bool flash_attn; | |
bool no_perf; | |
enum llama_pooling_type pooling_type; | |
ggml_backend_sched_eval_callback cb_eval; | |
void * cb_eval_user_data; | |
}; | |