faiss-gpu>=1.7.2
# for unstructured
onnxruntime-gpu==1.15.0
auto-gptq>=0.7.1
#optimum>=1.17.1
# autoawq for cuda 12.1, else build from source: https://github.com/casper-hansen/AutoAWQ?tab=readme-ov-file#build-from-source
autoawq
autoawq-kernels
exllama @ https://github.com/jllllll/exllama/releases/download/0.0.18/exllama-0.0.18+cu121-cp310-cp310-linux_x86_64.whl
# See: Dao-AILab/flash-attention/issues/453
# flash-attn==2.4.2