michaelfeil
/

ct2fast-e5-large-v2

@@ -2608,17 +2608,37 @@ Speedup inference while reducing memory by 2x-4x using int8 inference in C++ on
 quantized version of [intfloat/e5-large-v2](https://huggingface.co/intfloat/e5-large-v2)
 ```bash
-pip install hf-hub-ctranslate2>=3.0.0 ctranslate2>=3.16.0
 ```
 ```python
 # from transformers import AutoTokenizer
 model_name = "michaelfeil/ct2fast-e5-large-v2"
 from hf_hub_ctranslate2 import CT2SentenceTransformer
 model = CT2SentenceTransformer(
-    model_name, compute_type="int8_float16", device="cuda",
-    repo_contains_ct2=True
 )
 embeddings = model.encode(
     ["I like soccer", "I like tennis", "The eiffel tower is in Paris"],
@@ -2632,7 +2652,7 @@ scores = (embeddings @ embeddings.T) * 100
 ```
 Checkpoint compatible to [ctranslate2>=3.16.0](https://github.com/OpenNMT/CTranslate2)
-and [hf-hub-ctranslate2>=3.0.0](https://github.com/michaelfeil/hf-hub-ctranslate2)
 - `compute_type=int8_float16` for `device="cuda"`
 - `compute_type=int8`  for `device="cpu"`

 quantized version of [intfloat/e5-large-v2](https://huggingface.co/intfloat/e5-large-v2)
 ```bash
+pip install hf-hub-ctranslate2>=2.11.0 ctranslate2>=3.16.0
 ```
 ```python
 # from transformers import AutoTokenizer
 model_name = "michaelfeil/ct2fast-e5-large-v2"
+model_name_orig=intfloat/e5-large-v2
+from hf_hub_ctranslate2 import EncoderCT2fromHfHub
+model = EncoderCT2fromHfHub(
+        # load in int8 on CUDA
+        model_name_or_path=model_name,
+        device="cuda",
+        compute_type="int8_float16",
+)
+outputs = model.generate(
+    text=["I like soccer", "I like tennis", "The eiffel tower is in Paris"],
+    max_length=64,
+)
+# perform downstream tasks on outputs
+outputs["pooler_output"]
+outputs["last_hidden_state"]
+outputs["attention_mask"]
+# alternative, use SentenceTransformer Mix-In
+# for end-to-end Sentence embeddings generation
+# not pulling from this repo
 from hf_hub_ctranslate2 import CT2SentenceTransformer
 model = CT2SentenceTransformer(
+    model_name_orig, compute_type="int8_float16", device="cuda",
 )
 embeddings = model.encode(
     ["I like soccer", "I like tennis", "The eiffel tower is in Paris"],
 ```
 Checkpoint compatible to [ctranslate2>=3.16.0](https://github.com/OpenNMT/CTranslate2)
+and [hf-hub-ctranslate2>=2.11.0](https://github.com/michaelfeil/hf-hub-ctranslate2)
 - `compute_type=int8_float16` for `device="cuda"`
 - `compute_type=int8`  for `device="cpu"`