Update README.md
Browse files
README.md
CHANGED
@@ -111,10 +111,13 @@ def mean_pooling(model_output, attention_mask):
|
|
111 |
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
112 |
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
113 |
|
114 |
-
sentences = [
|
|
|
|
|
|
|
115 |
|
116 |
-
tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-
|
117 |
-
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-
|
118 |
|
119 |
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
120 |
|
@@ -135,16 +138,20 @@ from transformers import AutoModel
|
|
135 |
from numpy.linalg import norm
|
136 |
|
137 |
cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
|
138 |
-
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-code', trust_remote_code=True)
|
139 |
-
embeddings = model.encode(
|
140 |
-
|
|
|
|
|
|
|
|
|
141 |
```
|
142 |
|
143 |
If you only want to handle shorter sequence, such as 2k, pass the `max_length` parameter to the `encode` function:
|
144 |
|
145 |
```python
|
146 |
embeddings = model.encode(
|
147 |
-
['Very long ...
|
148 |
max_length=2048
|
149 |
)
|
150 |
```
|
|
|
111 |
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
|
112 |
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
|
113 |
|
114 |
+
sentences = [
|
115 |
+
'Save model to a pickle located at `path`',
|
116 |
+
'def save_act(self, path=None): if path is None: path = os.path.join(logger.get_dir(), "model.pkl") with tempfile.TemporaryDirectory() as td: save_variables(os.path.join(td, "model")) arc_name = os.path.join(td, "packed.zip") with zipfile.ZipFile(arc_name, "w") as zipf: for root, dirs, files in os.walk(td): for fname in files: file_path = os.path.join(root, fname) if file_path != arc_name: zipf.write(file_path, os.path.relpath(file_path, td)) with open(arc_name, "rb") as f: model_data = f.read() with open(path, "wb") as f: cloudpickle.dump((model_data, self._act_params), f)',
|
117 |
+
]
|
118 |
|
119 |
+
tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-code')
|
120 |
+
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-code', trust_remote_code=True)
|
121 |
|
122 |
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
123 |
|
|
|
138 |
from numpy.linalg import norm
|
139 |
|
140 |
cos_sim = lambda a,b: (a @ b.T) / (norm(a)*norm(b))
|
141 |
+
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-code', trust_remote_code=True)
|
142 |
+
embeddings = model.encode(
|
143 |
+
[
|
144 |
+
'Save model to a pickle located at `path`',
|
145 |
+
'def save_act(self, path=None): if path is None: path = os.path.join(logger.get_dir(), "model.pkl") with tempfile.TemporaryDirectory() as td: save_variables(os.path.join(td, "model")) arc_name = os.path.join(td, "packed.zip") with zipfile.ZipFile(arc_name, "w") as zipf: for root, dirs, files in os.walk(td): for fname in files: file_path = os.path.join(root, fname) if file_path != arc_name: zipf.write(file_path, os.path.relpath(file_path, td)) with open(arc_name, "rb") as f: model_data = f.read() with open(path, "wb") as f: cloudpickle.dump((model_data, self._act_params), f)',
|
146 |
+
]
|
147 |
+
)
|
148 |
```
|
149 |
|
150 |
If you only want to handle shorter sequence, such as 2k, pass the `max_length` parameter to the `encode` function:
|
151 |
|
152 |
```python
|
153 |
embeddings = model.encode(
|
154 |
+
['Very long ... code'],
|
155 |
max_length=2048
|
156 |
)
|
157 |
```
|