Spaces:
Running
on
Zero
Running
on
Zero
Changed to nomic
Browse files
app.py
CHANGED
@@ -6,15 +6,16 @@ import spaces
|
|
6 |
import torch
|
7 |
|
8 |
# neuralmind/bert-base-portuguese-cased
|
9 |
-
ModelName = "neuralmind/bert-base-portuguese-cased"
|
10 |
-
model = AutoModel.from_pretrained(ModelName)
|
11 |
-
tokenizer = AutoTokenizer.from_pretrained(ModelName, do_lower_case=False)
|
12 |
-
processor = AutoImageProcessor.from_pretrained("nomic-ai/nomic-embed-vision-v1.5")
|
13 |
-
vision_model = AutoModel.from_pretrained("nomic-ai/nomic-embed-vision-v1.5", trust_remote_code=True)
|
14 |
|
15 |
-
|
16 |
-
#
|
17 |
-
#
|
|
|
|
|
|
|
18 |
|
19 |
def mean_pooling(model_output, attention_mask):
|
20 |
token_embeddings = model_output[0]
|
@@ -26,25 +27,25 @@ def TxtEmbed(text):
|
|
26 |
|
27 |
|
28 |
|
29 |
-
input_ids = tokenizer.encode(text, return_tensors='pt')
|
30 |
|
31 |
-
with torch.no_grad():
|
32 |
-
|
33 |
-
|
34 |
-
return (encoded.tolist())[0];
|
35 |
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
|
49 |
|
50 |
|
|
|
6 |
import torch
|
7 |
|
8 |
# neuralmind/bert-base-portuguese-cased
|
9 |
+
#ModelName = "neuralmind/bert-base-portuguese-cased"
|
10 |
+
#model = AutoModel.from_pretrained(ModelName)
|
11 |
+
#tokenizer = AutoTokenizer.from_pretrained(ModelName, do_lower_case=False)
|
|
|
|
|
12 |
|
13 |
+
|
14 |
+
#processor = AutoImageProcessor.from_pretrained("nomic-ai/nomic-embed-vision-v1.5")
|
15 |
+
#vision_model = AutoModel.from_pretrained("nomic-ai/nomic-embed-vision-v1.5", trust_remote_code=True)
|
16 |
+
tokenizer = AutoTokenizer.from_pretrained('nomic-ai/nomic-embed-text-v1.5')
|
17 |
+
text_model = AutoModel.from_pretrained('nomic-ai/nomic-embed-text-v1.5', trust_remote_code=True)
|
18 |
+
text_model.eval()
|
19 |
|
20 |
def mean_pooling(model_output, attention_mask):
|
21 |
token_embeddings = model_output[0]
|
|
|
27 |
|
28 |
|
29 |
|
30 |
+
#input_ids = tokenizer.encode(text, return_tensors='pt')
|
31 |
|
32 |
+
#with torch.no_grad():
|
33 |
+
# outs = model(input_ids)
|
34 |
+
# encoded = outs[0][0, 1:-1] # Ignore [CLS] and [SEP] special tokens
|
35 |
+
#return (encoded.tolist())[0];
|
36 |
|
37 |
|
38 |
+
sentences = [text]
|
39 |
+
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
|
40 |
+
|
41 |
+
with torch.no_grad():
|
42 |
+
model_output = text_model(**encoded_input)
|
43 |
+
|
44 |
+
text_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
|
45 |
+
text_embeddings = F.layer_norm(text_embeddings, normalized_shape=(text_embeddings.shape[1],))
|
46 |
+
text_embeddings = F.normalize(text_embeddings, p=2, dim=1)
|
47 |
+
|
48 |
+
return (text_embeddings.tolist)[0]
|
49 |
|
50 |
|
51 |
|