File size: 1,840 Bytes
1dfccc3 d812385 1dfccc3 628fe8f d812385 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 |
import torch
import numpy as np
MAX_USER_QUERY_LEN = 35
# List of example queries for easy access
DEFAULT_QUERIES = {
"Example Query 1": "Who visited microsoft.com on September 18?",
"Example Query 2": "Does Kate has drive ?",
"Example Query 3": "What phone number can be used to contact David Johnson?",
}
def get_batch_text_representation(texts, model, tokenizer, batch_size=1):
"""
Get mean-pooled representations of given texts in batches.
"""
mean_pooled_batch = []
for i in range(0, len(texts), batch_size):
batch_texts = texts[i:i+batch_size]
inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
outputs = model(**inputs, output_hidden_states=False)
last_hidden_states = outputs.last_hidden_state
input_mask_expanded = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden_states.size()).float()
sum_embeddings = torch.sum(last_hidden_states * input_mask_expanded, 1)
sum_mask = input_mask_expanded.sum(1)
mean_pooled = sum_embeddings / sum_mask
mean_pooled_batch.extend(mean_pooled.cpu().detach().numpy())
return np.array(mean_pooled_batch)
def is_user_query_valid(user_query: str) -> bool:
"""
Check if the `user_query` is None and not empty.
Args:
user_query (str): The input text to be checked.
Returns:
bool: True if the `user_query` is None or empty, False otherwise.
"""
# If the query is not part of the default queries
is_default_query = user_query in DEFAULT_QUERIES.values()
# Check if the query exceeds the length limit
is_exceeded_max_length = user_query is not None and len(user_query) <= MAX_USER_QUERY_LEN
return not is_default_query and not is_exceeded_max_length
|