|
import torch |
|
import numpy as np |
|
|
|
|
|
MAX_USER_QUERY_LEN = 35 |
|
|
|
|
|
DEFAULT_QUERIES = { |
|
"Example Query 1": "Who visited microsoft.com on September 18?", |
|
"Example Query 2": "Does Kate has drive ?", |
|
"Example Query 3": "What phone number can be used to contact David Johnson?", |
|
} |
|
|
|
def get_batch_text_representation(texts, model, tokenizer, batch_size=1): |
|
""" |
|
Get mean-pooled representations of given texts in batches. |
|
""" |
|
mean_pooled_batch = [] |
|
for i in range(0, len(texts), batch_size): |
|
batch_texts = texts[i:i+batch_size] |
|
inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True) |
|
with torch.no_grad(): |
|
outputs = model(**inputs, output_hidden_states=False) |
|
last_hidden_states = outputs.last_hidden_state |
|
input_mask_expanded = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden_states.size()).float() |
|
sum_embeddings = torch.sum(last_hidden_states * input_mask_expanded, 1) |
|
sum_mask = input_mask_expanded.sum(1) |
|
mean_pooled = sum_embeddings / sum_mask |
|
mean_pooled_batch.extend(mean_pooled.cpu().detach().numpy()) |
|
return np.array(mean_pooled_batch) |
|
|
|
|
|
def is_user_query_valid(user_query: str) -> bool: |
|
""" |
|
Check if the `user_query` is None and not empty. |
|
Args: |
|
user_query (str): The input text to be checked. |
|
Returns: |
|
bool: True if the `user_query` is None or empty, False otherwise. |
|
""" |
|
|
|
is_default_query = user_query in DEFAULT_QUERIES.values() |
|
|
|
|
|
is_exceeded_max_length = user_query is not None and len(user_query) <= MAX_USER_QUERY_LEN |
|
|
|
return not is_default_query and not is_exceeded_max_length |
|
|
|
|
|
|