colibri.qdrant / tests /openapi /test_discovery.py
Gouzi Mohaled
Ajout du dossier tests
3932407
import math
import random
import pytest
from .helpers.collection_setup import basic_collection_setup, drop_collection
from .helpers.helpers import request_with_validation
@pytest.fixture(scope='module', autouse=True)
def collection_name_lookup(collection_name):
return f"{collection_name}_lookup"
def random_vector(dim=4):
return [random.random() for _ in range(dim)]
def random_example(dim=4, min_id=1, max_id=8):
if random.random() < 0.5:
return random_vector(dim)
else:
return random.randint(min_id, max_id)
def count_ids_in_examples(context, target) -> int:
set_ = set()
for pair in context:
for example in [pair["positive"], pair["negative"]]:
if isinstance(example, int):
set_.add(example)
if isinstance(target, int):
set_.add(target)
return len(set_)
@pytest.fixture(autouse=True, scope="module")
def setup(on_disk_vectors, collection_name):
basic_collection_setup(collection_name=collection_name, on_disk_vectors=on_disk_vectors)
yield
drop_collection(collection_name=collection_name)
# Context is when we don't include a target vector
def test_context(collection_name):
context = [
{"positive": random_example(), "negative": random_example()},
{"positive": random_example(), "negative": random_example()},
]
response = request_with_validation(
api="/collections/{collection_name}/points/discover",
method="POST",
path_params={"collection_name": collection_name},
body={
"context": context,
"limit": 8,
},
)
assert response.ok, response.json()
scored_points = response.json()["result"]
assert len(scored_points) == 8 - count_ids_in_examples(context, None)
# Score in context search relates to loss, so max score for context search is 0.0
for point in scored_points:
assert point["score"] <= 0.0
# When we only use target, it should be the exact same as search
def test_only_target_is_search_with_different_scoring(collection_name):
target = random_vector()
# First, search
response = request_with_validation(
api="/collections/{collection_name}/points/search",
method="POST",
path_params={"collection_name": collection_name},
body={
"vector": target,
"limit": 8,
},
)
assert response.ok, response.json()
search_points = response.json()["result"]
# Then, discover
response = request_with_validation(
api="/collections/{collection_name}/points/discover",
method="POST",
path_params={"collection_name": collection_name},
body={
"target": target,
"limit": 8,
},
)
assert response.ok, response.json()
discover_points = response.json()["result"]
assert len(discover_points) == 8
# Results should be in same order, but different scores
assert len(search_points) == len(discover_points)
for search_point, discover_point in zip(search_points, discover_points):
assert search_point["id"] == discover_point["id"]
assert search_point["score"] != discover_point["score"]
# Only when we use both target and context, we are doing discovery
def test_discover_same_context(collection_name):
target1 = random_example()
context = [
{"positive": random_example(), "negative": random_example()},
{"positive": random_example(), "negative": random_example()},
]
response = request_with_validation(
api="/collections/{collection_name}/points/discover",
method="POST",
path_params={"collection_name": collection_name},
body={
"target": target1,
"context": context,
"limit": 8,
"params": {
"exact": True,
},
},
)
assert response.ok, response.json()
scored_points1 = response.json()["result"]
assert len(scored_points1) == 8 - count_ids_in_examples(context, target1)
target2 = random_example()
response = request_with_validation(
api="/collections/{collection_name}/points/discover",
method="POST",
path_params={"collection_name": collection_name},
body={
"target": target2,
"context": context,
"limit": 8,
"params": {
"exact": True,
},
},
)
assert response.ok, response.json()
scored_points2 = response.json()["result"]
assert len(scored_points2) == 8 - count_ids_in_examples(context, target2)
# We keep same context, so context part of the score (integer part) should be the same,
# while target part of the score (decimal part) should be different
scored_points_2_map = {point["id"]: point for point in scored_points2}
for point1, point2 in zip(scored_points1, scored_points2):
if point1["id"] in scored_points_2_map:
point2 = scored_points_2_map[point1["id"]]
assert math.floor(point1["score"]) == math.floor(point2["score"])
target_score1 = point1["score"] - math.floor(point1["score"])
target_score2 = point2["score"] - math.floor(point2["score"])
if target1 == target2:
assert math.isclose(target_score1, target_score2, rel_tol=1e-5)
else:
assert not math.isclose(target_score1, target_score2, rel_tol=1e-5)
def test_discover_same_target(collection_name):
target = random_example()
context1 = [
{"positive": random_example(), "negative": random_example()},
{"positive": random_example(), "negative": random_example()},
]
context2 = [
{"positive": random_example(), "negative": random_example()},
{"positive": random_example(), "negative": random_example()},
]
response = request_with_validation(
api="/collections/{collection_name}/points/discover",
method="POST",
path_params={"collection_name": collection_name},
body={
"target": target,
"context": context1,
"limit": 8,
},
)
assert response.ok, response.json()
scored_points1 = response.json()["result"]
assert len(scored_points1) == 8 - count_ids_in_examples(context1, target)
response = request_with_validation(
api="/collections/{collection_name}/points/discover",
method="POST",
path_params={"collection_name": collection_name},
body={
"target": target,
"context": context2,
"limit": 8,
},
)
assert response.ok, response.json()
scored_points2 = response.json()["result"]
assert len(scored_points2) == 8 - count_ids_in_examples(context2, target)
# We keep same target, so context part of the score (integer part) can be different,
# while target part of the score (decimal part) should be the same
scored_points2_map = {point["id"]: point for point in scored_points2}
for point1 in scored_points1:
if point1["id"] in scored_points2_map:
point2 = scored_points2_map[point1["id"]]
target_score1 = point1["score"] - math.floor(point1["score"])
target_score2 = point2["score"] - math.floor(point2["score"])
assert math.isclose(target_score1, target_score2, rel_tol=1e-5)
def test_discover_batch(collection_name):
targets = []
contexts = []
single_results = []
# Singles
for i in range(10):
target = random_example()
targets.append(target)
context = [
{"positive": random_example(), "negative": random_example()},
{"positive": random_example(), "negative": random_example()},
]
contexts.append(context)
response = request_with_validation(
api="/collections/{collection_name}/points/discover",
method="POST",
path_params={"collection_name": collection_name},
body={
"target": target,
"context": context,
"limit": 8,
},
)
assert response.ok, response.json()
single_results.append(response.json()["result"])
# Batch
searches = [
{
"target": target,
"context": context,
"limit": 8,
}
for target, context in zip(targets, contexts)
]
response = request_with_validation(
api="/collections/{collection_name}/points/discover/batch",
method="POST",
path_params={"collection_name": collection_name},
body={
"searches": searches,
},
)
batch_results = response.json()["result"]
assert len(single_results) == len(batch_results)
for single_result, batch_result in zip(single_results, batch_results):
assert single_result == batch_result
def test_null_offset(collection_name):
target = random_example()
response = request_with_validation(
api="/collections/{collection_name}/points/discover",
method="POST",
path_params={"collection_name": collection_name},
body={
"target": target,
"limit": 8,
"offset": None,
},
)
assert response.ok, response.json()
def test_discovery_lookup(collection_name, collection_name_lookup):
# delete lookup collection if exists
response = request_with_validation(
api='/collections/{collection_name}',
method="DELETE",
path_params={'collection_name': collection_name_lookup},
)
assert response.ok, response.text
# re-create lookup collection
response = request_with_validation(
api='/collections/{collection_name}',
method="PUT",
path_params={'collection_name': collection_name_lookup},
body={
"vectors": {
"other": {
"size": 4,
"distance": "Dot",
}
}
}
)
assert response.ok, response.text
# insert vectors to lookup collection
response = request_with_validation(
api='/collections/{collection_name}/points',
method="PUT",
path_params={'collection_name': collection_name_lookup},
query_params={'wait': 'true'},
body={
"points": [
{
"id": 1,
"vector": {"other": [1.0, 0.0, 0.0, 0.0]},
},
{
"id": 2,
"vector": {"other": [0.0, 0.0, 0.0, 2.0]},
},
]
}
)
assert response.ok, response.text
# check discover by id + lookup_from
response = request_with_validation(
api="/collections/{collection_name}/points/discover",
method="POST",
path_params={"collection_name": collection_name},
body={
"target": [0.2, 0.1, 0.9, 0.7],
"context": [
{
"positive": 1,
"negative": 2
},
],
"limit": 10,
"lookup_from": {
"collection": collection_name_lookup,
"vector": "other"
}
},
)
assert response.ok, response.text
discovery_result_by_id = response.json()["result"]
# check discover by vector + lookup_from
response = request_with_validation(
api="/collections/{collection_name}/points/discover",
method="POST",
path_params={"collection_name": collection_name},
body={
"target": [0.2, 0.1, 0.9, 0.7],
"context": [
{
"positive": [1.0, 0.0, 0.0, 0.0],
"negative": [0.0, 0.0, 0.0, 2.0]
},
],
"limit": 10,
},
)
assert response.ok, response.text
discovery_result_by_vector = response.json()["result"]
# check if results are the same
assert discovery_result_by_id == discovery_result_by_vector, f"discovery_result_by_id: {discovery_result_by_id}, discovery_result_by_vector: {discovery_result_by_vector}"