Spaces:

reztilop
/

colibri.qdrant

Build error

File size: 12,295 Bytes
import math
import random

import pytest

from .helpers.collection_setup import basic_collection_setup, drop_collection
from .helpers.helpers import request_with_validation


@pytest.fixture(scope='module', autouse=True)
def collection_name_lookup(collection_name):
    return f"{collection_name}_lookup"


def random_vector(dim=4):
    return [random.random() for _ in range(dim)]


def random_example(dim=4, min_id=1, max_id=8):
    if random.random() < 0.5:
        return random_vector(dim)
    else:
        return random.randint(min_id, max_id)


def count_ids_in_examples(context, target) -> int:
    set_ = set()
    for pair in context:
        for example in [pair["positive"], pair["negative"]]:
            if isinstance(example, int):
                set_.add(example)
    if isinstance(target, int):
        set_.add(target)
    return len(set_)


@pytest.fixture(autouse=True, scope="module")
def setup(on_disk_vectors, collection_name):
    basic_collection_setup(collection_name=collection_name, on_disk_vectors=on_disk_vectors)
    yield
    drop_collection(collection_name=collection_name)


# Context is when we don't include a target vector
def test_context(collection_name):
    context = [
        {"positive": random_example(), "negative": random_example()},
        {"positive": random_example(), "negative": random_example()},
    ]
    response = request_with_validation(
        api="/collections/{collection_name}/points/discover",
        method="POST",
        path_params={"collection_name": collection_name},
        body={
            "context": context,
            "limit": 8,
        },
    )
    assert response.ok, response.json()

    scored_points = response.json()["result"]

    assert len(scored_points) == 8 - count_ids_in_examples(context, None)

    # Score in context search relates to loss, so max score for context search is 0.0
    for point in scored_points:
        assert point["score"] <= 0.0


# When we only use target, it should be the exact same as search
def test_only_target_is_search_with_different_scoring(collection_name):
    target = random_vector()

    # First, search
    response = request_with_validation(
        api="/collections/{collection_name}/points/search",
        method="POST",
        path_params={"collection_name": collection_name},
        body={
            "vector": target,
            "limit": 8,
        },
    )
    assert response.ok, response.json()

    search_points = response.json()["result"]

    # Then, discover
    response = request_with_validation(
        api="/collections/{collection_name}/points/discover",
        method="POST",
        path_params={"collection_name": collection_name},
        body={
            "target": target,
            "limit": 8,
        },
    )
    assert response.ok, response.json()

    discover_points = response.json()["result"]

    assert len(discover_points) == 8

    # Results should be in same order, but different scores
    assert len(search_points) == len(discover_points)
    for search_point, discover_point in zip(search_points, discover_points):
        assert search_point["id"] == discover_point["id"]
        assert search_point["score"] != discover_point["score"]


# Only when we use both target and context, we are doing discovery
def test_discover_same_context(collection_name):
    target1 = random_example()
    context = [
        {"positive": random_example(), "negative": random_example()},
        {"positive": random_example(), "negative": random_example()},
    ]

    response = request_with_validation(
        api="/collections/{collection_name}/points/discover",
        method="POST",
        path_params={"collection_name": collection_name},
        body={
            "target": target1,
            "context": context,
            "limit": 8,
            "params": {
                "exact": True,
            },
        },
    )
    assert response.ok, response.json()

    scored_points1 = response.json()["result"]

    assert len(scored_points1) == 8 - count_ids_in_examples(context, target1)

    target2 = random_example()

    response = request_with_validation(
        api="/collections/{collection_name}/points/discover",
        method="POST",
        path_params={"collection_name": collection_name},
        body={
            "target": target2,
            "context": context,
            "limit": 8,
            "params": {
                "exact": True,
            },
        },
    )
    assert response.ok, response.json()

    scored_points2 = response.json()["result"]

    assert len(scored_points2) == 8 - count_ids_in_examples(context, target2)

    # We keep same context, so context part of the score (integer part) should be the same,
    # while target part of the score (decimal part) should be different

    scored_points_2_map = {point["id"]: point for point in scored_points2}

    for point1, point2 in zip(scored_points1, scored_points2):
        if point1["id"] in scored_points_2_map:
            point2 = scored_points_2_map[point1["id"]]
            assert math.floor(point1["score"]) == math.floor(point2["score"])

            target_score1 = point1["score"] - math.floor(point1["score"])
            target_score2 = point2["score"] - math.floor(point2["score"])
            if target1 == target2:
                assert math.isclose(target_score1, target_score2, rel_tol=1e-5)
            else:
                assert not math.isclose(target_score1, target_score2, rel_tol=1e-5)


def test_discover_same_target(collection_name):
    target = random_example()

    context1 = [
        {"positive": random_example(), "negative": random_example()},
        {"positive": random_example(), "negative": random_example()},
    ]

    context2 = [
        {"positive": random_example(), "negative": random_example()},
        {"positive": random_example(), "negative": random_example()},
    ]

    response = request_with_validation(
        api="/collections/{collection_name}/points/discover",
        method="POST",
        path_params={"collection_name": collection_name},
        body={
            "target": target,
            "context": context1,
            "limit": 8,
        },
    )
    assert response.ok, response.json()

    scored_points1 = response.json()["result"]

    assert len(scored_points1) == 8 - count_ids_in_examples(context1, target)

    response = request_with_validation(
        api="/collections/{collection_name}/points/discover",
        method="POST",
        path_params={"collection_name": collection_name},
        body={
            "target": target,
            "context": context2,
            "limit": 8,
        },
    )
    assert response.ok, response.json()

    scored_points2 = response.json()["result"]

    assert len(scored_points2) == 8 - count_ids_in_examples(context2, target)

    # We keep same target, so context part of the score (integer part) can be different,
    # while target part of the score (decimal part) should be the same
    scored_points2_map = {point["id"]: point for point in scored_points2}

    for point1 in scored_points1:
        if point1["id"] in scored_points2_map:
            point2 = scored_points2_map[point1["id"]]

            target_score1 = point1["score"] - math.floor(point1["score"])
            target_score2 = point2["score"] - math.floor(point2["score"])
            assert math.isclose(target_score1, target_score2, rel_tol=1e-5)


def test_discover_batch(collection_name):
    targets = []
    contexts = []
    single_results = []

    # Singles
    for i in range(10):
        target = random_example()
        targets.append(target)

        context = [
            {"positive": random_example(), "negative": random_example()},
            {"positive": random_example(), "negative": random_example()},
        ]
        contexts.append(context)

        response = request_with_validation(
            api="/collections/{collection_name}/points/discover",
            method="POST",
            path_params={"collection_name": collection_name},
            body={
                "target": target,
                "context": context,
                "limit": 8,
            },
        )
        assert response.ok, response.json()

        single_results.append(response.json()["result"])

    # Batch
    searches = [
        {
            "target": target,
            "context": context,
            "limit": 8,
        }
        for target, context in zip(targets, contexts)
    ]

    response = request_with_validation(
        api="/collections/{collection_name}/points/discover/batch",
        method="POST",
        path_params={"collection_name": collection_name},
        body={
            "searches": searches,
        },
    )

    batch_results = response.json()["result"]

    assert len(single_results) == len(batch_results)
    for single_result, batch_result in zip(single_results, batch_results):
        assert single_result == batch_result


def test_null_offset(collection_name):
    target = random_example()

    response = request_with_validation(
        api="/collections/{collection_name}/points/discover",
        method="POST",
        path_params={"collection_name": collection_name},
        body={
            "target": target,
            "limit": 8,
            "offset": None,
        },
    )
    assert response.ok, response.json()


def test_discovery_lookup(collection_name, collection_name_lookup):
    # delete lookup collection if exists
    response = request_with_validation(
        api='/collections/{collection_name}',
        method="DELETE",
        path_params={'collection_name': collection_name_lookup},
    )
    assert response.ok, response.text

    # re-create lookup collection
    response = request_with_validation(
        api='/collections/{collection_name}',
        method="PUT",
        path_params={'collection_name': collection_name_lookup},
        body={
            "vectors": {
                "other": {
                    "size": 4,
                    "distance": "Dot",
                }
            }
        }
    )
    assert response.ok, response.text

    # insert vectors to lookup collection
    response = request_with_validation(
        api='/collections/{collection_name}/points',
        method="PUT",
        path_params={'collection_name': collection_name_lookup},
        query_params={'wait': 'true'},
        body={
            "points": [
                {
                    "id": 1,
                    "vector": {"other": [1.0, 0.0, 0.0, 0.0]},
                },
                {
                    "id": 2,
                    "vector": {"other": [0.0, 0.0, 0.0, 2.0]},
                },
            ]
        }
    )
    assert response.ok, response.text

    # check discover by id + lookup_from
    response = request_with_validation(
        api="/collections/{collection_name}/points/discover",
        method="POST",
        path_params={"collection_name": collection_name},
        body={
            "target": [0.2, 0.1, 0.9, 0.7],
            "context": [
                {
                    "positive": 1,
                    "negative": 2
                },
            ],
            "limit": 10,
            "lookup_from": {
                "collection": collection_name_lookup,
                "vector": "other"
            }
        },
    )
    assert response.ok, response.text
    discovery_result_by_id = response.json()["result"]

    # check discover by vector + lookup_from
    response = request_with_validation(
        api="/collections/{collection_name}/points/discover",
        method="POST",
        path_params={"collection_name": collection_name},
        body={
            "target": [0.2, 0.1, 0.9, 0.7],
            "context": [
                {
                    "positive": [1.0, 0.0, 0.0, 0.0],
                    "negative": [0.0, 0.0, 0.0, 2.0]
                },
            ],
            "limit": 10,
        },
    )
    assert response.ok, response.text
    discovery_result_by_vector = response.json()["result"]

    # check if results are the same
    assert discovery_result_by_id == discovery_result_by_vector, f"discovery_result_by_id: {discovery_result_by_id}, discovery_result_by_vector: {discovery_result_by_vector}"