File size: 1,674 Bytes
ae4e1e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import pytest
from typing import Any
from huggingface_hub import snapshot_download
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS


snapshot_download(
    repo_id='KonradSzafer/index',
    allow_patterns=['*.faiss', '*.pkl'], 
    repo_type='dataset',
    local_dir='index/'
)

@pytest.fixture(scope="module")
def embedding_model() -> HuggingFaceInstructEmbeddings:
    model_name = "hkunlp/instructor-large"
    embed_instruction = "Represent the Hugging Face library documentation"
    query_instruction = "Query the most relevant piece of information from the Hugging Face documentation"
    return HuggingFaceInstructEmbeddings(
        model_name=model_name,
        embed_instruction=embed_instruction,
        query_instruction=query_instruction,
    )

@pytest.fixture(scope="module")
def index_path() -> str:
    return "index/"

@pytest.fixture(scope="module")
def index(embedding_model: HuggingFaceInstructEmbeddings, index_path: str):
    return FAISS.load_local(index_path, embedding_model)

@pytest.fixture(scope="module")
def query() -> str:
    return "How to use the tokenizer?"

def test_load_index(embedding_model: HuggingFaceInstructEmbeddings, index_path: str):
    index = FAISS.load_local(index_path, embedding_model)
    assert index is not None, "Failed to load index"

def test_index_page_content(index, query: str):
    query_docs = index.similarity_search(query=query, k=3)
    assert isinstance(query_docs[0].page_content, str)

def test_index_metadata(index, query):
    query_docs = index.similarity_search(query=query, k=3)
    assert isinstance(query_docs[0].metadata['source'], str)