File size: 3,147 Bytes
854f61d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from types import ModuleType

import numpy as np
import openai
from langchain_community.vectorstores import FAISS 
# from sentence_transformers import SentenceTransformer
from hamilton.function_modifiers import config, extract_fields


@config.when(embedding_service="openai")
@extract_fields(

    dict(

        embedding_dimension=int,

        embedding_metric=str,

    )

)
def embedding_config__openai(model_name: str) -> dict:
    if model_name == "text-embedding-ada-002":
        return dict(embedding_dimension=1536, embedding_metric="cosine")
    # If you support more models, you would add that here
    raise ValueError(f"Invalid `model_name`[{model_name}] for openai was passed.")



@config.when(embedding_service="sentence_transformer")
@extract_fields(

    dict(

        embedding_dimension=int,

        embedding_metric=str

    )

)
def embedding_config__sentence_transformer(model_name: str) -> dict:
    if model_name == "multi-qa-MiniLM-L6-cos-v1":
        return dict(embedding_dimension=384, embedding_metric="cosine")
    # If you support more models, you would add that here
    raise ValueError(f"Invalid `model_name`[{model_name}] for SentenceTransformer was passed.")

@config.when(embedding_service="openai")
def embedding_provider__openai(api_key: str) -> ModuleType:
    """Set OpenAI API key"""
    openai.api_key = api_key
    return openai


@config.when(embedding_service="openai")
def embeddings__openai(

    embedding_provider: ModuleType,

    text_contents: list[str],

    model_name: str = "text-embedding-ada-002",

) -> list[np.ndarray]:
    """Convert text to vector representations (embeddings) using OpenAI Embeddings API

    reference: https://github.com/openai/openai-cookbook/blob/main/examples/Get_embeddings.ipynb

    """
    response = embedding_provider.Embedding.create(input=text_contents, engine=model_name)
    return [np.asarray(obj["embedding"]) for obj in response["data"]]


# @config.when(embedding_service="sentence_transformer")
# def embeddings__sentence_transformer(
#     text_contents: list[str], model_name: str = "multi-qa-MiniLM-L6-cos-v1"
# ) -> list[np.ndarray]:
#     """Convert text to vector representations (embeddings)
#     model card: https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1
#     reference: https://www.sbert.net/examples/applications/computing-embeddings/README.html
#     """
#     embedding_provider = SentenceTransformer(model_name)
#     embeddings = embedding_provider.encode(text_contents, convert_to_numpy=True)
#     return list(embeddings)

def text_contents2(

    text_contents: list[str]

) -> list[str]:
    return text_contents



def data_objects(

    ids: list[int], ideologies: list[str], labels: list[str], embeddings: list[np.ndarray]

) -> list[tuple]:
    assert len(labels) == len(embeddings) # == len(locations) 
    properties = [dict(id=id, ideology=ideology, label=label) for id, ideology, label in zip(ids, ideologies, labels)]
    embeddings = [x.tolist() for x in embeddings]
    
    return list(zip(labels, embeddings, properties))