File size: 2,542 Bytes
9dbf344
 
 
 
ff32b4a
 
9dbf344
 
 
 
 
5d87c3c
9dbf344
9eeba1e
 
 
9dbf344
9c6425d
b4510a6
9eeba1e
b4510a6
9eeba1e
b4510a6
9dbf344
9eeba1e
b4510a6
9eeba1e
9dbf344
9eeba1e
 
 
43ac0d8
9eeba1e
 
 
 
9dbf344
9eeba1e
 
4cfed8e
9eeba1e
 
9dbf344
9eeba1e
9dbf344
9eeba1e
 
9dbf344
be094ee
 
9dbf344
9eeba1e
 
 
 
 
 
 
 
 
b4510a6
5d87c3c
9eeba1e
 
9dbf344
b4510a6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import time
import numpy as np
from torch import cuda

random_seed = 42

if cuda.is_available():
    torch_device = "gpu"
else: 
    torch_device =  "cpu"

def make_or_load_embeddings(docs, file_list, embeddings_out, embedding_model, embeddings_super_compress, low_resource_mode_opt):

    # If no embeddings found, make or load in
    if embeddings_out.size == 0:
        print("Embeddings not found. Loading or generating new ones.")

        embeddings_file_names = [string for string in file_list if "embedding" in string.lower()]  
        
        if embeddings_file_names:
            embeddings_file_name = embeddings_file_names[0]
            print("Loading embeddings from file.")
            embeddings_out = np.load(embeddings_file_name)['arr_0']

            # If embedding files have 'super_compress' in the title, they have been multiplied by 100 before save
            if "compress" in embeddings_file_name:
                embeddings_out /= 100

        if not embeddings_file_names:
            tic = time.perf_counter()
            print("Starting to embed documents.")

            # Custom model
            # If on CPU, don't resort to embedding models
            if low_resource_mode_opt == "Yes":
                print("Creating simplified 'sparse' embeddings based on TfIDF")

                # Fit the pipeline to the text data
                embedding_model.fit(docs)

                # Transform text data to embeddings
                embeddings_out = embedding_model.transform(docs)

                #embeddings_out = embedding_model.encode(sentences=docs, show_progress_bar = True, batch_size = 32)

            elif low_resource_mode_opt == "No":
                print("Creating dense embeddings based on transformers model")

                #embeddings_out = embedding_model.encode(sentences=docs, max_length=1024, show_progress_bar = True, batch_size = 32) # For Jina # # 
                embeddings_out = embedding_model.encode(sentences=docs, show_progress_bar = True, batch_size = 32) # For BGE

            toc = time.perf_counter()
            time_out = f"The embedding took {toc - tic:0.1f} seconds"
            print(time_out)

           # If the user has chosen to go with super compressed embedding files to save disk space
            if embeddings_super_compress == "Yes":
                embeddings_out = np.round(embeddings_out, 3) 
                embeddings_out *= 100

        return embeddings_out

    else:
        print("Found pre-loaded embeddings.")

        return embeddings_out