File size: 3,983 Bytes
e57805e
 
 
 
 
 
 
 
280dead
e57805e
 
 
 
a6bb8b8
 
e57805e
 
 
 
 
c95f496
e57805e
c95f496
e57805e
44aac25
e57805e
44aac25
 
 
e57805e
44aac25
 
e57805e
44aac25
 
e57805e
44aac25
 
e57805e
 
44aac25
e57805e
44aac25
e57805e
44aac25
e57805e
44aac25
e57805e
44aac25
e57805e
 
44aac25
 
 
e57805e
44aac25
 
 
e57805e
44aac25
e57805e
44aac25
 
e57805e
44aac25
 
e57805e
44aac25
e57805e
44aac25
 
e57805e
44aac25
e57805e
 
 
 
925ff59
2ff895a
e57805e
 
2ff895a
e57805e
 
 
 
 
2ff895a
e57805e
 
 
 
 
 
2ff895a
e57805e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import pandas as pd
import numpy as np
from glob import glob
import os
from tqdm import tqdm
from huggingface_hub import snapshot_download # Download previous embeddings
from huggingface_hub import HfApi # To transact with huggingface.co
import gradio as gr # Create a Gradio interface so spaces doesnt timeout
tqdm.pandas()


#######################################################################################

print("Downloading repo")

# Setup transaction details
repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus"
repo_type = "dataset"

# Subfolder in the repo of the dataset where the file is stored
local_dir = "."

snapshot_download(repo_id=repo_id, repo_type=repo_type, local_dir=local_dir)

# #######################################################################################

# # Function to convert dense vector to binary vector
# def dense_to_binary(dense_vector):
#     return np.packbits(np.where(dense_vector >= 0, 1, 0)).tobytes()

# # Gather fp32 files
# floats = glob('data/*.parquet')

# # Create a folder to store binary embeddings
# os.makedirs('binary_embeddings', exist_ok=True)

# # Convert and save each file
# for file in floats:
    

#     print(f"Processing file: {file}")

#     df = pd.read_parquet(file)

#     df['vector'] = df['vector'].progress_apply(dense_to_binary)
    
#     df.to_parquet(f'binary_embeddings/{os.path.basename(file)}')

# #######################################################################################


# # Upload the new embeddings to the repo
# access_token =  os.getenv("HF_API_KEY")
# api = HfApi(token=access_token)

# # Setup transaction details
# repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary"
# repo_type = "dataset"

# api.create_repo(repo_id=repo_id, repo_type=repo_type, exist_ok=True)

# # Subfolder in the repo of the dataset where the file is stored
# folder_in_repo = "data"

# # Path to the folder containing the new embeddings
# embed_folder = "binary_embeddings"

# print(f"Uploading embeddings to {repo_id} from folder {embed_folder}")

# # Upload all files within the folder to the specified repository
# api.upload_folder(repo_id=repo_id, folder_path=embed_folder, path_in_repo=folder_in_repo, repo_type="dataset")

# print("Upload complete")

#######################################################################################

# Function to convert dense vector to binary vector
def dense_to_bmrl(dense_vector, size=512):
    return np.packbits(np.where(dense_vector >= 0, 1, 0)[:size]).tobytes()

# Gather fp32 files
floats = glob('data/*.parquet')

# Create a folder to store binary embeddings
os.makedirs('bmrl_embeddings', exist_ok=True)

# Convert and save each file
for file in floats:
    

    print(f"Processing file: {file}")

    df = pd.read_parquet(file)

    df['vector'] = df['vector'].progress_apply(dense_to_bmrl)
    
    df.to_parquet(f'bmrl_embeddings/{os.path.basename(file)}')

#######################################################################################


# Upload the new embeddings to the repo
access_token =  os.getenv("HF_API_KEY")
api = HfApi(token=access_token)

# Setup transaction details
repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_bmrl"
repo_type = "dataset"

api.create_repo(repo_id=repo_id, repo_type=repo_type, exist_ok=True)

# Subfolder in the repo of the dataset where the file is stored
folder_in_repo = "data"

# Path to the folder containing the new embeddings
embed_folder = "bmrl_embeddings"

print(f"Uploading embeddings to {repo_id} from folder {embed_folder}")

# Upload all files within the folder to the specified repository
api.upload_folder(repo_id=repo_id, folder_path=embed_folder, path_in_repo=folder_in_repo, repo_type="dataset")

print("Upload complete")

def greet(name, intensity):
    return "Hello, " + name + "!" * int(intensity)

demo = gr.Interface(
    fn=greet,
    inputs=["text", "slider"],
    outputs=["text"],
)

demo.launch()