Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -26,8 +26,7 @@ snapshot_download(repo_id=repo_id, repo_type=repo_type, local_dir=local_dir)
|
|
26 |
|
27 |
# Function to convert dense vector to binary vector
|
28 |
def dense_to_binary(dense_vector):
|
29 |
-
return np.packbits(np.where(dense_vector >= 0, 1, 0))
|
30 |
-
|
31 |
|
32 |
# Gather fp32 files
|
33 |
floats = glob('data/*.parquet')
|
@@ -47,11 +46,6 @@ for file in floats:
|
|
47 |
|
48 |
df.to_parquet(f'binary_embeddings/{os.path.basename(file)}')
|
49 |
|
50 |
-
# Print the size of the original and binary files
|
51 |
-
original_size = os.path.getsize(file)
|
52 |
-
binary_size = os.path.getsize(f'binary_embeddings/{os.path.basename(file)}')
|
53 |
-
print(f'Original size: {original_size} bytes, Binary size: {binary_size} bytes')
|
54 |
-
|
55 |
#######################################################################################
|
56 |
|
57 |
|
@@ -81,32 +75,27 @@ print("Upload complete")
|
|
81 |
#######################################################################################
|
82 |
|
83 |
# Function to convert dense vector to binary vector
|
84 |
-
def
|
85 |
-
return np.packbits(np.
|
86 |
|
87 |
# Gather fp32 files
|
88 |
-
|
89 |
|
90 |
# Create a folder to store binary embeddings
|
91 |
os.makedirs('bmrl_embeddings', exist_ok=True)
|
92 |
|
93 |
# Convert and save each file
|
94 |
-
for file in
|
95 |
|
96 |
|
97 |
print(f"Processing file: {file}")
|
98 |
|
99 |
df = pd.read_parquet(file)
|
100 |
|
101 |
-
df['vector'] = df['vector'].progress_apply(
|
102 |
|
103 |
df.to_parquet(f'bmrl_embeddings/{os.path.basename(file)}')
|
104 |
|
105 |
-
# Print the size of the original and binary files
|
106 |
-
original_size = os.path.getsize(file)
|
107 |
-
binary_size = os.path.getsize(f'bmrl_embeddings/{os.path.basename(file)}')
|
108 |
-
print(f'Original size: {original_size} bytes, Binary size: {binary_size} bytes')
|
109 |
-
|
110 |
#######################################################################################
|
111 |
|
112 |
|
|
|
26 |
|
27 |
# Function to convert dense vector to binary vector
|
28 |
def dense_to_binary(dense_vector):
|
29 |
+
return np.packbits(np.where(dense_vector >= 0, 1, 0)).tobytes()
|
|
|
30 |
|
31 |
# Gather fp32 files
|
32 |
floats = glob('data/*.parquet')
|
|
|
46 |
|
47 |
df.to_parquet(f'binary_embeddings/{os.path.basename(file)}')
|
48 |
|
|
|
|
|
|
|
|
|
|
|
49 |
#######################################################################################
|
50 |
|
51 |
|
|
|
75 |
#######################################################################################
|
76 |
|
77 |
# Function to convert dense vector to binary vector
|
78 |
+
def dense_to_bmrl(binary_vector, size=512):
|
79 |
+
return np.packbits(np.where(dense_vector >= 0, 1, 0)[:size]).tobytes()
|
80 |
|
81 |
# Gather fp32 files
|
82 |
+
floats = glob('data/*.parquet')
|
83 |
|
84 |
# Create a folder to store binary embeddings
|
85 |
os.makedirs('bmrl_embeddings', exist_ok=True)
|
86 |
|
87 |
# Convert and save each file
|
88 |
+
for file in floats:
|
89 |
|
90 |
|
91 |
print(f"Processing file: {file}")
|
92 |
|
93 |
df = pd.read_parquet(file)
|
94 |
|
95 |
+
df['vector'] = df['vector'].progress_apply(dense_to_bmrl)
|
96 |
|
97 |
df.to_parquet(f'bmrl_embeddings/{os.path.basename(file)}')
|
98 |
|
|
|
|
|
|
|
|
|
|
|
99 |
#######################################################################################
|
100 |
|
101 |
|