Spaces:
Paused
Paused
File size: 3,983 Bytes
e57805e 280dead e57805e a6bb8b8 e57805e c95f496 e57805e c95f496 e57805e 44aac25 e57805e 44aac25 e57805e 44aac25 e57805e 44aac25 e57805e 44aac25 e57805e 44aac25 e57805e 44aac25 e57805e 44aac25 e57805e 44aac25 e57805e 44aac25 e57805e 44aac25 e57805e 44aac25 e57805e 44aac25 e57805e 44aac25 e57805e 44aac25 e57805e 44aac25 e57805e 44aac25 e57805e 44aac25 e57805e 925ff59 2ff895a e57805e 2ff895a e57805e 2ff895a e57805e 2ff895a e57805e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import pandas as pd
import numpy as np
from glob import glob
import os
from tqdm import tqdm
from huggingface_hub import snapshot_download # Download previous embeddings
from huggingface_hub import HfApi # To transact with huggingface.co
import gradio as gr # Create a Gradio interface so spaces doesnt timeout
tqdm.pandas()
#######################################################################################
print("Downloading repo")
# Setup transaction details
repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus"
repo_type = "dataset"
# Subfolder in the repo of the dataset where the file is stored
local_dir = "."
snapshot_download(repo_id=repo_id, repo_type=repo_type, local_dir=local_dir)
# #######################################################################################
# # Function to convert dense vector to binary vector
# def dense_to_binary(dense_vector):
# return np.packbits(np.where(dense_vector >= 0, 1, 0)).tobytes()
# # Gather fp32 files
# floats = glob('data/*.parquet')
# # Create a folder to store binary embeddings
# os.makedirs('binary_embeddings', exist_ok=True)
# # Convert and save each file
# for file in floats:
# print(f"Processing file: {file}")
# df = pd.read_parquet(file)
# df['vector'] = df['vector'].progress_apply(dense_to_binary)
# df.to_parquet(f'binary_embeddings/{os.path.basename(file)}')
# #######################################################################################
# # Upload the new embeddings to the repo
# access_token = os.getenv("HF_API_KEY")
# api = HfApi(token=access_token)
# # Setup transaction details
# repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary"
# repo_type = "dataset"
# api.create_repo(repo_id=repo_id, repo_type=repo_type, exist_ok=True)
# # Subfolder in the repo of the dataset where the file is stored
# folder_in_repo = "data"
# # Path to the folder containing the new embeddings
# embed_folder = "binary_embeddings"
# print(f"Uploading embeddings to {repo_id} from folder {embed_folder}")
# # Upload all files within the folder to the specified repository
# api.upload_folder(repo_id=repo_id, folder_path=embed_folder, path_in_repo=folder_in_repo, repo_type="dataset")
# print("Upload complete")
#######################################################################################
# Function to convert dense vector to binary vector
def dense_to_bmrl(dense_vector, size=512):
return np.packbits(np.where(dense_vector >= 0, 1, 0)[:size]).tobytes()
# Gather fp32 files
floats = glob('data/*.parquet')
# Create a folder to store binary embeddings
os.makedirs('bmrl_embeddings', exist_ok=True)
# Convert and save each file
for file in floats:
print(f"Processing file: {file}")
df = pd.read_parquet(file)
df['vector'] = df['vector'].progress_apply(dense_to_bmrl)
df.to_parquet(f'bmrl_embeddings/{os.path.basename(file)}')
#######################################################################################
# Upload the new embeddings to the repo
access_token = os.getenv("HF_API_KEY")
api = HfApi(token=access_token)
# Setup transaction details
repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_bmrl"
repo_type = "dataset"
api.create_repo(repo_id=repo_id, repo_type=repo_type, exist_ok=True)
# Subfolder in the repo of the dataset where the file is stored
folder_in_repo = "data"
# Path to the folder containing the new embeddings
embed_folder = "bmrl_embeddings"
print(f"Uploading embeddings to {repo_id} from folder {embed_folder}")
# Upload all files within the folder to the specified repository
api.upload_folder(repo_id=repo_id, folder_path=embed_folder, path_in_repo=folder_in_repo, repo_type="dataset")
print("Upload complete")
def greet(name, intensity):
return "Hello, " + name + "!" * int(intensity)
demo = gr.Interface(
fn=greet,
inputs=["text", "slider"],
outputs=["text"],
)
demo.launch()
|