import pandas as pd import numpy as np from glob import glob import os from tqdm import tqdm from huggingface_hub import snapshot_download # Download previous embeddings from huggingface_hub import HfApi # To transact with huggingface.co import gradio as gr # Create a Gradio interface so spaces doesnt timeout tqdm.pandas() ####################################################################################### print("Downloading repo") # Setup transaction details repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus" repo_type = "dataset" # Subfolder in the repo of the dataset where the file is stored local_dir = "." snapshot_download(repo_id=repo_id, repo_type=repo_type, local_dir=local_dir) # ####################################################################################### # # Function to convert dense vector to binary vector # def dense_to_binary(dense_vector): # return np.packbits(np.where(dense_vector >= 0, 1, 0)).tobytes() # # Gather fp32 files # floats = glob('data/*.parquet') # # Create a folder to store binary embeddings # os.makedirs('binary_embeddings', exist_ok=True) # # Convert and save each file # for file in floats: # print(f"Processing file: {file}") # df = pd.read_parquet(file) # df['vector'] = df['vector'].progress_apply(dense_to_binary) # df.to_parquet(f'binary_embeddings/{os.path.basename(file)}') # ####################################################################################### # # Upload the new embeddings to the repo # access_token = os.getenv("HF_API_KEY") # api = HfApi(token=access_token) # # Setup transaction details # repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_binary" # repo_type = "dataset" # api.create_repo(repo_id=repo_id, repo_type=repo_type, exist_ok=True) # # Subfolder in the repo of the dataset where the file is stored # folder_in_repo = "data" # # Path to the folder containing the new embeddings # embed_folder = "binary_embeddings" # print(f"Uploading embeddings to {repo_id} from folder {embed_folder}") # # Upload all files within the folder to the specified repository # api.upload_folder(repo_id=repo_id, folder_path=embed_folder, path_in_repo=folder_in_repo, repo_type="dataset") # print("Upload complete") ####################################################################################### # Function to convert dense vector to binary vector def dense_to_bmrl(dense_vector, size=512): return np.packbits(np.where(dense_vector >= 0, 1, 0)[:size]).tobytes() # Gather fp32 files floats = glob('data/*.parquet') # Create a folder to store binary embeddings os.makedirs('bmrl_embeddings', exist_ok=True) # Convert and save each file for file in floats: print(f"Processing file: {file}") df = pd.read_parquet(file) df['vector'] = df['vector'].progress_apply(dense_to_bmrl) df.to_parquet(f'bmrl_embeddings/{os.path.basename(file)}') ####################################################################################### # Upload the new embeddings to the repo access_token = os.getenv("HF_API_KEY") api = HfApi(token=access_token) # Setup transaction details repo_id = "bluuebunny/arxiv_abstract_embedding_mxbai_large_v1_milvus_bmrl" repo_type = "dataset" api.create_repo(repo_id=repo_id, repo_type=repo_type, exist_ok=True) # Subfolder in the repo of the dataset where the file is stored folder_in_repo = "data" # Path to the folder containing the new embeddings embed_folder = "bmrl_embeddings" print(f"Uploading embeddings to {repo_id} from folder {embed_folder}") # Upload all files within the folder to the specified repository api.upload_folder(repo_id=repo_id, folder_path=embed_folder, path_in_repo=folder_in_repo, repo_type="dataset") print("Upload complete") def greet(name, intensity): return "Hello, " + name + "!" * int(intensity) demo = gr.Interface( fn=greet, inputs=["text", "slider"], outputs=["text"], ) demo.launch()