In [1]:
!pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [2]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.3.1


In [3]:
from rank_bm25 import BM25Okapi
import numpy as np
from tqdm import tqdm
import os
import pickle
import torch
import glob
import json

from sentence_transformers import SentenceTransformer

In [6]:
def tokenize_doc_to_str(doc: dict) -> str:
    tokenized_doc = []
    for key, value in doc.items():
        tokenized_key = key.lower().replace("_", " ")
        tokenized_doc.append(tokenized_key)
        tokenized_doc.append(':')
        if isinstance(value, str):
            tokenized_doc.extend(value)
        else:
            tokenized_doc.extend(str(value))
    return ' '.join(tokenized_doc)

# Take all json files with names that end '_processed' 
base_path = "/kaggle/input/jokerbot-rag/Kaggle_rag_data"
sbert_embeddings_path = "/kaggle/working/sbert_embeddings.pt"

docs = []
for path in glob.glob(f"{base_path}/*_processed.json"):
    with open(path, 'r') as f:
        docs.extend(json.load(f))

# Initialize SentenceTransformer and ensure it uses GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
sbert = SentenceTransformer('sentence-transformers/all-distilroberta-v1', device=device)

str_docs = [tokenize_doc_to_str(doc) for doc in docs]

sbert_embeddings = sbert.encode(str_docs, show_progress_bar=True, convert_to_tensor=True, batch_size=256)
sbert_embeddings = sbert_embeddings.cpu()  # Move to CPU before saving

torch.save(sbert_embeddings, sbert_embeddings_path)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/2938 [00:00<?, ?it/s]

In [6]:
# Define the path and split size
sbert_embeddings = torch.load("/kaggle/working/sbert_embeddings.pt")

output_dir = "/kaggle/working/embeddings_parts"
os.makedirs(output_dir, exist_ok=True)
split_size = 1000  # Number of rows per split
embeddings_size = sbert_embeddings.size(0)
# print(sbert_embeddings[0:5].shape)

# Split and save the tensor
for i in range(0, embeddings_size, split_size):
    end_idx = min(i + split_size, embeddings_size)
    part = sbert_embeddings[i:end_idx].clone()
    torch.save(part, os.path.join(output_dir, f"embeddings_part_{i//split_size}.pt"))

print(f"Saved all parts to {output_dir}")

  sbert_embeddings = torch.load("/kaggle/working/sbert_embeddings.pt")


Saved all parts to /kaggle/working/embeddings_parts


In [7]:
# Load and combine
loaded_parts = []
for file in sorted(os.listdir(output_dir)):
    if file.startswith("embeddings_part_") and file.endswith(".pt"):
        part_path = os.path.join(output_dir, file)
        loaded_parts.append(torch.load(part_path))

reconstructed_embeddings = torch.cat(loaded_parts, dim=0)

# Verify the shape
print("Original shape:", sbert_embeddings.shape)
print("Reconstructed shape:", reconstructed_embeddings.shape)

  loaded_parts.append(torch.load(part_path))


Original shape: torch.Size([752114, 768])
Reconstructed shape: torch.Size([752114, 768])


In [4]:
!rm -rf /kaggle/working/embeddings_parts/*

  pid, fd = os.forkpty()


In [5]:
!rm -rf /kaggle/working/embedding_parts.zip

In [8]:
!zip -r embedding_parts.zip /kaggle/working/embeddings_parts

  adding: kaggle/working/embeddings_parts/ (stored 0%)
  adding: kaggle/working/embeddings_parts/embeddings_part_271.pt (deflated 7%)
  adding: kaggle/working/embeddings_parts/embeddings_part_451.pt (deflated 7%)
  adding: kaggle/working/embeddings_parts/embeddings_part_734.pt (deflated 7%)
  adding: kaggle/working/embeddings_parts/embeddings_part_387.pt (deflated 7%)
  adding: kaggle/working/embeddings_parts/embeddings_part_360.pt (deflated 7%)
  adding: kaggle/working/embeddings_parts/embeddings_part_682.pt (deflated 7%)
  adding: kaggle/working/embeddings_parts/embeddings_part_481.pt (deflated 7%)
  adding: kaggle/working/embeddings_parts/embeddings_part_173.pt (deflated 7%)
  adding: kaggle/working/embeddings_parts/embeddings_part_532.pt (deflated 7%)
  adding: kaggle/working/embeddings_parts/embeddings_part_578.pt (deflated 7%)
  adding: kaggle/working/embeddings_parts/embeddings_part_351.pt (deflated 7%)
  adding: kaggle/working/embeddings_parts/embeddings_part_212.pt (deflated 7