In [None]:
%pip install kagglehub
%pip install sacremoses

In [None]:
from pathlib import Path
import os
from pathlib import Path
from transformers import pipeline
from tqdm import tqdm
import pandas as pd
import torch
import kagglehub
import signal

In [None]:
class GracefulExiter:
 # to catch keyboard interrupts
 def __init__(self):
 self.should_exit = False
 signal.signal(signal.SIGINT, self.exit_gracefully)
 signal.signal(signal.SIGTERM, self.exit_gracefully)

 def exit_gracefully(self, signum, frame):
 print(
 "\nReceived interrupt signal. Finishing current work and saving progress..."
 )
 self.should_exit = True

In [None]:
def get_dataset():
 # Download latest version
 path = kagglehub.dataset_download("Cornell-University/arxiv")

 print("Path to dataset files:", path)

 file_name = os.listdir(path)[0]
 path_to_dataset = Path(path) / file_name
 data = pd.read_json(path_to_dataset, lines=True)

 # leave only the first common category
 data["categories"] = [category.split()[0] for category in data["categories"]]
 data["categories"] = [category.split(".")[0] for category in data["categories"]]

 # sort data in a proper way
 counts = data.groupby(by="categories")["title"].count().sort_index()
 unique_categories = counts.index.to_list()

 groups_same_category = {
 category: data[data["categories"] == category] for category in unique_categories
 }

 max_group_size = counts.max()

 new_df = []

 for i in range(max_group_size):
 for category in unique_categories:
 if i < len(groups_same_category[category]):
 new_df.append(groups_same_category[category].iloc[i])

 result_df = pd.DataFrame(new_df).reset_index()
 return result_df

In [None]:
def translate_dataset(
 starting_from=0,
 count=1000,
 batch_size=16,
 save_interval=64,
 dataset=None,
 use_google_drive=False,
):
 # if dataset is given the function will use it
 # else it will download dataset

 # for colab to save files in your google drive
 # just in case colab ending the session before you could save all the files

 # if use_google_drive:
 # from google.colab import drive
 # drive.mount('/content/drive')
 # target_folder = Path("/content/drive/MyDrive/arxiv_translations")
 # else:
 # target_folder = Path("russian_dataset")
 # target_folder.mkdir(exist_ok=True)

 target_folder = Path("dataset_parts")
 target_folder.mkdir(exist_ok=True)

 # to catch keyboard interrupts
 exiter = GracefulExiter()

 result_df = dataset.copy()

 # download the model
 translator = pipeline(
 "translation_en_to_ru",
 model="Helsinki-NLP/opus-mt-en-ru",
 device="cuda" if torch.cuda.is_available() else "cpu",
 torch_dtype="auto",
 )

 def clean_text(text, max_length=512):
 if pd.isna(text) or text.strip() == "":
 return "[EMPTY]"
 if len(text) > max_length:
 text = text[:max_length]
 return str(text).strip()

 def translate_batch(texts, batch_size=batch_size, max_length=512):
 results = []
 texts = [clean_text(text, max_length) for text in texts]
 try:
 for out in tqdm(
 translator(texts, max_length=max_length, batch_size=batch_size),
 total=len(texts),
 desc="Translating...",
 ):
 results.append(out)
 except Exception as e:
 print(f"Error: {e}")
 return results

 # take the necessary interval
 part_df = result_df.iloc[starting_from : starting_from + count]

 russian_data = pd.DataFrame(columns=["authors", "title", "abstract", "categories"])

 previous_temp_file = None

 for chunk_start in range(0, count, save_interval):
 if exiter.should_exit:
 break

 chunk_end = min(chunk_start + save_interval, count)
 print(f"Processing records {chunk_start} to {chunk_end}...")

 chunk_df = part_df.iloc[chunk_start:chunk_end]

 translated_chunk = {
 "authors": translate_batch(chunk_df["authors"].tolist()),
 "title": translate_batch(chunk_df["title"].tolist()),
 "abstract": translate_batch(chunk_df["abstract"].tolist()),
 "categories": chunk_df["categories"].tolist(),
 }
 if exiter.should_exit:
 print("Interrupt detected. Saving partial results...")
 break
 chunk_df_translated = pd.DataFrame(translated_chunk)
 russian_data = pd.concat([russian_data, chunk_df_translated], ignore_index=True)

 # save temperory results
 temp_filename = (
 target_folder / f"{starting_from}_{starting_from + chunk_end}_temp.csv"
 )
 russian_data.to_csv(temp_filename, index=False)
 print(f"Saved temporary results to {temp_filename}")

 # removing previous temporary file
 if previous_temp_file is not None and previous_temp_file.exists():
 previous_temp_file.unlink()
 print(f"Removed previous temporary file: {previous_temp_file}")

 previous_temp_file = temp_filename

 if exiter.should_exit:
 # keyboard interrupt
 final_filename = (
 target_folder
 / f"{starting_from}_{starting_from + len(russian_data)}_partial.csv"
 )
 print(f"\nProcess interrupted. Saving partial results to {final_filename}")
 else:
 final_filename = target_folder / f"{starting_from}_{count}_final.csv"
 print(f"\nProcessing completed. Saving final results to {final_filename}")

 russian_data.to_csv(final_filename, index=False)

 # remove temperorary files
 if not exiter.should_exit:
 for temp_file in target_folder.glob("*_temp.csv"):
 temp_file.unlink()
 print("Temporary files removed.")

In [None]:
df = get_dataset()

In [None]:
translate_dataset(
 starting_from=0, count=50_000, dataset=df, batch_size=128, save_interval=512
)