{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "execution": { "iopub.execute_input": "2025-04-09T09:04:50.582374Z", "iopub.status.busy": "2025-04-09T09:04:50.581446Z", "iopub.status.idle": "2025-04-09T09:04:54.831276Z", "shell.execute_reply": "2025-04-09T09:04:54.829937Z", "shell.execute_reply.started": "2025-04-09T09:04:50.582330Z" }, "id": "POBbLwluCMeK", "outputId": "9589beb5-86c8-4b44-d9bd-cc3316c838c9" }, "outputs": [], "source": [ "%pip install kagglehub\n", "%pip install sacremoses" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2025-04-09T09:04:54.834196Z", "iopub.status.busy": "2025-04-09T09:04:54.833289Z", "iopub.status.idle": "2025-04-09T09:04:58.835896Z", "shell.execute_reply": "2025-04-09T09:04:58.834641Z", "shell.execute_reply.started": "2025-04-09T09:04:54.834135Z" }, "id": "BwJ36n6vZUB2", "tags": [] }, "outputs": [], "source": [ "from pathlib import Path\n", "import os\n", "from pathlib import Path\n", "from transformers import pipeline\n", "from tqdm import tqdm\n", "import pandas as pd\n", "import torch\n", "import kagglehub\n", "import signal" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2025-04-09T09:04:58.838507Z", "iopub.status.busy": "2025-04-09T09:04:58.837160Z", "iopub.status.idle": "2025-04-09T09:04:58.856737Z", "shell.execute_reply": "2025-04-09T09:04:58.855801Z", "shell.execute_reply.started": "2025-04-09T09:04:58.838466Z" }, "id": "cOIT5Hu5FdT2" }, "outputs": [], "source": [ "class GracefulExiter:\n", " # to catch keyboard interrupts\n", " def __init__(self):\n", " self.should_exit = False\n", " signal.signal(signal.SIGINT, self.exit_gracefully)\n", " signal.signal(signal.SIGTERM, self.exit_gracefully)\n", "\n", " def exit_gracefully(self, signum, frame):\n", " print(\n", " \"\\nReceived interrupt signal. Finishing current work and saving progress...\"\n", " )\n", " self.should_exit = True" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2025-04-09T09:04:58.859897Z", "iopub.status.busy": "2025-04-09T09:04:58.858860Z", "iopub.status.idle": "2025-04-09T09:04:58.886712Z", "shell.execute_reply": "2025-04-09T09:04:58.885792Z", "shell.execute_reply.started": "2025-04-09T09:04:58.859858Z" }, "id": "Fg9c5cFZZyoG" }, "outputs": [], "source": [ "def get_dataset():\n", " # Download latest version\n", " path = kagglehub.dataset_download(\"Cornell-University/arxiv\")\n", "\n", " print(\"Path to dataset files:\", path)\n", "\n", " file_name = os.listdir(path)[0]\n", " path_to_dataset = Path(path) / file_name\n", " data = pd.read_json(path_to_dataset, lines=True)\n", "\n", " # leave only the first common category\n", " data[\"categories\"] = [category.split()[0] for category in data[\"categories\"]]\n", " data[\"categories\"] = [category.split(\".\")[0] for category in data[\"categories\"]]\n", "\n", " # sort data in a proper way\n", " counts = data.groupby(by=\"categories\")[\"title\"].count().sort_index()\n", " unique_categories = counts.index.to_list()\n", "\n", " groups_same_category = {\n", " category: data[data[\"categories\"] == category] for category in unique_categories\n", " }\n", "\n", " max_group_size = counts.max()\n", "\n", " new_df = []\n", "\n", " for i in range(max_group_size):\n", " for category in unique_categories:\n", " if i < len(groups_same_category[category]):\n", " new_df.append(groups_same_category[category].iloc[i])\n", "\n", " result_df = pd.DataFrame(new_df).reset_index()\n", " return result_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2025-04-09T09:04:58.889441Z", "iopub.status.busy": "2025-04-09T09:04:58.887873Z", "iopub.status.idle": "2025-04-09T09:04:58.910755Z", "shell.execute_reply": "2025-04-09T09:04:58.909796Z", "shell.execute_reply.started": "2025-04-09T09:04:58.889390Z" }, "id": "RqdjPXAk1dyg", "tags": [] }, "outputs": [], "source": [ "def translate_dataset(\n", " starting_from=0,\n", " count=1000,\n", " batch_size=16,\n", " save_interval=64,\n", " dataset=None,\n", " use_google_drive=False,\n", "):\n", " # if dataset is given the function will use it\n", " # else it will download dataset\n", "\n", " # for colab to save files in your google drive\n", " # just in case colab ending the session before you could save all the files\n", "\n", " # if use_google_drive:\n", " # from google.colab import drive\n", " # drive.mount('/content/drive')\n", " # target_folder = Path(\"/content/drive/MyDrive/arxiv_translations\")\n", " # else:\n", " # target_folder = Path(\"russian_dataset\")\n", " # target_folder.mkdir(exist_ok=True)\n", "\n", " target_folder = Path(\"dataset_parts\")\n", " target_folder.mkdir(exist_ok=True)\n", "\n", " # to catch keyboard interrupts\n", " exiter = GracefulExiter()\n", "\n", " result_df = dataset.copy()\n", "\n", " # download the model\n", " translator = pipeline(\n", " \"translation_en_to_ru\",\n", " model=\"Helsinki-NLP/opus-mt-en-ru\",\n", " device=\"cuda\" if torch.cuda.is_available() else \"cpu\",\n", " torch_dtype=\"auto\",\n", " )\n", "\n", " def clean_text(text, max_length=512):\n", " if pd.isna(text) or text.strip() == \"\":\n", " return \"[EMPTY]\"\n", " if len(text) > max_length:\n", " text = text[:max_length]\n", " return str(text).strip()\n", "\n", " def translate_batch(texts, batch_size=batch_size, max_length=512):\n", " results = []\n", " texts = [clean_text(text, max_length) for text in texts]\n", " try:\n", " for out in tqdm(\n", " translator(texts, max_length=max_length, batch_size=batch_size),\n", " total=len(texts),\n", " desc=\"Translating...\",\n", " ):\n", " results.append(out)\n", " except Exception as e:\n", " print(f\"Error: {e}\")\n", " return results\n", "\n", " # take the necessary interval\n", " part_df = result_df.iloc[starting_from : starting_from + count]\n", "\n", " russian_data = pd.DataFrame(columns=[\"authors\", \"title\", \"abstract\", \"categories\"])\n", "\n", " previous_temp_file = None\n", "\n", " for chunk_start in range(0, count, save_interval):\n", " if exiter.should_exit:\n", " break\n", "\n", " chunk_end = min(chunk_start + save_interval, count)\n", " print(f\"Processing records {chunk_start} to {chunk_end}...\")\n", "\n", " chunk_df = part_df.iloc[chunk_start:chunk_end]\n", "\n", " translated_chunk = {\n", " \"authors\": translate_batch(chunk_df[\"authors\"].tolist()),\n", " \"title\": translate_batch(chunk_df[\"title\"].tolist()),\n", " \"abstract\": translate_batch(chunk_df[\"abstract\"].tolist()),\n", " \"categories\": chunk_df[\"categories\"].tolist(),\n", " }\n", " if exiter.should_exit:\n", " print(\"Interrupt detected. Saving partial results...\")\n", " break\n", " chunk_df_translated = pd.DataFrame(translated_chunk)\n", " russian_data = pd.concat([russian_data, chunk_df_translated], ignore_index=True)\n", "\n", " # save temperory results\n", " temp_filename = (\n", " target_folder / f\"{starting_from}_{starting_from + chunk_end}_temp.csv\"\n", " )\n", " russian_data.to_csv(temp_filename, index=False)\n", " print(f\"Saved temporary results to {temp_filename}\")\n", "\n", " # removing previous temporary file\n", " if previous_temp_file is not None and previous_temp_file.exists():\n", " previous_temp_file.unlink()\n", " print(f\"Removed previous temporary file: {previous_temp_file}\")\n", "\n", " previous_temp_file = temp_filename\n", "\n", " if exiter.should_exit:\n", " # keyboard interrupt\n", " final_filename = (\n", " target_folder\n", " / f\"{starting_from}_{starting_from + len(russian_data)}_partial.csv\"\n", " )\n", " print(f\"\\nProcess interrupted. Saving partial results to {final_filename}\")\n", " else:\n", " final_filename = target_folder / f\"{starting_from}_{count}_final.csv\"\n", " print(f\"\\nProcessing completed. Saving final results to {final_filename}\")\n", "\n", " russian_data.to_csv(final_filename, index=False)\n", "\n", " # remove temperorary files\n", " if not exiter.should_exit:\n", " for temp_file in target_folder.glob(\"*_temp.csv\"):\n", " temp_file.unlink()\n", " print(\"Temporary files removed.\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2025-04-09T09:04:58.913113Z", "iopub.status.busy": "2025-04-09T09:04:58.911808Z" } }, "outputs": [], "source": [ "df = get_dataset()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "mlO-3KoY8uT6", "outputId": "bb555bc7-6ad4-43ef-d096-06ef01b07525", "tags": [] }, "outputs": [], "source": [ "translate_dataset(\n", " starting_from=0, count=50_000, dataset=df, batch_size=128, save_interval=512\n", ")" ] } ], "metadata": { "colab": { "provenance": [] }, "kernelspec": { "display_name": "DataSphere Kernel", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 4 }