{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "file_path = \"/Users/mohammad.ibrahim/Desktop/TSAI/combined_text.txt\"\n", "with open(file_path, 'r', encoding='utf-8') as file:\n", " text = file.read()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "pattern = r\"\"\"'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}।•]?+\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}।•]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+|।|•\"\"\"\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import regex as re\n", "text_chunks = re.findall(pattern, text)\n", "\n", " # input text preprocessing\n", "tokens = [list(ch.encode(\"utf-8\")) for ch in text_chunks]" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# tokens = text.encode(\"utf-8\") # raw bytes\n", "tokens = list(map(int, tokens)) # convert to a list of integers in range 0..255 for convenience" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tokens length: 179910393\n", "ids length: 32798069\n", "compression ratio: 5.49X\n" ] } ], "source": [ "def get_stats(ids):\n", " counts = {}\n", " for pair in zip(ids, ids[1:]):\n", " counts[pair] = counts.get(pair, 0) + 1\n", " return counts\n", "\n", "def merge(ids, pair, idx):\n", " newids = []\n", " i = 0\n", " while i < len(ids):\n", " if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:\n", " newids.append(idx)\n", " i += 2\n", " else:\n", " newids.append(ids[i])\n", " i += 1\n", " return newids\n", "\n", "# ---\n", "vocab_size = 1000 # the desired final vocabulary size\n", "num_merges = vocab_size - 256\n", "ids = list(tokens) # copy so we don't destroy the original list\n", "\n", "merges = {} # (int, int) -> int\n", "for i in range(num_merges):\n", " stats = get_stats(ids)\n", " pair = max(stats, key=stats.get)\n", " idx = 256 + i\n", " # print(f\"merging {pair} into a new token {idx}\")\n", " ids = merge(ids, pair, idx)\n", " merges[pair] = idx\n", "\n", "print(\"tokens length:\", len(tokens))\n", "print(\"ids length:\", len(ids))\n", "print(f\"compression ratio: {len(tokens) / len(ids):.2f}X\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.6" } }, "nbformat": 4, "nbformat_minor": 2 }