{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "collapsed_sections": [ "Z3N2WMYNV-qX" ], "authorship_tag": "ABX9TyOuk8MIfThoeWnRbBQlPf+h", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "gpuClass": "standard" }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "\"Open" ] }, { "cell_type": "code", "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')\n", "project_path = '/content/drive/MyDrive/projects/Stock_Predicter'\n", "%cd $project_path" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Xr3Qozgfktoc", "outputId": "e80033fb-a41f-438f-fc90-60bc0317d5d3" }, "execution_count": 2, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Mounted at /content/drive\n", "/content/drive/MyDrive/projects/Stock_Predicter\n" ] } ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "id": "e8SQqogMQYLh" }, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "import pandas_datareader as web\n", "import datetime as dt\n", "import yfinance as yfin\n", "import tensorflow as tf\n", "import os\n", "import re\n", "\n", "from sklearn.preprocessing import MinMaxScaler\n", "from tensorflow.keras.models import Sequential\n", "from tensorflow.keras.layers import Dense, Dropout, LSTM\n" ] }, { "cell_type": "markdown", "source": [ "# Get Data" ], "metadata": { "id": "5vO8pty3VwkG" } }, { "cell_type": "code", "source": [ "# Select a company for now\n", "ticker = 'AAPL'\n", "\n", "start = dt.datetime(2013,1,1)\n", "end = dt.datetime(2023,4,5)" ], "metadata": { "id": "O6dtJpJwS5Eg" }, "execution_count": 93, "outputs": [] }, { "cell_type": "code", "source": [ "yfin.pdr_override()\n", "data = web.data.get_data_yahoo(ticker, start, end)\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "LwPyk8Uh-Zz_", "outputId": "63953807-ca2e-4e18-c571-a6bcc4f8db5d" }, "execution_count": 5, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\r[*********************100%***********************] 1 of 1 completed\n" ] } ] }, { "cell_type": "markdown", "source": [ "# Preprocess_data" ], "metadata": { "id": "SSuS9OONV5-a" } }, { "cell_type": "code", "source": [ "def normalize_data(data, relative_to_previous=True, scaler=None):\n", " def substract_to_values(data, value):\n", " df_copy = pd.DataFrame.copy(data)\n", " df_copy[['Open', 'High', 'Low', 'Close', 'Adj Close']] = df_copy[['Open', 'High', 'Low', 'Close', 'Adj Close']] - value\n", " return df_copy\n", " if relative_to_previous:\n", " the_data = pd.DataFrame(substract_to_values(data.iloc[0], data.iloc[0]['Open'])).T\n", " # the_data = substract_to_values(data.iloc[0], data.iloc[0]['Open']).to_frame().T # This is the same as the previous line\n", " for i in range(1,len(data)):\n", " the_data = pd.concat((the_data, substract_to_values(data.iloc[i], data.iloc[i-1]['Close']).to_frame().T))\n", " else:\n", " the_data = pd.DataFrame.copy(data)\n", " \n", " if scaler is None:\n", " # Create the scaler\n", " values = the_data.values\n", " # print('values')\n", " # print(values)\n", " max_value = np.max(values[:,:-1])\n", " # print(max_value)\n", " min_value = np.min(values[:,:-1])\n", " # print(min_value)\n", " max_volume = np.max(values[:,-1])\n", " min_volume = np.min(values[:,-1])\n", " # print(max_volume, min_volume)\n", " def scaler(data):\n", " values = data.values\n", " # print(values)\n", " values[:,:-1] = (values[:,:-1] - min_value) / (max_value-min_value) * 2 - 1\n", " values[:,-1] = (values[:,-1] - min_volume) / (max_volume-min_volume) * 2 - 1\n", " # print(values)\n", " return data\n", " def anti_scaler(values):\n", " decoded_values = (values + 1) * (max_value-min_value) / 2 + min_value \n", " return decoded_values\n", " \n", " normalized_data = scaler(the_data)\n", "\n", " return normalized_data, scaler, anti_scaler\n", "\n", "\n" ], "metadata": { "id": "v9RoqzBvtrOb" }, "execution_count": 111, "outputs": [] }, { "cell_type": "code", "source": [ "norm_data, the_scaler, the_decoder = normalize_data(data, relative_to_previous=True)\n", "#todo: save the_scaler somehow to use in new runtimes" ], "metadata": { "id": "-kgo__Q3hw1_" }, "execution_count": 112, "outputs": [] }, { "cell_type": "code", "source": [ "len(norm_data)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "A1L8giqcsutX", "outputId": "0aaf515b-3835-432c-b882-c2111a221ed4" }, "execution_count": 41, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "2583" ] }, "metadata": {}, "execution_count": 41 } ] }, { "cell_type": "code", "source": [ "prediction_days = 100\n", "\n", "x_train_list = []\n", "y_train_list = []\n", "\n", "for i in range(prediction_days, len(norm_data)):\n", " x_train_list.append(norm_data[i-prediction_days:i])\n", " y_train_list.append(norm_data.iloc[i].values[0:4])\n", "\n", "x_train = np.array(x_train_list)\n", "y_train = np.array(y_train_list)" ], "metadata": { "id": "jMXkRAYFomHM" }, "execution_count": 9, "outputs": [] }, { "cell_type": "code", "source": [ "print(x_train.shape)\n", "print(y_train.shape)\n", "print(x_train.shape[1:])" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "G7oMd1fRyOYt", "outputId": "2094c403-096d-4f3a-9b15-bae0fbb7bf11" }, "execution_count": 10, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "(2483, 100, 6)\n", "(2483, 4)\n", "(100, 6)\n" ] } ] }, { "cell_type": "markdown", "source": [ "# Model" ], "metadata": { "id": "Z3N2WMYNV-qX" } }, { "cell_type": "markdown", "source": [ "## Create Model" ], "metadata": { "id": "emDyvzVUp5KJ" } }, { "cell_type": "code", "source": [ "def create_model():\n", " model = Sequential()\n", " # model.add(LSTM(units=112, return_sequences=True, input_shape=(x_train.shape[1:])))\n", " model.add(LSTM(units=112, return_sequences=True, input_shape=(None,x_train.shape[-1],)))\n", " model.add(Dropout(0.2))\n", " model.add(LSTM(units=112, return_sequences=True))\n", " model.add(Dropout(0.2))\n", " model.add(LSTM(units=50))\n", " model.add(Dropout(0.2))\n", " model.add(Dense(units=4))\n", " return model\n", "\n", "model = create_model()\n", "print(model.summary())" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "GXhYAKzXVfku", "outputId": "c54da788-6e82-4679-df1f-d3e89a20d228" }, "execution_count": 66, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Model: \"sequential_1\"\n", "_________________________________________________________________\n", " Layer (type) Output Shape Param # \n", "=================================================================\n", " lstm_3 (LSTM) (None, None, 112) 53312 \n", " \n", " dropout_3 (Dropout) (None, None, 112) 0 \n", " \n", " lstm_4 (LSTM) (None, None, 112) 100800 \n", " \n", " dropout_4 (Dropout) (None, None, 112) 0 \n", " \n", " lstm_5 (LSTM) (None, 50) 32600 \n", " \n", " dropout_5 (Dropout) (None, 50) 0 \n", " \n", " dense_1 (Dense) (None, 4) 204 \n", " \n", "=================================================================\n", "Total params: 186,916\n", "Trainable params: 186,916\n", "Non-trainable params: 0\n", "_________________________________________________________________\n", "None\n" ] } ] }, { "cell_type": "code", "source": [ "model.compile(optimizer='adam', loss='mean_squared_error')" ], "metadata": { "id": "ZhoWj_XeXQws" }, "execution_count": 12, "outputs": [] }, { "cell_type": "markdown", "source": [ "## Create checkpoint callback" ], "metadata": { "id": "XU0vc4n8p92L" } }, { "cell_type": "code", "source": [ "# Directory where the checkpoints will be saved\n", "checkpoint_dir = './training_checkpoints_'+dt.datetime.now().strftime(\"%Y%m%d%H%M%S\")\n", "# Name of the checkpoint files\n", "checkpoint_prefix = os.path.join(checkpoint_dir, \"ckpt_epoch{epoch}_loss{loss}\")\n", "\n", "checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(\n", " filepath=checkpoint_prefix,\n", " save_weights_only=True)" ], "metadata": { "id": "M5MBAB1-qCZr" }, "execution_count": 35, "outputs": [] }, { "cell_type": "markdown", "source": [ "## Model Train" ], "metadata": { "id": "65QbfffusPoJ" } }, { "cell_type": "code", "source": [ "print(x_train.shape)\n", "print(y_train.shape)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "HDT9XPXHvqyN", "outputId": "60938333-8afe-4b80-9af3-37bca3d67f83" }, "execution_count": 15, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "(2483, 100, 6)\n", "(2483, 4)\n" ] } ] }, { "cell_type": "code", "source": [ "y_train[-2]" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "F1wZkJMh3XNH", "outputId": "37a023db-0727-434a-85be-141c3c377907" }, "execution_count": 40, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([ 0.02002301, 0.0391905 , -0.09898045, -0.05744885])" ] }, "metadata": {}, "execution_count": 40 } ] }, { "cell_type": "code", "source": [ "model.fit(x_train, y_train, epochs=25, batch_size=32, callbacks=[checkpoint_callback])\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "9Ccc_Ej2TmYO", "outputId": "235efc3b-616b-4e57-fb87-07efcb377e8e" }, "execution_count": 37, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Epoch 1/25\n", "78/78 [==============================] - 31s 395ms/step - loss: 0.0117\n", "Epoch 2/25\n", "78/78 [==============================] - 31s 394ms/step - loss: 0.0111\n", "Epoch 3/25\n", "78/78 [==============================] - 33s 429ms/step - loss: 0.0109\n", "Epoch 4/25\n", "78/78 [==============================] - 31s 396ms/step - loss: 0.0109\n", "Epoch 5/25\n", "78/78 [==============================] - 31s 398ms/step - loss: 0.0108\n", "Epoch 6/25\n", "78/78 [==============================] - 31s 400ms/step - loss: 0.0108\n", "Epoch 7/25\n", "78/78 [==============================] - 32s 405ms/step - loss: 0.0108\n", "Epoch 8/25\n", "78/78 [==============================] - 31s 394ms/step - loss: 0.0108\n", "Epoch 9/25\n", "78/78 [==============================] - 30s 385ms/step - loss: 0.0108\n", "Epoch 10/25\n", "78/78 [==============================] - 30s 385ms/step - loss: 0.0108\n", "Epoch 11/25\n", "78/78 [==============================] - 29s 373ms/step - loss: 0.0108\n", "Epoch 12/25\n", "78/78 [==============================] - 29s 375ms/step - loss: 0.0107\n", "Epoch 13/25\n", "78/78 [==============================] - 30s 383ms/step - loss: 0.0107\n", "Epoch 14/25\n", "78/78 [==============================] - 30s 388ms/step - loss: 0.0107\n", "Epoch 15/25\n", "78/78 [==============================] - 31s 396ms/step - loss: 0.0108\n", "Epoch 16/25\n", "78/78 [==============================] - 30s 379ms/step - loss: 0.0107\n", "Epoch 17/25\n", "78/78 [==============================] - 30s 386ms/step - loss: 0.0107\n", "Epoch 18/25\n", "78/78 [==============================] - 30s 383ms/step - loss: 0.0108\n", "Epoch 19/25\n", "78/78 [==============================] - 30s 382ms/step - loss: 0.0107\n", "Epoch 20/25\n", "78/78 [==============================] - 31s 397ms/step - loss: 0.0107\n", "Epoch 21/25\n", "78/78 [==============================] - 30s 384ms/step - loss: 0.0107\n", "Epoch 22/25\n", "78/78 [==============================] - 30s 381ms/step - loss: 0.0106\n", "Epoch 23/25\n", "78/78 [==============================] - 30s 380ms/step - loss: 0.0106\n", "Epoch 24/25\n", "78/78 [==============================] - 30s 385ms/step - loss: 0.0107\n", "Epoch 25/25\n", "78/78 [==============================] - 30s 383ms/step - loss: 0.0106\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": {}, "execution_count": 37 } ] }, { "cell_type": "markdown", "source": [ "# Testing a model" ], "metadata": { "id": "dbSKl47vZvpe" } }, { "cell_type": "code", "source": [ "#print trainings directories to pick one\n", "!ls -d training_checkpoints_*/" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "59CDDB0i4yTx", "outputId": "497ae253-e3ac-47d0-d066-8e508f55782c" }, "execution_count": 49, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "training_checkpoints_20230406041748/\n" ] } ] }, { "cell_type": "code", "source": [ "test_model = create_model()" ], "metadata": { "id": "tpmru7nG9kbW" }, "execution_count": 72, "outputs": [] }, { "cell_type": "code", "source": [ "checkpoint_dir = 'training_checkpoints_20230406041748'\n", "\n", "def load_weights(epoch=None):\n", " if epoch is None:\n", " weights_file = tf.train.latest_checkpoint(checkpoint_dir)\n", " else:\n", " with os.scandir(checkpoint_dir) as entries:\n", " for entry in entries:\n", " if re.search(f'^ckpt_epoch{epoch}_.*\\.index', entry.name):\n", " weights_file = checkpoint_dir + '/'+ entry.name[:-6]\n", "\n", " print(weights_file)\n", " test_model.load_weights(weights_file)\n", " return test_model\n", "\n", "test_model = load_weights()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wQ0JTXsp4VKF", "outputId": "d4b794c9-7a89-4867-d17c-de1f20b9b607" }, "execution_count": 87, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "training_checkpoints_20230406041748/ckpt_epoch25_loss0.01064301934093237\n" ] } ] }, { "cell_type": "code", "source": [ "test_start = dt.date.today() - dt.timedelta(days=200)\n", "test_end = dt.date.today()\n", "\n", "yfin.pdr_override()\n", "test_data = web.data.get_data_yahoo(ticker, test_start, test_end)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Mf4q97pfaSCA", "outputId": "4317ef63-be5e-49ca-fdca-1d5760efbba1" }, "execution_count": 99, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\r[*********************100%***********************] 1 of 1 completed\n" ] } ] }, { "cell_type": "code", "source": [ "test_data_norm, _ = normalize_data(test_data, scaler=the_scaler)" ], "metadata": { "id": "xEG2yEdKC8uy" }, "execution_count": 100, "outputs": [] }, { "cell_type": "code", "source": [ "print(type(test_data_norm))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "mhbqRZ6cDhd6", "outputId": "8b40a738-e143-4920-de03-8e8572f4389a" }, "execution_count": 102, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n" ] } ] }, { "cell_type": "code", "source": [ "input_data = np.expand_dims(test_data_norm.values, axis=0)\n", "print(input_data.shape)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "F2bnofchD0xv", "outputId": "0b2261fb-056d-4ec2-a98b-82517d7806f1" }, "execution_count": 104, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "(1, 138, 6)\n" ] } ] }, { "cell_type": "code", "source": [ "results = test_model.predict(input_data, batch_size=1)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "AVYFQZnqEqhx", "outputId": "958d1669-c8bc-4eff-bb66-f25eb4dde011" }, "execution_count": 105, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "1/1 [==============================] - 1s 1s/step\n" ] } ] }, { "cell_type": "code", "source": [ "print(results)\n", "print(the_decoder(results))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "FbdX4ulhExsX", "outputId": "14a763ca-0983-41ec-e88b-da796fa4b51a" }, "execution_count": 113, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "[[-0.01962117 0.09634934 -0.10176479 -0.00849891]]\n", "[[-0.06636524 1.3856668 -1.0948591 0.0728941 ]]\n" ] } ] }, { "cell_type": "code", "source": [ "test_data.head()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 237 }, "id": "m0k7toG3E2_9", "outputId": "38ab6e43-1321-4028-9482-8e6687802a7d" }, "execution_count": 107, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Open High Low Close Adj Close \\\n", "Date \n", "2022-09-19 149.309998 154.559998 149.100006 154.479996 153.989029 \n", "2022-09-20 153.399994 158.080002 153.080002 156.899994 156.401352 \n", "2022-09-21 157.339996 158.740005 153.600006 153.720001 153.231461 \n", "2022-09-22 152.380005 154.470001 150.910004 152.740005 152.254578 \n", "2022-09-23 151.190002 151.470001 148.559998 150.429993 149.951904 \n", "\n", " Volume \n", "Date \n", "2022-09-19 81474200 \n", "2022-09-20 107689800 \n", "2022-09-21 101696800 \n", "2022-09-22 86652500 \n", "2022-09-23 96029900 " ], "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
OpenHighLowCloseAdj CloseVolume
Date
2022-09-19149.309998154.559998149.100006154.479996153.98902981474200
2022-09-20153.399994158.080002153.080002156.899994156.401352107689800
2022-09-21157.339996158.740005153.600006153.720001153.231461101696800
2022-09-22152.380005154.470001150.910004152.740005152.25457886652500
2022-09-23151.190002151.470001148.559998150.429993149.95190496029900
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ] }, "metadata": {}, "execution_count": 107 } ] } ] }