File size: 127,189 Bytes
ad7eafd
1
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"mount_file_id":"1z2Tq7dKp7YLHVFK8dXL2EZn0hBclc6dQ","authorship_tag":"ABX9TyNCgalS8m4obOk+v6xZ1zQQ"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":["# Initial instructions"],"metadata":{"id":"Do8-wzEtQiVu"}},{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"6NUxzlPDIej5","executionInfo":{"status":"ok","timestamp":1688570661655,"user_tz":-210,"elapsed":3937,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"e8d93915-58f9-4b3c-8688-08e8c3c5ce91"},"outputs":[{"output_type":"stream","name":"stdout","text":["Requirement already satisfied: kaggle in /usr/local/lib/python3.10/dist-packages (1.5.13)\n","Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.10/dist-packages (from kaggle) (1.16.0)\n","Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from kaggle) (2023.5.7)\n","Requirement already satisfied: python-dateutil in /usr/local/lib/python3.10/dist-packages (from kaggle) (2.8.2)\n","Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from kaggle) (2.27.1)\n","Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from kaggle) (4.65.0)\n","Requirement already satisfied: python-slugify in /usr/local/lib/python3.10/dist-packages (from kaggle) (8.0.1)\n","Requirement already satisfied: urllib3 in /usr/local/lib/python3.10/dist-packages (from kaggle) (1.26.16)\n","Requirement already satisfied: text-unidecode>=1.3 in /usr/local/lib/python3.10/dist-packages (from python-slugify->kaggle) (1.3)\n","Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->kaggle) (2.0.12)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->kaggle) (3.4)\n"]}],"source":["! pip install kaggle"]},{"cell_type":"code","source":["from google.colab import files"],"metadata":{"id":"9gjOv_WUJBzz","executionInfo":{"status":"ok","timestamp":1688570665466,"user_tz":-210,"elapsed":8,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":2,"outputs":[]},{"cell_type":"code","source":["files.upload()\n","! mkdir ~/.kaggle\n","! cp kaggle.json ~/.kaggle/\n","! chmod 600 ~/.kaggle/kaggle.json\n","! kaggle datasets download -d rounakbanik/the-movies-dataset"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":125},"id":"cQoNEnDDJOvf","executionInfo":{"status":"ok","timestamp":1688570709193,"user_tz":-210,"elapsed":40889,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"3c8e5f7b-618d-4109-950c-30be7e04569b"},"execution_count":3,"outputs":[{"output_type":"display_data","data":{"text/plain":["<IPython.core.display.HTML object>"],"text/html":["\n","     <input type=\"file\" id=\"files-6731ac4b-bf13-403a-91ee-a79038f4c5ce\" name=\"files[]\" multiple disabled\n","        style=\"border:none\" />\n","     <output id=\"result-6731ac4b-bf13-403a-91ee-a79038f4c5ce\">\n","      Upload widget is only available when the cell has been executed in the\n","      current browser session. Please rerun this cell to enable.\n","      </output>\n","      <script>// Copyright 2017 Google LLC\n","//\n","// Licensed under the Apache License, Version 2.0 (the \"License\");\n","// you may not use this file except in compliance with the License.\n","// You may obtain a copy of the License at\n","//\n","//      http://www.apache.org/licenses/LICENSE-2.0\n","//\n","// Unless required by applicable law or agreed to in writing, software\n","// distributed under the License is distributed on an \"AS IS\" BASIS,\n","// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n","// See the License for the specific language governing permissions and\n","// limitations under the License.\n","\n","/**\n"," * @fileoverview Helpers for google.colab Python module.\n"," */\n","(function(scope) {\n","function span(text, styleAttributes = {}) {\n","  const element = document.createElement('span');\n","  element.textContent = text;\n","  for (const key of Object.keys(styleAttributes)) {\n","    element.style[key] = styleAttributes[key];\n","  }\n","  return element;\n","}\n","\n","// Max number of bytes which will be uploaded at a time.\n","const MAX_PAYLOAD_SIZE = 100 * 1024;\n","\n","function _uploadFiles(inputId, outputId) {\n","  const steps = uploadFilesStep(inputId, outputId);\n","  const outputElement = document.getElementById(outputId);\n","  // Cache steps on the outputElement to make it available for the next call\n","  // to uploadFilesContinue from Python.\n","  outputElement.steps = steps;\n","\n","  return _uploadFilesContinue(outputId);\n","}\n","\n","// This is roughly an async generator (not supported in the browser yet),\n","// where there are multiple asynchronous steps and the Python side is going\n","// to poll for completion of each step.\n","// This uses a Promise to block the python side on completion of each step,\n","// then passes the result of the previous step as the input to the next step.\n","function _uploadFilesContinue(outputId) {\n","  const outputElement = document.getElementById(outputId);\n","  const steps = outputElement.steps;\n","\n","  const next = steps.next(outputElement.lastPromiseValue);\n","  return Promise.resolve(next.value.promise).then((value) => {\n","    // Cache the last promise value to make it available to the next\n","    // step of the generator.\n","    outputElement.lastPromiseValue = value;\n","    return next.value.response;\n","  });\n","}\n","\n","/**\n"," * Generator function which is called between each async step of the upload\n"," * process.\n"," * @param {string} inputId Element ID of the input file picker element.\n"," * @param {string} outputId Element ID of the output display.\n"," * @return {!Iterable<!Object>} Iterable of next steps.\n"," */\n","function* uploadFilesStep(inputId, outputId) {\n","  const inputElement = document.getElementById(inputId);\n","  inputElement.disabled = false;\n","\n","  const outputElement = document.getElementById(outputId);\n","  outputElement.innerHTML = '';\n","\n","  const pickedPromise = new Promise((resolve) => {\n","    inputElement.addEventListener('change', (e) => {\n","      resolve(e.target.files);\n","    });\n","  });\n","\n","  const cancel = document.createElement('button');\n","  inputElement.parentElement.appendChild(cancel);\n","  cancel.textContent = 'Cancel upload';\n","  const cancelPromise = new Promise((resolve) => {\n","    cancel.onclick = () => {\n","      resolve(null);\n","    };\n","  });\n","\n","  // Wait for the user to pick the files.\n","  const files = yield {\n","    promise: Promise.race([pickedPromise, cancelPromise]),\n","    response: {\n","      action: 'starting',\n","    }\n","  };\n","\n","  cancel.remove();\n","\n","  // Disable the input element since further picks are not allowed.\n","  inputElement.disabled = true;\n","\n","  if (!files) {\n","    return {\n","      response: {\n","        action: 'complete',\n","      }\n","    };\n","  }\n","\n","  for (const file of files) {\n","    const li = document.createElement('li');\n","    li.append(span(file.name, {fontWeight: 'bold'}));\n","    li.append(span(\n","        `(${file.type || 'n/a'}) - ${file.size} bytes, ` +\n","        `last modified: ${\n","            file.lastModifiedDate ? file.lastModifiedDate.toLocaleDateString() :\n","                                    'n/a'} - `));\n","    const percent = span('0% done');\n","    li.appendChild(percent);\n","\n","    outputElement.appendChild(li);\n","\n","    const fileDataPromise = new Promise((resolve) => {\n","      const reader = new FileReader();\n","      reader.onload = (e) => {\n","        resolve(e.target.result);\n","      };\n","      reader.readAsArrayBuffer(file);\n","    });\n","    // Wait for the data to be ready.\n","    let fileData = yield {\n","      promise: fileDataPromise,\n","      response: {\n","        action: 'continue',\n","      }\n","    };\n","\n","    // Use a chunked sending to avoid message size limits. See b/62115660.\n","    let position = 0;\n","    do {\n","      const length = Math.min(fileData.byteLength - position, MAX_PAYLOAD_SIZE);\n","      const chunk = new Uint8Array(fileData, position, length);\n","      position += length;\n","\n","      const base64 = btoa(String.fromCharCode.apply(null, chunk));\n","      yield {\n","        response: {\n","          action: 'append',\n","          file: file.name,\n","          data: base64,\n","        },\n","      };\n","\n","      let percentDone = fileData.byteLength === 0 ?\n","          100 :\n","          Math.round((position / fileData.byteLength) * 100);\n","      percent.textContent = `${percentDone}% done`;\n","\n","    } while (position < fileData.byteLength);\n","  }\n","\n","  // All done.\n","  yield {\n","    response: {\n","      action: 'complete',\n","    }\n","  };\n","}\n","\n","scope.google = scope.google || {};\n","scope.google.colab = scope.google.colab || {};\n","scope.google.colab._files = {\n","  _uploadFiles,\n","  _uploadFilesContinue,\n","};\n","})(self);\n","</script> "]},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Saving kaggle.json to kaggle.json\n","Downloading the-movies-dataset.zip to /content\n"," 96% 219M/228M [00:02<00:00, 121MB/s]\n","100% 228M/228M [00:02<00:00, 102MB/s]\n"]}]},{"cell_type":"code","source":["import os"],"metadata":{"id":"ArIWM5DHqXYZ","executionInfo":{"status":"ok","timestamp":1688570797771,"user_tz":-210,"elapsed":371,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":5,"outputs":[]},{"cell_type":"code","source":["if not os.path.isdir('/content/drive/MyDrive/Rec/data/raw') :\n","    os.mkdir('/content/drive/MyDrive/Rec/data/raw')"],"metadata":{"id":"HaEx9oF3Li7f"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["! unzip '/content/the-movies-dataset.zip' -d '/content/drive/MyDrive/Rec/data/raw'"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"pLeGJcfvLwwJ","executionInfo":{"status":"ok","timestamp":1688565261896,"user_tz":-210,"elapsed":11516,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"449b2179-ff5a-4ac8-f9c5-07efc6a7e7c6"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Archive:  /content/the-movies-dataset.zip\n","  inflating: /content/drive/MyDrive/Rec/data/raw/credits.csv  \n","  inflating: /content/drive/MyDrive/Rec/data/raw/keywords.csv  \n","  inflating: /content/drive/MyDrive/Rec/data/raw/links.csv  \n","  inflating: /content/drive/MyDrive/Rec/data/raw/links_small.csv  \n","  inflating: /content/drive/MyDrive/Rec/data/raw/movies_metadata.csv  \n","  inflating: /content/drive/MyDrive/Rec/data/raw/ratings.csv  \n","  inflating: /content/drive/MyDrive/Rec/data/raw/ratings_small.csv  \n"]}]},{"cell_type":"code","source":["raw_dir = '/content/drive/MyDrive/Rec/data/raw'"],"metadata":{"id":"PAWyXXGxMVIX","executionInfo":{"status":"ok","timestamp":1688570776985,"user_tz":-210,"elapsed":496,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":4,"outputs":[]},{"cell_type":"markdown","source":["# Needed imports"],"metadata":{"id":"6rPjGeurQdnV"}},{"cell_type":"code","source":["import numpy as np\n","import pandas as pd\n","import json\n","import csv\n","import matplotlib.pyplot as plt\n","from ast import literal_eval"],"metadata":{"id":"tx4gvgqgMxgo","executionInfo":{"status":"ok","timestamp":1688570807449,"user_tz":-210,"elapsed":390,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":6,"outputs":[]},{"cell_type":"markdown","source":["# Credits dataset"],"metadata":{"id":"GA6noW_jSjRc"}},{"cell_type":"code","source":["credits = pd.read_csv(os.path.join(raw_dir, 'credits.csv'))\n","print('shape of credits dataset: {}'.format(credits.shape))\n","credits.head(3)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":161},"id":"ZaOZPI1NQcg5","executionInfo":{"status":"ok","timestamp":1688547894409,"user_tz":-210,"elapsed":3235,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"1bc46c8b-c287-49bb-bc9c-b832e85b1e8f"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["shape of credits dataset: (45476, 3)\n"]},{"output_type":"execute_result","data":{"text/plain":["                                                cast  \\\n","0  [{'cast_id': 14, 'character': 'Woody (voice)',...   \n","1  [{'cast_id': 1, 'character': 'Alan Parrish', '...   \n","2  [{'cast_id': 2, 'character': 'Max Goldman', 'c...   \n","\n","                                                crew     id  \n","0  [{'credit_id': '52fe4284c3a36847f8024f49', 'de...    862  \n","1  [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...   8844  \n","2  [{'credit_id': '52fe466a9251416c75077a89', 'de...  15602  "],"text/html":["\n","  <div id=\"df-af558189-001b-41a5-a507-f131b8c3ab89\">\n","    <div class=\"colab-df-container\">\n","      <div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>cast</th>\n","      <th>crew</th>\n","      <th>id</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>[{'cast_id': 14, 'character': 'Woody (voice)',...</td>\n","      <td>[{'credit_id': '52fe4284c3a36847f8024f49', 'de...</td>\n","      <td>862</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>[{'cast_id': 1, 'character': 'Alan Parrish', '...</td>\n","      <td>[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...</td>\n","      <td>8844</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>[{'cast_id': 2, 'character': 'Max Goldman', 'c...</td>\n","      <td>[{'credit_id': '52fe466a9251416c75077a89', 'de...</td>\n","      <td>15602</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>\n","      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-af558189-001b-41a5-a507-f131b8c3ab89')\"\n","              title=\"Convert this dataframe to an interactive table.\"\n","              style=\"display:none;\">\n","        \n","  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n","       width=\"24px\">\n","    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n","    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n","  </svg>\n","      </button>\n","      \n","  <style>\n","    .colab-df-container {\n","      display:flex;\n","      flex-wrap:wrap;\n","      gap: 12px;\n","    }\n","\n","    .colab-df-convert {\n","      background-color: #E8F0FE;\n","      border: none;\n","      border-radius: 50%;\n","      cursor: pointer;\n","      display: none;\n","      fill: #1967D2;\n","      height: 32px;\n","      padding: 0 0 0 0;\n","      width: 32px;\n","    }\n","\n","    .colab-df-convert:hover {\n","      background-color: #E2EBFA;\n","      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n","      fill: #174EA6;\n","    }\n","\n","    [theme=dark] .colab-df-convert {\n","      background-color: #3B4455;\n","      fill: #D2E3FC;\n","    }\n","\n","    [theme=dark] .colab-df-convert:hover {\n","      background-color: #434B5C;\n","      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n","      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n","      fill: #FFFFFF;\n","    }\n","  </style>\n","\n","      <script>\n","        const buttonEl =\n","          document.querySelector('#df-af558189-001b-41a5-a507-f131b8c3ab89 button.colab-df-convert');\n","        buttonEl.style.display =\n","          google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n","        async function convertToInteractive(key) {\n","          const element = document.querySelector('#df-af558189-001b-41a5-a507-f131b8c3ab89');\n","          const dataTable =\n","            await google.colab.kernel.invokeFunction('convertToInteractive',\n","                                                     [key], {});\n","          if (!dataTable) return;\n","\n","          const docLinkHtml = 'Like what you see? Visit the ' +\n","            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n","            + ' to learn more about interactive tables.';\n","          element.innerHTML = '';\n","          dataTable['output_type'] = 'display_data';\n","          await google.colab.output.renderOutput(dataTable, element);\n","          const docLink = document.createElement('div');\n","          docLink.innerHTML = docLinkHtml;\n","          element.appendChild(docLink);\n","        }\n","      </script>\n","    </div>\n","  </div>\n","  "]},"metadata":{},"execution_count":10}]},{"cell_type":"code","source":["credits.isnull().sum()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"6of-O22CTBD7","executionInfo":{"status":"ok","timestamp":1688547995732,"user_tz":-210,"elapsed":473,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"6da185d8-74eb-4701-c341-8304a241aa94"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["cast    0\n","crew    0\n","id      0\n","dtype: int64"]},"metadata":{},"execution_count":11}]},{"cell_type":"code","source":["credits.isna().sum()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"tfhS7WLITas6","executionInfo":{"status":"ok","timestamp":1688548005437,"user_tz":-210,"elapsed":27,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"aa448602-e8af-456a-c269-6dcc16337a5e"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["cast    0\n","crew    0\n","id      0\n","dtype: int64"]},"metadata":{},"execution_count":12}]},{"cell_type":"code","source":["credits.nunique()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"8gj4gQr2Tc_H","executionInfo":{"status":"ok","timestamp":1688548048490,"user_tz":-210,"elapsed":1036,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"2911f641-04bd-4573-8260-65327de68818"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["cast    43019\n","crew    44669\n","id      45432\n","dtype: int64"]},"metadata":{},"execution_count":14}]},{"cell_type":"code","source":["credits.drop_duplicates(inplace=True)"],"metadata":{"id":"shIqe3YbTiCw"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Extracting information from json strings"],"metadata":{"id":"XvobXUNzVxFf"}},{"cell_type":"code","source":["def get_text(text, obj='name'):\n","    text = literal_eval(text)\n","    if len(text) == 1:\n","        for i in text:\n","            return i[obj]\n","    else:\n","        s = []\n","        for i in text:\n","            s.append(str(i[obj]))\n","        return ', '.join(s)"],"metadata":{"id":"wmybX3cqTsrm","executionInfo":{"status":"ok","timestamp":1688571301495,"user_tz":-210,"elapsed":495,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":28,"outputs":[]},{"cell_type":"code","source":["credits['name_crew'] = credits['crew'].apply(get_text, obj = \"name\")\n","credits['department_crew'] = credits['crew'].apply(get_text, obj = \"department\")\n","credits['gender_crew'] = credits['crew'].apply(get_text, obj = \"gender\")\n","credits['job_crew'] = credits['crew'].apply(get_text, obj = \"job\")\n","credits['profile_path_crew'] = credits['crew'].apply(get_text, obj = \"profile_path\")\n","credits['id_crew'] = credits['crew'].apply(get_text, obj = \"id\")"],"metadata":{"id":"NIfflprHV4mT"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["credits['name_cast'] = credits['cast'].apply(get_text, obj = \"name\")\n","credits['order_cast'] = credits['cast'].apply(get_text, obj = \"order\")\n","credits['gender_cast'] = credits['cast'].apply(get_text, obj = \"gender\")\n","credits['credit_id_cast'] = credits['cast'].apply(get_text, obj = \"credit_id\")\n","credits['profile_path_cast'] = credits['cast'].apply(get_text, obj = \"profile_path\")\n","credits['id_cast'] = credits['cast'].apply(get_text, obj = \"id\")\n","credits['character_cast'] = credits['cast'].apply(get_text, obj = \"character\")"],"metadata":{"id":"6mVD2jb9V9Gy"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Constructing the new dataframe and saving that"],"metadata":{"id":"lDgPbPtyYJrG"}},{"cell_type":"code","source":["clean_credits = credits.drop([\"crew\", \"cast\"], axis=1)"],"metadata":{"id":"zGjkFiWEWicR"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["Checking the new dataframe"],"metadata":{"id":"QeO-YhWTYUvz"}},{"cell_type":"code","source":["print ('shape of the cleaned Credits dataset: {}'.format(clean_credits.shape))\n","clean_credits.head(3)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":572},"id":"k9Iqhry2Xsyt","executionInfo":{"status":"ok","timestamp":1688549213175,"user_tz":-210,"elapsed":473,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"65d54e2b-4828-4937-dada-233a2dd45011"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["shape of the cleaned Credits dataset: (45439, 14)\n"]},{"output_type":"execute_result","data":{"text/plain":["      id                                          name_crew  \\\n","0    862  John Lasseter, Joss Whedon, Andrew Stanton, Jo...   \n","1   8844  Larry J. Franco, Jonathan Hensleigh, James Hor...   \n","2  15602  Howard Deutch, Mark Steven Johnson, Mark Steve...   \n","\n","                                     department_crew  \\\n","0  Directing, Writing, Writing, Writing, Writing,...   \n","1  Production, Writing, Sound, Directing, Editing...   \n","2                  Directing, Writing, Writing, Crew   \n","\n","                                         gender_crew  \\\n","0  2, 2, 2, 2, 0, 1, 0, 2, 2, 2, 2, 2, 0, 0, 0, 2...   \n","1     2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2   \n","2                                         2, 2, 2, 2   \n","\n","                                            job_crew  \\\n","0  Director, Screenplay, Screenplay, Screenplay, ...   \n","1  Executive Producer, Screenplay, Original Music...   \n","2      Director, Characters, Writer, Sound Recordist   \n","\n","                                   profile_path_crew  \\\n","0  /7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg, /dTiVsuaTVTe...   \n","1  None, /l1c4UFD3g0HVWj5f0CxXAvMAGiT.jpg, /oLOtX...   \n","2  /68Vae1HkU1NxQZ6KEmuxIpno7c9.jpg, /6trChNn3o2b...   \n","\n","                                             id_crew  \\\n","0  7879, 12891, 7, 12892, 12893, 12894, 12895, 12...   \n","1  511, 876, 1729, 4945, 4951, 4952, 8023, 9967, ...   \n","2                       26502, 16837, 16837, 1551320   \n","\n","                                           name_cast  \\\n","0  Tom Hanks, Tim Allen, Don Rickles, Jim Varney,...   \n","1  Robin Williams, Jonathan Hyde, Kirsten Dunst, ...   \n","2  Walter Matthau, Jack Lemmon, Ann-Margret, Soph...   \n","\n","                                          order_cast  \\\n","0           0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12   \n","1  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ...   \n","2                                0, 1, 2, 3, 4, 5, 6   \n","\n","                                         gender_cast  \\\n","0              2, 2, 2, 2, 2, 2, 1, 0, 2, 1, 2, 1, 2   \n","1  2, 2, 1, 0, 1, 1, 2, 1, 0, 1, 2, 1, 2, 0, 0, 0...   \n","2                                2, 2, 1, 1, 1, 2, 2   \n","\n","                                      credit_id_cast  \\\n","0  52fe4284c3a36847f8024f95, 52fe4284c3a36847f802...   \n","1  52fe44bfc3a36847f80a7c73, 52fe44bfc3a36847f80a...   \n","2  52fe466a9251416c75077a8d, 52fe466a9251416c7507...   \n","\n","                                   profile_path_cast  \\\n","0  /pQFoyx7rp09CJTAb932F2g8Nlho.jpg, /uX2xVf6pMmP...   \n","1  /sojtJyIV3lkUeThD7A2oHNm8183.jpg, /7il5D76vx6Q...   \n","2  /xJVkvprOnzP5Zdh5y63y8HHniDZ.jpg, /chZmNRYMtqk...   \n","\n","                                             id_cast  \\\n","0  31, 12898, 7167, 12899, 12900, 7907, 8873, 111...   \n","1  2157, 8537, 205, 145151, 5149, 10739, 58563, 1...   \n","2         6837, 3151, 13567, 16757, 589, 16523, 7166   \n","\n","                                      character_cast  \n","0  Woody (voice), Buzz Lightyear (voice), Mr. Pot...  \n","1  Alan Parrish, Samuel Alan Parrish / Van Pelt, ...  \n","2  Max Goldman, John Gustafson, Ariel Gustafson, ...  "],"text/html":["\n","  <div id=\"df-b061b4ac-c086-4304-9e17-fe2a649fdb5f\">\n","    <div class=\"colab-df-container\">\n","      <div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>id</th>\n","      <th>name_crew</th>\n","      <th>department_crew</th>\n","      <th>gender_crew</th>\n","      <th>job_crew</th>\n","      <th>profile_path_crew</th>\n","      <th>id_crew</th>\n","      <th>name_cast</th>\n","      <th>order_cast</th>\n","      <th>gender_cast</th>\n","      <th>credit_id_cast</th>\n","      <th>profile_path_cast</th>\n","      <th>id_cast</th>\n","      <th>character_cast</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>862</td>\n","      <td>John Lasseter, Joss Whedon, Andrew Stanton, Jo...</td>\n","      <td>Directing, Writing, Writing, Writing, Writing,...</td>\n","      <td>2, 2, 2, 2, 0, 1, 0, 2, 2, 2, 2, 2, 0, 0, 0, 2...</td>\n","      <td>Director, Screenplay, Screenplay, Screenplay, ...</td>\n","      <td>/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg, /dTiVsuaTVTe...</td>\n","      <td>7879, 12891, 7, 12892, 12893, 12894, 12895, 12...</td>\n","      <td>Tom Hanks, Tim Allen, Don Rickles, Jim Varney,...</td>\n","      <td>0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12</td>\n","      <td>2, 2, 2, 2, 2, 2, 1, 0, 2, 1, 2, 1, 2</td>\n","      <td>52fe4284c3a36847f8024f95, 52fe4284c3a36847f802...</td>\n","      <td>/pQFoyx7rp09CJTAb932F2g8Nlho.jpg, /uX2xVf6pMmP...</td>\n","      <td>31, 12898, 7167, 12899, 12900, 7907, 8873, 111...</td>\n","      <td>Woody (voice), Buzz Lightyear (voice), Mr. Pot...</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>8844</td>\n","      <td>Larry J. Franco, Jonathan Hensleigh, James Hor...</td>\n","      <td>Production, Writing, Sound, Directing, Editing...</td>\n","      <td>2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2</td>\n","      <td>Executive Producer, Screenplay, Original Music...</td>\n","      <td>None, /l1c4UFD3g0HVWj5f0CxXAvMAGiT.jpg, /oLOtX...</td>\n","      <td>511, 876, 1729, 4945, 4951, 4952, 8023, 9967, ...</td>\n","      <td>Robin Williams, Jonathan Hyde, Kirsten Dunst, ...</td>\n","      <td>0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ...</td>\n","      <td>2, 2, 1, 0, 1, 1, 2, 1, 0, 1, 2, 1, 2, 0, 0, 0...</td>\n","      <td>52fe44bfc3a36847f80a7c73, 52fe44bfc3a36847f80a...</td>\n","      <td>/sojtJyIV3lkUeThD7A2oHNm8183.jpg, /7il5D76vx6Q...</td>\n","      <td>2157, 8537, 205, 145151, 5149, 10739, 58563, 1...</td>\n","      <td>Alan Parrish, Samuel Alan Parrish / Van Pelt, ...</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>15602</td>\n","      <td>Howard Deutch, Mark Steven Johnson, Mark Steve...</td>\n","      <td>Directing, Writing, Writing, Crew</td>\n","      <td>2, 2, 2, 2</td>\n","      <td>Director, Characters, Writer, Sound Recordist</td>\n","      <td>/68Vae1HkU1NxQZ6KEmuxIpno7c9.jpg, /6trChNn3o2b...</td>\n","      <td>26502, 16837, 16837, 1551320</td>\n","      <td>Walter Matthau, Jack Lemmon, Ann-Margret, Soph...</td>\n","      <td>0, 1, 2, 3, 4, 5, 6</td>\n","      <td>2, 2, 1, 1, 1, 2, 2</td>\n","      <td>52fe466a9251416c75077a8d, 52fe466a9251416c7507...</td>\n","      <td>/xJVkvprOnzP5Zdh5y63y8HHniDZ.jpg, /chZmNRYMtqk...</td>\n","      <td>6837, 3151, 13567, 16757, 589, 16523, 7166</td>\n","      <td>Max Goldman, John Gustafson, Ariel Gustafson, ...</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>\n","      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-b061b4ac-c086-4304-9e17-fe2a649fdb5f')\"\n","              title=\"Convert this dataframe to an interactive table.\"\n","              style=\"display:none;\">\n","        \n","  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n","       width=\"24px\">\n","    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n","    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n","  </svg>\n","      </button>\n","      \n","  <style>\n","    .colab-df-container {\n","      display:flex;\n","      flex-wrap:wrap;\n","      gap: 12px;\n","    }\n","\n","    .colab-df-convert {\n","      background-color: #E8F0FE;\n","      border: none;\n","      border-radius: 50%;\n","      cursor: pointer;\n","      display: none;\n","      fill: #1967D2;\n","      height: 32px;\n","      padding: 0 0 0 0;\n","      width: 32px;\n","    }\n","\n","    .colab-df-convert:hover {\n","      background-color: #E2EBFA;\n","      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n","      fill: #174EA6;\n","    }\n","\n","    [theme=dark] .colab-df-convert {\n","      background-color: #3B4455;\n","      fill: #D2E3FC;\n","    }\n","\n","    [theme=dark] .colab-df-convert:hover {\n","      background-color: #434B5C;\n","      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n","      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n","      fill: #FFFFFF;\n","    }\n","  </style>\n","\n","      <script>\n","        const buttonEl =\n","          document.querySelector('#df-b061b4ac-c086-4304-9e17-fe2a649fdb5f button.colab-df-convert');\n","        buttonEl.style.display =\n","          google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n","        async function convertToInteractive(key) {\n","          const element = document.querySelector('#df-b061b4ac-c086-4304-9e17-fe2a649fdb5f');\n","          const dataTable =\n","            await google.colab.kernel.invokeFunction('convertToInteractive',\n","                                                     [key], {});\n","          if (!dataTable) return;\n","\n","          const docLinkHtml = 'Like what you see? Visit the ' +\n","            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n","            + ' to learn more about interactive tables.';\n","          element.innerHTML = '';\n","          dataTable['output_type'] = 'display_data';\n","          await google.colab.output.renderOutput(dataTable, element);\n","          const docLink = document.createElement('div');\n","          docLink.innerHTML = docLinkHtml;\n","          element.appendChild(docLink);\n","        }\n","      </script>\n","    </div>\n","  </div>\n","  "]},"metadata":{},"execution_count":20}]},{"cell_type":"markdown","source":["Saving the new .csv dataset"],"metadata":{"id":"CR5anOZZYlBe"}},{"cell_type":"code","source":["clean_credits.to_csv('/content/drive/MyDrive/Rec/data/cleaned/clean_credits.csv', index=False)"],"metadata":{"id":"yto7-njpYD7U"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# Keywords dataset"],"metadata":{"id":"emyNitcyZPCr"}},{"cell_type":"code","source":["keywords = pd.read_csv(os.path.join(raw_dir, \"keywords.csv\"))\n","print('shape of raw Keywores dataset: {}'.format(keywords.shape))\n","keywords.head(3)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":161},"id":"4JqG86_-ZKHn","executionInfo":{"status":"ok","timestamp":1688549658847,"user_tz":-210,"elapsed":511,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"7a279461-cc46-4f3b-f211-2a02492658f2"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["shape of raw Keywores dataset: (46419, 2)\n"]},{"output_type":"execute_result","data":{"text/plain":["      id                                           keywords\n","0    862  [{'id': 931, 'name': 'jealousy'}, {'id': 4290,...\n","1   8844  [{'id': 10090, 'name': 'board game'}, {'id': 1...\n","2  15602  [{'id': 1495, 'name': 'fishing'}, {'id': 12392..."],"text/html":["\n","  <div id=\"df-8ece7329-852a-47fe-8d22-9752d988430f\">\n","    <div class=\"colab-df-container\">\n","      <div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>id</th>\n","      <th>keywords</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>862</td>\n","      <td>[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>8844</td>\n","      <td>[{'id': 10090, 'name': 'board game'}, {'id': 1...</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>15602</td>\n","      <td>[{'id': 1495, 'name': 'fishing'}, {'id': 12392...</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>\n","      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-8ece7329-852a-47fe-8d22-9752d988430f')\"\n","              title=\"Convert this dataframe to an interactive table.\"\n","              style=\"display:none;\">\n","        \n","  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n","       width=\"24px\">\n","    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n","    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n","  </svg>\n","      </button>\n","      \n","  <style>\n","    .colab-df-container {\n","      display:flex;\n","      flex-wrap:wrap;\n","      gap: 12px;\n","    }\n","\n","    .colab-df-convert {\n","      background-color: #E8F0FE;\n","      border: none;\n","      border-radius: 50%;\n","      cursor: pointer;\n","      display: none;\n","      fill: #1967D2;\n","      height: 32px;\n","      padding: 0 0 0 0;\n","      width: 32px;\n","    }\n","\n","    .colab-df-convert:hover {\n","      background-color: #E2EBFA;\n","      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n","      fill: #174EA6;\n","    }\n","\n","    [theme=dark] .colab-df-convert {\n","      background-color: #3B4455;\n","      fill: #D2E3FC;\n","    }\n","\n","    [theme=dark] .colab-df-convert:hover {\n","      background-color: #434B5C;\n","      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n","      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n","      fill: #FFFFFF;\n","    }\n","  </style>\n","\n","      <script>\n","        const buttonEl =\n","          document.querySelector('#df-8ece7329-852a-47fe-8d22-9752d988430f button.colab-df-convert');\n","        buttonEl.style.display =\n","          google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n","        async function convertToInteractive(key) {\n","          const element = document.querySelector('#df-8ece7329-852a-47fe-8d22-9752d988430f');\n","          const dataTable =\n","            await google.colab.kernel.invokeFunction('convertToInteractive',\n","                                                     [key], {});\n","          if (!dataTable) return;\n","\n","          const docLinkHtml = 'Like what you see? Visit the ' +\n","            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n","            + ' to learn more about interactive tables.';\n","          element.innerHTML = '';\n","          dataTable['output_type'] = 'display_data';\n","          await google.colab.output.renderOutput(dataTable, element);\n","          const docLink = document.createElement('div');\n","          docLink.innerHTML = docLinkHtml;\n","          element.appendChild(docLink);\n","        }\n","      </script>\n","    </div>\n","  </div>\n","  "]},"metadata":{},"execution_count":24}]},{"cell_type":"code","source":["keywords.isna().sum()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"tLkNXoSJZmDE","executionInfo":{"status":"ok","timestamp":1688549694116,"user_tz":-210,"elapsed":486,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"fb4852d3-563a-4833-8674-eeaab7c29bbf"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["id          0\n","keywords    0\n","dtype: int64"]},"metadata":{},"execution_count":25}]},{"cell_type":"code","source":["keywords.isnull().sum()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"9tyQTNKPZ5V0","executionInfo":{"status":"ok","timestamp":1688549730364,"user_tz":-210,"elapsed":963,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"9d0fcf3b-3aad-46e8-f36e-2c77038f5b3f"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["id          0\n","keywords    0\n","dtype: int64"]},"metadata":{},"execution_count":26}]},{"cell_type":"code","source":["keywords.nunique()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"u2jgxSs_aB9x","executionInfo":{"status":"ok","timestamp":1688549740316,"user_tz":-210,"elapsed":604,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"ba5dbee0-d701-46c0-d2e8-341d806794ad"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["id          45432\n","keywords    25989\n","dtype: int64"]},"metadata":{},"execution_count":27}]},{"cell_type":"code","source":["keywords.drop_duplicates(inplace=True)\n","print('shape of dataset after dropping duplicates: {}'.format(keywords.shape))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"BqPLBOeyaEif","executionInfo":{"status":"ok","timestamp":1688549823949,"user_tz":-210,"elapsed":429,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"d7a7d607-69cd-4367-f302-d384bb90bdcd"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["shape of dataset after dropping duplicates: (45432, 2)\n"]}]},{"cell_type":"markdown","source":["## Extracting information from json strings"],"metadata":{"id":"CEea-KSLaqI2"}},{"cell_type":"code","source":["keywords['name_keywords'] = keywords['keywords'].apply(get_text, obj = \"name\")\n","keywords['id_keywords'] = keywords['keywords'].apply(get_text, obj = \"id\")"],"metadata":{"id":"_W1h0U4-aWYX"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Constructing new dataframe and saving that"],"metadata":{"id":"KMGfBLW7a4wi"}},{"cell_type":"code","source":["clean_keywords = keywords.drop([\"keywords\"], axis=1)"],"metadata":{"id":"JUuH5tmxazAW"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["Checking the new dataset"],"metadata":{"id":"hwYzBGyjbZMN"}},{"cell_type":"code","source":["print('shape of the cleaned data: {}'.format(clean_keywords.shape))\n","clean_keywords.head(3)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":161},"id":"QsOVt41MbLCz","executionInfo":{"status":"ok","timestamp":1688550078229,"user_tz":-210,"elapsed":575,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"6a400dc8-6ea3-4dca-f034-6ca0f1643fc0"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["shape of the cleaned data: (45432, 3)\n"]},{"output_type":"execute_result","data":{"text/plain":["      id                                      name_keywords  \\\n","0    862  jealousy, toy, boy, friendship, friends, rival...   \n","1   8844  board game, disappearance, based on children's...   \n","2  15602  fishing, best friend, duringcreditsstinger, ol...   \n","\n","                                         id_keywords  \n","0  931, 4290, 5202, 6054, 9713, 9823, 165503, 170...  \n","1         10090, 10941, 15101, 33467, 158086, 158091  \n","2                        1495, 12392, 179431, 208510  "],"text/html":["\n","  <div id=\"df-66979cc2-b808-444f-bd2b-bd46f8759b8c\">\n","    <div class=\"colab-df-container\">\n","      <div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>id</th>\n","      <th>name_keywords</th>\n","      <th>id_keywords</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>862</td>\n","      <td>jealousy, toy, boy, friendship, friends, rival...</td>\n","      <td>931, 4290, 5202, 6054, 9713, 9823, 165503, 170...</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>8844</td>\n","      <td>board game, disappearance, based on children's...</td>\n","      <td>10090, 10941, 15101, 33467, 158086, 158091</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>15602</td>\n","      <td>fishing, best friend, duringcreditsstinger, ol...</td>\n","      <td>1495, 12392, 179431, 208510</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>\n","      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-66979cc2-b808-444f-bd2b-bd46f8759b8c')\"\n","              title=\"Convert this dataframe to an interactive table.\"\n","              style=\"display:none;\">\n","        \n","  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n","       width=\"24px\">\n","    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n","    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n","  </svg>\n","      </button>\n","      \n","  <style>\n","    .colab-df-container {\n","      display:flex;\n","      flex-wrap:wrap;\n","      gap: 12px;\n","    }\n","\n","    .colab-df-convert {\n","      background-color: #E8F0FE;\n","      border: none;\n","      border-radius: 50%;\n","      cursor: pointer;\n","      display: none;\n","      fill: #1967D2;\n","      height: 32px;\n","      padding: 0 0 0 0;\n","      width: 32px;\n","    }\n","\n","    .colab-df-convert:hover {\n","      background-color: #E2EBFA;\n","      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n","      fill: #174EA6;\n","    }\n","\n","    [theme=dark] .colab-df-convert {\n","      background-color: #3B4455;\n","      fill: #D2E3FC;\n","    }\n","\n","    [theme=dark] .colab-df-convert:hover {\n","      background-color: #434B5C;\n","      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n","      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n","      fill: #FFFFFF;\n","    }\n","  </style>\n","\n","      <script>\n","        const buttonEl =\n","          document.querySelector('#df-66979cc2-b808-444f-bd2b-bd46f8759b8c button.colab-df-convert');\n","        buttonEl.style.display =\n","          google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n","        async function convertToInteractive(key) {\n","          const element = document.querySelector('#df-66979cc2-b808-444f-bd2b-bd46f8759b8c');\n","          const dataTable =\n","            await google.colab.kernel.invokeFunction('convertToInteractive',\n","                                                     [key], {});\n","          if (!dataTable) return;\n","\n","          const docLinkHtml = 'Like what you see? Visit the ' +\n","            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n","            + ' to learn more about interactive tables.';\n","          element.innerHTML = '';\n","          dataTable['output_type'] = 'display_data';\n","          await google.colab.output.renderOutput(dataTable, element);\n","          const docLink = document.createElement('div');\n","          docLink.innerHTML = docLinkHtml;\n","          element.appendChild(docLink);\n","        }\n","      </script>\n","    </div>\n","  </div>\n","  "]},"metadata":{},"execution_count":32}]},{"cell_type":"markdown","source":["Saving the cleaned dataset as a .csv file"],"metadata":{"id":"W0YT8YV3bcf_"}},{"cell_type":"code","source":["clean_keywords.to_csv('/content/drive/MyDrive/Rec/data/cleaned/clean_keywords.csv', index=False)\n"],"metadata":{"id":"M9f5_VeibXC-"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# Metadata dataset"],"metadata":{"id":"SKlrgI-Ub1SP"}},{"cell_type":"code","source":["metadata = pd.read_csv(os.path.join(raw_dir, 'movies_metadata.csv'))\n","print('shape of the raw metadata dataset: {}'.format(metadata.shape))\n","metadata.head(3)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":567},"id":"MwDtrtVJbvM5","executionInfo":{"status":"ok","timestamp":1688571617483,"user_tz":-210,"elapsed":1283,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"3b0db80d-e8e3-4bdf-dce4-edbfd4addabe"},"execution_count":35,"outputs":[{"output_type":"stream","name":"stdout","text":["shape of the raw metadata dataset: (45466, 24)\n"]},{"output_type":"stream","name":"stderr","text":["<ipython-input-35-d8520c3ee68a>:1: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.\n","  metadata = pd.read_csv(os.path.join(raw_dir, 'movies_metadata.csv'))\n"]},{"output_type":"execute_result","data":{"text/plain":["   adult                              belongs_to_collection    budget  \\\n","0  False  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   \n","1  False                                                NaN  65000000   \n","2  False  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0   \n","\n","                                              genres  \\\n","0  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...   \n","1  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...   \n","2  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...   \n","\n","                               homepage     id    imdb_id original_language  \\\n","0  http://toystory.disney.com/toy-story    862  tt0114709                en   \n","1                                   NaN   8844  tt0113497                en   \n","2                                   NaN  15602  tt0113228                en   \n","\n","     original_title                                           overview  ...  \\\n","0         Toy Story  Led by Woody, Andy's toys live happily in his ...  ...   \n","1           Jumanji  When siblings Judy and Peter discover an encha...  ...   \n","2  Grumpier Old Men  A family wedding reignites the ancient feud be...  ...   \n","\n","  release_date      revenue runtime  \\\n","0   1995-10-30  373554033.0    81.0   \n","1   1995-12-15  262797249.0   104.0   \n","2   1995-12-22          0.0   101.0   \n","\n","                                    spoken_languages    status  \\\n","0           [{'iso_639_1': 'en', 'name': 'English'}]  Released   \n","1  [{'iso_639_1': 'en', 'name': 'English'}, {'iso...  Released   \n","2           [{'iso_639_1': 'en', 'name': 'English'}]  Released   \n","\n","                                             tagline             title  video  \\\n","0                                                NaN         Toy Story  False   \n","1          Roll the dice and unleash the excitement!           Jumanji  False   \n","2  Still Yelling. Still Fighting. Still Ready for...  Grumpier Old Men  False   \n","\n","  vote_average vote_count  \n","0          7.7     5415.0  \n","1          6.9     2413.0  \n","2          6.5       92.0  \n","\n","[3 rows x 24 columns]"],"text/html":["\n","  <div id=\"df-1f24f91a-2b29-4c28-b1e3-65fcc0c08c84\">\n","    <div class=\"colab-df-container\">\n","      <div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>adult</th>\n","      <th>belongs_to_collection</th>\n","      <th>budget</th>\n","      <th>genres</th>\n","      <th>homepage</th>\n","      <th>id</th>\n","      <th>imdb_id</th>\n","      <th>original_language</th>\n","      <th>original_title</th>\n","      <th>overview</th>\n","      <th>...</th>\n","      <th>release_date</th>\n","      <th>revenue</th>\n","      <th>runtime</th>\n","      <th>spoken_languages</th>\n","      <th>status</th>\n","      <th>tagline</th>\n","      <th>title</th>\n","      <th>video</th>\n","      <th>vote_average</th>\n","      <th>vote_count</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>False</td>\n","      <td>{'id': 10194, 'name': 'Toy Story Collection', ...</td>\n","      <td>30000000</td>\n","      <td>[{'id': 16, 'name': 'Animation'}, {'id': 35, '...</td>\n","      <td>http://toystory.disney.com/toy-story</td>\n","      <td>862</td>\n","      <td>tt0114709</td>\n","      <td>en</td>\n","      <td>Toy Story</td>\n","      <td>Led by Woody, Andy's toys live happily in his ...</td>\n","      <td>...</td>\n","      <td>1995-10-30</td>\n","      <td>373554033.0</td>\n","      <td>81.0</td>\n","      <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n","      <td>Released</td>\n","      <td>NaN</td>\n","      <td>Toy Story</td>\n","      <td>False</td>\n","      <td>7.7</td>\n","      <td>5415.0</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>False</td>\n","      <td>NaN</td>\n","      <td>65000000</td>\n","      <td>[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...</td>\n","      <td>NaN</td>\n","      <td>8844</td>\n","      <td>tt0113497</td>\n","      <td>en</td>\n","      <td>Jumanji</td>\n","      <td>When siblings Judy and Peter discover an encha...</td>\n","      <td>...</td>\n","      <td>1995-12-15</td>\n","      <td>262797249.0</td>\n","      <td>104.0</td>\n","      <td>[{'iso_639_1': 'en', 'name': 'English'}, {'iso...</td>\n","      <td>Released</td>\n","      <td>Roll the dice and unleash the excitement!</td>\n","      <td>Jumanji</td>\n","      <td>False</td>\n","      <td>6.9</td>\n","      <td>2413.0</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>False</td>\n","      <td>{'id': 119050, 'name': 'Grumpy Old Men Collect...</td>\n","      <td>0</td>\n","      <td>[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...</td>\n","      <td>NaN</td>\n","      <td>15602</td>\n","      <td>tt0113228</td>\n","      <td>en</td>\n","      <td>Grumpier Old Men</td>\n","      <td>A family wedding reignites the ancient feud be...</td>\n","      <td>...</td>\n","      <td>1995-12-22</td>\n","      <td>0.0</td>\n","      <td>101.0</td>\n","      <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n","      <td>Released</td>\n","      <td>Still Yelling. Still Fighting. Still Ready for...</td>\n","      <td>Grumpier Old Men</td>\n","      <td>False</td>\n","      <td>6.5</td>\n","      <td>92.0</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>3 rows × 24 columns</p>\n","</div>\n","      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-1f24f91a-2b29-4c28-b1e3-65fcc0c08c84')\"\n","              title=\"Convert this dataframe to an interactive table.\"\n","              style=\"display:none;\">\n","        \n","  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n","       width=\"24px\">\n","    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n","    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n","  </svg>\n","      </button>\n","      \n","  <style>\n","    .colab-df-container {\n","      display:flex;\n","      flex-wrap:wrap;\n","      gap: 12px;\n","    }\n","\n","    .colab-df-convert {\n","      background-color: #E8F0FE;\n","      border: none;\n","      border-radius: 50%;\n","      cursor: pointer;\n","      display: none;\n","      fill: #1967D2;\n","      height: 32px;\n","      padding: 0 0 0 0;\n","      width: 32px;\n","    }\n","\n","    .colab-df-convert:hover {\n","      background-color: #E2EBFA;\n","      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n","      fill: #174EA6;\n","    }\n","\n","    [theme=dark] .colab-df-convert {\n","      background-color: #3B4455;\n","      fill: #D2E3FC;\n","    }\n","\n","    [theme=dark] .colab-df-convert:hover {\n","      background-color: #434B5C;\n","      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n","      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n","      fill: #FFFFFF;\n","    }\n","  </style>\n","\n","      <script>\n","        const buttonEl =\n","          document.querySelector('#df-1f24f91a-2b29-4c28-b1e3-65fcc0c08c84 button.colab-df-convert');\n","        buttonEl.style.display =\n","          google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n","        async function convertToInteractive(key) {\n","          const element = document.querySelector('#df-1f24f91a-2b29-4c28-b1e3-65fcc0c08c84');\n","          const dataTable =\n","            await google.colab.kernel.invokeFunction('convertToInteractive',\n","                                                     [key], {});\n","          if (!dataTable) return;\n","\n","          const docLinkHtml = 'Like what you see? Visit the ' +\n","            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n","            + ' to learn more about interactive tables.';\n","          element.innerHTML = '';\n","          dataTable['output_type'] = 'display_data';\n","          await google.colab.output.renderOutput(dataTable, element);\n","          const docLink = document.createElement('div');\n","          docLink.innerHTML = docLinkHtml;\n","          element.appendChild(docLink);\n","        }\n","      </script>\n","    </div>\n","  </div>\n","  "]},"metadata":{},"execution_count":35}]},{"cell_type":"code","source":["metadata.isna().sum()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Kt9tI1iydiXd","executionInfo":{"status":"ok","timestamp":1688571624293,"user_tz":-210,"elapsed":398,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"41a87985-be28-49d6-86c1-d86fbe49b2f5"},"execution_count":36,"outputs":[{"output_type":"execute_result","data":{"text/plain":["adult                        0\n","belongs_to_collection    40972\n","budget                       0\n","genres                       0\n","homepage                 37684\n","id                           0\n","imdb_id                     17\n","original_language           11\n","original_title               0\n","overview                   954\n","popularity                   5\n","poster_path                386\n","production_companies         3\n","production_countries         3\n","release_date                87\n","revenue                      6\n","runtime                    263\n","spoken_languages             6\n","status                      87\n","tagline                  25054\n","title                        6\n","video                        6\n","vote_average                 6\n","vote_count                   6\n","dtype: int64"]},"metadata":{},"execution_count":36}]},{"cell_type":"code","source":["metadata.isnull().sum()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"9ASDp8eTkg4h","executionInfo":{"status":"ok","timestamp":1688571627733,"user_tz":-210,"elapsed":685,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"2751d9b1-f910-4de2-de33-6ff6bee31bd5"},"execution_count":37,"outputs":[{"output_type":"execute_result","data":{"text/plain":["adult                        0\n","belongs_to_collection    40972\n","budget                       0\n","genres                       0\n","homepage                 37684\n","id                           0\n","imdb_id                     17\n","original_language           11\n","original_title               0\n","overview                   954\n","popularity                   5\n","poster_path                386\n","production_companies         3\n","production_countries         3\n","release_date                87\n","revenue                      6\n","runtime                    263\n","spoken_languages             6\n","status                      87\n","tagline                  25054\n","title                        6\n","video                        6\n","vote_average                 6\n","vote_count                   6\n","dtype: int64"]},"metadata":{},"execution_count":37}]},{"cell_type":"code","source":["metadata.nunique()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"yYM0Ud3Tlpjw","executionInfo":{"status":"ok","timestamp":1688571629398,"user_tz":-210,"elapsed":363,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"b1aa8de1-0749-466c-8dd9-7a2dae7dcdeb"},"execution_count":38,"outputs":[{"output_type":"execute_result","data":{"text/plain":["adult                        5\n","belongs_to_collection     1698\n","budget                    1226\n","genres                    4069\n","homepage                  7673\n","id                       45436\n","imdb_id                  45417\n","original_language           92\n","original_title           43373\n","overview                 44307\n","popularity               44176\n","poster_path              45024\n","production_companies     22708\n","production_countries      2393\n","release_date             17336\n","revenue                   6863\n","runtime                    353\n","spoken_languages          1931\n","status                       6\n","tagline                  20283\n","title                    42277\n","video                        2\n","vote_average                92\n","vote_count                1820\n","dtype: int64"]},"metadata":{},"execution_count":38}]},{"cell_type":"markdown","source":["Histogram of the dataset"],"metadata":{"id":"m7T_-wt1mFvu"}},{"cell_type":"code","source":["metadata.hist(bins=50, figsize=(10, 7))\n","plt.show()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":622},"id":"rLGE24QNl17Y","executionInfo":{"status":"ok","timestamp":1688571633442,"user_tz":-210,"elapsed":1657,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"de36782e-7d6f-4664-f10f-36d695f7675c"},"execution_count":39,"outputs":[{"output_type":"display_data","data":{"text/plain":["<Figure size 1000x700 with 4 Axes>"],"image/png":"\n"},"metadata":{}}]},{"cell_type":"markdown","source":["## Removing entries with numeric name of comopanies or countries"],"metadata":{"id":"uTNnaW8Jmr3U"}},{"cell_type":"code","source":["def is_string_number(string):\n","    try:\n","        float(string)\n","        return True\n","    except ValueError:\n","        return False"],"metadata":{"id":"-sB4JFQnmCxW","executionInfo":{"status":"ok","timestamp":1688571638129,"user_tz":-210,"elapsed":595,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":40,"outputs":[]},{"cell_type":"code","source":["for i in range(metadata.shape[0]):\n","    if is_string_number(metadata.loc[i, 'production_countries']) or is_string_number(metadata.loc[i, 'production_companies']):\n","        metadata.drop(index=i, inplace=True)\n","metadata.shape"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"7EZfyxccmqMQ","executionInfo":{"status":"ok","timestamp":1688571640517,"user_tz":-210,"elapsed":784,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"564f7e66-ea73-4755-cd9e-9e55e50707d4"},"execution_count":41,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(45460, 24)"]},"metadata":{},"execution_count":41}]},{"cell_type":"code","source":["metadata.drop_duplicates(inplace=True)\n","metadata.dropna(subset=['production_companies', 'production_countries'], inplace=True)"],"metadata":{"id":"vtrzBhjznD7p","executionInfo":{"status":"ok","timestamp":1688571642826,"user_tz":-210,"elapsed":363,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":42,"outputs":[]},{"cell_type":"code","source":["metadata = metadata.fillna(\"NaN value\")\n"],"metadata":{"id":"q7mDM3zAnlH_","executionInfo":{"status":"ok","timestamp":1688571644612,"user_tz":-210,"elapsed":398,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":43,"outputs":[]},{"cell_type":"markdown","source":["## Extracting information from json values"],"metadata":{"id":"J952IGoVrwJA"}},{"cell_type":"code","source":["def get_text_nan(text, obj=\"name\"):\n","    if(text == \"NaN value\"):\n","         return np.nan\n","    elif (isinstance(literal_eval(str(text)), float)):\n","        return np.nan\n","    else:\n","        text = literal_eval(str(text))\n","        for i in text:\n","            if(i == obj):\n","                return (text[i])"],"metadata":{"id":"C3ElqfyRrgK8","executionInfo":{"status":"ok","timestamp":1688571653225,"user_tz":-210,"elapsed":739,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":44,"outputs":[]},{"cell_type":"code","source":["def text_getter(text, obj='name'):\n","    if(text == 'NaN value'):\n","         return np.nan\n","    elif (isinstance(literal_eval(str(text)), float)):\n","        return np.nan\n","    elif(isinstance(literal_eval(str(text)), list)):\n","        s = []\n","        for i in text:\n","            s.append(str(i[obj]))\n","        return ', '.join(s)\n","    else:\n","        text = literal_eval(str(text))\n","        for i in text:\n","            if(i == obj):\n","                return (text[i])"],"metadata":{"id":"sPZP1mKRrhnY","executionInfo":{"status":"ok","timestamp":1688571654839,"user_tz":-210,"elapsed":6,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":45,"outputs":[]},{"cell_type":"markdown","source":["'belongs to collection' column"],"metadata":{"id":"kvoi_xr8sw5X"}},{"cell_type":"code","source":["metadata['name_belongs_to_collection'] = metadata['belongs_to_collection'].apply(get_text_nan ,obj = \"name\")\n","metadata['id_belongs_to_collection'] = metadata['belongs_to_collection'].apply(get_text_nan ,obj = \"id\")\n","metadata['poster_path_belongs_to_collection'] = metadata['belongs_to_collection'].apply(get_text_nan ,obj = \"poster_path\")\n","metadata['backdrop_path_belongs_to_collection'] = metadata['belongs_to_collection'].apply(get_text_nan ,obj = \"backdrop_path\")"],"metadata":{"id":"9-ZGGNdfsFKw","executionInfo":{"status":"ok","timestamp":1688571660522,"user_tz":-210,"elapsed":933,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":46,"outputs":[]},{"cell_type":"markdown","source":["'genres' column"],"metadata":{"id":"B-8-Qd9dtIcv"}},{"cell_type":"code","source":["metadata['name_genres'] = metadata['genres'].apply(get_text ,obj = \"name\")\n","metadata['id_genres'] = metadata['genres'].apply(get_text ,obj = \"id\")"],"metadata":{"id":"QJp5U6BGsu0Z","executionInfo":{"status":"ok","timestamp":1688571665947,"user_tz":-210,"elapsed":2373,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":47,"outputs":[]},{"cell_type":"markdown","source":["'production countries' column"],"metadata":{"id":"GLFk9tGQtM4_"}},{"cell_type":"code","source":["metadata['name_production_countries'] = metadata['production_countries'].apply(get_text ,obj = \"name\")\n","metadata['iso_3166_1_production_companies'] = metadata['production_countries'].apply(get_text ,obj = \"iso_3166_1\")"],"metadata":{"id":"viAxv4kGsvbD","executionInfo":{"status":"ok","timestamp":1688571671111,"user_tz":-210,"elapsed":1435,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":48,"outputs":[]},{"cell_type":"markdown","source":["'production companies' column"],"metadata":{"id":"VDodd_DPtbrz"}},{"cell_type":"code","source":["metadata['name_production_companies'] = metadata['production_companies'].apply(get_text ,obj = \"name\")\n","metadata['id_production_companies'] = metadata['production_companies'].apply(get_text ,obj = \"id\")\n"],"metadata":{"id":"9fyeJVlftaEU","executionInfo":{"status":"ok","timestamp":1688571676262,"user_tz":-210,"elapsed":1791,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":49,"outputs":[]},{"cell_type":"code","source":["metadata = metadata.replace('NaN value', np.nan)"],"metadata":{"id":"VkkYNztRuf42","executionInfo":{"status":"ok","timestamp":1688571907439,"user_tz":-210,"elapsed":579,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":52,"outputs":[]},{"cell_type":"markdown","source":["## Constructing the new dataframe and saving the new dataset"],"metadata":{"id":"8-Z-JMlCtlon"}},{"cell_type":"code","source":["clean_metadata = metadata.drop([\"spoken_languages\" ,\"genres\" ,\"production_countries\" ,\"belongs_to_collection\"] ,axis=1)\n","print('shape of the cleaned dataset: {}'.format(clean_metadata.shape))\n","clean_metadata.head(3)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":532},"id":"zveKnpTPtfiA","executionInfo":{"status":"ok","timestamp":1688571914195,"user_tz":-210,"elapsed":578,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"8b7df7c5-f07e-418b-9dfe-de0f92557bb3"},"execution_count":53,"outputs":[{"output_type":"stream","name":"stdout","text":["shape of the cleaned dataset: (45447, 30)\n"]},{"output_type":"execute_result","data":{"text/plain":["   adult    budget                              homepage     id    imdb_id  \\\n","0  False  30000000  http://toystory.disney.com/toy-story    862  tt0114709   \n","1  False  65000000                                   NaN   8844  tt0113497   \n","2  False         0                                   NaN  15602  tt0113228   \n","\n","  original_language    original_title  \\\n","0                en         Toy Story   \n","1                en           Jumanji   \n","2                en  Grumpier Old Men   \n","\n","                                            overview popularity  \\\n","0  Led by Woody, Andy's toys live happily in his ...  21.946943   \n","1  When siblings Judy and Peter discover an encha...  17.015539   \n","2  A family wedding reignites the ancient feud be...    11.7129   \n","\n","                        poster_path  ... name_belongs_to_collection  \\\n","0  /rhIRbceoE9lR4veEXuwCC2wARtG.jpg  ...       Toy Story Collection   \n","1  /vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg  ...                        NaN   \n","2  /6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg  ...  Grumpy Old Men Collection   \n","\n","  id_belongs_to_collection  poster_path_belongs_to_collection  \\\n","0                  10194.0   /7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg   \n","1                      NaN                                NaN   \n","2                 119050.0   /nLvUdqgPgm3F85NMCii9gVFUcet.jpg   \n","\n","   backdrop_path_belongs_to_collection                 name_genres  \\\n","0     /9FBwqcd9IRruEDUrTdcaafOMKUq.jpg   Animation, Comedy, Family   \n","1                                  NaN  Adventure, Fantasy, Family   \n","2     /hypTnLot2z8wpFS7qwsQHW1uV8u.jpg             Romance, Comedy   \n","\n","       id_genres name_production_countries  iso_3166_1_production_companies  \\\n","0  16, 35, 10751  United States of America                               US   \n","1  12, 14, 10751  United States of America                               US   \n","2      10749, 35  United States of America                               US   \n","\n","                           name_production_companies  id_production_companies  \n","0                            Pixar Animation Studios                        3  \n","1  TriStar Pictures, Teitler Film, Interscope Com...         559, 2550, 10201  \n","2                       Warner Bros., Lancaster Gate              6194, 19464  \n","\n","[3 rows x 30 columns]"],"text/html":["\n","  <div id=\"df-c8e90d56-a50e-4f9c-b919-c9d653aa1110\">\n","    <div class=\"colab-df-container\">\n","      <div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>adult</th>\n","      <th>budget</th>\n","      <th>homepage</th>\n","      <th>id</th>\n","      <th>imdb_id</th>\n","      <th>original_language</th>\n","      <th>original_title</th>\n","      <th>overview</th>\n","      <th>popularity</th>\n","      <th>poster_path</th>\n","      <th>...</th>\n","      <th>name_belongs_to_collection</th>\n","      <th>id_belongs_to_collection</th>\n","      <th>poster_path_belongs_to_collection</th>\n","      <th>backdrop_path_belongs_to_collection</th>\n","      <th>name_genres</th>\n","      <th>id_genres</th>\n","      <th>name_production_countries</th>\n","      <th>iso_3166_1_production_companies</th>\n","      <th>name_production_companies</th>\n","      <th>id_production_companies</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>False</td>\n","      <td>30000000</td>\n","      <td>http://toystory.disney.com/toy-story</td>\n","      <td>862</td>\n","      <td>tt0114709</td>\n","      <td>en</td>\n","      <td>Toy Story</td>\n","      <td>Led by Woody, Andy's toys live happily in his ...</td>\n","      <td>21.946943</td>\n","      <td>/rhIRbceoE9lR4veEXuwCC2wARtG.jpg</td>\n","      <td>...</td>\n","      <td>Toy Story Collection</td>\n","      <td>10194.0</td>\n","      <td>/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg</td>\n","      <td>/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg</td>\n","      <td>Animation, Comedy, Family</td>\n","      <td>16, 35, 10751</td>\n","      <td>United States of America</td>\n","      <td>US</td>\n","      <td>Pixar Animation Studios</td>\n","      <td>3</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>False</td>\n","      <td>65000000</td>\n","      <td>NaN</td>\n","      <td>8844</td>\n","      <td>tt0113497</td>\n","      <td>en</td>\n","      <td>Jumanji</td>\n","      <td>When siblings Judy and Peter discover an encha...</td>\n","      <td>17.015539</td>\n","      <td>/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg</td>\n","      <td>...</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>Adventure, Fantasy, Family</td>\n","      <td>12, 14, 10751</td>\n","      <td>United States of America</td>\n","      <td>US</td>\n","      <td>TriStar Pictures, Teitler Film, Interscope Com...</td>\n","      <td>559, 2550, 10201</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>False</td>\n","      <td>0</td>\n","      <td>NaN</td>\n","      <td>15602</td>\n","      <td>tt0113228</td>\n","      <td>en</td>\n","      <td>Grumpier Old Men</td>\n","      <td>A family wedding reignites the ancient feud be...</td>\n","      <td>11.7129</td>\n","      <td>/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg</td>\n","      <td>...</td>\n","      <td>Grumpy Old Men Collection</td>\n","      <td>119050.0</td>\n","      <td>/nLvUdqgPgm3F85NMCii9gVFUcet.jpg</td>\n","      <td>/hypTnLot2z8wpFS7qwsQHW1uV8u.jpg</td>\n","      <td>Romance, Comedy</td>\n","      <td>10749, 35</td>\n","      <td>United States of America</td>\n","      <td>US</td>\n","      <td>Warner Bros., Lancaster Gate</td>\n","      <td>6194, 19464</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>3 rows × 30 columns</p>\n","</div>\n","      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-c8e90d56-a50e-4f9c-b919-c9d653aa1110')\"\n","              title=\"Convert this dataframe to an interactive table.\"\n","              style=\"display:none;\">\n","        \n","  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n","       width=\"24px\">\n","    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n","    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n","  </svg>\n","      </button>\n","      \n","  <style>\n","    .colab-df-container {\n","      display:flex;\n","      flex-wrap:wrap;\n","      gap: 12px;\n","    }\n","\n","    .colab-df-convert {\n","      background-color: #E8F0FE;\n","      border: none;\n","      border-radius: 50%;\n","      cursor: pointer;\n","      display: none;\n","      fill: #1967D2;\n","      height: 32px;\n","      padding: 0 0 0 0;\n","      width: 32px;\n","    }\n","\n","    .colab-df-convert:hover {\n","      background-color: #E2EBFA;\n","      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n","      fill: #174EA6;\n","    }\n","\n","    [theme=dark] .colab-df-convert {\n","      background-color: #3B4455;\n","      fill: #D2E3FC;\n","    }\n","\n","    [theme=dark] .colab-df-convert:hover {\n","      background-color: #434B5C;\n","      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n","      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n","      fill: #FFFFFF;\n","    }\n","  </style>\n","\n","      <script>\n","        const buttonEl =\n","          document.querySelector('#df-c8e90d56-a50e-4f9c-b919-c9d653aa1110 button.colab-df-convert');\n","        buttonEl.style.display =\n","          google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n","        async function convertToInteractive(key) {\n","          const element = document.querySelector('#df-c8e90d56-a50e-4f9c-b919-c9d653aa1110');\n","          const dataTable =\n","            await google.colab.kernel.invokeFunction('convertToInteractive',\n","                                                     [key], {});\n","          if (!dataTable) return;\n","\n","          const docLinkHtml = 'Like what you see? Visit the ' +\n","            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n","            + ' to learn more about interactive tables.';\n","          element.innerHTML = '';\n","          dataTable['output_type'] = 'display_data';\n","          await google.colab.output.renderOutput(dataTable, element);\n","          const docLink = document.createElement('div');\n","          docLink.innerHTML = docLinkHtml;\n","          element.appendChild(docLink);\n","        }\n","      </script>\n","    </div>\n","  </div>\n","  "]},"metadata":{},"execution_count":53}]},{"cell_type":"code","source":["clean_metadata.to_csv('/content/drive/MyDrive/Rec/data/cleaned/clean_metadata.csv', index=False)"],"metadata":{"id":"bFZ03yWWuFTO","executionInfo":{"status":"ok","timestamp":1688571930421,"user_tz":-210,"elapsed":1566,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":54,"outputs":[]},{"cell_type":"code","source":[],"metadata":{"id":"LiikvR3euR2Z"},"execution_count":null,"outputs":[]}]}