Spaces:
Sleeping
Sleeping
File size: 127,189 Bytes
ad7eafd |
1 |
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"mount_file_id":"1z2Tq7dKp7YLHVFK8dXL2EZn0hBclc6dQ","authorship_tag":"ABX9TyNCgalS8m4obOk+v6xZ1zQQ"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":["# Initial instructions"],"metadata":{"id":"Do8-wzEtQiVu"}},{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"6NUxzlPDIej5","executionInfo":{"status":"ok","timestamp":1688570661655,"user_tz":-210,"elapsed":3937,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"e8d93915-58f9-4b3c-8688-08e8c3c5ce91"},"outputs":[{"output_type":"stream","name":"stdout","text":["Requirement already satisfied: kaggle in /usr/local/lib/python3.10/dist-packages (1.5.13)\n","Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.10/dist-packages (from kaggle) (1.16.0)\n","Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from kaggle) (2023.5.7)\n","Requirement already satisfied: python-dateutil in /usr/local/lib/python3.10/dist-packages (from kaggle) (2.8.2)\n","Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from kaggle) (2.27.1)\n","Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from kaggle) (4.65.0)\n","Requirement already satisfied: python-slugify in /usr/local/lib/python3.10/dist-packages (from kaggle) (8.0.1)\n","Requirement already satisfied: urllib3 in /usr/local/lib/python3.10/dist-packages (from kaggle) (1.26.16)\n","Requirement already satisfied: text-unidecode>=1.3 in /usr/local/lib/python3.10/dist-packages (from python-slugify->kaggle) (1.3)\n","Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->kaggle) (2.0.12)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->kaggle) (3.4)\n"]}],"source":["! pip install kaggle"]},{"cell_type":"code","source":["from google.colab import files"],"metadata":{"id":"9gjOv_WUJBzz","executionInfo":{"status":"ok","timestamp":1688570665466,"user_tz":-210,"elapsed":8,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":2,"outputs":[]},{"cell_type":"code","source":["files.upload()\n","! mkdir ~/.kaggle\n","! cp kaggle.json ~/.kaggle/\n","! chmod 600 ~/.kaggle/kaggle.json\n","! kaggle datasets download -d rounakbanik/the-movies-dataset"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":125},"id":"cQoNEnDDJOvf","executionInfo":{"status":"ok","timestamp":1688570709193,"user_tz":-210,"elapsed":40889,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"3c8e5f7b-618d-4109-950c-30be7e04569b"},"execution_count":3,"outputs":[{"output_type":"display_data","data":{"text/plain":["<IPython.core.display.HTML object>"],"text/html":["\n"," <input type=\"file\" id=\"files-6731ac4b-bf13-403a-91ee-a79038f4c5ce\" name=\"files[]\" multiple disabled\n"," style=\"border:none\" />\n"," <output id=\"result-6731ac4b-bf13-403a-91ee-a79038f4c5ce\">\n"," Upload widget is only available when the cell has been executed in the\n"," current browser session. Please rerun this cell to enable.\n"," </output>\n"," <script>// Copyright 2017 Google LLC\n","//\n","// Licensed under the Apache License, Version 2.0 (the \"License\");\n","// you may not use this file except in compliance with the License.\n","// You may obtain a copy of the License at\n","//\n","// http://www.apache.org/licenses/LICENSE-2.0\n","//\n","// Unless required by applicable law or agreed to in writing, software\n","// distributed under the License is distributed on an \"AS IS\" BASIS,\n","// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n","// See the License for the specific language governing permissions and\n","// limitations under the License.\n","\n","/**\n"," * @fileoverview Helpers for google.colab Python module.\n"," */\n","(function(scope) {\n","function span(text, styleAttributes = {}) {\n"," const element = document.createElement('span');\n"," element.textContent = text;\n"," for (const key of Object.keys(styleAttributes)) {\n"," element.style[key] = styleAttributes[key];\n"," }\n"," return element;\n","}\n","\n","// Max number of bytes which will be uploaded at a time.\n","const MAX_PAYLOAD_SIZE = 100 * 1024;\n","\n","function _uploadFiles(inputId, outputId) {\n"," const steps = uploadFilesStep(inputId, outputId);\n"," const outputElement = document.getElementById(outputId);\n"," // Cache steps on the outputElement to make it available for the next call\n"," // to uploadFilesContinue from Python.\n"," outputElement.steps = steps;\n","\n"," return _uploadFilesContinue(outputId);\n","}\n","\n","// This is roughly an async generator (not supported in the browser yet),\n","// where there are multiple asynchronous steps and the Python side is going\n","// to poll for completion of each step.\n","// This uses a Promise to block the python side on completion of each step,\n","// then passes the result of the previous step as the input to the next step.\n","function _uploadFilesContinue(outputId) {\n"," const outputElement = document.getElementById(outputId);\n"," const steps = outputElement.steps;\n","\n"," const next = steps.next(outputElement.lastPromiseValue);\n"," return Promise.resolve(next.value.promise).then((value) => {\n"," // Cache the last promise value to make it available to the next\n"," // step of the generator.\n"," outputElement.lastPromiseValue = value;\n"," return next.value.response;\n"," });\n","}\n","\n","/**\n"," * Generator function which is called between each async step of the upload\n"," * process.\n"," * @param {string} inputId Element ID of the input file picker element.\n"," * @param {string} outputId Element ID of the output display.\n"," * @return {!Iterable<!Object>} Iterable of next steps.\n"," */\n","function* uploadFilesStep(inputId, outputId) {\n"," const inputElement = document.getElementById(inputId);\n"," inputElement.disabled = false;\n","\n"," const outputElement = document.getElementById(outputId);\n"," outputElement.innerHTML = '';\n","\n"," const pickedPromise = new Promise((resolve) => {\n"," inputElement.addEventListener('change', (e) => {\n"," resolve(e.target.files);\n"," });\n"," });\n","\n"," const cancel = document.createElement('button');\n"," inputElement.parentElement.appendChild(cancel);\n"," cancel.textContent = 'Cancel upload';\n"," const cancelPromise = new Promise((resolve) => {\n"," cancel.onclick = () => {\n"," resolve(null);\n"," };\n"," });\n","\n"," // Wait for the user to pick the files.\n"," const files = yield {\n"," promise: Promise.race([pickedPromise, cancelPromise]),\n"," response: {\n"," action: 'starting',\n"," }\n"," };\n","\n"," cancel.remove();\n","\n"," // Disable the input element since further picks are not allowed.\n"," inputElement.disabled = true;\n","\n"," if (!files) {\n"," return {\n"," response: {\n"," action: 'complete',\n"," }\n"," };\n"," }\n","\n"," for (const file of files) {\n"," const li = document.createElement('li');\n"," li.append(span(file.name, {fontWeight: 'bold'}));\n"," li.append(span(\n"," `(${file.type || 'n/a'}) - ${file.size} bytes, ` +\n"," `last modified: ${\n"," file.lastModifiedDate ? file.lastModifiedDate.toLocaleDateString() :\n"," 'n/a'} - `));\n"," const percent = span('0% done');\n"," li.appendChild(percent);\n","\n"," outputElement.appendChild(li);\n","\n"," const fileDataPromise = new Promise((resolve) => {\n"," const reader = new FileReader();\n"," reader.onload = (e) => {\n"," resolve(e.target.result);\n"," };\n"," reader.readAsArrayBuffer(file);\n"," });\n"," // Wait for the data to be ready.\n"," let fileData = yield {\n"," promise: fileDataPromise,\n"," response: {\n"," action: 'continue',\n"," }\n"," };\n","\n"," // Use a chunked sending to avoid message size limits. See b/62115660.\n"," let position = 0;\n"," do {\n"," const length = Math.min(fileData.byteLength - position, MAX_PAYLOAD_SIZE);\n"," const chunk = new Uint8Array(fileData, position, length);\n"," position += length;\n","\n"," const base64 = btoa(String.fromCharCode.apply(null, chunk));\n"," yield {\n"," response: {\n"," action: 'append',\n"," file: file.name,\n"," data: base64,\n"," },\n"," };\n","\n"," let percentDone = fileData.byteLength === 0 ?\n"," 100 :\n"," Math.round((position / fileData.byteLength) * 100);\n"," percent.textContent = `${percentDone}% done`;\n","\n"," } while (position < fileData.byteLength);\n"," }\n","\n"," // All done.\n"," yield {\n"," response: {\n"," action: 'complete',\n"," }\n"," };\n","}\n","\n","scope.google = scope.google || {};\n","scope.google.colab = scope.google.colab || {};\n","scope.google.colab._files = {\n"," _uploadFiles,\n"," _uploadFilesContinue,\n","};\n","})(self);\n","</script> "]},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Saving kaggle.json to kaggle.json\n","Downloading the-movies-dataset.zip to /content\n"," 96% 219M/228M [00:02<00:00, 121MB/s]\n","100% 228M/228M [00:02<00:00, 102MB/s]\n"]}]},{"cell_type":"code","source":["import os"],"metadata":{"id":"ArIWM5DHqXYZ","executionInfo":{"status":"ok","timestamp":1688570797771,"user_tz":-210,"elapsed":371,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":5,"outputs":[]},{"cell_type":"code","source":["if not os.path.isdir('/content/drive/MyDrive/Rec/data/raw') :\n"," os.mkdir('/content/drive/MyDrive/Rec/data/raw')"],"metadata":{"id":"HaEx9oF3Li7f"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["! unzip '/content/the-movies-dataset.zip' -d '/content/drive/MyDrive/Rec/data/raw'"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"pLeGJcfvLwwJ","executionInfo":{"status":"ok","timestamp":1688565261896,"user_tz":-210,"elapsed":11516,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"449b2179-ff5a-4ac8-f9c5-07efc6a7e7c6"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Archive: /content/the-movies-dataset.zip\n"," inflating: /content/drive/MyDrive/Rec/data/raw/credits.csv \n"," inflating: /content/drive/MyDrive/Rec/data/raw/keywords.csv \n"," inflating: /content/drive/MyDrive/Rec/data/raw/links.csv \n"," inflating: /content/drive/MyDrive/Rec/data/raw/links_small.csv \n"," inflating: /content/drive/MyDrive/Rec/data/raw/movies_metadata.csv \n"," inflating: /content/drive/MyDrive/Rec/data/raw/ratings.csv \n"," inflating: /content/drive/MyDrive/Rec/data/raw/ratings_small.csv \n"]}]},{"cell_type":"code","source":["raw_dir = '/content/drive/MyDrive/Rec/data/raw'"],"metadata":{"id":"PAWyXXGxMVIX","executionInfo":{"status":"ok","timestamp":1688570776985,"user_tz":-210,"elapsed":496,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":4,"outputs":[]},{"cell_type":"markdown","source":["# Needed imports"],"metadata":{"id":"6rPjGeurQdnV"}},{"cell_type":"code","source":["import numpy as np\n","import pandas as pd\n","import json\n","import csv\n","import matplotlib.pyplot as plt\n","from ast import literal_eval"],"metadata":{"id":"tx4gvgqgMxgo","executionInfo":{"status":"ok","timestamp":1688570807449,"user_tz":-210,"elapsed":390,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":6,"outputs":[]},{"cell_type":"markdown","source":["# Credits dataset"],"metadata":{"id":"GA6noW_jSjRc"}},{"cell_type":"code","source":["credits = pd.read_csv(os.path.join(raw_dir, 'credits.csv'))\n","print('shape of credits dataset: {}'.format(credits.shape))\n","credits.head(3)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":161},"id":"ZaOZPI1NQcg5","executionInfo":{"status":"ok","timestamp":1688547894409,"user_tz":-210,"elapsed":3235,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"1bc46c8b-c287-49bb-bc9c-b832e85b1e8f"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["shape of credits dataset: (45476, 3)\n"]},{"output_type":"execute_result","data":{"text/plain":[" cast \\\n","0 [{'cast_id': 14, 'character': 'Woody (voice)',... \n","1 [{'cast_id': 1, 'character': 'Alan Parrish', '... \n","2 [{'cast_id': 2, 'character': 'Max Goldman', 'c... \n","\n"," crew id \n","0 [{'credit_id': '52fe4284c3a36847f8024f49', 'de... 862 \n","1 [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de... 8844 \n","2 [{'credit_id': '52fe466a9251416c75077a89', 'de... 15602 "],"text/html":["\n"," <div id=\"df-af558189-001b-41a5-a507-f131b8c3ab89\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>cast</th>\n"," <th>crew</th>\n"," <th>id</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>[{'cast_id': 14, 'character': 'Woody (voice)',...</td>\n"," <td>[{'credit_id': '52fe4284c3a36847f8024f49', 'de...</td>\n"," <td>862</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>[{'cast_id': 1, 'character': 'Alan Parrish', '...</td>\n"," <td>[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...</td>\n"," <td>8844</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>[{'cast_id': 2, 'character': 'Max Goldman', 'c...</td>\n"," <td>[{'credit_id': '52fe466a9251416c75077a89', 'de...</td>\n"," <td>15602</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-af558189-001b-41a5-a507-f131b8c3ab89')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-af558189-001b-41a5-a507-f131b8c3ab89 button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-af558189-001b-41a5-a507-f131b8c3ab89');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "]},"metadata":{},"execution_count":10}]},{"cell_type":"code","source":["credits.isnull().sum()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"6of-O22CTBD7","executionInfo":{"status":"ok","timestamp":1688547995732,"user_tz":-210,"elapsed":473,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"6da185d8-74eb-4701-c341-8304a241aa94"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["cast 0\n","crew 0\n","id 0\n","dtype: int64"]},"metadata":{},"execution_count":11}]},{"cell_type":"code","source":["credits.isna().sum()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"tfhS7WLITas6","executionInfo":{"status":"ok","timestamp":1688548005437,"user_tz":-210,"elapsed":27,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"aa448602-e8af-456a-c269-6dcc16337a5e"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["cast 0\n","crew 0\n","id 0\n","dtype: int64"]},"metadata":{},"execution_count":12}]},{"cell_type":"code","source":["credits.nunique()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"8gj4gQr2Tc_H","executionInfo":{"status":"ok","timestamp":1688548048490,"user_tz":-210,"elapsed":1036,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"2911f641-04bd-4573-8260-65327de68818"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["cast 43019\n","crew 44669\n","id 45432\n","dtype: int64"]},"metadata":{},"execution_count":14}]},{"cell_type":"code","source":["credits.drop_duplicates(inplace=True)"],"metadata":{"id":"shIqe3YbTiCw"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Extracting information from json strings"],"metadata":{"id":"XvobXUNzVxFf"}},{"cell_type":"code","source":["def get_text(text, obj='name'):\n"," text = literal_eval(text)\n"," if len(text) == 1:\n"," for i in text:\n"," return i[obj]\n"," else:\n"," s = []\n"," for i in text:\n"," s.append(str(i[obj]))\n"," return ', '.join(s)"],"metadata":{"id":"wmybX3cqTsrm","executionInfo":{"status":"ok","timestamp":1688571301495,"user_tz":-210,"elapsed":495,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":28,"outputs":[]},{"cell_type":"code","source":["credits['name_crew'] = credits['crew'].apply(get_text, obj = \"name\")\n","credits['department_crew'] = credits['crew'].apply(get_text, obj = \"department\")\n","credits['gender_crew'] = credits['crew'].apply(get_text, obj = \"gender\")\n","credits['job_crew'] = credits['crew'].apply(get_text, obj = \"job\")\n","credits['profile_path_crew'] = credits['crew'].apply(get_text, obj = \"profile_path\")\n","credits['id_crew'] = credits['crew'].apply(get_text, obj = \"id\")"],"metadata":{"id":"NIfflprHV4mT"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["credits['name_cast'] = credits['cast'].apply(get_text, obj = \"name\")\n","credits['order_cast'] = credits['cast'].apply(get_text, obj = \"order\")\n","credits['gender_cast'] = credits['cast'].apply(get_text, obj = \"gender\")\n","credits['credit_id_cast'] = credits['cast'].apply(get_text, obj = \"credit_id\")\n","credits['profile_path_cast'] = credits['cast'].apply(get_text, obj = \"profile_path\")\n","credits['id_cast'] = credits['cast'].apply(get_text, obj = \"id\")\n","credits['character_cast'] = credits['cast'].apply(get_text, obj = \"character\")"],"metadata":{"id":"6mVD2jb9V9Gy"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Constructing the new dataframe and saving that"],"metadata":{"id":"lDgPbPtyYJrG"}},{"cell_type":"code","source":["clean_credits = credits.drop([\"crew\", \"cast\"], axis=1)"],"metadata":{"id":"zGjkFiWEWicR"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["Checking the new dataframe"],"metadata":{"id":"QeO-YhWTYUvz"}},{"cell_type":"code","source":["print ('shape of the cleaned Credits dataset: {}'.format(clean_credits.shape))\n","clean_credits.head(3)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":572},"id":"k9Iqhry2Xsyt","executionInfo":{"status":"ok","timestamp":1688549213175,"user_tz":-210,"elapsed":473,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"65d54e2b-4828-4937-dada-233a2dd45011"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["shape of the cleaned Credits dataset: (45439, 14)\n"]},{"output_type":"execute_result","data":{"text/plain":[" id name_crew \\\n","0 862 John Lasseter, Joss Whedon, Andrew Stanton, Jo... \n","1 8844 Larry J. Franco, Jonathan Hensleigh, James Hor... \n","2 15602 Howard Deutch, Mark Steven Johnson, Mark Steve... \n","\n"," department_crew \\\n","0 Directing, Writing, Writing, Writing, Writing,... \n","1 Production, Writing, Sound, Directing, Editing... \n","2 Directing, Writing, Writing, Crew \n","\n"," gender_crew \\\n","0 2, 2, 2, 2, 0, 1, 0, 2, 2, 2, 2, 2, 0, 0, 0, 2... \n","1 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2 \n","2 2, 2, 2, 2 \n","\n"," job_crew \\\n","0 Director, Screenplay, Screenplay, Screenplay, ... \n","1 Executive Producer, Screenplay, Original Music... \n","2 Director, Characters, Writer, Sound Recordist \n","\n"," profile_path_crew \\\n","0 /7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg, /dTiVsuaTVTe... \n","1 None, /l1c4UFD3g0HVWj5f0CxXAvMAGiT.jpg, /oLOtX... \n","2 /68Vae1HkU1NxQZ6KEmuxIpno7c9.jpg, /6trChNn3o2b... \n","\n"," id_crew \\\n","0 7879, 12891, 7, 12892, 12893, 12894, 12895, 12... \n","1 511, 876, 1729, 4945, 4951, 4952, 8023, 9967, ... \n","2 26502, 16837, 16837, 1551320 \n","\n"," name_cast \\\n","0 Tom Hanks, Tim Allen, Don Rickles, Jim Varney,... \n","1 Robin Williams, Jonathan Hyde, Kirsten Dunst, ... \n","2 Walter Matthau, Jack Lemmon, Ann-Margret, Soph... \n","\n"," order_cast \\\n","0 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 \n","1 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ... \n","2 0, 1, 2, 3, 4, 5, 6 \n","\n"," gender_cast \\\n","0 2, 2, 2, 2, 2, 2, 1, 0, 2, 1, 2, 1, 2 \n","1 2, 2, 1, 0, 1, 1, 2, 1, 0, 1, 2, 1, 2, 0, 0, 0... \n","2 2, 2, 1, 1, 1, 2, 2 \n","\n"," credit_id_cast \\\n","0 52fe4284c3a36847f8024f95, 52fe4284c3a36847f802... \n","1 52fe44bfc3a36847f80a7c73, 52fe44bfc3a36847f80a... \n","2 52fe466a9251416c75077a8d, 52fe466a9251416c7507... \n","\n"," profile_path_cast \\\n","0 /pQFoyx7rp09CJTAb932F2g8Nlho.jpg, /uX2xVf6pMmP... \n","1 /sojtJyIV3lkUeThD7A2oHNm8183.jpg, /7il5D76vx6Q... \n","2 /xJVkvprOnzP5Zdh5y63y8HHniDZ.jpg, /chZmNRYMtqk... \n","\n"," id_cast \\\n","0 31, 12898, 7167, 12899, 12900, 7907, 8873, 111... \n","1 2157, 8537, 205, 145151, 5149, 10739, 58563, 1... \n","2 6837, 3151, 13567, 16757, 589, 16523, 7166 \n","\n"," character_cast \n","0 Woody (voice), Buzz Lightyear (voice), Mr. Pot... \n","1 Alan Parrish, Samuel Alan Parrish / Van Pelt, ... \n","2 Max Goldman, John Gustafson, Ariel Gustafson, ... "],"text/html":["\n"," <div id=\"df-b061b4ac-c086-4304-9e17-fe2a649fdb5f\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>id</th>\n"," <th>name_crew</th>\n"," <th>department_crew</th>\n"," <th>gender_crew</th>\n"," <th>job_crew</th>\n"," <th>profile_path_crew</th>\n"," <th>id_crew</th>\n"," <th>name_cast</th>\n"," <th>order_cast</th>\n"," <th>gender_cast</th>\n"," <th>credit_id_cast</th>\n"," <th>profile_path_cast</th>\n"," <th>id_cast</th>\n"," <th>character_cast</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>862</td>\n"," <td>John Lasseter, Joss Whedon, Andrew Stanton, Jo...</td>\n"," <td>Directing, Writing, Writing, Writing, Writing,...</td>\n"," <td>2, 2, 2, 2, 0, 1, 0, 2, 2, 2, 2, 2, 0, 0, 0, 2...</td>\n"," <td>Director, Screenplay, Screenplay, Screenplay, ...</td>\n"," <td>/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg, /dTiVsuaTVTe...</td>\n"," <td>7879, 12891, 7, 12892, 12893, 12894, 12895, 12...</td>\n"," <td>Tom Hanks, Tim Allen, Don Rickles, Jim Varney,...</td>\n"," <td>0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12</td>\n"," <td>2, 2, 2, 2, 2, 2, 1, 0, 2, 1, 2, 1, 2</td>\n"," <td>52fe4284c3a36847f8024f95, 52fe4284c3a36847f802...</td>\n"," <td>/pQFoyx7rp09CJTAb932F2g8Nlho.jpg, /uX2xVf6pMmP...</td>\n"," <td>31, 12898, 7167, 12899, 12900, 7907, 8873, 111...</td>\n"," <td>Woody (voice), Buzz Lightyear (voice), Mr. Pot...</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>8844</td>\n"," <td>Larry J. Franco, Jonathan Hensleigh, James Hor...</td>\n"," <td>Production, Writing, Sound, Directing, Editing...</td>\n"," <td>2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2</td>\n"," <td>Executive Producer, Screenplay, Original Music...</td>\n"," <td>None, /l1c4UFD3g0HVWj5f0CxXAvMAGiT.jpg, /oLOtX...</td>\n"," <td>511, 876, 1729, 4945, 4951, 4952, 8023, 9967, ...</td>\n"," <td>Robin Williams, Jonathan Hyde, Kirsten Dunst, ...</td>\n"," <td>0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ...</td>\n"," <td>2, 2, 1, 0, 1, 1, 2, 1, 0, 1, 2, 1, 2, 0, 0, 0...</td>\n"," <td>52fe44bfc3a36847f80a7c73, 52fe44bfc3a36847f80a...</td>\n"," <td>/sojtJyIV3lkUeThD7A2oHNm8183.jpg, /7il5D76vx6Q...</td>\n"," <td>2157, 8537, 205, 145151, 5149, 10739, 58563, 1...</td>\n"," <td>Alan Parrish, Samuel Alan Parrish / Van Pelt, ...</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>15602</td>\n"," <td>Howard Deutch, Mark Steven Johnson, Mark Steve...</td>\n"," <td>Directing, Writing, Writing, Crew</td>\n"," <td>2, 2, 2, 2</td>\n"," <td>Director, Characters, Writer, Sound Recordist</td>\n"," <td>/68Vae1HkU1NxQZ6KEmuxIpno7c9.jpg, /6trChNn3o2b...</td>\n"," <td>26502, 16837, 16837, 1551320</td>\n"," <td>Walter Matthau, Jack Lemmon, Ann-Margret, Soph...</td>\n"," <td>0, 1, 2, 3, 4, 5, 6</td>\n"," <td>2, 2, 1, 1, 1, 2, 2</td>\n"," <td>52fe466a9251416c75077a8d, 52fe466a9251416c7507...</td>\n"," <td>/xJVkvprOnzP5Zdh5y63y8HHniDZ.jpg, /chZmNRYMtqk...</td>\n"," <td>6837, 3151, 13567, 16757, 589, 16523, 7166</td>\n"," <td>Max Goldman, John Gustafson, Ariel Gustafson, ...</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-b061b4ac-c086-4304-9e17-fe2a649fdb5f')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-b061b4ac-c086-4304-9e17-fe2a649fdb5f button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-b061b4ac-c086-4304-9e17-fe2a649fdb5f');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "]},"metadata":{},"execution_count":20}]},{"cell_type":"markdown","source":["Saving the new .csv dataset"],"metadata":{"id":"CR5anOZZYlBe"}},{"cell_type":"code","source":["clean_credits.to_csv('/content/drive/MyDrive/Rec/data/cleaned/clean_credits.csv', index=False)"],"metadata":{"id":"yto7-njpYD7U"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# Keywords dataset"],"metadata":{"id":"emyNitcyZPCr"}},{"cell_type":"code","source":["keywords = pd.read_csv(os.path.join(raw_dir, \"keywords.csv\"))\n","print('shape of raw Keywores dataset: {}'.format(keywords.shape))\n","keywords.head(3)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":161},"id":"4JqG86_-ZKHn","executionInfo":{"status":"ok","timestamp":1688549658847,"user_tz":-210,"elapsed":511,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"7a279461-cc46-4f3b-f211-2a02492658f2"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["shape of raw Keywores dataset: (46419, 2)\n"]},{"output_type":"execute_result","data":{"text/plain":[" id keywords\n","0 862 [{'id': 931, 'name': 'jealousy'}, {'id': 4290,...\n","1 8844 [{'id': 10090, 'name': 'board game'}, {'id': 1...\n","2 15602 [{'id': 1495, 'name': 'fishing'}, {'id': 12392..."],"text/html":["\n"," <div id=\"df-8ece7329-852a-47fe-8d22-9752d988430f\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>id</th>\n"," <th>keywords</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>862</td>\n"," <td>[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>8844</td>\n"," <td>[{'id': 10090, 'name': 'board game'}, {'id': 1...</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>15602</td>\n"," <td>[{'id': 1495, 'name': 'fishing'}, {'id': 12392...</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-8ece7329-852a-47fe-8d22-9752d988430f')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-8ece7329-852a-47fe-8d22-9752d988430f button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-8ece7329-852a-47fe-8d22-9752d988430f');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "]},"metadata":{},"execution_count":24}]},{"cell_type":"code","source":["keywords.isna().sum()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"tLkNXoSJZmDE","executionInfo":{"status":"ok","timestamp":1688549694116,"user_tz":-210,"elapsed":486,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"fb4852d3-563a-4833-8674-eeaab7c29bbf"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["id 0\n","keywords 0\n","dtype: int64"]},"metadata":{},"execution_count":25}]},{"cell_type":"code","source":["keywords.isnull().sum()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"9tyQTNKPZ5V0","executionInfo":{"status":"ok","timestamp":1688549730364,"user_tz":-210,"elapsed":963,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"9d0fcf3b-3aad-46e8-f36e-2c77038f5b3f"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["id 0\n","keywords 0\n","dtype: int64"]},"metadata":{},"execution_count":26}]},{"cell_type":"code","source":["keywords.nunique()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"u2jgxSs_aB9x","executionInfo":{"status":"ok","timestamp":1688549740316,"user_tz":-210,"elapsed":604,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"ba5dbee0-d701-46c0-d2e8-341d806794ad"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["id 45432\n","keywords 25989\n","dtype: int64"]},"metadata":{},"execution_count":27}]},{"cell_type":"code","source":["keywords.drop_duplicates(inplace=True)\n","print('shape of dataset after dropping duplicates: {}'.format(keywords.shape))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"BqPLBOeyaEif","executionInfo":{"status":"ok","timestamp":1688549823949,"user_tz":-210,"elapsed":429,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"d7a7d607-69cd-4367-f302-d384bb90bdcd"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["shape of dataset after dropping duplicates: (45432, 2)\n"]}]},{"cell_type":"markdown","source":["## Extracting information from json strings"],"metadata":{"id":"CEea-KSLaqI2"}},{"cell_type":"code","source":["keywords['name_keywords'] = keywords['keywords'].apply(get_text, obj = \"name\")\n","keywords['id_keywords'] = keywords['keywords'].apply(get_text, obj = \"id\")"],"metadata":{"id":"_W1h0U4-aWYX"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["## Constructing new dataframe and saving that"],"metadata":{"id":"KMGfBLW7a4wi"}},{"cell_type":"code","source":["clean_keywords = keywords.drop([\"keywords\"], axis=1)"],"metadata":{"id":"JUuH5tmxazAW"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["Checking the new dataset"],"metadata":{"id":"hwYzBGyjbZMN"}},{"cell_type":"code","source":["print('shape of the cleaned data: {}'.format(clean_keywords.shape))\n","clean_keywords.head(3)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":161},"id":"QsOVt41MbLCz","executionInfo":{"status":"ok","timestamp":1688550078229,"user_tz":-210,"elapsed":575,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"6a400dc8-6ea3-4dca-f034-6ca0f1643fc0"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["shape of the cleaned data: (45432, 3)\n"]},{"output_type":"execute_result","data":{"text/plain":[" id name_keywords \\\n","0 862 jealousy, toy, boy, friendship, friends, rival... \n","1 8844 board game, disappearance, based on children's... \n","2 15602 fishing, best friend, duringcreditsstinger, ol... \n","\n"," id_keywords \n","0 931, 4290, 5202, 6054, 9713, 9823, 165503, 170... \n","1 10090, 10941, 15101, 33467, 158086, 158091 \n","2 1495, 12392, 179431, 208510 "],"text/html":["\n"," <div id=\"df-66979cc2-b808-444f-bd2b-bd46f8759b8c\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>id</th>\n"," <th>name_keywords</th>\n"," <th>id_keywords</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>862</td>\n"," <td>jealousy, toy, boy, friendship, friends, rival...</td>\n"," <td>931, 4290, 5202, 6054, 9713, 9823, 165503, 170...</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>8844</td>\n"," <td>board game, disappearance, based on children's...</td>\n"," <td>10090, 10941, 15101, 33467, 158086, 158091</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>15602</td>\n"," <td>fishing, best friend, duringcreditsstinger, ol...</td>\n"," <td>1495, 12392, 179431, 208510</td>\n"," </tr>\n"," </tbody>\n","</table>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-66979cc2-b808-444f-bd2b-bd46f8759b8c')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-66979cc2-b808-444f-bd2b-bd46f8759b8c button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-66979cc2-b808-444f-bd2b-bd46f8759b8c');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "]},"metadata":{},"execution_count":32}]},{"cell_type":"markdown","source":["Saving the cleaned dataset as a .csv file"],"metadata":{"id":"W0YT8YV3bcf_"}},{"cell_type":"code","source":["clean_keywords.to_csv('/content/drive/MyDrive/Rec/data/cleaned/clean_keywords.csv', index=False)\n"],"metadata":{"id":"M9f5_VeibXC-"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# Metadata dataset"],"metadata":{"id":"SKlrgI-Ub1SP"}},{"cell_type":"code","source":["metadata = pd.read_csv(os.path.join(raw_dir, 'movies_metadata.csv'))\n","print('shape of the raw metadata dataset: {}'.format(metadata.shape))\n","metadata.head(3)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":567},"id":"MwDtrtVJbvM5","executionInfo":{"status":"ok","timestamp":1688571617483,"user_tz":-210,"elapsed":1283,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"3b0db80d-e8e3-4bdf-dce4-edbfd4addabe"},"execution_count":35,"outputs":[{"output_type":"stream","name":"stdout","text":["shape of the raw metadata dataset: (45466, 24)\n"]},{"output_type":"stream","name":"stderr","text":["<ipython-input-35-d8520c3ee68a>:1: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False.\n"," metadata = pd.read_csv(os.path.join(raw_dir, 'movies_metadata.csv'))\n"]},{"output_type":"execute_result","data":{"text/plain":[" adult belongs_to_collection budget \\\n","0 False {'id': 10194, 'name': 'Toy Story Collection', ... 30000000 \n","1 False NaN 65000000 \n","2 False {'id': 119050, 'name': 'Grumpy Old Men Collect... 0 \n","\n"," genres \\\n","0 [{'id': 16, 'name': 'Animation'}, {'id': 35, '... \n","1 [{'id': 12, 'name': 'Adventure'}, {'id': 14, '... \n","2 [{'id': 10749, 'name': 'Romance'}, {'id': 35, ... \n","\n"," homepage id imdb_id original_language \\\n","0 http://toystory.disney.com/toy-story 862 tt0114709 en \n","1 NaN 8844 tt0113497 en \n","2 NaN 15602 tt0113228 en \n","\n"," original_title overview ... \\\n","0 Toy Story Led by Woody, Andy's toys live happily in his ... ... \n","1 Jumanji When siblings Judy and Peter discover an encha... ... \n","2 Grumpier Old Men A family wedding reignites the ancient feud be... ... \n","\n"," release_date revenue runtime \\\n","0 1995-10-30 373554033.0 81.0 \n","1 1995-12-15 262797249.0 104.0 \n","2 1995-12-22 0.0 101.0 \n","\n"," spoken_languages status \\\n","0 [{'iso_639_1': 'en', 'name': 'English'}] Released \n","1 [{'iso_639_1': 'en', 'name': 'English'}, {'iso... Released \n","2 [{'iso_639_1': 'en', 'name': 'English'}] Released \n","\n"," tagline title video \\\n","0 NaN Toy Story False \n","1 Roll the dice and unleash the excitement! Jumanji False \n","2 Still Yelling. Still Fighting. Still Ready for... Grumpier Old Men False \n","\n"," vote_average vote_count \n","0 7.7 5415.0 \n","1 6.9 2413.0 \n","2 6.5 92.0 \n","\n","[3 rows x 24 columns]"],"text/html":["\n"," <div id=\"df-1f24f91a-2b29-4c28-b1e3-65fcc0c08c84\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>adult</th>\n"," <th>belongs_to_collection</th>\n"," <th>budget</th>\n"," <th>genres</th>\n"," <th>homepage</th>\n"," <th>id</th>\n"," <th>imdb_id</th>\n"," <th>original_language</th>\n"," <th>original_title</th>\n"," <th>overview</th>\n"," <th>...</th>\n"," <th>release_date</th>\n"," <th>revenue</th>\n"," <th>runtime</th>\n"," <th>spoken_languages</th>\n"," <th>status</th>\n"," <th>tagline</th>\n"," <th>title</th>\n"," <th>video</th>\n"," <th>vote_average</th>\n"," <th>vote_count</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>False</td>\n"," <td>{'id': 10194, 'name': 'Toy Story Collection', ...</td>\n"," <td>30000000</td>\n"," <td>[{'id': 16, 'name': 'Animation'}, {'id': 35, '...</td>\n"," <td>http://toystory.disney.com/toy-story</td>\n"," <td>862</td>\n"," <td>tt0114709</td>\n"," <td>en</td>\n"," <td>Toy Story</td>\n"," <td>Led by Woody, Andy's toys live happily in his ...</td>\n"," <td>...</td>\n"," <td>1995-10-30</td>\n"," <td>373554033.0</td>\n"," <td>81.0</td>\n"," <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n"," <td>Released</td>\n"," <td>NaN</td>\n"," <td>Toy Story</td>\n"," <td>False</td>\n"," <td>7.7</td>\n"," <td>5415.0</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>False</td>\n"," <td>NaN</td>\n"," <td>65000000</td>\n"," <td>[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...</td>\n"," <td>NaN</td>\n"," <td>8844</td>\n"," <td>tt0113497</td>\n"," <td>en</td>\n"," <td>Jumanji</td>\n"," <td>When siblings Judy and Peter discover an encha...</td>\n"," <td>...</td>\n"," <td>1995-12-15</td>\n"," <td>262797249.0</td>\n"," <td>104.0</td>\n"," <td>[{'iso_639_1': 'en', 'name': 'English'}, {'iso...</td>\n"," <td>Released</td>\n"," <td>Roll the dice and unleash the excitement!</td>\n"," <td>Jumanji</td>\n"," <td>False</td>\n"," <td>6.9</td>\n"," <td>2413.0</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>False</td>\n"," <td>{'id': 119050, 'name': 'Grumpy Old Men Collect...</td>\n"," <td>0</td>\n"," <td>[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...</td>\n"," <td>NaN</td>\n"," <td>15602</td>\n"," <td>tt0113228</td>\n"," <td>en</td>\n"," <td>Grumpier Old Men</td>\n"," <td>A family wedding reignites the ancient feud be...</td>\n"," <td>...</td>\n"," <td>1995-12-22</td>\n"," <td>0.0</td>\n"," <td>101.0</td>\n"," <td>[{'iso_639_1': 'en', 'name': 'English'}]</td>\n"," <td>Released</td>\n"," <td>Still Yelling. Still Fighting. Still Ready for...</td>\n"," <td>Grumpier Old Men</td>\n"," <td>False</td>\n"," <td>6.5</td>\n"," <td>92.0</td>\n"," </tr>\n"," </tbody>\n","</table>\n","<p>3 rows × 24 columns</p>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-1f24f91a-2b29-4c28-b1e3-65fcc0c08c84')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-1f24f91a-2b29-4c28-b1e3-65fcc0c08c84 button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-1f24f91a-2b29-4c28-b1e3-65fcc0c08c84');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "]},"metadata":{},"execution_count":35}]},{"cell_type":"code","source":["metadata.isna().sum()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Kt9tI1iydiXd","executionInfo":{"status":"ok","timestamp":1688571624293,"user_tz":-210,"elapsed":398,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"41a87985-be28-49d6-86c1-d86fbe49b2f5"},"execution_count":36,"outputs":[{"output_type":"execute_result","data":{"text/plain":["adult 0\n","belongs_to_collection 40972\n","budget 0\n","genres 0\n","homepage 37684\n","id 0\n","imdb_id 17\n","original_language 11\n","original_title 0\n","overview 954\n","popularity 5\n","poster_path 386\n","production_companies 3\n","production_countries 3\n","release_date 87\n","revenue 6\n","runtime 263\n","spoken_languages 6\n","status 87\n","tagline 25054\n","title 6\n","video 6\n","vote_average 6\n","vote_count 6\n","dtype: int64"]},"metadata":{},"execution_count":36}]},{"cell_type":"code","source":["metadata.isnull().sum()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"9ASDp8eTkg4h","executionInfo":{"status":"ok","timestamp":1688571627733,"user_tz":-210,"elapsed":685,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"2751d9b1-f910-4de2-de33-6ff6bee31bd5"},"execution_count":37,"outputs":[{"output_type":"execute_result","data":{"text/plain":["adult 0\n","belongs_to_collection 40972\n","budget 0\n","genres 0\n","homepage 37684\n","id 0\n","imdb_id 17\n","original_language 11\n","original_title 0\n","overview 954\n","popularity 5\n","poster_path 386\n","production_companies 3\n","production_countries 3\n","release_date 87\n","revenue 6\n","runtime 263\n","spoken_languages 6\n","status 87\n","tagline 25054\n","title 6\n","video 6\n","vote_average 6\n","vote_count 6\n","dtype: int64"]},"metadata":{},"execution_count":37}]},{"cell_type":"code","source":["metadata.nunique()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"yYM0Ud3Tlpjw","executionInfo":{"status":"ok","timestamp":1688571629398,"user_tz":-210,"elapsed":363,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"b1aa8de1-0749-466c-8dd9-7a2dae7dcdeb"},"execution_count":38,"outputs":[{"output_type":"execute_result","data":{"text/plain":["adult 5\n","belongs_to_collection 1698\n","budget 1226\n","genres 4069\n","homepage 7673\n","id 45436\n","imdb_id 45417\n","original_language 92\n","original_title 43373\n","overview 44307\n","popularity 44176\n","poster_path 45024\n","production_companies 22708\n","production_countries 2393\n","release_date 17336\n","revenue 6863\n","runtime 353\n","spoken_languages 1931\n","status 6\n","tagline 20283\n","title 42277\n","video 2\n","vote_average 92\n","vote_count 1820\n","dtype: int64"]},"metadata":{},"execution_count":38}]},{"cell_type":"markdown","source":["Histogram of the dataset"],"metadata":{"id":"m7T_-wt1mFvu"}},{"cell_type":"code","source":["metadata.hist(bins=50, figsize=(10, 7))\n","plt.show()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":622},"id":"rLGE24QNl17Y","executionInfo":{"status":"ok","timestamp":1688571633442,"user_tz":-210,"elapsed":1657,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"de36782e-7d6f-4664-f10f-36d695f7675c"},"execution_count":39,"outputs":[{"output_type":"display_data","data":{"text/plain":["<Figure size 1000x700 with 4 Axes>"],"image/png":"\n"},"metadata":{}}]},{"cell_type":"markdown","source":["## Removing entries with numeric name of comopanies or countries"],"metadata":{"id":"uTNnaW8Jmr3U"}},{"cell_type":"code","source":["def is_string_number(string):\n"," try:\n"," float(string)\n"," return True\n"," except ValueError:\n"," return False"],"metadata":{"id":"-sB4JFQnmCxW","executionInfo":{"status":"ok","timestamp":1688571638129,"user_tz":-210,"elapsed":595,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":40,"outputs":[]},{"cell_type":"code","source":["for i in range(metadata.shape[0]):\n"," if is_string_number(metadata.loc[i, 'production_countries']) or is_string_number(metadata.loc[i, 'production_companies']):\n"," metadata.drop(index=i, inplace=True)\n","metadata.shape"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"7EZfyxccmqMQ","executionInfo":{"status":"ok","timestamp":1688571640517,"user_tz":-210,"elapsed":784,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"564f7e66-ea73-4755-cd9e-9e55e50707d4"},"execution_count":41,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(45460, 24)"]},"metadata":{},"execution_count":41}]},{"cell_type":"code","source":["metadata.drop_duplicates(inplace=True)\n","metadata.dropna(subset=['production_companies', 'production_countries'], inplace=True)"],"metadata":{"id":"vtrzBhjznD7p","executionInfo":{"status":"ok","timestamp":1688571642826,"user_tz":-210,"elapsed":363,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":42,"outputs":[]},{"cell_type":"code","source":["metadata = metadata.fillna(\"NaN value\")\n"],"metadata":{"id":"q7mDM3zAnlH_","executionInfo":{"status":"ok","timestamp":1688571644612,"user_tz":-210,"elapsed":398,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":43,"outputs":[]},{"cell_type":"markdown","source":["## Extracting information from json values"],"metadata":{"id":"J952IGoVrwJA"}},{"cell_type":"code","source":["def get_text_nan(text, obj=\"name\"):\n"," if(text == \"NaN value\"):\n"," return np.nan\n"," elif (isinstance(literal_eval(str(text)), float)):\n"," return np.nan\n"," else:\n"," text = literal_eval(str(text))\n"," for i in text:\n"," if(i == obj):\n"," return (text[i])"],"metadata":{"id":"C3ElqfyRrgK8","executionInfo":{"status":"ok","timestamp":1688571653225,"user_tz":-210,"elapsed":739,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":44,"outputs":[]},{"cell_type":"code","source":["def text_getter(text, obj='name'):\n"," if(text == 'NaN value'):\n"," return np.nan\n"," elif (isinstance(literal_eval(str(text)), float)):\n"," return np.nan\n"," elif(isinstance(literal_eval(str(text)), list)):\n"," s = []\n"," for i in text:\n"," s.append(str(i[obj]))\n"," return ', '.join(s)\n"," else:\n"," text = literal_eval(str(text))\n"," for i in text:\n"," if(i == obj):\n"," return (text[i])"],"metadata":{"id":"sPZP1mKRrhnY","executionInfo":{"status":"ok","timestamp":1688571654839,"user_tz":-210,"elapsed":6,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":45,"outputs":[]},{"cell_type":"markdown","source":["'belongs to collection' column"],"metadata":{"id":"kvoi_xr8sw5X"}},{"cell_type":"code","source":["metadata['name_belongs_to_collection'] = metadata['belongs_to_collection'].apply(get_text_nan ,obj = \"name\")\n","metadata['id_belongs_to_collection'] = metadata['belongs_to_collection'].apply(get_text_nan ,obj = \"id\")\n","metadata['poster_path_belongs_to_collection'] = metadata['belongs_to_collection'].apply(get_text_nan ,obj = \"poster_path\")\n","metadata['backdrop_path_belongs_to_collection'] = metadata['belongs_to_collection'].apply(get_text_nan ,obj = \"backdrop_path\")"],"metadata":{"id":"9-ZGGNdfsFKw","executionInfo":{"status":"ok","timestamp":1688571660522,"user_tz":-210,"elapsed":933,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":46,"outputs":[]},{"cell_type":"markdown","source":["'genres' column"],"metadata":{"id":"B-8-Qd9dtIcv"}},{"cell_type":"code","source":["metadata['name_genres'] = metadata['genres'].apply(get_text ,obj = \"name\")\n","metadata['id_genres'] = metadata['genres'].apply(get_text ,obj = \"id\")"],"metadata":{"id":"QJp5U6BGsu0Z","executionInfo":{"status":"ok","timestamp":1688571665947,"user_tz":-210,"elapsed":2373,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":47,"outputs":[]},{"cell_type":"markdown","source":["'production countries' column"],"metadata":{"id":"GLFk9tGQtM4_"}},{"cell_type":"code","source":["metadata['name_production_countries'] = metadata['production_countries'].apply(get_text ,obj = \"name\")\n","metadata['iso_3166_1_production_companies'] = metadata['production_countries'].apply(get_text ,obj = \"iso_3166_1\")"],"metadata":{"id":"viAxv4kGsvbD","executionInfo":{"status":"ok","timestamp":1688571671111,"user_tz":-210,"elapsed":1435,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":48,"outputs":[]},{"cell_type":"markdown","source":["'production companies' column"],"metadata":{"id":"VDodd_DPtbrz"}},{"cell_type":"code","source":["metadata['name_production_companies'] = metadata['production_companies'].apply(get_text ,obj = \"name\")\n","metadata['id_production_companies'] = metadata['production_companies'].apply(get_text ,obj = \"id\")\n"],"metadata":{"id":"9fyeJVlftaEU","executionInfo":{"status":"ok","timestamp":1688571676262,"user_tz":-210,"elapsed":1791,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":49,"outputs":[]},{"cell_type":"code","source":["metadata = metadata.replace('NaN value', np.nan)"],"metadata":{"id":"VkkYNztRuf42","executionInfo":{"status":"ok","timestamp":1688571907439,"user_tz":-210,"elapsed":579,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":52,"outputs":[]},{"cell_type":"markdown","source":["## Constructing the new dataframe and saving the new dataset"],"metadata":{"id":"8-Z-JMlCtlon"}},{"cell_type":"code","source":["clean_metadata = metadata.drop([\"spoken_languages\" ,\"genres\" ,\"production_countries\" ,\"belongs_to_collection\"] ,axis=1)\n","print('shape of the cleaned dataset: {}'.format(clean_metadata.shape))\n","clean_metadata.head(3)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":532},"id":"zveKnpTPtfiA","executionInfo":{"status":"ok","timestamp":1688571914195,"user_tz":-210,"elapsed":578,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"8b7df7c5-f07e-418b-9dfe-de0f92557bb3"},"execution_count":53,"outputs":[{"output_type":"stream","name":"stdout","text":["shape of the cleaned dataset: (45447, 30)\n"]},{"output_type":"execute_result","data":{"text/plain":[" adult budget homepage id imdb_id \\\n","0 False 30000000 http://toystory.disney.com/toy-story 862 tt0114709 \n","1 False 65000000 NaN 8844 tt0113497 \n","2 False 0 NaN 15602 tt0113228 \n","\n"," original_language original_title \\\n","0 en Toy Story \n","1 en Jumanji \n","2 en Grumpier Old Men \n","\n"," overview popularity \\\n","0 Led by Woody, Andy's toys live happily in his ... 21.946943 \n","1 When siblings Judy and Peter discover an encha... 17.015539 \n","2 A family wedding reignites the ancient feud be... 11.7129 \n","\n"," poster_path ... name_belongs_to_collection \\\n","0 /rhIRbceoE9lR4veEXuwCC2wARtG.jpg ... Toy Story Collection \n","1 /vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg ... NaN \n","2 /6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg ... Grumpy Old Men Collection \n","\n"," id_belongs_to_collection poster_path_belongs_to_collection \\\n","0 10194.0 /7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg \n","1 NaN NaN \n","2 119050.0 /nLvUdqgPgm3F85NMCii9gVFUcet.jpg \n","\n"," backdrop_path_belongs_to_collection name_genres \\\n","0 /9FBwqcd9IRruEDUrTdcaafOMKUq.jpg Animation, Comedy, Family \n","1 NaN Adventure, Fantasy, Family \n","2 /hypTnLot2z8wpFS7qwsQHW1uV8u.jpg Romance, Comedy \n","\n"," id_genres name_production_countries iso_3166_1_production_companies \\\n","0 16, 35, 10751 United States of America US \n","1 12, 14, 10751 United States of America US \n","2 10749, 35 United States of America US \n","\n"," name_production_companies id_production_companies \n","0 Pixar Animation Studios 3 \n","1 TriStar Pictures, Teitler Film, Interscope Com... 559, 2550, 10201 \n","2 Warner Bros., Lancaster Gate 6194, 19464 \n","\n","[3 rows x 30 columns]"],"text/html":["\n"," <div id=\"df-c8e90d56-a50e-4f9c-b919-c9d653aa1110\">\n"," <div class=\"colab-df-container\">\n"," <div>\n","<style scoped>\n"," .dataframe tbody tr th:only-of-type {\n"," vertical-align: middle;\n"," }\n","\n"," .dataframe tbody tr th {\n"," vertical-align: top;\n"," }\n","\n"," .dataframe thead th {\n"," text-align: right;\n"," }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n"," <thead>\n"," <tr style=\"text-align: right;\">\n"," <th></th>\n"," <th>adult</th>\n"," <th>budget</th>\n"," <th>homepage</th>\n"," <th>id</th>\n"," <th>imdb_id</th>\n"," <th>original_language</th>\n"," <th>original_title</th>\n"," <th>overview</th>\n"," <th>popularity</th>\n"," <th>poster_path</th>\n"," <th>...</th>\n"," <th>name_belongs_to_collection</th>\n"," <th>id_belongs_to_collection</th>\n"," <th>poster_path_belongs_to_collection</th>\n"," <th>backdrop_path_belongs_to_collection</th>\n"," <th>name_genres</th>\n"," <th>id_genres</th>\n"," <th>name_production_countries</th>\n"," <th>iso_3166_1_production_companies</th>\n"," <th>name_production_companies</th>\n"," <th>id_production_companies</th>\n"," </tr>\n"," </thead>\n"," <tbody>\n"," <tr>\n"," <th>0</th>\n"," <td>False</td>\n"," <td>30000000</td>\n"," <td>http://toystory.disney.com/toy-story</td>\n"," <td>862</td>\n"," <td>tt0114709</td>\n"," <td>en</td>\n"," <td>Toy Story</td>\n"," <td>Led by Woody, Andy's toys live happily in his ...</td>\n"," <td>21.946943</td>\n"," <td>/rhIRbceoE9lR4veEXuwCC2wARtG.jpg</td>\n"," <td>...</td>\n"," <td>Toy Story Collection</td>\n"," <td>10194.0</td>\n"," <td>/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg</td>\n"," <td>/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg</td>\n"," <td>Animation, Comedy, Family</td>\n"," <td>16, 35, 10751</td>\n"," <td>United States of America</td>\n"," <td>US</td>\n"," <td>Pixar Animation Studios</td>\n"," <td>3</td>\n"," </tr>\n"," <tr>\n"," <th>1</th>\n"," <td>False</td>\n"," <td>65000000</td>\n"," <td>NaN</td>\n"," <td>8844</td>\n"," <td>tt0113497</td>\n"," <td>en</td>\n"," <td>Jumanji</td>\n"," <td>When siblings Judy and Peter discover an encha...</td>\n"," <td>17.015539</td>\n"," <td>/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg</td>\n"," <td>...</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>NaN</td>\n"," <td>Adventure, Fantasy, Family</td>\n"," <td>12, 14, 10751</td>\n"," <td>United States of America</td>\n"," <td>US</td>\n"," <td>TriStar Pictures, Teitler Film, Interscope Com...</td>\n"," <td>559, 2550, 10201</td>\n"," </tr>\n"," <tr>\n"," <th>2</th>\n"," <td>False</td>\n"," <td>0</td>\n"," <td>NaN</td>\n"," <td>15602</td>\n"," <td>tt0113228</td>\n"," <td>en</td>\n"," <td>Grumpier Old Men</td>\n"," <td>A family wedding reignites the ancient feud be...</td>\n"," <td>11.7129</td>\n"," <td>/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg</td>\n"," <td>...</td>\n"," <td>Grumpy Old Men Collection</td>\n"," <td>119050.0</td>\n"," <td>/nLvUdqgPgm3F85NMCii9gVFUcet.jpg</td>\n"," <td>/hypTnLot2z8wpFS7qwsQHW1uV8u.jpg</td>\n"," <td>Romance, Comedy</td>\n"," <td>10749, 35</td>\n"," <td>United States of America</td>\n"," <td>US</td>\n"," <td>Warner Bros., Lancaster Gate</td>\n"," <td>6194, 19464</td>\n"," </tr>\n"," </tbody>\n","</table>\n","<p>3 rows × 30 columns</p>\n","</div>\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-c8e90d56-a50e-4f9c-b919-c9d653aa1110')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n"," \n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n"," <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n"," </svg>\n"," </button>\n"," \n"," <style>\n"," .colab-df-container {\n"," display:flex;\n"," flex-wrap:wrap;\n"," gap: 12px;\n"," }\n","\n"," .colab-df-convert {\n"," background-color: #E8F0FE;\n"," border: none;\n"," border-radius: 50%;\n"," cursor: pointer;\n"," display: none;\n"," fill: #1967D2;\n"," height: 32px;\n"," padding: 0 0 0 0;\n"," width: 32px;\n"," }\n","\n"," .colab-df-convert:hover {\n"," background-color: #E2EBFA;\n"," box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n"," fill: #174EA6;\n"," }\n","\n"," [theme=dark] .colab-df-convert {\n"," background-color: #3B4455;\n"," fill: #D2E3FC;\n"," }\n","\n"," [theme=dark] .colab-df-convert:hover {\n"," background-color: #434B5C;\n"," box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n"," filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n"," fill: #FFFFFF;\n"," }\n"," </style>\n","\n"," <script>\n"," const buttonEl =\n"," document.querySelector('#df-c8e90d56-a50e-4f9c-b919-c9d653aa1110 button.colab-df-convert');\n"," buttonEl.style.display =\n"," google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n"," async function convertToInteractive(key) {\n"," const element = document.querySelector('#df-c8e90d56-a50e-4f9c-b919-c9d653aa1110');\n"," const dataTable =\n"," await google.colab.kernel.invokeFunction('convertToInteractive',\n"," [key], {});\n"," if (!dataTable) return;\n","\n"," const docLinkHtml = 'Like what you see? Visit the ' +\n"," '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n"," + ' to learn more about interactive tables.';\n"," element.innerHTML = '';\n"," dataTable['output_type'] = 'display_data';\n"," await google.colab.output.renderOutput(dataTable, element);\n"," const docLink = document.createElement('div');\n"," docLink.innerHTML = docLinkHtml;\n"," element.appendChild(docLink);\n"," }\n"," </script>\n"," </div>\n"," </div>\n"," "]},"metadata":{},"execution_count":53}]},{"cell_type":"code","source":["clean_metadata.to_csv('/content/drive/MyDrive/Rec/data/cleaned/clean_metadata.csv', index=False)"],"metadata":{"id":"bFZ03yWWuFTO","executionInfo":{"status":"ok","timestamp":1688571930421,"user_tz":-210,"elapsed":1566,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":54,"outputs":[]},{"cell_type":"code","source":[],"metadata":{"id":"LiikvR3euR2Z"},"execution_count":null,"outputs":[]}]} |