File size: 55,724 Bytes
ad7eafd
1
{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"collapsed_sections":["fQcMREQvThwU","ugf5R7Ihi2eU","fmm6lJZH27-5","8SMtZaf6EkMD","uYO9OW7sXZMF"],"mount_file_id":"1nkLBMUOcoheh7EH5xe3uev4aZqAvPdVG","authorship_tag":"ABX9TyNIByCmZPZzgkENHepyEbKv"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":["# Initial instructions"],"metadata":{"id":"fQcMREQvThwU"}},{"cell_type":"code","source":["! pip install kaggle"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"opfjSdEKbA6q","executionInfo":{"status":"ok","timestamp":1688566779143,"user_tz":-210,"elapsed":4727,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"9a722f41-7e7f-48e5-b73a-a6d3766296e4"},"execution_count":1,"outputs":[{"output_type":"stream","name":"stdout","text":["Requirement already satisfied: kaggle in /usr/local/lib/python3.10/dist-packages (1.5.13)\n","Requirement already satisfied: six>=1.10 in /usr/local/lib/python3.10/dist-packages (from kaggle) (1.16.0)\n","Requirement already satisfied: certifi in /usr/local/lib/python3.10/dist-packages (from kaggle) (2023.5.7)\n","Requirement already satisfied: python-dateutil in /usr/local/lib/python3.10/dist-packages (from kaggle) (2.8.2)\n","Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from kaggle) (2.27.1)\n","Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from kaggle) (4.65.0)\n","Requirement already satisfied: python-slugify in /usr/local/lib/python3.10/dist-packages (from kaggle) (8.0.1)\n","Requirement already satisfied: urllib3 in /usr/local/lib/python3.10/dist-packages (from kaggle) (1.26.16)\n","Requirement already satisfied: text-unidecode>=1.3 in /usr/local/lib/python3.10/dist-packages (from python-slugify->kaggle) (1.3)\n","Requirement already satisfied: charset-normalizer~=2.0.0 in /usr/local/lib/python3.10/dist-packages (from requests->kaggle) (2.0.12)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->kaggle) (3.4)\n"]}]},{"cell_type":"code","source":["from google.colab import files"],"metadata":{"id":"9gjOv_WUJBzz","executionInfo":{"status":"ok","timestamp":1688566817902,"user_tz":-210,"elapsed":467,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":2,"outputs":[]},{"cell_type":"code","source":["files.upload()\n","! mkdir ~/.kaggle\n","! cp kaggle.json ~/.kaggle/\n","! chmod 600 ~/.kaggle/kaggle.json\n","! kaggle datasets download -d rounakbanik/the-movies-dataset"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":401},"id":"cQoNEnDDJOvf","executionInfo":{"status":"error","timestamp":1688566832628,"user_tz":-210,"elapsed":12771,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"26b37601-be8f-4ba9-a01c-1e21bba10e66"},"execution_count":3,"outputs":[{"output_type":"display_data","data":{"text/plain":["<IPython.core.display.HTML object>"],"text/html":["\n","     <input type=\"file\" id=\"files-b2ffe85c-4b2c-4a7a-82b0-5fa9df40ca2d\" name=\"files[]\" multiple disabled\n","        style=\"border:none\" />\n","     <output id=\"result-b2ffe85c-4b2c-4a7a-82b0-5fa9df40ca2d\">\n","      Upload widget is only available when the cell has been executed in the\n","      current browser session. Please rerun this cell to enable.\n","      </output>\n","      <script>// Copyright 2017 Google LLC\n","//\n","// Licensed under the Apache License, Version 2.0 (the \"License\");\n","// you may not use this file except in compliance with the License.\n","// You may obtain a copy of the License at\n","//\n","//      http://www.apache.org/licenses/LICENSE-2.0\n","//\n","// Unless required by applicable law or agreed to in writing, software\n","// distributed under the License is distributed on an \"AS IS\" BASIS,\n","// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n","// See the License for the specific language governing permissions and\n","// limitations under the License.\n","\n","/**\n"," * @fileoverview Helpers for google.colab Python module.\n"," */\n","(function(scope) {\n","function span(text, styleAttributes = {}) {\n","  const element = document.createElement('span');\n","  element.textContent = text;\n","  for (const key of Object.keys(styleAttributes)) {\n","    element.style[key] = styleAttributes[key];\n","  }\n","  return element;\n","}\n","\n","// Max number of bytes which will be uploaded at a time.\n","const MAX_PAYLOAD_SIZE = 100 * 1024;\n","\n","function _uploadFiles(inputId, outputId) {\n","  const steps = uploadFilesStep(inputId, outputId);\n","  const outputElement = document.getElementById(outputId);\n","  // Cache steps on the outputElement to make it available for the next call\n","  // to uploadFilesContinue from Python.\n","  outputElement.steps = steps;\n","\n","  return _uploadFilesContinue(outputId);\n","}\n","\n","// This is roughly an async generator (not supported in the browser yet),\n","// where there are multiple asynchronous steps and the Python side is going\n","// to poll for completion of each step.\n","// This uses a Promise to block the python side on completion of each step,\n","// then passes the result of the previous step as the input to the next step.\n","function _uploadFilesContinue(outputId) {\n","  const outputElement = document.getElementById(outputId);\n","  const steps = outputElement.steps;\n","\n","  const next = steps.next(outputElement.lastPromiseValue);\n","  return Promise.resolve(next.value.promise).then((value) => {\n","    // Cache the last promise value to make it available to the next\n","    // step of the generator.\n","    outputElement.lastPromiseValue = value;\n","    return next.value.response;\n","  });\n","}\n","\n","/**\n"," * Generator function which is called between each async step of the upload\n"," * process.\n"," * @param {string} inputId Element ID of the input file picker element.\n"," * @param {string} outputId Element ID of the output display.\n"," * @return {!Iterable<!Object>} Iterable of next steps.\n"," */\n","function* uploadFilesStep(inputId, outputId) {\n","  const inputElement = document.getElementById(inputId);\n","  inputElement.disabled = false;\n","\n","  const outputElement = document.getElementById(outputId);\n","  outputElement.innerHTML = '';\n","\n","  const pickedPromise = new Promise((resolve) => {\n","    inputElement.addEventListener('change', (e) => {\n","      resolve(e.target.files);\n","    });\n","  });\n","\n","  const cancel = document.createElement('button');\n","  inputElement.parentElement.appendChild(cancel);\n","  cancel.textContent = 'Cancel upload';\n","  const cancelPromise = new Promise((resolve) => {\n","    cancel.onclick = () => {\n","      resolve(null);\n","    };\n","  });\n","\n","  // Wait for the user to pick the files.\n","  const files = yield {\n","    promise: Promise.race([pickedPromise, cancelPromise]),\n","    response: {\n","      action: 'starting',\n","    }\n","  };\n","\n","  cancel.remove();\n","\n","  // Disable the input element since further picks are not allowed.\n","  inputElement.disabled = true;\n","\n","  if (!files) {\n","    return {\n","      response: {\n","        action: 'complete',\n","      }\n","    };\n","  }\n","\n","  for (const file of files) {\n","    const li = document.createElement('li');\n","    li.append(span(file.name, {fontWeight: 'bold'}));\n","    li.append(span(\n","        `(${file.type || 'n/a'}) - ${file.size} bytes, ` +\n","        `last modified: ${\n","            file.lastModifiedDate ? file.lastModifiedDate.toLocaleDateString() :\n","                                    'n/a'} - `));\n","    const percent = span('0% done');\n","    li.appendChild(percent);\n","\n","    outputElement.appendChild(li);\n","\n","    const fileDataPromise = new Promise((resolve) => {\n","      const reader = new FileReader();\n","      reader.onload = (e) => {\n","        resolve(e.target.result);\n","      };\n","      reader.readAsArrayBuffer(file);\n","    });\n","    // Wait for the data to be ready.\n","    let fileData = yield {\n","      promise: fileDataPromise,\n","      response: {\n","        action: 'continue',\n","      }\n","    };\n","\n","    // Use a chunked sending to avoid message size limits. See b/62115660.\n","    let position = 0;\n","    do {\n","      const length = Math.min(fileData.byteLength - position, MAX_PAYLOAD_SIZE);\n","      const chunk = new Uint8Array(fileData, position, length);\n","      position += length;\n","\n","      const base64 = btoa(String.fromCharCode.apply(null, chunk));\n","      yield {\n","        response: {\n","          action: 'append',\n","          file: file.name,\n","          data: base64,\n","        },\n","      };\n","\n","      let percentDone = fileData.byteLength === 0 ?\n","          100 :\n","          Math.round((position / fileData.byteLength) * 100);\n","      percent.textContent = `${percentDone}% done`;\n","\n","    } while (position < fileData.byteLength);\n","  }\n","\n","  // All done.\n","  yield {\n","    response: {\n","      action: 'complete',\n","    }\n","  };\n","}\n","\n","scope.google = scope.google || {};\n","scope.google.colab = scope.google.colab || {};\n","scope.google.colab._files = {\n","  _uploadFiles,\n","  _uploadFilesContinue,\n","};\n","})(self);\n","</script> "]},"metadata":{}},{"output_type":"error","ename":"KeyboardInterrupt","evalue":"ignored","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)","\u001b[0;32m<ipython-input-3-dd9d5815f8a5>\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfiles\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msystem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m' mkdir ~/.kaggle'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msystem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m' cp kaggle.json ~/.kaggle/'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msystem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m' chmod 600 ~/.kaggle/kaggle.json'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msystem\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m' kaggle datasets download -d rounakbanik/the-movies-dataset'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/google/colab/files.py\u001b[0m in \u001b[0;36mupload\u001b[0;34m()\u001b[0m\n\u001b[1;32m     67\u001b[0m   \"\"\"\n\u001b[1;32m     68\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 69\u001b[0;31m   \u001b[0muploaded_files\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_upload_files\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmultiple\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     70\u001b[0m   \u001b[0;31m# Mapping from original filename to filename as saved locally.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     71\u001b[0m   \u001b[0mlocal_filenames\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/google/colab/files.py\u001b[0m in \u001b[0;36m_upload_files\u001b[0;34m(multiple)\u001b[0m\n\u001b[1;32m    151\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    152\u001b[0m   \u001b[0;31m# First result is always an indication that the file picker has completed.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 153\u001b[0;31m   result = _output.eval_js(\n\u001b[0m\u001b[1;32m    154\u001b[0m       'google.colab._files._uploadFiles(\"{input_id}\", \"{output_id}\")'.format(\n\u001b[1;32m    155\u001b[0m           \u001b[0minput_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minput_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moutput_id\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_id\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/google/colab/output/_js.py\u001b[0m in \u001b[0;36meval_js\u001b[0;34m(script, ignore_result, timeout_sec)\u001b[0m\n\u001b[1;32m     38\u001b[0m   \u001b[0;32mif\u001b[0m \u001b[0mignore_result\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     39\u001b[0m     \u001b[0;32mreturn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 40\u001b[0;31m   \u001b[0;32mreturn\u001b[0m \u001b[0m_message\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_reply_from_input\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrequest_id\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtimeout_sec\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     41\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     42\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/google/colab/_message.py\u001b[0m in \u001b[0;36mread_reply_from_input\u001b[0;34m(message_id, timeout_sec)\u001b[0m\n\u001b[1;32m     94\u001b[0m     \u001b[0mreply\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_read_next_input_message\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     95\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mreply\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0m_NOT_READY\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mreply\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdict\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 96\u001b[0;31m       \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0.025\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     97\u001b[0m       \u001b[0;32mcontinue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     98\u001b[0m     if (\n","\u001b[0;31mKeyboardInterrupt\u001b[0m: "]}]},{"cell_type":"code","source":["import os\n","if not os.path.isdir('/content/data/cleaned') :\n","    os.mkdir('/content/data/cleaned')"],"metadata":{"id":"HaEx9oF3Li7f"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["! unzip '/content/the-movies-dataset.zip' -d '/content/data/raw'"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"pLeGJcfvLwwJ","executionInfo":{"status":"ok","timestamp":1688546148833,"user_tz":-210,"elapsed":10746,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"debf1aaf-4932-47f9-b076-49457210b967"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Archive:  /content/the-movies-dataset.zip\n","  inflating: /content/data/raw/credits.csv  \n","  inflating: /content/data/raw/keywords.csv  \n","  inflating: /content/data/raw/links.csv  \n","  inflating: /content/data/raw/links_small.csv  \n","  inflating: /content/data/raw/movies_metadata.csv  \n","  inflating: /content/data/raw/ratings.csv  \n","  inflating: /content/data/raw/ratings_small.csv  \n"]}]},{"cell_type":"markdown","source":["# Needed Imports"],"metadata":{"id":"a0M_2XYkbY_O"}},{"cell_type":"code","source":["import numpy as np\n","import pandas as pd\n","import matplotlib.pyplot as plt\n","import re"],"metadata":{"id":"z2eLwbMCbbxB","executionInfo":{"status":"ok","timestamp":1688584736915,"user_tz":-210,"elapsed":1969,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":4,"outputs":[]},{"cell_type":"markdown","source":["# Load datasets"],"metadata":{"id":"ugf5R7Ihi2eU"}},{"cell_type":"code","source":["metadata = pd.read_csv('/content/drive/MyDrive/Rec/data/cleaned/clean_metadata.csv')\n","credits = pd.read_csv('/content/drive/MyDrive/Rec/data/cleaned/clean_credits.csv')\n","keywords = pd.read_csv('/content/drive/MyDrive/Rec/data/cleaned/clean_keywords.csv')\n","links = pd.read_csv('/content/drive/MyDrive/Rec/data/raw/links.csv')\n","links = links[links['tmdbId'].notnull()]['tmdbId'].astype('int')"],"metadata":{"id":"pW6Sw6dCi4aX","executionInfo":{"status":"ok","timestamp":1688574375594,"user_tz":-210,"elapsed":3626,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":28,"outputs":[]},{"cell_type":"code","source":["print('shape: {}'.format(metadata.shape))\n","print('columns: \\n {}'.format(metadata.columns))\n","metadata.head(3)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":723},"id":"N_nTAGZUjZTc","executionInfo":{"status":"ok","timestamp":1688571998020,"user_tz":-210,"elapsed":809,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"3d810160-7989-4473-f969-bddaeea13eb7"},"execution_count":22,"outputs":[{"output_type":"stream","name":"stdout","text":["shape: (45447, 30)\n","columns: \n"," Index(['adult', 'budget', 'homepage', 'id', 'imdb_id', 'original_language',\n","       'original_title', 'overview', 'popularity', 'poster_path',\n","       'production_companies', 'release_date', 'revenue', 'runtime', 'status',\n","       'tagline', 'title', 'video', 'vote_average', 'vote_count',\n","       'name_belongs_to_collection', 'id_belongs_to_collection',\n","       'poster_path_belongs_to_collection',\n","       'backdrop_path_belongs_to_collection', 'name_genres', 'id_genres',\n","       'name_production_countries', 'iso_3166_1_production_companies',\n","       'name_production_companies', 'id_production_companies'],\n","      dtype='object')\n"]},{"output_type":"execute_result","data":{"text/plain":["   adult    budget                              homepage     id    imdb_id  \\\n","0  False  30000000  http://toystory.disney.com/toy-story    862  tt0114709   \n","1  False  65000000                                   NaN   8844  tt0113497   \n","2  False         0                                   NaN  15602  tt0113228   \n","\n","  original_language    original_title  \\\n","0                en         Toy Story   \n","1                en           Jumanji   \n","2                en  Grumpier Old Men   \n","\n","                                            overview  popularity  \\\n","0  Led by Woody, Andy's toys live happily in his ...   21.946943   \n","1  When siblings Judy and Peter discover an encha...   17.015539   \n","2  A family wedding reignites the ancient feud be...   11.712900   \n","\n","                        poster_path  ... name_belongs_to_collection  \\\n","0  /rhIRbceoE9lR4veEXuwCC2wARtG.jpg  ...       Toy Story Collection   \n","1  /vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg  ...                        NaN   \n","2  /6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg  ...  Grumpy Old Men Collection   \n","\n","  id_belongs_to_collection  poster_path_belongs_to_collection  \\\n","0                  10194.0   /7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg   \n","1                      NaN                                NaN   \n","2                 119050.0   /nLvUdqgPgm3F85NMCii9gVFUcet.jpg   \n","\n","   backdrop_path_belongs_to_collection                 name_genres  \\\n","0     /9FBwqcd9IRruEDUrTdcaafOMKUq.jpg   Animation, Comedy, Family   \n","1                                  NaN  Adventure, Fantasy, Family   \n","2     /hypTnLot2z8wpFS7qwsQHW1uV8u.jpg             Romance, Comedy   \n","\n","       id_genres name_production_countries  iso_3166_1_production_companies  \\\n","0  16, 35, 10751  United States of America                               US   \n","1  12, 14, 10751  United States of America                               US   \n","2      10749, 35  United States of America                               US   \n","\n","                           name_production_companies  id_production_companies  \n","0                            Pixar Animation Studios                        3  \n","1  TriStar Pictures, Teitler Film, Interscope Com...         559, 2550, 10201  \n","2                       Warner Bros., Lancaster Gate              6194, 19464  \n","\n","[3 rows x 30 columns]"],"text/html":["\n","  <div id=\"df-3fa5403e-5fb8-409e-8a96-2f930c77c93e\">\n","    <div class=\"colab-df-container\">\n","      <div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>adult</th>\n","      <th>budget</th>\n","      <th>homepage</th>\n","      <th>id</th>\n","      <th>imdb_id</th>\n","      <th>original_language</th>\n","      <th>original_title</th>\n","      <th>overview</th>\n","      <th>popularity</th>\n","      <th>poster_path</th>\n","      <th>...</th>\n","      <th>name_belongs_to_collection</th>\n","      <th>id_belongs_to_collection</th>\n","      <th>poster_path_belongs_to_collection</th>\n","      <th>backdrop_path_belongs_to_collection</th>\n","      <th>name_genres</th>\n","      <th>id_genres</th>\n","      <th>name_production_countries</th>\n","      <th>iso_3166_1_production_companies</th>\n","      <th>name_production_companies</th>\n","      <th>id_production_companies</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>False</td>\n","      <td>30000000</td>\n","      <td>http://toystory.disney.com/toy-story</td>\n","      <td>862</td>\n","      <td>tt0114709</td>\n","      <td>en</td>\n","      <td>Toy Story</td>\n","      <td>Led by Woody, Andy's toys live happily in his ...</td>\n","      <td>21.946943</td>\n","      <td>/rhIRbceoE9lR4veEXuwCC2wARtG.jpg</td>\n","      <td>...</td>\n","      <td>Toy Story Collection</td>\n","      <td>10194.0</td>\n","      <td>/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg</td>\n","      <td>/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg</td>\n","      <td>Animation, Comedy, Family</td>\n","      <td>16, 35, 10751</td>\n","      <td>United States of America</td>\n","      <td>US</td>\n","      <td>Pixar Animation Studios</td>\n","      <td>3</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>False</td>\n","      <td>65000000</td>\n","      <td>NaN</td>\n","      <td>8844</td>\n","      <td>tt0113497</td>\n","      <td>en</td>\n","      <td>Jumanji</td>\n","      <td>When siblings Judy and Peter discover an encha...</td>\n","      <td>17.015539</td>\n","      <td>/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg</td>\n","      <td>...</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>Adventure, Fantasy, Family</td>\n","      <td>12, 14, 10751</td>\n","      <td>United States of America</td>\n","      <td>US</td>\n","      <td>TriStar Pictures, Teitler Film, Interscope Com...</td>\n","      <td>559, 2550, 10201</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>False</td>\n","      <td>0</td>\n","      <td>NaN</td>\n","      <td>15602</td>\n","      <td>tt0113228</td>\n","      <td>en</td>\n","      <td>Grumpier Old Men</td>\n","      <td>A family wedding reignites the ancient feud be...</td>\n","      <td>11.712900</td>\n","      <td>/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg</td>\n","      <td>...</td>\n","      <td>Grumpy Old Men Collection</td>\n","      <td>119050.0</td>\n","      <td>/nLvUdqgPgm3F85NMCii9gVFUcet.jpg</td>\n","      <td>/hypTnLot2z8wpFS7qwsQHW1uV8u.jpg</td>\n","      <td>Romance, Comedy</td>\n","      <td>10749, 35</td>\n","      <td>United States of America</td>\n","      <td>US</td>\n","      <td>Warner Bros., Lancaster Gate</td>\n","      <td>6194, 19464</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>3 rows Γ— 30 columns</p>\n","</div>\n","      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-3fa5403e-5fb8-409e-8a96-2f930c77c93e')\"\n","              title=\"Convert this dataframe to an interactive table.\"\n","              style=\"display:none;\">\n","        \n","  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n","       width=\"24px\">\n","    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n","    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n","  </svg>\n","      </button>\n","      \n","  <style>\n","    .colab-df-container {\n","      display:flex;\n","      flex-wrap:wrap;\n","      gap: 12px;\n","    }\n","\n","    .colab-df-convert {\n","      background-color: #E8F0FE;\n","      border: none;\n","      border-radius: 50%;\n","      cursor: pointer;\n","      display: none;\n","      fill: #1967D2;\n","      height: 32px;\n","      padding: 0 0 0 0;\n","      width: 32px;\n","    }\n","\n","    .colab-df-convert:hover {\n","      background-color: #E2EBFA;\n","      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n","      fill: #174EA6;\n","    }\n","\n","    [theme=dark] .colab-df-convert {\n","      background-color: #3B4455;\n","      fill: #D2E3FC;\n","    }\n","\n","    [theme=dark] .colab-df-convert:hover {\n","      background-color: #434B5C;\n","      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n","      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n","      fill: #FFFFFF;\n","    }\n","  </style>\n","\n","      <script>\n","        const buttonEl =\n","          document.querySelector('#df-3fa5403e-5fb8-409e-8a96-2f930c77c93e button.colab-df-convert');\n","        buttonEl.style.display =\n","          google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n","        async function convertToInteractive(key) {\n","          const element = document.querySelector('#df-3fa5403e-5fb8-409e-8a96-2f930c77c93e');\n","          const dataTable =\n","            await google.colab.kernel.invokeFunction('convertToInteractive',\n","                                                     [key], {});\n","          if (!dataTable) return;\n","\n","          const docLinkHtml = 'Like what you see? Visit the ' +\n","            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n","            + ' to learn more about interactive tables.';\n","          element.innerHTML = '';\n","          dataTable['output_type'] = 'display_data';\n","          await google.colab.output.renderOutput(dataTable, element);\n","          const docLink = document.createElement('div');\n","          docLink.innerHTML = docLinkHtml;\n","          element.appendChild(docLink);\n","        }\n","      </script>\n","    </div>\n","  </div>\n","  "]},"metadata":{},"execution_count":22}]},{"cell_type":"code","source":["a = metadata['original_language'].value_counts()\n","a[a>10]"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"QPmkUByIlo1G","executionInfo":{"status":"ok","timestamp":1688572010282,"user_tz":-210,"elapsed":438,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"de2a56b6-96a2-461c-f4bb-56bd552c714e"},"execution_count":23,"outputs":[{"output_type":"execute_result","data":{"text/plain":["en    32260\n","fr     2437\n","it     1529\n","ja     1349\n","de     1079\n","es      994\n","ru      826\n","hi      508\n","ko      444\n","zh      409\n","sv      384\n","pt      316\n","cn      313\n","fi      295\n","nl      248\n","da      224\n","pl      219\n","tr      150\n","cs      130\n","el      113\n","no      106\n","fa      101\n","hu      100\n","ta       78\n","th       75\n","he       67\n","sr       63\n","ro       57\n","te       45\n","ar       39\n","ml       36\n","xx       33\n","hr       29\n","bn       29\n","mr       25\n","et       24\n","is       24\n","tl       23\n","id       20\n","lv       18\n","ka       18\n","sl       17\n","uk       16\n","bs       14\n","ca       12\n","Name: original_language, dtype: int64"]},"metadata":{},"execution_count":23}]},{"cell_type":"markdown","source":["# Constructing the dataset used for item based recommendation"],"metadata":{"id":"fmm6lJZH27-5"}},{"cell_type":"code","source":["keywords['id'] = keywords['id'].astype('int')\n","credits['id'] = credits['id'].astype('int')"],"metadata":{"id":"qHk24Ai_l_tH","executionInfo":{"status":"ok","timestamp":1688574139845,"user_tz":-210,"elapsed":589,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":24,"outputs":[]},{"cell_type":"code","source":["metadata = metadata.merge(credits, on='id')\n","metadata = metadata.merge(keywords, on='id')"],"metadata":{"id":"2MNqsMX13JZJ","executionInfo":{"status":"ok","timestamp":1688574651033,"user_tz":-210,"elapsed":503,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":32,"outputs":[]},{"cell_type":"markdown","source":["## Use only the available movies in TMDB"],"metadata":{"id":"32SR5rG64Vxy"}},{"cell_type":"code","source":["rec_data = metadata[metadata['id'].isin(links)].copy()\n","rec_data.shape"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"GLHc1UW-3NDu","executionInfo":{"status":"ok","timestamp":1688574670536,"user_tz":-210,"elapsed":593,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"f66c62c1-7700-4a92-e249-56e145731564"},"execution_count":33,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(45459, 45)"]},"metadata":{},"execution_count":33}]},{"cell_type":"code","source":["rec_data = rec_data.drop_duplicates(subset='id')"],"metadata":{"id":"hoaS3X9ma9F-","executionInfo":{"status":"ok","timestamp":1688583554915,"user_tz":-210,"elapsed":579,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":140,"outputs":[]},{"cell_type":"markdown","source":["## Adding 'director' column"],"metadata":{"id":"Ax8jTRG05rWa"}},{"cell_type":"code","source":["def is_float(string):\n","    try:\n","        float(string)\n","        return True\n","    except ValueError:\n","        return False"],"metadata":{"id":"JorU4WBB40kq","executionInfo":{"status":"ok","timestamp":1688574823282,"user_tz":-210,"elapsed":505,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":34,"outputs":[]},{"cell_type":"code","source":["def directors_names(job_crew, index):\n","    if not isinstance(job_crew, float):\n","        jobs = job_crew.split(', ')\n","        director_indices = [i for i, job in enumerate(jobs) if job == 'Director']\n","        if director_indices:\n","            names = rec_data.loc[index, 'name_crew']\n","            if not isinstance(names, float):\n","                names = names.split(', ')\n","                director_names = [names[i] for i in director_indices]\n","                return ', '.join(director_names)\n","    return np.nan"],"metadata":{"id":"4Wv7jSsX5wS7","executionInfo":{"status":"ok","timestamp":1688575844013,"user_tz":-210,"elapsed":467,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":37,"outputs":[]},{"cell_type":"code","source":["for i in rec_data.index:\n","    rec_data.loc[i, 'director'] = directors_names(rec_data.loc[i, 'job_crew'], i)\n","rec_data['director']"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"RJZBCGZS62pU","executionInfo":{"status":"ok","timestamp":1688577018900,"user_tz":-210,"elapsed":7640,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"9e4659dd-06ad-4334-9192-53b2f2f33640"},"execution_count":38,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0           John Lasseter\n","1            Joe Johnston\n","2           Howard Deutch\n","3         Forest Whitaker\n","4           Charles Shyer\n","               ...       \n","45454    Hamid Nematollah\n","45455            Lav Diaz\n","45456      Mark L. Lester\n","45457    Yakov Protazanov\n","45458       Daisy Asquith\n","Name: director, Length: 45459, dtype: object"]},"metadata":{},"execution_count":38}]},{"cell_type":"markdown","source":["# Generating the final dataset"],"metadata":{"id":"8SMtZaf6EkMD"}},{"cell_type":"code","source":["rec_data = rec_data [['id', 'original_language', 'overview','tagline', 'title', 'name_genres', 'name_cast', 'name_keywords', 'director']]"],"metadata":{"id":"DhM20iQwHV_1","executionInfo":{"status":"ok","timestamp":1688579331516,"user_tz":-210,"elapsed":613,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":84,"outputs":[]},{"cell_type":"code","source":["rec_data = rec_data.replace(np.nan, '')"],"metadata":{"id":"yawZztUkFTD1","executionInfo":{"status":"ok","timestamp":1688580086363,"user_tz":-210,"elapsed":1800,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":87,"outputs":[]},{"cell_type":"code","source":["rec_data = rec_data.replace('', 'Not mentioned')"],"metadata":{"id":"F_96ZhozO--3","executionInfo":{"status":"ok","timestamp":1688580411504,"user_tz":-210,"elapsed":498,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":95,"outputs":[]},{"cell_type":"code","source":["rec_data['description'] = 'Title: ' + rec_data['title'] + '. Director: ' + rec_data['director']  + '. Genres: ' + rec_data['name_genres'] + '. Overview: ' + rec_data['overview'] + ' Keywords: ' + rec_data['name_keywords'] + '. Language: ' + rec_data['original_language'] + '.'"],"metadata":{"id":"fywEpVC9N1ae","executionInfo":{"status":"ok","timestamp":1688582041083,"user_tz":-210,"elapsed":448,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":115,"outputs":[]},{"cell_type":"code","source":["def clean_text (text) :\n","    try:\n","        # Remove new line and tabs\n","        clean = text.replace(\"\\n\", \" \")\n","        clean = clean.replace(\"\\t\", \" \")\n","        clean = clean.replace(\"\\r\", \" \")\n","        clean = clean.replace(\"Γ‚\\xa0\", \"\")  # non-breaking space\n","\n","        # Remove all punctuation and special characters\n","        # clean = re.sub(\n","        #     r\"([^\\s\\w]|_)+\", \"\", clean\n","        # )  # noqa W695 invalid escape sequence '\\s'\n","\n","        # If you want to keep some punctuation, see below commented out example\n","        clean = re.sub(r'([^,.:\\s\\w\\-]|_)+','', clean)\n","\n","        # Skip further processing if the text will be used in BERT tokenization\n","\n","    except Exception:\n","        print(\"Cannot clean non-existent text\")\n","        clean = \"\"\n","\n","    return clean"],"metadata":{"id":"EWUD2uTEGdTa","executionInfo":{"status":"ok","timestamp":1688582398315,"user_tz":-210,"elapsed":472,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":128,"outputs":[]},{"cell_type":"code","source":["rec_data['description'] = rec_data['description'].apply(clean_text)"],"metadata":{"id":"yQPx5nkFGlb-","executionInfo":{"status":"ok","timestamp":1688582648519,"user_tz":-210,"elapsed":2256,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":130,"outputs":[]},{"cell_type":"code","source":["final_rec_data = rec_data[['id', 'title', 'description']]\n","final_rec_data = final_rec_data.drop_duplicates(subset='id')"],"metadata":{"id":"4YjLmb1TXtvr","executionInfo":{"status":"ok","timestamp":1688584254391,"user_tz":-210,"elapsed":988,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":141,"outputs":[]},{"cell_type":"code","source":["final_rec_data.shape"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"vEbcByGnZWLY","executionInfo":{"status":"ok","timestamp":1688584258284,"user_tz":-210,"elapsed":931,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"20c3aea6-aa57-4e5f-c2e6-94e884b98d26"},"execution_count":142,"outputs":[{"output_type":"execute_result","data":{"text/plain":["(45429, 3)"]},"metadata":{},"execution_count":142}]},{"cell_type":"markdown","source":["## Saving the dataset"],"metadata":{"id":"uYO9OW7sXZMF"}},{"cell_type":"code","source":["final_rec_data.to_csv('/content/drive/MyDrive/Rec/data/cleaned/descriptions.csv', index=False)"],"metadata":{"id":"SVR8F9QaZZXL","executionInfo":{"status":"ok","timestamp":1688584265792,"user_tz":-210,"elapsed":1909,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":143,"outputs":[]},{"cell_type":"markdown","source":["# REC"],"metadata":{"id":"ty8JI6oFhQ_d"}},{"cell_type":"code","source":["d =  pd.read_csv('/content/drive/MyDrive/Rec/data/cleaned/descriptions.csv')"],"metadata":{"id":"gAthfffufZXF","executionInfo":{"status":"ok","timestamp":1688584746369,"user_tz":-210,"elapsed":1187,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":5,"outputs":[]},{"cell_type":"code","source":["from sklearn.feature_extraction.text import TfidfVectorizer\n","from sklearn.metrics.pairwise import linear_kernel, cosine_similarity\n","\n","tfidf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0.0, stop_words='english')\n","tfidf_matrix = tfidf.fit_transform(d['description'])\n","# cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)\n","# cosine_sim = cosine_sim.astype(np.float16)\n"],"metadata":{"id":"4nGOUldZY6X7","executionInfo":{"status":"ok","timestamp":1688584778153,"user_tz":-210,"elapsed":17021,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":7,"outputs":[]},{"cell_type":"code","source":["def recommender (title, num_recommends):\n","    idx = d[d['title'] == title].index[0]\n","    cosine_sim = cosine_similarity(tfidf_matrix[int(idx)], tfidf_matrix)\n","    similarity_scores = list(enumerate(cosine_sim[0]))\n","    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)\n","    similarity_scores = similarity_scores[1: num_recommends + 1]\n","    movie_indices = [i[0] for i in similarity_scores]\n","    return d.iloc[movie_indices]['title']"],"metadata":{"id":"M40vz12Oajc3","executionInfo":{"status":"ok","timestamp":1688585022061,"user_tz":-210,"elapsed":848,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":23,"outputs":[]},{"cell_type":"markdown","source":[],"metadata":{"id":"zpcUEJrDhZLY"}},{"cell_type":"code","source":["! pip install scikit-surprise"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"nx9G3cXEkbCZ","executionInfo":{"status":"ok","timestamp":1688598659233,"user_tz":-210,"elapsed":51093,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"3433f6f6-58f4-4625-8ad7-3b565166eee0"},"execution_count":2,"outputs":[{"output_type":"stream","name":"stdout","text":["Collecting scikit-surprise\n","  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)\n","\u001b[?25l     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/772.0 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m772.0/772.0 kB\u001b[0m \u001b[31m43.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n","Requirement already satisfied: joblib>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-surprise) (1.2.0)\n","Requirement already satisfied: numpy>=1.17.3 in /usr/local/lib/python3.10/dist-packages (from scikit-surprise) (1.22.4)\n","Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from scikit-surprise) (1.10.1)\n","Building wheels for collected packages: scikit-surprise\n","  Building wheel for scikit-surprise (setup.py) ... \u001b[?25l\u001b[?25hdone\n","  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3096320 sha256=ab360f2850ab501540eeccaf1058521f2c63a69cb989d9308e7f3d63bc789795\n","  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445\n","Successfully built scikit-surprise\n","Installing collected packages: scikit-surprise\n","Successfully installed scikit-surprise-1.1.3\n"]}]},{"cell_type":"code","source":["import pandas as pd\n","from surprise import Dataset, SVD ,Reader\n","from sklearn.metrics.pairwise import linear_kernel, cosine_similarity\n","from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n","from surprise.model_selection import cross_validate , KFold\n","from surprise import model_selection"],"metadata":{"id":"9IJpu0c3f7ub","executionInfo":{"status":"ok","timestamp":1688598692049,"user_tz":-210,"elapsed":1775,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":3,"outputs":[]},{"cell_type":"code","source":["from surprise import Dataset, Reader, SVD\n","from surprise.model_selection import train_test_split\n","from surprise.accuracy import rmse\n","\n","# load data from a CSV file\n","data = pd.read_csv('/content/drive/MyDrive/Rec/data/raw/ratings_small.csv')\n","\n","# define the Reader object\n","reader = Reader(rating_scale=(1, 5))\n","\n","# load the data into the Dataset object\n","dataset = Dataset.load_from_df(data[['userId', 'movieId', 'rating']], reader)\n","\n","# split the data into training and testing sets\n","trainset, testset = train_test_split(dataset, test_size=0.2)\n","\n","# define the SVD algorithm\n","algo = SVD(n_factors=100, n_epochs=20)\n","\n","# train the algorithm on the training set\n","algo.fit(trainset)\n","\n","# make predictions on the testing set\n","predictions = algo.test(testset)\n","\n","# evaluate the performance of the algorithm\n","rmse_score = rmse(predictions)\n","print('RMSE:', rmse_score)\n","\n","# make recommendations for a given user\n","user_id = 24256\n","items_to_recommend = []\n","for item_id in data['movieId'].unique():\n","    predicted_rating = algo.predict(user_id, item_id).est\n","    if predicted_rating >= 4.8:\n","        items_to_recommend.append(item_id)\n","print('Items to recommend:', items_to_recommend)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"s5T_lyVHoDOg","executionInfo":{"status":"ok","timestamp":1688599195314,"user_tz":-210,"elapsed":1948,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"845856e2-33ef-4459-9bb1-e81dca281acf"},"execution_count":9,"outputs":[{"output_type":"stream","name":"stdout","text":["RMSE: 0.8969\n","RMSE: 0.8968864510559503\n","Items to recommend: []\n"]}]},{"cell_type":"code","source":["links = links.rename(columns={'imdbId' : 'imdb_id'})\n","links['movieId']"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"DYxbl8NX79Dr","executionInfo":{"status":"ok","timestamp":1688610513785,"user_tz":-210,"elapsed":661,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"afb9d051-07a2-441a-dd84-04f8c1a56530"},"execution_count":34,"outputs":[{"output_type":"execute_result","data":{"text/plain":["   movieId  imdb_id   tmdbId\n","0        1   114709    862.0\n","1        2   113497   8844.0\n","2        3   113228  15602.0\n","3        4   114885  31357.0\n","4        5   113041  11862.0"],"text/html":["\n","  <div id=\"df-e4283956-5412-49ab-9168-53e860d30c68\">\n","    <div class=\"colab-df-container\">\n","      <div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>movieId</th>\n","      <th>imdb_id</th>\n","      <th>tmdbId</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>1</td>\n","      <td>114709</td>\n","      <td>862.0</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>2</td>\n","      <td>113497</td>\n","      <td>8844.0</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>3</td>\n","      <td>113228</td>\n","      <td>15602.0</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>4</td>\n","      <td>114885</td>\n","      <td>31357.0</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>5</td>\n","      <td>113041</td>\n","      <td>11862.0</td>\n","    </tr>\n","  </tbody>\n","</table>\n","</div>\n","      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-e4283956-5412-49ab-9168-53e860d30c68')\"\n","              title=\"Convert this dataframe to an interactive table.\"\n","              style=\"display:none;\">\n","        \n","  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n","       width=\"24px\">\n","    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n","    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n","  </svg>\n","      </button>\n","      \n","  <style>\n","    .colab-df-container {\n","      display:flex;\n","      flex-wrap:wrap;\n","      gap: 12px;\n","    }\n","\n","    .colab-df-convert {\n","      background-color: #E8F0FE;\n","      border: none;\n","      border-radius: 50%;\n","      cursor: pointer;\n","      display: none;\n","      fill: #1967D2;\n","      height: 32px;\n","      padding: 0 0 0 0;\n","      width: 32px;\n","    }\n","\n","    .colab-df-convert:hover {\n","      background-color: #E2EBFA;\n","      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n","      fill: #174EA6;\n","    }\n","\n","    [theme=dark] .colab-df-convert {\n","      background-color: #3B4455;\n","      fill: #D2E3FC;\n","    }\n","\n","    [theme=dark] .colab-df-convert:hover {\n","      background-color: #434B5C;\n","      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n","      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n","      fill: #FFFFFF;\n","    }\n","  </style>\n","\n","      <script>\n","        const buttonEl =\n","          document.querySelector('#df-e4283956-5412-49ab-9168-53e860d30c68 button.colab-df-convert');\n","        buttonEl.style.display =\n","          google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n","        async function convertToInteractive(key) {\n","          const element = document.querySelector('#df-e4283956-5412-49ab-9168-53e860d30c68');\n","          const dataTable =\n","            await google.colab.kernel.invokeFunction('convertToInteractive',\n","                                                     [key], {});\n","          if (!dataTable) return;\n","\n","          const docLinkHtml = 'Like what you see? Visit the ' +\n","            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n","            + ' to learn more about interactive tables.';\n","          element.innerHTML = '';\n","          dataTable['output_type'] = 'display_data';\n","          await google.colab.output.renderOutput(dataTable, element);\n","          const docLink = document.createElement('div');\n","          docLink.innerHTML = docLinkHtml;\n","          element.appendChild(docLink);\n","        }\n","      </script>\n","    </div>\n","  </div>\n","  "]},"metadata":{},"execution_count":34}]},{"cell_type":"code","source":["links['imdb_id']=links['imdb_id'].astype(int)"],"metadata":{"id":"Oyogeoo--vRr","executionInfo":{"status":"ok","timestamp":1688609761885,"user_tz":-210,"elapsed":662,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}}},"execution_count":27,"outputs":[]},{"cell_type":"code","source":["2 in cr['id']"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"Sil04xEt_INh","executionInfo":{"status":"ok","timestamp":1688610594282,"user_tz":-210,"elapsed":719,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"87b039fd-7f7f-4a01-e6d7-75d5234e2e15"},"execution_count":36,"outputs":[{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{},"execution_count":36}]},{"cell_type":"code","source":["cr = cr.merge(links, on='imdb_id')"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":311},"id":"R7IM3WBY8Fbh","executionInfo":{"status":"error","timestamp":1688609767176,"user_tz":-210,"elapsed":715,"user":{"displayName":"Amir Hossein Karami","userId":"12632705231641967217"}},"outputId":"1eb8c903-36cf-4c15-9cb8-c4b3229b7edd"},"execution_count":28,"outputs":[{"output_type":"error","ename":"ValueError","evalue":"ignored","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)","\u001b[0;32m<ipython-input-28-367d6dcecdf8>\u001b[0m in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mcr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmerge\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlinks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mon\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'imdb_id'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mmerge\u001b[0;34m(self, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)\u001b[0m\n\u001b[1;32m  10091\u001b[0m         \u001b[0;32mfrom\u001b[0m \u001b[0mpandas\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmerge\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmerge\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m  10092\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m> 10093\u001b[0;31m         return merge(\n\u001b[0m\u001b[1;32m  10094\u001b[0m             \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m  10095\u001b[0m             \u001b[0mright\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/core/reshape/merge.py\u001b[0m in \u001b[0;36mmerge\u001b[0;34m(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy, indicator, validate)\u001b[0m\n\u001b[1;32m    108\u001b[0m     \u001b[0mvalidate\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m|\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    109\u001b[0m ) -> DataFrame:\n\u001b[0;32m--> 110\u001b[0;31m     op = _MergeOperation(\n\u001b[0m\u001b[1;32m    111\u001b[0m         \u001b[0mleft\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    112\u001b[0m         \u001b[0mright\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/core/reshape/merge.py\u001b[0m in \u001b[0;36m__init__\u001b[0;34m(self, left, right, how, on, left_on, right_on, axis, left_index, right_index, sort, suffixes, indicator, validate)\u001b[0m\n\u001b[1;32m    705\u001b[0m         \u001b[0;31m# validate the merge keys dtypes. We may need to coerce\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    706\u001b[0m         \u001b[0;31m# to avoid incompatible dtypes\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 707\u001b[0;31m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_maybe_coerce_merge_keys\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    708\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    709\u001b[0m         \u001b[0;31m# If argument passed to validate,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;32m/usr/local/lib/python3.10/dist-packages/pandas/core/reshape/merge.py\u001b[0m in \u001b[0;36m_maybe_coerce_merge_keys\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m   1338\u001b[0m                     \u001b[0minferred_right\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mstring_types\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0minferred_left\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mstring_types\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1339\u001b[0m                 ):\n\u001b[0;32m-> 1340\u001b[0;31m                     \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmsg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   1341\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1342\u001b[0m             \u001b[0;31m# datetimelikes must match exactly\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;31mValueError\u001b[0m: You are trying to merge on object and int64 columns. If you wish to proceed you should use pd.concat"]}]},{"cell_type":"code","source":[],"metadata":{"id":"tkpi2hXk-gXH"},"execution_count":null,"outputs":[]}]}