{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "c04aac25-dad2-4b5b-b7e9-6102add4febb", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": 2, "id": "6b397e38-4659-4205-9716-f72f20e5d865", "metadata": {}, "outputs": [], "source": [ "movies = pd.read_csv('data/tmdb_5000_movies.csv')\n", "credits = pd.read_csv('data/tmdb_5000_credits.csv')" ] }, { "cell_type": "code", "execution_count": 3, "id": "c3ad363c-de60-4f3f-829f-75dc4ec20b37", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
budgetgenreshomepageidkeywordsoriginal_languageoriginal_titleoverviewpopularityproduction_companiesproduction_countriesrelease_daterevenueruntimespoken_languagesstatustaglinetitlevote_averagevote_count
0237000000[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...http://www.avatarmovie.com/19995[{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":...enAvatarIn the 22nd century, a paraplegic Marine is di...150.437577[{\"name\": \"Ingenious Film Partners\", \"id\": 289...[{\"iso_3166_1\": \"US\", \"name\": \"United States o...2009-12-102787965087162.0[{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso...ReleasedEnter the World of Pandora.Avatar7.211800
1300000000[{\"id\": 12, \"name\": \"Adventure\"}, {\"id\": 14, \"...http://disney.go.com/disneypictures/pirates/285[{\"id\": 270, \"name\": \"ocean\"}, {\"id\": 726, \"na...enPirates of the Caribbean: At World's EndCaptain Barbossa, long believed to be dead, ha...139.082615[{\"name\": \"Walt Disney Pictures\", \"id\": 2}, {\"...[{\"iso_3166_1\": \"US\", \"name\": \"United States o...2007-05-19961000000169.0[{\"iso_639_1\": \"en\", \"name\": \"English\"}]ReleasedAt the end of the world, the adventure begins.Pirates of the Caribbean: At World's End6.94500
2245000000[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...http://www.sonypictures.com/movies/spectre/206647[{\"id\": 470, \"name\": \"spy\"}, {\"id\": 818, \"name...enSpectreA cryptic message from Bond’s past sends him o...107.376788[{\"name\": \"Columbia Pictures\", \"id\": 5}, {\"nam...[{\"iso_3166_1\": \"GB\", \"name\": \"United Kingdom\"...2015-10-26880674609148.0[{\"iso_639_1\": \"fr\", \"name\": \"Fran\\u00e7ais\"},...ReleasedA Plan No One EscapesSpectre6.34466
3250000000[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 80, \"nam...http://www.thedarkknightrises.com/49026[{\"id\": 849, \"name\": \"dc comics\"}, {\"id\": 853,...enThe Dark Knight RisesFollowing the death of District Attorney Harve...112.312950[{\"name\": \"Legendary Pictures\", \"id\": 923}, {\"...[{\"iso_3166_1\": \"US\", \"name\": \"United States o...2012-07-161084939099165.0[{\"iso_639_1\": \"en\", \"name\": \"English\"}]ReleasedThe Legend EndsThe Dark Knight Rises7.69106
4260000000[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...http://movies.disney.com/john-carter49529[{\"id\": 818, \"name\": \"based on novel\"}, {\"id\":...enJohn CarterJohn Carter is a war-weary, former military ca...43.926995[{\"name\": \"Walt Disney Pictures\", \"id\": 2}][{\"iso_3166_1\": \"US\", \"name\": \"United States o...2012-03-07284139100132.0[{\"iso_639_1\": \"en\", \"name\": \"English\"}]ReleasedLost in our world, found in another.John Carter6.12124
\n", "
" ], "text/plain": [ " budget genres \\\n", "0 237000000 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam... \n", "1 300000000 [{\"id\": 12, \"name\": \"Adventure\"}, {\"id\": 14, \"... \n", "2 245000000 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam... \n", "3 250000000 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 80, \"nam... \n", "4 260000000 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam... \n", "\n", " homepage id \\\n", "0 http://www.avatarmovie.com/ 19995 \n", "1 http://disney.go.com/disneypictures/pirates/ 285 \n", "2 http://www.sonypictures.com/movies/spectre/ 206647 \n", "3 http://www.thedarkknightrises.com/ 49026 \n", "4 http://movies.disney.com/john-carter 49529 \n", "\n", " keywords original_language \\\n", "0 [{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":... en \n", "1 [{\"id\": 270, \"name\": \"ocean\"}, {\"id\": 726, \"na... en \n", "2 [{\"id\": 470, \"name\": \"spy\"}, {\"id\": 818, \"name... en \n", "3 [{\"id\": 849, \"name\": \"dc comics\"}, {\"id\": 853,... en \n", "4 [{\"id\": 818, \"name\": \"based on novel\"}, {\"id\":... en \n", "\n", " original_title \\\n", "0 Avatar \n", "1 Pirates of the Caribbean: At World's End \n", "2 Spectre \n", "3 The Dark Knight Rises \n", "4 John Carter \n", "\n", " overview popularity \\\n", "0 In the 22nd century, a paraplegic Marine is di... 150.437577 \n", "1 Captain Barbossa, long believed to be dead, ha... 139.082615 \n", "2 A cryptic message from Bond’s past sends him o... 107.376788 \n", "3 Following the death of District Attorney Harve... 112.312950 \n", "4 John Carter is a war-weary, former military ca... 43.926995 \n", "\n", " production_companies \\\n", "0 [{\"name\": \"Ingenious Film Partners\", \"id\": 289... \n", "1 [{\"name\": \"Walt Disney Pictures\", \"id\": 2}, {\"... \n", "2 [{\"name\": \"Columbia Pictures\", \"id\": 5}, {\"nam... \n", "3 [{\"name\": \"Legendary Pictures\", \"id\": 923}, {\"... \n", "4 [{\"name\": \"Walt Disney Pictures\", \"id\": 2}] \n", "\n", " production_countries release_date revenue \\\n", "0 [{\"iso_3166_1\": \"US\", \"name\": \"United States o... 2009-12-10 2787965087 \n", "1 [{\"iso_3166_1\": \"US\", \"name\": \"United States o... 2007-05-19 961000000 \n", "2 [{\"iso_3166_1\": \"GB\", \"name\": \"United Kingdom\"... 2015-10-26 880674609 \n", "3 [{\"iso_3166_1\": \"US\", \"name\": \"United States o... 2012-07-16 1084939099 \n", "4 [{\"iso_3166_1\": \"US\", \"name\": \"United States o... 2012-03-07 284139100 \n", "\n", " runtime spoken_languages status \\\n", "0 162.0 [{\"iso_639_1\": \"en\", \"name\": \"English\"}, {\"iso... Released \n", "1 169.0 [{\"iso_639_1\": \"en\", \"name\": \"English\"}] Released \n", "2 148.0 [{\"iso_639_1\": \"fr\", \"name\": \"Fran\\u00e7ais\"},... Released \n", "3 165.0 [{\"iso_639_1\": \"en\", \"name\": \"English\"}] Released \n", "4 132.0 [{\"iso_639_1\": \"en\", \"name\": \"English\"}] Released \n", "\n", " tagline \\\n", "0 Enter the World of Pandora. \n", "1 At the end of the world, the adventure begins. \n", "2 A Plan No One Escapes \n", "3 The Legend Ends \n", "4 Lost in our world, found in another. \n", "\n", " title vote_average vote_count \n", "0 Avatar 7.2 11800 \n", "1 Pirates of the Caribbean: At World's End 6.9 4500 \n", "2 Spectre 6.3 4466 \n", "3 The Dark Knight Rises 7.6 9106 \n", "4 John Carter 6.1 2124 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.head()" ] }, { "cell_type": "code", "execution_count": 4, "id": "2d4d19ed-104f-46c1-96c9-5ffd94cf8b15", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movie_idtitlecastcrew
019995Avatar[{\"cast_id\": 242, \"character\": \"Jake Sully\", \"...[{\"credit_id\": \"52fe48009251416c750aca23\", \"de...
1285Pirates of the Caribbean: At World's End[{\"cast_id\": 4, \"character\": \"Captain Jack Spa...[{\"credit_id\": \"52fe4232c3a36847f800b579\", \"de...
2206647Spectre[{\"cast_id\": 1, \"character\": \"James Bond\", \"cr...[{\"credit_id\": \"54805967c3a36829b5002c41\", \"de...
349026The Dark Knight Rises[{\"cast_id\": 2, \"character\": \"Bruce Wayne / Ba...[{\"credit_id\": \"52fe4781c3a36847f81398c3\", \"de...
449529John Carter[{\"cast_id\": 5, \"character\": \"John Carter\", \"c...[{\"credit_id\": \"52fe479ac3a36847f813eaa3\", \"de...
\n", "
" ], "text/plain": [ " movie_id title \\\n", "0 19995 Avatar \n", "1 285 Pirates of the Caribbean: At World's End \n", "2 206647 Spectre \n", "3 49026 The Dark Knight Rises \n", "4 49529 John Carter \n", "\n", " cast \\\n", "0 [{\"cast_id\": 242, \"character\": \"Jake Sully\", \"... \n", "1 [{\"cast_id\": 4, \"character\": \"Captain Jack Spa... \n", "2 [{\"cast_id\": 1, \"character\": \"James Bond\", \"cr... \n", "3 [{\"cast_id\": 2, \"character\": \"Bruce Wayne / Ba... \n", "4 [{\"cast_id\": 5, \"character\": \"John Carter\", \"c... \n", "\n", " crew \n", "0 [{\"credit_id\": \"52fe48009251416c750aca23\", \"de... \n", "1 [{\"credit_id\": \"52fe4232c3a36847f800b579\", \"de... \n", "2 [{\"credit_id\": \"54805967c3a36829b5002c41\", \"de... \n", "3 [{\"credit_id\": \"52fe4781c3a36847f81398c3\", \"de... \n", "4 [{\"credit_id\": \"52fe479ac3a36847f813eaa3\", \"de... " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "credits.head()" ] }, { "cell_type": "code", "execution_count": 5, "id": "1203ff17-269a-4641-8045-5a16e204460c", "metadata": {}, "outputs": [], "source": [ "movies = movies.merge(credits, on='title')" ] }, { "cell_type": "code", "execution_count": 6, "id": "1764ac22-2898-453b-a19c-da36b619a2ce", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 4809 entries, 0 to 4808\n", "Data columns (total 23 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 budget 4809 non-null int64 \n", " 1 genres 4809 non-null object \n", " 2 homepage 1713 non-null object \n", " 3 id 4809 non-null int64 \n", " 4 keywords 4809 non-null object \n", " 5 original_language 4809 non-null object \n", " 6 original_title 4809 non-null object \n", " 7 overview 4806 non-null object \n", " 8 popularity 4809 non-null float64\n", " 9 production_companies 4809 non-null object \n", " 10 production_countries 4809 non-null object \n", " 11 release_date 4808 non-null object \n", " 12 revenue 4809 non-null int64 \n", " 13 runtime 4807 non-null float64\n", " 14 spoken_languages 4809 non-null object \n", " 15 status 4809 non-null object \n", " 16 tagline 3965 non-null object \n", " 17 title 4809 non-null object \n", " 18 vote_average 4809 non-null float64\n", " 19 vote_count 4809 non-null int64 \n", " 20 movie_id 4809 non-null int64 \n", " 21 cast 4809 non-null object \n", " 22 crew 4809 non-null object \n", "dtypes: float64(3), int64(5), object(15)\n", "memory usage: 864.2+ KB\n" ] } ], "source": [ "movies.info()" ] }, { "cell_type": "code", "execution_count": 7, "id": "5f039dca-eeae-44cd-8cab-4ea1e28c2b62", "metadata": {}, "outputs": [], "source": [ "movies = movies[['movie_id', 'title', 'genres', 'overview', 'keywords', 'cast', 'crew']]" ] }, { "cell_type": "code", "execution_count": 8, "id": "533b1e16-c9d3-42b0-82bc-80a4d4fdc029", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movie_idtitlegenresoverviewkeywordscastcrew
019995Avatar[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...In the 22nd century, a paraplegic Marine is di...[{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":...[{\"cast_id\": 242, \"character\": \"Jake Sully\", \"...[{\"credit_id\": \"52fe48009251416c750aca23\", \"de...
1285Pirates of the Caribbean: At World's End[{\"id\": 12, \"name\": \"Adventure\"}, {\"id\": 14, \"...Captain Barbossa, long believed to be dead, ha...[{\"id\": 270, \"name\": \"ocean\"}, {\"id\": 726, \"na...[{\"cast_id\": 4, \"character\": \"Captain Jack Spa...[{\"credit_id\": \"52fe4232c3a36847f800b579\", \"de...
2206647Spectre[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...A cryptic message from Bond’s past sends him o...[{\"id\": 470, \"name\": \"spy\"}, {\"id\": 818, \"name...[{\"cast_id\": 1, \"character\": \"James Bond\", \"cr...[{\"credit_id\": \"54805967c3a36829b5002c41\", \"de...
349026The Dark Knight Rises[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 80, \"nam...Following the death of District Attorney Harve...[{\"id\": 849, \"name\": \"dc comics\"}, {\"id\": 853,...[{\"cast_id\": 2, \"character\": \"Bruce Wayne / Ba...[{\"credit_id\": \"52fe4781c3a36847f81398c3\", \"de...
449529John Carter[{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam...John Carter is a war-weary, former military ca...[{\"id\": 818, \"name\": \"based on novel\"}, {\"id\":...[{\"cast_id\": 5, \"character\": \"John Carter\", \"c...[{\"credit_id\": \"52fe479ac3a36847f813eaa3\", \"de...
\n", "
" ], "text/plain": [ " movie_id title \\\n", "0 19995 Avatar \n", "1 285 Pirates of the Caribbean: At World's End \n", "2 206647 Spectre \n", "3 49026 The Dark Knight Rises \n", "4 49529 John Carter \n", "\n", " genres \\\n", "0 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam... \n", "1 [{\"id\": 12, \"name\": \"Adventure\"}, {\"id\": 14, \"... \n", "2 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam... \n", "3 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 80, \"nam... \n", "4 [{\"id\": 28, \"name\": \"Action\"}, {\"id\": 12, \"nam... \n", "\n", " overview \\\n", "0 In the 22nd century, a paraplegic Marine is di... \n", "1 Captain Barbossa, long believed to be dead, ha... \n", "2 A cryptic message from Bond’s past sends him o... \n", "3 Following the death of District Attorney Harve... \n", "4 John Carter is a war-weary, former military ca... \n", "\n", " keywords \\\n", "0 [{\"id\": 1463, \"name\": \"culture clash\"}, {\"id\":... \n", "1 [{\"id\": 270, \"name\": \"ocean\"}, {\"id\": 726, \"na... \n", "2 [{\"id\": 470, \"name\": \"spy\"}, {\"id\": 818, \"name... \n", "3 [{\"id\": 849, \"name\": \"dc comics\"}, {\"id\": 853,... \n", "4 [{\"id\": 818, \"name\": \"based on novel\"}, {\"id\":... \n", "\n", " cast \\\n", "0 [{\"cast_id\": 242, \"character\": \"Jake Sully\", \"... \n", "1 [{\"cast_id\": 4, \"character\": \"Captain Jack Spa... \n", "2 [{\"cast_id\": 1, \"character\": \"James Bond\", \"cr... \n", "3 [{\"cast_id\": 2, \"character\": \"Bruce Wayne / Ba... \n", "4 [{\"cast_id\": 5, \"character\": \"John Carter\", \"c... \n", "\n", " crew \n", "0 [{\"credit_id\": \"52fe48009251416c750aca23\", \"de... \n", "1 [{\"credit_id\": \"52fe4232c3a36847f800b579\", \"de... \n", "2 [{\"credit_id\": \"54805967c3a36829b5002c41\", \"de... \n", "3 [{\"credit_id\": \"52fe4781c3a36847f81398c3\", \"de... \n", "4 [{\"credit_id\": \"52fe479ac3a36847f813eaa3\", \"de... " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.head()" ] }, { "cell_type": "code", "execution_count": 9, "id": "309f68b0-46b9-492e-bf0c-605fc91e1e57", "metadata": {}, "outputs": [], "source": [ "movies = movies.dropna()" ] }, { "cell_type": "code", "execution_count": 10, "id": "8a22d175-b884-43f5-b013-5fd549100b19", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "movie_id 0\n", "title 0\n", "genres 0\n", "overview 0\n", "keywords 0\n", "cast 0\n", "crew 0\n", "dtype: int64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 11, "id": "d9f5ccca-3a68-43cd-841e-97e8fdd7f68f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.duplicated().sum()" ] }, { "cell_type": "code", "execution_count": 12, "id": "517bee9d-a24c-4763-943c-9d81ad7307af", "metadata": {}, "outputs": [], "source": [ "import ast\n", "\n", "def get_name(string):\n", " List = []\n", " for i in ast.literal_eval(string):\n", " List.append(i['name'])\n", " return List\n", "\n", "\n", "def get_cast(string):\n", " List = []\n", " for i in ast.literal_eval(string):\n", " List.append(i['name'])\n", " if len(List)==3:\n", " return List\n", " return List\n", "\n", "\n", "def get_director_name(string):\n", " List = []\n", " for i in ast.literal_eval(string):\n", " if i['job']=='Director':\n", " List.append(i['name'])\n", " return List\n", " return List" ] }, { "cell_type": "code", "execution_count": 13, "id": "0be843da-f3f6-43b9-8ef6-da85b57775a5", "metadata": {}, "outputs": [], "source": [ "movies['genres'] = movies['genres'].apply(get_name)\n", "movies['keywords'] = movies['keywords'].apply(get_name)" ] }, { "cell_type": "code", "execution_count": 14, "id": "a0503645-58d4-4d19-9a9f-5414f8130e24", "metadata": {}, "outputs": [], "source": [ "movies['cast'] = movies['cast'].apply(get_cast)" ] }, { "cell_type": "code", "execution_count": 15, "id": "b9b60133-4d3f-4197-af98-b188fa9ccf4c", "metadata": {}, "outputs": [], "source": [ "movies['crew'] = movies['crew'].apply(get_director_name)" ] }, { "cell_type": "code", "execution_count": 16, "id": "607f9f9b-f4aa-4e56-a74a-e8dcf351c38f", "metadata": {}, "outputs": [], "source": [ "movies['overview'] = movies['overview'].apply(lambda x: x.split())" ] }, { "cell_type": "code", "execution_count": 17, "id": "f0e93004-ae79-4b40-9323-7cb9fd9908bf", "metadata": {}, "outputs": [], "source": [ "movies['genres'] = movies['genres'].apply(lambda x: [i.replace(' ', '') for i in x])\n", "movies['overview'] = movies['overview'].apply(lambda x: [i.replace(' ', '') for i in x])\n", "movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(' ', '') for i in x])\n", "movies['cast'] = movies['cast'].apply(lambda x: [i.replace(' ', '') for i in x])\n", "movies['crew'] = movies['crew'].apply(lambda x: [i.replace(' ', '') for i in x])" ] }, { "cell_type": "code", "execution_count": 18, "id": "0019446c-147f-493c-a0c7-6679b13b5b5d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "movie_id 0\n", "title 0\n", "genres 0\n", "overview 0\n", "keywords 0\n", "cast 0\n", "crew 0\n", "dtype: int64" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 19, "id": "d6b495e5-1d34-46fc-9d15-9385c15e0840", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movie_idtitlegenresoverviewkeywordscastcrew
019995Avatar[Action, Adventure, Fantasy, ScienceFiction][In, the, 22nd, century,, a, paraplegic, Marin...[cultureclash, future, spacewar, spacecolony, ...[SamWorthington, ZoeSaldana, SigourneyWeaver][JamesCameron]
1285Pirates of the Caribbean: At World's End[Adventure, Fantasy, Action][Captain, Barbossa,, long, believed, to, be, d...[ocean, drugabuse, exoticisland, eastindiatrad...[JohnnyDepp, OrlandoBloom, KeiraKnightley][GoreVerbinski]
2206647Spectre[Action, Adventure, Crime][A, cryptic, message, from, Bond’s, past, send...[spy, basedonnovel, secretagent, sequel, mi6, ...[DanielCraig, ChristophWaltz, LéaSeydoux][SamMendes]
349026The Dark Knight Rises[Action, Crime, Drama, Thriller][Following, the, death, of, District, Attorney...[dccomics, crimefighter, terrorist, secretiden...[ChristianBale, MichaelCaine, GaryOldman][ChristopherNolan]
449529John Carter[Action, Adventure, ScienceFiction][John, Carter, is, a, war-weary,, former, mili...[basedonnovel, mars, medallion, spacetravel, p...[TaylorKitsch, LynnCollins, SamanthaMorton][AndrewStanton]
........................
48049367El Mariachi[Action, Crime, Thriller][El, Mariachi, just, wants, to, play, his, gui...[unitedstates–mexicobarrier, legs, arms, paper...[CarlosGallardo, JaimedeHoyos, PeterMarquardt][RobertRodriguez]
480572766Newlyweds[Comedy, Romance][A, newlywed, couple's, honeymoon, is, upended...[][EdwardBurns, KerryBishé, MarshaDietlein][EdwardBurns]
4806231617Signed, Sealed, Delivered[Comedy, Drama, Romance, TVMovie][\"Signed,, Sealed,, Delivered\", introduces, a,...[date, loveatfirstsight, narration, investigat...[EricMabius, KristinBooth, CrystalLowe][ScottSmith]
4807126186Shanghai Calling[][When, ambitious, New, York, attorney, Sam, is...[][DanielHenney, ElizaCoupe, BillPaxton][DanielHsia]
480825975My Date with Drew[Documentary][Ever, since, the, second, grade, when, he, fi...[obsession, camcorder, crush, dreamgirl][DrewBarrymore, BrianHerzlinger, CoreyFeldman][BrianHerzlinger]
\n", "

4806 rows × 7 columns

\n", "
" ], "text/plain": [ " movie_id title \\\n", "0 19995 Avatar \n", "1 285 Pirates of the Caribbean: At World's End \n", "2 206647 Spectre \n", "3 49026 The Dark Knight Rises \n", "4 49529 John Carter \n", "... ... ... \n", "4804 9367 El Mariachi \n", "4805 72766 Newlyweds \n", "4806 231617 Signed, Sealed, Delivered \n", "4807 126186 Shanghai Calling \n", "4808 25975 My Date with Drew \n", "\n", " genres \\\n", "0 [Action, Adventure, Fantasy, ScienceFiction] \n", "1 [Adventure, Fantasy, Action] \n", "2 [Action, Adventure, Crime] \n", "3 [Action, Crime, Drama, Thriller] \n", "4 [Action, Adventure, ScienceFiction] \n", "... ... \n", "4804 [Action, Crime, Thriller] \n", "4805 [Comedy, Romance] \n", "4806 [Comedy, Drama, Romance, TVMovie] \n", "4807 [] \n", "4808 [Documentary] \n", "\n", " overview \\\n", "0 [In, the, 22nd, century,, a, paraplegic, Marin... \n", "1 [Captain, Barbossa,, long, believed, to, be, d... \n", "2 [A, cryptic, message, from, Bond’s, past, send... \n", "3 [Following, the, death, of, District, Attorney... \n", "4 [John, Carter, is, a, war-weary,, former, mili... \n", "... ... \n", "4804 [El, Mariachi, just, wants, to, play, his, gui... \n", "4805 [A, newlywed, couple's, honeymoon, is, upended... \n", "4806 [\"Signed,, Sealed,, Delivered\", introduces, a,... \n", "4807 [When, ambitious, New, York, attorney, Sam, is... \n", "4808 [Ever, since, the, second, grade, when, he, fi... \n", "\n", " keywords \\\n", "0 [cultureclash, future, spacewar, spacecolony, ... \n", "1 [ocean, drugabuse, exoticisland, eastindiatrad... \n", "2 [spy, basedonnovel, secretagent, sequel, mi6, ... \n", "3 [dccomics, crimefighter, terrorist, secretiden... \n", "4 [basedonnovel, mars, medallion, spacetravel, p... \n", "... ... \n", "4804 [unitedstates–mexicobarrier, legs, arms, paper... \n", "4805 [] \n", "4806 [date, loveatfirstsight, narration, investigat... \n", "4807 [] \n", "4808 [obsession, camcorder, crush, dreamgirl] \n", "\n", " cast crew \n", "0 [SamWorthington, ZoeSaldana, SigourneyWeaver] [JamesCameron] \n", "1 [JohnnyDepp, OrlandoBloom, KeiraKnightley] [GoreVerbinski] \n", "2 [DanielCraig, ChristophWaltz, LéaSeydoux] [SamMendes] \n", "3 [ChristianBale, MichaelCaine, GaryOldman] [ChristopherNolan] \n", "4 [TaylorKitsch, LynnCollins, SamanthaMorton] [AndrewStanton] \n", "... ... ... \n", "4804 [CarlosGallardo, JaimedeHoyos, PeterMarquardt] [RobertRodriguez] \n", "4805 [EdwardBurns, KerryBishé, MarshaDietlein] [EdwardBurns] \n", "4806 [EricMabius, KristinBooth, CrystalLowe] [ScottSmith] \n", "4807 [DanielHenney, ElizaCoupe, BillPaxton] [DanielHsia] \n", "4808 [DrewBarrymore, BrianHerzlinger, CoreyFeldman] [BrianHerzlinger] \n", "\n", "[4806 rows x 7 columns]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies" ] }, { "cell_type": "code", "execution_count": 20, "id": "63cb68c3-8bcb-413b-bca2-f2ab12f6bcb2", "metadata": {}, "outputs": [], "source": [ "movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']\n", "movies = movies[['movie_id', 'title', 'tags']]" ] }, { "cell_type": "code", "execution_count": 21, "id": "3eccb9ec-d8af-4cb6-99f5-256f8722d643", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\thaku\\AppData\\Local\\Temp\\ipykernel_7988\\2153568569.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " movies['tags'] = movies['tags'].apply(lambda x: \" \".join(x))\n" ] } ], "source": [ "movies['tags'] = movies['tags'].apply(lambda x: \" \".join(x))" ] }, { "cell_type": "code", "execution_count": 22, "id": "2d415c55-fdcf-4d26-9feb-2eb20659831c", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\thaku\\AppData\\Local\\Temp\\ipykernel_7988\\3982405354.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " movies['tags'] = movies['tags'].apply(lambda x: x.lower())\n" ] } ], "source": [ "movies['tags'] = movies['tags'].apply(lambda x: x.lower())" ] }, { "cell_type": "code", "execution_count": 23, "id": "0b5ebd6d-6e36-4fbd-9fb3-3e933a27bfc1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver jamescameron'" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.iloc[0]['tags']" ] }, { "cell_type": "code", "execution_count": 24, "id": "bf04370b-e001-4271-8c96-f4280ab66bec", "metadata": { "scrolled": true }, "outputs": [ { "ename": "TypeError", "evalue": "PorterStemmer.stem() missing 1 required positional argument: 'word'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[24], line 12\u001b[0m\n\u001b[0;32m 9\u001b[0m List\u001b[38;5;241m.\u001b[39mappend(ps\u001b[38;5;241m.\u001b[39mstem(i))\n\u001b[0;32m 10\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(List)\n\u001b[1;32m---> 12\u001b[0m movies[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtags\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mmovies\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtags\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstem\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32m~\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\core\\series.py:4924\u001b[0m, in \u001b[0;36mSeries.apply\u001b[1;34m(self, func, convert_dtype, args, by_row, **kwargs)\u001b[0m\n\u001b[0;32m 4789\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mapply\u001b[39m(\n\u001b[0;32m 4790\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m 4791\u001b[0m func: AggFuncType,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 4796\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[0;32m 4797\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m DataFrame \u001b[38;5;241m|\u001b[39m Series:\n\u001b[0;32m 4798\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 4799\u001b[0m \u001b[38;5;124;03m Invoke function on values of Series.\u001b[39;00m\n\u001b[0;32m 4800\u001b[0m \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 4915\u001b[0m \u001b[38;5;124;03m dtype: float64\u001b[39;00m\n\u001b[0;32m 4916\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m 4917\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mSeriesApply\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 4918\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4919\u001b[0m \u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4920\u001b[0m \u001b[43m \u001b[49m\u001b[43mconvert_dtype\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert_dtype\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4921\u001b[0m \u001b[43m \u001b[49m\u001b[43mby_row\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mby_row\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4922\u001b[0m \u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[0;32m 4923\u001b[0m \u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m-> 4924\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32m~\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\core\\apply.py:1427\u001b[0m, in \u001b[0;36mSeriesApply.apply\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1424\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapply_compat()\n\u001b[0;32m 1426\u001b[0m \u001b[38;5;66;03m# self.func is Callable\u001b[39;00m\n\u001b[1;32m-> 1427\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mapply_standard\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32m~\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\core\\apply.py:1507\u001b[0m, in \u001b[0;36mSeriesApply.apply_standard\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 1501\u001b[0m \u001b[38;5;66;03m# row-wise access\u001b[39;00m\n\u001b[0;32m 1502\u001b[0m \u001b[38;5;66;03m# apply doesn't have a `na_action` keyword and for backward compat reasons\u001b[39;00m\n\u001b[0;32m 1503\u001b[0m \u001b[38;5;66;03m# we need to give `na_action=\"ignore\"` for categorical data.\u001b[39;00m\n\u001b[0;32m 1504\u001b[0m \u001b[38;5;66;03m# TODO: remove the `na_action=\"ignore\"` when that default has been changed in\u001b[39;00m\n\u001b[0;32m 1505\u001b[0m \u001b[38;5;66;03m# Categorical (GH51645).\u001b[39;00m\n\u001b[0;32m 1506\u001b[0m action \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mignore\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(obj\u001b[38;5;241m.\u001b[39mdtype, CategoricalDtype) \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m-> 1507\u001b[0m mapped \u001b[38;5;241m=\u001b[39m \u001b[43mobj\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_map_values\u001b[49m\u001b[43m(\u001b[49m\n\u001b[0;32m 1508\u001b[0m \u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcurried\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mna_action\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43maction\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconvert_dtype\u001b[49m\n\u001b[0;32m 1509\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1511\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(mapped) \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(mapped[\u001b[38;5;241m0\u001b[39m], ABCSeries):\n\u001b[0;32m 1512\u001b[0m \u001b[38;5;66;03m# GH#43986 Need to do list(mapped) in order to get treated as nested\u001b[39;00m\n\u001b[0;32m 1513\u001b[0m \u001b[38;5;66;03m# See also GH#25959 regarding EA support\u001b[39;00m\n\u001b[0;32m 1514\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj\u001b[38;5;241m.\u001b[39m_constructor_expanddim(\u001b[38;5;28mlist\u001b[39m(mapped), index\u001b[38;5;241m=\u001b[39mobj\u001b[38;5;241m.\u001b[39mindex)\n", "File \u001b[1;32m~\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\core\\base.py:921\u001b[0m, in \u001b[0;36mIndexOpsMixin._map_values\u001b[1;34m(self, mapper, na_action, convert)\u001b[0m\n\u001b[0;32m 918\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(arr, ExtensionArray):\n\u001b[0;32m 919\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m arr\u001b[38;5;241m.\u001b[39mmap(mapper, na_action\u001b[38;5;241m=\u001b[39mna_action)\n\u001b[1;32m--> 921\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43malgorithms\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap_array\u001b[49m\u001b[43m(\u001b[49m\u001b[43marr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mna_action\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mna_action\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[1;32m~\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\pandas\\core\\algorithms.py:1743\u001b[0m, in \u001b[0;36mmap_array\u001b[1;34m(arr, mapper, na_action, convert)\u001b[0m\n\u001b[0;32m 1741\u001b[0m values \u001b[38;5;241m=\u001b[39m arr\u001b[38;5;241m.\u001b[39mastype(\u001b[38;5;28mobject\u001b[39m, copy\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[0;32m 1742\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m na_action \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m-> 1743\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mlib\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmap_infer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmapper\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mconvert\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 1744\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 1745\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m lib\u001b[38;5;241m.\u001b[39mmap_infer_mask(\n\u001b[0;32m 1746\u001b[0m values, mapper, mask\u001b[38;5;241m=\u001b[39misna(values)\u001b[38;5;241m.\u001b[39mview(np\u001b[38;5;241m.\u001b[39muint8), convert\u001b[38;5;241m=\u001b[39mconvert\n\u001b[0;32m 1747\u001b[0m )\n", "File \u001b[1;32mlib.pyx:2972\u001b[0m, in \u001b[0;36mpandas._libs.lib.map_infer\u001b[1;34m()\u001b[0m\n", "Cell \u001b[1;32mIn[24], line 9\u001b[0m, in \u001b[0;36mstem\u001b[1;34m(text)\u001b[0m\n\u001b[0;32m 7\u001b[0m List \u001b[38;5;241m=\u001b[39m []\n\u001b[0;32m 8\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m i \u001b[38;5;129;01min\u001b[39;00m text\u001b[38;5;241m.\u001b[39msplit():\n\u001b[1;32m----> 9\u001b[0m List\u001b[38;5;241m.\u001b[39mappend(\u001b[43mps\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstem\u001b[49m\u001b[43m(\u001b[49m\u001b[43mi\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[0;32m 10\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m \u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;241m.\u001b[39mjoin(List)\n", "\u001b[1;31mTypeError\u001b[0m: PorterStemmer.stem() missing 1 required positional argument: 'word'" ] } ], "source": [ "# import nltk\n", "# from nltk.stem.porter import PorterStemmer\n", "\n", "# ps = PorterStemmer\n", "\n", "# def stem(text):\n", "# List = []\n", "# for i in text.split():\n", "# List.append(ps.stem(i))\n", "# return \" \".join(List)\n", "\n", "# movies['tags'] = movies['tags'].apply(stem)" ] }, { "cell_type": "code", "execution_count": 25, "id": "a0ee92b1-3376-432e-9d9c-2ee37a236d9d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movie_idtitletags
019995Avatarin the 22nd century, a paraplegic marine is di...
1285Pirates of the Caribbean: At World's Endcaptain barbossa, long believed to be dead, ha...
2206647Spectrea cryptic message from bond’s past sends him o...
349026The Dark Knight Risesfollowing the death of district attorney harve...
449529John Carterjohn carter is a war-weary, former military ca...
............
48049367El Mariachiel mariachi just wants to play his guitar and ...
480572766Newlywedsa newlywed couple's honeymoon is upended by th...
4806231617Signed, Sealed, Delivered\"signed, sealed, delivered\" introduces a dedic...
4807126186Shanghai Callingwhen ambitious new york attorney sam is sent t...
480825975My Date with Drewever since the second grade when he first saw ...
\n", "

4806 rows × 3 columns

\n", "
" ], "text/plain": [ " movie_id title \\\n", "0 19995 Avatar \n", "1 285 Pirates of the Caribbean: At World's End \n", "2 206647 Spectre \n", "3 49026 The Dark Knight Rises \n", "4 49529 John Carter \n", "... ... ... \n", "4804 9367 El Mariachi \n", "4805 72766 Newlyweds \n", "4806 231617 Signed, Sealed, Delivered \n", "4807 126186 Shanghai Calling \n", "4808 25975 My Date with Drew \n", "\n", " tags \n", "0 in the 22nd century, a paraplegic marine is di... \n", "1 captain barbossa, long believed to be dead, ha... \n", "2 a cryptic message from bond’s past sends him o... \n", "3 following the death of district attorney harve... \n", "4 john carter is a war-weary, former military ca... \n", "... ... \n", "4804 el mariachi just wants to play his guitar and ... \n", "4805 a newlywed couple's honeymoon is upended by th... \n", "4806 \"signed, sealed, delivered\" introduces a dedic... \n", "4807 when ambitious new york attorney sam is sent t... \n", "4808 ever since the second grade when he first saw ... \n", "\n", "[4806 rows x 3 columns]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies" ] }, { "cell_type": "code", "execution_count": 26, "id": "9f521536-b865-49f5-8996-dcbc7982e641", "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "\n", "cv = CountVectorizer(max_features=5000, stop_words='english')" ] }, { "cell_type": "code", "execution_count": 27, "id": "2a802077-3472-476f-a649-c40158bc97cb", "metadata": {}, "outputs": [], "source": [ "vectors = cv.fit_transform(movies['tags']).toarray()" ] }, { "cell_type": "code", "execution_count": 28, "id": "3aed858d-ed14-4840-ac00-ce11406db70b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[0, 0, 0, ..., 0, 0, 0],\n", " [0, 0, 0, ..., 0, 0, 0],\n", " [0, 0, 0, ..., 0, 0, 0],\n", " ...,\n", " [0, 0, 0, ..., 0, 0, 0],\n", " [0, 0, 0, ..., 0, 0, 0],\n", " [0, 0, 0, ..., 0, 0, 0]], dtype=int64)" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vectors" ] }, { "cell_type": "code", "execution_count": 30, "id": "174f33cc-34dd-4c8d-86c8-ed7e9a7a01c7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['000' '007' '10' ... 'zone' 'zoo' 'zooeydeschanel']\n" ] } ], "source": [ "print(cv.get_feature_names_out())" ] }, { "cell_type": "code", "execution_count": 31, "id": "10a5bb71-f03c-4b16-ab13-f7c70b8d7dc7", "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics.pairwise import cosine_similarity" ] }, { "cell_type": "code", "execution_count": 32, "id": "b693799b-7b86-41cf-8299-3e109dd2cc48", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1. , 0.08964215, 0.05976143, ..., 0.02519763, 0.02817181,\n", " 0. ],\n", " [0.08964215, 1. , 0.0625 , ..., 0.02635231, 0. ,\n", " 0. ],\n", " [0.05976143, 0.0625 , 1. , ..., 0.02635231, 0. ,\n", " 0. ],\n", " ...,\n", " [0.02519763, 0.02635231, 0.02635231, ..., 1. , 0.0745356 ,\n", " 0.04836508],\n", " [0.02817181, 0. , 0. , ..., 0.0745356 , 1. ,\n", " 0.05407381],\n", " [0. , 0. , 0. , ..., 0.04836508, 0.05407381,\n", " 1. ]])" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_cos_sim = cosine_similarity(vectors)\n", "movies_cos_sim" ] }, { "cell_type": "code", "execution_count": 33, "id": "f2b04fdc-7764-4fe7-a401-63833468fd85", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(4806, 4806)" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies_cos_sim.shape" ] }, { "cell_type": "code", "execution_count": 101, "id": "21306d5b-1bce-4508-a8ed-85e41be6af95", "metadata": {}, "outputs": [], "source": [ "def recommend(movie):\n", " if movie in movies['title'].tolist():\n", " index = movies[movies['title']==movie].index[0]\n", " ascending_indices = movies_cos_sim[index].argsort()\n", " descending_indices = ascending_indices[::-1]\n", " return movies.iloc[descending_indices[1:21]]['title'].tolist()\n", " else:\n", " return 'movie not found in dataset'" ] }, { "cell_type": "code", "execution_count": 133, "id": "be12e60c-1a10-4cb3-a9cd-b3fe1174f91b", "metadata": {}, "outputs": [ { "name": "stdin", "output_type": "stream", "text": [ "movie : My Fault\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "m\n", "o\n", "v\n", "i\n", "e\n", " \n", "n\n", "o\n", "t\n", " \n", "f\n", "o\n", "u\n", "n\n", "d\n", " \n", "i\n", "n\n", " \n", "d\n", "a\n", "t\n", "a\n", "s\n", "e\n", "t\n", "----------------------------------------------------------------------------------------------\n" ] }, { "name": "stdin", "output_type": "stream", "text": [ "movie : God Father\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "m\n", "o\n", "v\n", "i\n", "e\n", " \n", "n\n", "o\n", "t\n", " \n", "f\n", "o\n", "u\n", "n\n", "d\n", " \n", "i\n", "n\n", " \n", "d\n", "a\n", "t\n", "a\n", "s\n", "e\n", "t\n", "----------------------------------------------------------------------------------------------\n" ] }, { "name": "stdin", "output_type": "stream", "text": [ "movie : Godfather\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "m\n", "o\n", "v\n", "i\n", "e\n", " \n", "n\n", "o\n", "t\n", " \n", "f\n", "o\n", "u\n", "n\n", "d\n", " \n", "i\n", "n\n", " \n", "d\n", "a\n", "t\n", "a\n", "s\n", "e\n", "t\n", "----------------------------------------------------------------------------------------------\n" ] }, { "name": "stdin", "output_type": "stream", "text": [ "movie : The Godfather\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Desert Dancer\n", "Take the Lead\n", "Step Up 2: The Streets\n", "Center Stage\n", "Step Up\n", "Footloose\n", "ABCD (Any Body Can Dance)\n", "Step Up Revolution\n", "Tango\n", "Dancin' It's On\n", "Love Me Tender\n", "Sweet Charity\n", "Black Swan\n", "Sunday School Musical\n", "Peaceful Warrior\n", "Mao's Last Dancer\n", "Mr. Holland's Opus\n", "Yentl\n", "Honey\n", "Rize\n", "----------------------------------------------------------------------------------------------\n" ] }, { "name": "stdin", "output_type": "stream", "text": [ "movie : The Fault in Our Stars\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Easy Money\n", "The Slaughter Rule\n", "Blood Ties\n", "Runner Runner\n", "The Gambler\n", "Hardball\n", "Gridiron Gang\n", "The Rainmaker\n", "Amidst the Devil's Wings\n", "Nine Queens\n", "Casino\n", "The Legend of Bagger Vance\n", "My Big Fat Greek Wedding\n", "Mi America\n", "Blue Like Jazz\n", "Ong Bak 2\n", "Auto Focus\n", "Stonewall\n", "Killer Joe\n", "Jesus' Son\n", "----------------------------------------------------------------------------------------------\n" ] }, { "name": "stdin", "output_type": "stream", "text": [ "movie : \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "function ended\n" ] } ], "source": [ "while(True):\n", " movie = input('movie : ')\n", " if movie:\n", " for i in recommend(movie):\n", " print(i)\n", " print('----------------------------------------------------------------------------------------------')\n", " else:\n", " print('function ended')\n", " break" ] }, { "cell_type": "code", "execution_count": 136, "id": "afc6fb23-bdc3-47df-9c22-18d303d97d81", "metadata": {}, "outputs": [], "source": [ "import pickle\n", "\n", "pickle.dump(movies.to_dict(), open('movies_dict.pkl', 'wb'))" ] }, { "cell_type": "code", "execution_count": 144, "id": "d5d7a4a7-df6e-4ed6-a04b-5a4b8ba08550", "metadata": {}, "outputs": [], "source": [ "pickle.dump(movies_cos_sim, open('movies_cos_sim.pkl', 'wb'))\n", "x = pickle.load(open('movies_cos_sim.pkl', 'rb'))\n", "x" ] }, { "cell_type": "code", "execution_count": 145, "id": "738cd08d-2232-4008-84c8-9e90f9320698", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1. , 0.08964215, 0.05976143, ..., 0.02519763, 0.02817181,\n", " 0. ],\n", " [0.08964215, 1. , 0.0625 , ..., 0.02635231, 0. ,\n", " 0. ],\n", " [0.05976143, 0.0625 , 1. , ..., 0.02635231, 0. ,\n", " 0. ],\n", " ...,\n", " [0.02519763, 0.02635231, 0.02635231, ..., 1. , 0.0745356 ,\n", " 0.04836508],\n", " [0.02817181, 0. , 0. , ..., 0.0745356 , 1. ,\n", " 0.05407381],\n", " [0. , 0. , 0. , ..., 0.04836508, 0.05407381,\n", " 1. ]])" ] }, "execution_count": 145, "metadata": {}, "output_type": "execute_result" } ], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "1c150443-6419-4b0a-ab1a-92a3952c1bab", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 5 }