{
"cells": [
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import os\n",
"\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\")"
]
},
{
"cell_type": "code",
"execution_count": 128,
"metadata": {},
"outputs": [],
"source": [
"# Checking for available files in the folder"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['bollywood_full_1950-2019.csv', 'MovieGenre.csv', 'Top_10000_Movies.csv']"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dirc = 'data'\n",
"os.listdir(dirc)"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" imdbId | \n",
" Imdb Link | \n",
" Title | \n",
" IMDB Score | \n",
" Genre | \n",
" Poster | \n",
"
\n",
" \n",
" \n",
" \n",
" 10164 | \n",
" 437407 | \n",
" http://www.imdb.com/title/tt437407 | \n",
" Parineeta (2005) | \n",
" 7.3 | \n",
" Drama|Musical|Romance | \n",
" https://images-na.ssl-images-amazon.com/images... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" imdbId Imdb Link Title \\\n",
"10164 437407 http://www.imdb.com/title/tt437407 Parineeta (2005) \n",
"\n",
" IMDB Score Genre \\\n",
"10164 7.3 Drama|Musical|Romance \n",
"\n",
" Poster \n",
"10164 https://images-na.ssl-images-amazon.com/images... "
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('data/MovieGenre.csv', encoding='latin-1')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.sample()"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(40108, 6)"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 40108 entries, 0 to 40107\n",
"Data columns (total 6 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 imdbId 40108 non-null int64 \n",
" 1 Imdb Link 40108 non-null object \n",
" 2 Title 40108 non-null object \n",
" 3 IMDB Score 40060 non-null float64\n",
" 4 Genre 39963 non-null object \n",
" 5 Poster 39383 non-null object \n",
"dtypes: float64(1), int64(1), object(4)\n",
"memory usage: 1.8+ MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### DATA CLEANING :"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"593"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.duplicated().sum()"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"df.drop_duplicates(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"imdbId 0\n",
"Imdb Link 0\n",
"Title 0\n",
"IMDB Score 48\n",
"Genre 145\n",
"Poster 724\n",
"dtype: int64"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
"df.dropna(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(38654, 6)"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
"pop = df[df[\"IMDB Score\"] > 7]"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'https://images-na.ssl-images-amazon.com/images/M/MV5BMDU2ZWJlMjktMTRhMy00ZTA5LWEzNDgtYmNmZTEwZTViZWJkXkEyXkFqcGdeQXVyNDQ2OTk4MzI@._V1_UX182_CR0,0,182,268_AL_.jpg'"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.loc[0,\"Poster\"]"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(12345, 6)"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pop.shape"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" imdbId | \n",
" Imdb Link | \n",
" Title | \n",
" IMDB Score | \n",
" Genre | \n",
" Poster | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 114709 | \n",
" http://www.imdb.com/title/tt114709 | \n",
" Toy Story (1995) | \n",
" 8.3 | \n",
" Animation|Adventure|Comedy | \n",
" https://images-na.ssl-images-amazon.com/images... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" imdbId Imdb Link Title IMDB Score \\\n",
"0 114709 http://www.imdb.com/title/tt114709 Toy Story (1995) 8.3 \n",
"\n",
" Genre \\\n",
"0 Animation|Adventure|Comedy \n",
"\n",
" Poster \n",
"0 https://images-na.ssl-images-amazon.com/images... "
]
},
"execution_count": 78,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pop.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" imdbId | \n",
" Imdb Link | \n",
" Title | \n",
" IMDB Score | \n",
" Genre | \n",
" Poster | \n",
" year | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 114709 | \n",
" http://www.imdb.com/title/tt114709 | \n",
" Toy Story (1995) | \n",
" 8.3 | \n",
" Animation|Adventure|Comedy | \n",
" https://images-na.ssl-images-amazon.com/images... | \n",
" 1995 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" imdbId Imdb Link Title IMDB Score \\\n",
"0 114709 http://www.imdb.com/title/tt114709 Toy Story (1995) 8.3 \n",
"\n",
" Genre \\\n",
"0 Animation|Adventure|Comedy \n",
"\n",
" Poster year \n",
"0 https://images-na.ssl-images-amazon.com/images... 1995 "
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pop[\"year\"] = pop[\"Title\"].apply(lambda x: x[-5:-1])\n",
"pop.head(1)"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(11987, 7)"
]
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"for i in pop[\"year\"]:\n",
" if not str(i).isdigit():\n",
" pop.drop(index = pop[pop[\"year\"]==i].index[0], inplace=True)\n",
"pop.shape"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
"pop.reset_index(drop=True, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 1995\n",
"1 1995\n",
"2 1995\n",
"3 1995\n",
"4 1995\n",
" ... \n",
"11982 2016\n",
"11983 2009\n",
"11984 1967\n",
"11985 2014\n",
"11986 2015\n",
"Name: year, Length: 11987, dtype: int64"
]
},
"execution_count": 82,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.to_numeric(pop[\"year\"])"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'2017'"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"max(pop[\"year\"])"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [],
"source": [
"l = []\n",
"def year(df) : \n",
" try : \n",
" if int(df['year']) > 1999 : \n",
" l.append(df.name)\n",
" except : \n",
" pass"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 None\n",
"1 None\n",
"2 None\n",
"3 None\n",
"4 None\n",
" ... \n",
"11982 None\n",
"11983 None\n",
"11984 None\n",
"11985 None\n",
"11986 None\n",
"Length: 11987, dtype: object"
]
},
"execution_count": 85,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pop.apply(year, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [],
"source": [
"movies = pop.iloc[l,:]"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(5431, 7)"
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies.shape"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [],
"source": [
"movies.reset_index(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" index | \n",
" imdbId | \n",
" Imdb Link | \n",
" Title | \n",
" IMDB Score | \n",
" Genre | \n",
" Poster | \n",
" year | \n",
"
\n",
" \n",
" \n",
" \n",
" 61 | \n",
" 1455 | \n",
" 221073 | \n",
" http://www.imdb.com/title/tt221073 | \n",
" Chopper (2000) | \n",
" 7.2 | \n",
" Biography|Crime|Drama | \n",
" https://images-na.ssl-images-amazon.com/images... | \n",
" 2000 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" index imdbId Imdb Link Title \\\n",
"61 1455 221073 http://www.imdb.com/title/tt221073 Chopper (2000) \n",
"\n",
" IMDB Score Genre \\\n",
"61 7.2 Biography|Crime|Drama \n",
"\n",
" Poster year \n",
"61 https://images-na.ssl-images-amazon.com/images... 2000 "
]
},
"execution_count": 89,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies.sample()"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Title | \n",
" Genre | \n",
" Poster | \n",
" year | \n",
"
\n",
" \n",
" \n",
" \n",
" 3687 | \n",
" The Amazing Screw-On Head (2006) | \n",
" Animation|Short|Action | \n",
" https://images-na.ssl-images-amazon.com/images... | \n",
" 2006 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Title Genre \\\n",
"3687 The Amazing Screw-On Head (2006) Animation|Short|Action \n",
"\n",
" Poster year \n",
"3687 https://images-na.ssl-images-amazon.com/images... 2006 "
]
},
"execution_count": 90,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies.drop(columns=[\"index\", \"imdbId\", \"Imdb Link\", \"IMDB Score\"], inplace=True)\n",
"movies.sample()"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Title 0\n",
"Genre 0\n",
"Poster 0\n",
"year 0\n",
"dtype: int64"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Title | \n",
" Genre | \n",
" Poster | \n",
" year | \n",
"
\n",
" \n",
" \n",
" \n",
" 2165 | \n",
" War Horse (2011) | \n",
" [Drama, War] | \n",
" https://images-na.ssl-images-amazon.com/images... | \n",
" 2011 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Title Genre \\\n",
"2165 War Horse (2011) [Drama, War] \n",
"\n",
" Poster year \n",
"2165 https://images-na.ssl-images-amazon.com/images... 2011 "
]
},
"execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies[\"Genre\"] = movies[\"Genre\"].apply(lambda row: str(row).split(\"|\"))\n",
"movies.sample()"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [],
"source": [
"movies[\"tags\"] = movies[\"Title\"].apply(lambda x: str(x)[:-6].split())"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
"movies[\"tags\"] = movies[\"tags\"] + movies[\"Genre\"]\n",
"movies[\"tags\"] = movies[\"tags\"].apply(lambda x: (\" \".join(x)).lower())"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Title | \n",
" Genre | \n",
" Poster | \n",
" year | \n",
" tags | \n",
"
\n",
" \n",
" \n",
" \n",
" 312 | \n",
" Revolution OS (2001) | \n",
" [Documentary, Comedy] | \n",
" https://images-na.ssl-images-amazon.com/images... | \n",
" 2001 | \n",
" revolution os documentary comedy | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Title Genre \\\n",
"312 Revolution OS (2001) [Documentary, Comedy] \n",
"\n",
" Poster year \\\n",
"312 https://images-na.ssl-images-amazon.com/images... 2001 \n",
"\n",
" tags \n",
"312 revolution os documentary comedy "
]
},
"execution_count": 95,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies.sample()"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [],
"source": [
"from pickle import load\n",
"bolly = load(open(\"movie_info.pkl\",\"rb\"))"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" poster_path | \n",
" original_title | \n",
" year_of_release | \n",
" genres | \n",
" imdb_rating | \n",
" imdb_votes | \n",
" summary | \n",
" tags | \n",
"
\n",
" \n",
" \n",
" \n",
" 1572 | \n",
" https://upload.wikimedia.org/wikipedia/en/thum... | \n",
" Yaadein... | \n",
" 2001 | \n",
" [Drama, Musical, Romance] | \n",
" 4.4 | \n",
" 3034.0 | \n",
" [Raj, Singh, Puri, is, best, friends, with, L.... | \n",
" drama music romanc raj singh puri is best frie... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" poster_path original_title \\\n",
"1572 https://upload.wikimedia.org/wikipedia/en/thum... Yaadein... \n",
"\n",
" year_of_release genres imdb_rating imdb_votes \\\n",
"1572 2001 [Drama, Musical, Romance] 4.4 3034.0 \n",
"\n",
" summary \\\n",
"1572 [Raj, Singh, Puri, is, best, friends, with, L.... \n",
"\n",
" tags \n",
"1572 drama music romanc raj singh puri is best frie... "
]
},
"execution_count": 97,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bolly = pd.DataFrame(bolly)\n",
"bolly.sample()"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Title | \n",
" Genre | \n",
" Poster | \n",
" year | \n",
" tags | \n",
"
\n",
" \n",
" \n",
" \n",
" 1512 | \n",
" Ai no mukidashi (2008) | \n",
" [Action, Comedy, Drama] | \n",
" https://images-na.ssl-images-amazon.com/images... | \n",
" 2008 | \n",
" ai no mukidashi action comedy drama | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Title Genre \\\n",
"1512 Ai no mukidashi (2008) [Action, Comedy, Drama] \n",
"\n",
" Poster year \\\n",
"1512 https://images-na.ssl-images-amazon.com/images... 2008 \n",
"\n",
" tags \n",
"1512 ai no mukidashi action comedy drama "
]
},
"execution_count": 98,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies.sample()"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Poster | \n",
" Title | \n",
" year | \n",
" Genre | \n",
" imdb_rating | \n",
" imdb_votes | \n",
" summary | \n",
" tags | \n",
"
\n",
" \n",
" \n",
" \n",
" 1490 | \n",
" https://upload.wikimedia.org/wikipedia/commons... | \n",
" Kaante | \n",
" 2002 | \n",
" [Action, Crime, Drama] | \n",
" 6.6 | \n",
" 4267.0 | \n",
" [Six, bank, robbers, trying, to, pull, off, th... | \n",
" action crime drama on or about 12may00 a truck... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Poster Title year \\\n",
"1490 https://upload.wikimedia.org/wikipedia/commons... Kaante 2002 \n",
"\n",
" Genre imdb_rating imdb_votes \\\n",
"1490 [Action, Crime, Drama] 6.6 4267.0 \n",
"\n",
" summary \\\n",
"1490 [Six, bank, robbers, trying, to, pull, off, th... \n",
"\n",
" tags \n",
"1490 action crime drama on or about 12may00 a truck... "
]
},
"execution_count": 99,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bolly.rename(columns={\"poster_path\" : \"Poster\", \"original_title\" : \"Title\", \n",
" \"year_of_release\" : \"year\", \"genres\" : \"Genre\"}, inplace=True)\n",
"\n",
"bolly.sample()"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [],
"source": [
"bolly.shape \n",
"bolly.reset_index(inplace=True)\n",
"movies.reset_index(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [],
"source": [
"new = pd.concat([bolly, movies]).drop(columns=[\"imdb_rating\",\"imdb_votes\",\"summary\"])"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [],
"source": [
"new.reset_index(inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(5431, 6)"
]
},
"execution_count": 103,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"movies.shape"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"level_0 0\n",
"index 0\n",
"Poster 0\n",
"Title 0\n",
"year 0\n",
"Genre 0\n",
"tags 0\n",
"dtype: int64"
]
},
"execution_count": 104,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"new.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 7118 entries, 0 to 7117\n",
"Data columns (total 7 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 level_0 7118 non-null int64 \n",
" 1 index 7118 non-null int64 \n",
" 2 Poster 7118 non-null object\n",
" 3 Title 7118 non-null object\n",
" 4 year 7118 non-null object\n",
" 5 Genre 7118 non-null object\n",
" 6 tags 7118 non-null object\n",
"dtypes: int64(2), object(5)\n",
"memory usage: 389.4+ KB\n"
]
}
],
"source": [
"new.info()"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {},
"outputs": [],
"source": [
"new[\"year\"] = pd.to_numeric(new[\"year\"])"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 7118 entries, 0 to 7117\n",
"Data columns (total 7 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 level_0 7118 non-null int64 \n",
" 1 index 7118 non-null int64 \n",
" 2 Poster 7118 non-null object\n",
" 3 Title 7118 non-null object\n",
" 4 year 7118 non-null int64 \n",
" 5 Genre 7118 non-null object\n",
" 6 tags 7118 non-null object\n",
"dtypes: int64(3), object(4)\n",
"memory usage: 389.4+ KB\n"
]
}
],
"source": [
"new.info()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### ENCODING :"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import CountVectorizer\n",
"cv = CountVectorizer(max_features=5000, stop_words='english')"
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {},
"outputs": [],
"source": [
"vectors = cv.fit_transform(new[\"tags\"]).toarray()"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(7118, 5000)"
]
},
"execution_count": 110,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vectors.shape"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {},
"outputs": [],
"source": [
"most_common_words = cv.get_feature_names_out()"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 000 | \n",
" 10 | \n",
" 100 | \n",
" 10th | \n",
" 11 | \n",
" 11th | \n",
" 12 | \n",
" 13 | \n",
" 13th | \n",
" 14 | \n",
" ... | \n",
" zero | \n",
" zinda | \n",
" zindagi | \n",
" zinta | \n",
" zone | \n",
" zooni | \n",
" zoya | \n",
" zubeida | \n",
" zulfi | \n",
" ìê | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 7113 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 7114 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 7115 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 7116 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 7117 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
7118 rows × 5000 columns
\n",
"
"
],
"text/plain": [
" 000 10 100 10th 11 11th 12 13 13th 14 ... zero zinda \\\n",
"0 0 0 0 0 1 0 0 0 0 0 ... 0 0 \n",
"1 0 0 0 0 0 0 0 0 0 0 ... 0 0 \n",
"2 0 0 0 0 0 0 0 0 0 0 ... 0 0 \n",
"3 0 0 0 0 0 0 0 0 0 0 ... 0 0 \n",
"4 0 0 0 0 0 0 0 0 0 0 ... 0 0 \n",
"... ... .. ... ... .. ... .. .. ... .. ... ... ... \n",
"7113 0 0 0 0 0 0 0 0 0 0 ... 0 0 \n",
"7114 0 0 0 0 0 0 0 0 0 0 ... 0 0 \n",
"7115 0 0 0 0 0 0 0 0 0 0 ... 0 0 \n",
"7116 0 0 0 0 0 0 0 0 0 0 ... 0 0 \n",
"7117 0 0 0 0 0 0 0 0 0 0 ... 0 0 \n",
"\n",
" zindagi zinta zone zooni zoya zubeida zulfi ìê \n",
"0 0 0 0 0 0 0 0 0 \n",
"1 0 0 0 0 0 0 0 0 \n",
"2 0 0 0 0 0 0 0 0 \n",
"3 0 0 0 0 0 0 0 0 \n",
"4 0 0 0 0 0 0 0 0 \n",
"... ... ... ... ... ... ... ... .. \n",
"7113 0 0 0 0 0 0 0 0 \n",
"7114 0 0 0 0 0 0 0 0 \n",
"7115 0 0 0 0 0 0 0 0 \n",
"7116 0 0 0 0 0 0 0 0 \n",
"7117 0 0 0 0 0 0 0 0 \n",
"\n",
"[7118 rows x 5000 columns]"
]
},
"execution_count": 112,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cs = pd.DataFrame(vectors, columns=most_common_words)\n",
"# cs.to_csv(\"data\\common_words.csv\")\n",
"cs"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"#### RECOMMENDATION ON THE BASIS OF DISTANCE BETWEEN VECTORS :"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.metrics.pairwise import cosine_similarity"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {},
"outputs": [],
"source": [
"similarity = cosine_similarity(vectors)"
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {},
"outputs": [],
"source": [
"def recommended_movies(mov):\n",
" idx = new[new[\"Title\"] == mov].index[0]\n",
" corr = similarity[idx]\n",
" rec = sorted(list(enumerate(corr)), reverse=True, key=lambda x: x[1])[1:6]\n",
"\n",
" for i in rec:\n",
" print(new.iloc[i[0]].Title)"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Melancholian 3 huonetta (2004)\n",
"The War on Democracy (2007)\n",
"My First War (2008)\n",
"Shooting War (2000)\n",
"Armadillo (2010)\n"
]
}
],
"source": [
"recommended_movies(\"Armadillo (2010)\")"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {},
"outputs": [],
"source": [
"from pickle import dump,load"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {},
"outputs": [],
"source": [
"# dump(new, open(\"hollywood.pkl\", 'wb'))"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"outputs": [],
"source": [
"# dump(similarity, open(\"sim_hollywood.pkl\", 'wb'))"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {},
"outputs": [],
"source": [
"data2 = load(open(\"hollywood.pkl\", 'rb'))\n",
"holly = pd.DataFrame(data2)\n"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" level_0 | \n",
" index | \n",
" Poster | \n",
" Title | \n",
" year | \n",
" Genre | \n",
" tags | \n",
"
\n",
" \n",
" \n",
" \n",
" 551 | \n",
" 551 | \n",
" 551 | \n",
" https://upload.wikimedia.org/wikipedia/en/thum... | \n",
" ROAR: Tigers of the Sundarbans | \n",
" 2014 | \n",
" [Action, Adventure, Sci-Fi] | \n",
" action adventur sci-fi after hi photojournalis... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" level_0 index Poster \\\n",
"551 551 551 https://upload.wikimedia.org/wikipedia/en/thum... \n",
"\n",
" Title year Genre \\\n",
"551 ROAR: Tigers of the Sundarbans 2014 [Action, Adventure, Sci-Fi] \n",
"\n",
" tags \n",
"551 action adventur sci-fi after hi photojournalis... "
]
},
"execution_count": 121,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"holly.sample()"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(7118, 7)"
]
},
"execution_count": 122,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"holly.shape"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" level_0 | \n",
" index | \n",
" Poster | \n",
" Title | \n",
" year | \n",
" Genre | \n",
" tags | \n",
"
\n",
" \n",
" \n",
" \n",
" 4913 | \n",
" 3226 | \n",
" 3226 | \n",
" http://ia.media-imdb.com/images/G/01/imdb/imag... | \n",
" Slow Southern Steel (2010) | \n",
" 2010 | \n",
" [Documentary, History, Music] | \n",
" slow southern steel documentary history music | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" level_0 index Poster \\\n",
"4913 3226 3226 http://ia.media-imdb.com/images/G/01/imdb/imag... \n",
"\n",
" Title year Genre \\\n",
"4913 Slow Southern Steel (2010) 2010 [Documentary, History, Music] \n",
"\n",
" tags \n",
"4913 slow southern steel documentary history music "
]
},
"execution_count": 123,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"holly.sample()"
]
},
{
"cell_type": "code",
"execution_count": 124,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Int64Index([0], dtype='int64')"
]
},
"execution_count": 124,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"holly[holly[\"Title\"] == \"Uri: The Surgical Strike\"].index"
]
},
{
"cell_type": "code",
"execution_count": 125,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"('Avatar (2009)',\n",
" 'https://images-na.ssl-images-amazon.com/images/M/MV5BMTYwOTEwNjAzMl5BMl5BanBnXkFtZTcwODc5MTUwMw@@._V1_UX182_CR0,0,182,268_AL_.jpg',\n",
" 'Action Adventure Fantasy',\n",
" 2009,\n",
" ' ',\n",
" 3235)"
]
},
"execution_count": 125,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"idx = holly[holly[\"Title\"] == \"Avatar (2009)\"].index[0]\n",
"p = holly.iloc[idx].Poster\n",
"l = holly.iloc[idx].Title\n",
"c = \" \".join([holly.iloc[idx].Genre][0])\n",
"y = holly.iloc[idx].year\n",
"s = \" \"\n",
"l,p,c,y,s,idx"
]
},
{
"cell_type": "code",
"execution_count": 126,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" level_0 | \n",
" index | \n",
" Poster | \n",
" Title | \n",
" year | \n",
" Genre | \n",
" tags | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" https://upload.wikimedia.org/wikipedia/en/thum... | \n",
" Uri: The Surgical Strike | \n",
" 2019 | \n",
" [Action, Drama, War] | \n",
" action drama war divid over five chapter the f... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" level_0 index Poster \\\n",
"0 0 0 https://upload.wikimedia.org/wikipedia/en/thum... \n",
"\n",
" Title year Genre \\\n",
"0 Uri: The Surgical Strike 2019 [Action, Drama, War] \n",
"\n",
" tags \n",
"0 action drama war divid over five chapter the f... "
]
},
"execution_count": 126,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"holly[holly[\"Title\"] == \"Uri: The Surgical Strike\"]"
]
},
{
"cell_type": "code",
"execution_count": 127,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"level_0 1548\n",
"index 1548\n",
"Poster https://upload.wikimedia.org/wikipedia/en/thum...\n",
"Title Pyaasa\n",
"year 2002\n",
"Genre [Drama]\n",
"tags drama suraj thakur (aftab shivdasani) is one o...\n",
"Name: 1548, dtype: object"
]
},
"execution_count": 127,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"holly.iloc[1548]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "706654849fe4d07e215a38f448ee8e5d780794e2be3793e11d37ab3169b306ae"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}