{ "cells": [ { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")" ] }, { "cell_type": "code", "execution_count": 128, "metadata": {}, "outputs": [], "source": [ "# Checking for available files in the folder" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['bollywood_full_1950-2019.csv', 'MovieGenre.csv', 'Top_10000_Movies.csv']" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dirc = 'data'\n", "os.listdir(dirc)" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
imdbIdImdb LinkTitleIMDB ScoreGenrePoster
10164437407http://www.imdb.com/title/tt437407Parineeta (2005)7.3Drama|Musical|Romancehttps://images-na.ssl-images-amazon.com/images...
\n", "
" ], "text/plain": [ " imdbId Imdb Link Title \\\n", "10164 437407 http://www.imdb.com/title/tt437407 Parineeta (2005) \n", "\n", " IMDB Score Genre \\\n", "10164 7.3 Drama|Musical|Romance \n", "\n", " Poster \n", "10164 https://images-na.ssl-images-amazon.com/images... " ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv('data/MovieGenre.csv', encoding='latin-1')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df.sample()" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(40108, 6)" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 40108 entries, 0 to 40107\n", "Data columns (total 6 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 imdbId 40108 non-null int64 \n", " 1 Imdb Link 40108 non-null object \n", " 2 Title 40108 non-null object \n", " 3 IMDB Score 40060 non-null float64\n", " 4 Genre 39963 non-null object \n", " 5 Poster 39383 non-null object \n", "dtypes: float64(1), int64(1), object(4)\n", "memory usage: 1.8+ MB\n" ] } ], "source": [ "df.info()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### DATA CLEANING :" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "593" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.duplicated().sum()" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [], "source": [ "df.drop_duplicates(inplace=True)" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "imdbId 0\n", "Imdb Link 0\n", "Title 0\n", "IMDB Score 48\n", "Genre 145\n", "Poster 724\n", "dtype: int64" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "df.dropna(inplace=True)" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(38654, 6)" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "pop = df[df[\"IMDB Score\"] > 7]" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'https://images-na.ssl-images-amazon.com/images/M/MV5BMDU2ZWJlMjktMTRhMy00ZTA5LWEzNDgtYmNmZTEwZTViZWJkXkEyXkFqcGdeQXVyNDQ2OTk4MzI@._V1_UX182_CR0,0,182,268_AL_.jpg'" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.loc[0,\"Poster\"]" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(12345, 6)" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pop.shape" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
imdbIdImdb LinkTitleIMDB ScoreGenrePoster
0114709http://www.imdb.com/title/tt114709Toy Story (1995)8.3Animation|Adventure|Comedyhttps://images-na.ssl-images-amazon.com/images...
\n", "
" ], "text/plain": [ " imdbId Imdb Link Title IMDB Score \\\n", "0 114709 http://www.imdb.com/title/tt114709 Toy Story (1995) 8.3 \n", "\n", " Genre \\\n", "0 Animation|Adventure|Comedy \n", "\n", " Poster \n", "0 https://images-na.ssl-images-amazon.com/images... " ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pop.head(1)" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
imdbIdImdb LinkTitleIMDB ScoreGenrePosteryear
0114709http://www.imdb.com/title/tt114709Toy Story (1995)8.3Animation|Adventure|Comedyhttps://images-na.ssl-images-amazon.com/images...1995
\n", "
" ], "text/plain": [ " imdbId Imdb Link Title IMDB Score \\\n", "0 114709 http://www.imdb.com/title/tt114709 Toy Story (1995) 8.3 \n", "\n", " Genre \\\n", "0 Animation|Adventure|Comedy \n", "\n", " Poster year \n", "0 https://images-na.ssl-images-amazon.com/images... 1995 " ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pop[\"year\"] = pop[\"Title\"].apply(lambda x: x[-5:-1])\n", "pop.head(1)" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(11987, 7)" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "for i in pop[\"year\"]:\n", " if not str(i).isdigit():\n", " pop.drop(index = pop[pop[\"year\"]==i].index[0], inplace=True)\n", "pop.shape" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [], "source": [ "pop.reset_index(drop=True, inplace=True)" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 1995\n", "1 1995\n", "2 1995\n", "3 1995\n", "4 1995\n", " ... \n", "11982 2016\n", "11983 2009\n", "11984 1967\n", "11985 2014\n", "11986 2015\n", "Name: year, Length: 11987, dtype: int64" ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.to_numeric(pop[\"year\"])" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'2017'" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "max(pop[\"year\"])" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [], "source": [ "l = []\n", "def year(df) : \n", " try : \n", " if int(df['year']) > 1999 : \n", " l.append(df.name)\n", " except : \n", " pass" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 None\n", "1 None\n", "2 None\n", "3 None\n", "4 None\n", " ... \n", "11982 None\n", "11983 None\n", "11984 None\n", "11985 None\n", "11986 None\n", "Length: 11987, dtype: object" ] }, "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pop.apply(year, axis=1)" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [], "source": [ "movies = pop.iloc[l,:]" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(5431, 7)" ] }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.shape" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [], "source": [ "movies.reset_index(inplace=True)" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
indeximdbIdImdb LinkTitleIMDB ScoreGenrePosteryear
611455221073http://www.imdb.com/title/tt221073Chopper (2000)7.2Biography|Crime|Dramahttps://images-na.ssl-images-amazon.com/images...2000
\n", "
" ], "text/plain": [ " index imdbId Imdb Link Title \\\n", "61 1455 221073 http://www.imdb.com/title/tt221073 Chopper (2000) \n", "\n", " IMDB Score Genre \\\n", "61 7.2 Biography|Crime|Drama \n", "\n", " Poster year \n", "61 https://images-na.ssl-images-amazon.com/images... 2000 " ] }, "execution_count": 89, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.sample()" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TitleGenrePosteryear
3687The Amazing Screw-On Head (2006)Animation|Short|Actionhttps://images-na.ssl-images-amazon.com/images...2006
\n", "
" ], "text/plain": [ " Title Genre \\\n", "3687 The Amazing Screw-On Head (2006) Animation|Short|Action \n", "\n", " Poster year \n", "3687 https://images-na.ssl-images-amazon.com/images... 2006 " ] }, "execution_count": 90, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.drop(columns=[\"index\", \"imdbId\", \"Imdb Link\", \"IMDB Score\"], inplace=True)\n", "movies.sample()" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Title 0\n", "Genre 0\n", "Poster 0\n", "year 0\n", "dtype: int64" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TitleGenrePosteryear
2165War Horse (2011)[Drama, War]https://images-na.ssl-images-amazon.com/images...2011
\n", "
" ], "text/plain": [ " Title Genre \\\n", "2165 War Horse (2011) [Drama, War] \n", "\n", " Poster year \n", "2165 https://images-na.ssl-images-amazon.com/images... 2011 " ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies[\"Genre\"] = movies[\"Genre\"].apply(lambda row: str(row).split(\"|\"))\n", "movies.sample()" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [], "source": [ "movies[\"tags\"] = movies[\"Title\"].apply(lambda x: str(x)[:-6].split())" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [], "source": [ "movies[\"tags\"] = movies[\"tags\"] + movies[\"Genre\"]\n", "movies[\"tags\"] = movies[\"tags\"].apply(lambda x: (\" \".join(x)).lower())" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TitleGenrePosteryeartags
312Revolution OS (2001)[Documentary, Comedy]https://images-na.ssl-images-amazon.com/images...2001revolution os documentary comedy
\n", "
" ], "text/plain": [ " Title Genre \\\n", "312 Revolution OS (2001) [Documentary, Comedy] \n", "\n", " Poster year \\\n", "312 https://images-na.ssl-images-amazon.com/images... 2001 \n", "\n", " tags \n", "312 revolution os documentary comedy " ] }, "execution_count": 95, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.sample()" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [], "source": [ "from pickle import load\n", "bolly = load(open(\"movie_info.pkl\",\"rb\"))" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
poster_pathoriginal_titleyear_of_releasegenresimdb_ratingimdb_votessummarytags
1572https://upload.wikimedia.org/wikipedia/en/thum...Yaadein...2001[Drama, Musical, Romance]4.43034.0[Raj, Singh, Puri, is, best, friends, with, L....drama music romanc raj singh puri is best frie...
\n", "
" ], "text/plain": [ " poster_path original_title \\\n", "1572 https://upload.wikimedia.org/wikipedia/en/thum... Yaadein... \n", "\n", " year_of_release genres imdb_rating imdb_votes \\\n", "1572 2001 [Drama, Musical, Romance] 4.4 3034.0 \n", "\n", " summary \\\n", "1572 [Raj, Singh, Puri, is, best, friends, with, L.... \n", "\n", " tags \n", "1572 drama music romanc raj singh puri is best frie... " ] }, "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bolly = pd.DataFrame(bolly)\n", "bolly.sample()" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
TitleGenrePosteryeartags
1512Ai no mukidashi (2008)[Action, Comedy, Drama]https://images-na.ssl-images-amazon.com/images...2008ai no mukidashi action comedy drama
\n", "
" ], "text/plain": [ " Title Genre \\\n", "1512 Ai no mukidashi (2008) [Action, Comedy, Drama] \n", "\n", " Poster year \\\n", "1512 https://images-na.ssl-images-amazon.com/images... 2008 \n", "\n", " tags \n", "1512 ai no mukidashi action comedy drama " ] }, "execution_count": 98, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.sample()" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PosterTitleyearGenreimdb_ratingimdb_votessummarytags
1490https://upload.wikimedia.org/wikipedia/commons...Kaante2002[Action, Crime, Drama]6.64267.0[Six, bank, robbers, trying, to, pull, off, th...action crime drama on or about 12may00 a truck...
\n", "
" ], "text/plain": [ " Poster Title year \\\n", "1490 https://upload.wikimedia.org/wikipedia/commons... Kaante 2002 \n", "\n", " Genre imdb_rating imdb_votes \\\n", "1490 [Action, Crime, Drama] 6.6 4267.0 \n", "\n", " summary \\\n", "1490 [Six, bank, robbers, trying, to, pull, off, th... \n", "\n", " tags \n", "1490 action crime drama on or about 12may00 a truck... " ] }, "execution_count": 99, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bolly.rename(columns={\"poster_path\" : \"Poster\", \"original_title\" : \"Title\", \n", " \"year_of_release\" : \"year\", \"genres\" : \"Genre\"}, inplace=True)\n", "\n", "bolly.sample()" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [], "source": [ "bolly.shape \n", "bolly.reset_index(inplace=True)\n", "movies.reset_index(inplace=True)" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [], "source": [ "new = pd.concat([bolly, movies]).drop(columns=[\"imdb_rating\",\"imdb_votes\",\"summary\"])" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [], "source": [ "new.reset_index(inplace=True)" ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(5431, 6)" ] }, "execution_count": 103, "metadata": {}, "output_type": "execute_result" } ], "source": [ "movies.shape" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "level_0 0\n", "index 0\n", "Poster 0\n", "Title 0\n", "year 0\n", "Genre 0\n", "tags 0\n", "dtype: int64" ] }, "execution_count": 104, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new.isnull().sum()" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 7118 entries, 0 to 7117\n", "Data columns (total 7 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 level_0 7118 non-null int64 \n", " 1 index 7118 non-null int64 \n", " 2 Poster 7118 non-null object\n", " 3 Title 7118 non-null object\n", " 4 year 7118 non-null object\n", " 5 Genre 7118 non-null object\n", " 6 tags 7118 non-null object\n", "dtypes: int64(2), object(5)\n", "memory usage: 389.4+ KB\n" ] } ], "source": [ "new.info()" ] }, { "cell_type": "code", "execution_count": 106, "metadata": {}, "outputs": [], "source": [ "new[\"year\"] = pd.to_numeric(new[\"year\"])" ] }, { "cell_type": "code", "execution_count": 107, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 7118 entries, 0 to 7117\n", "Data columns (total 7 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 level_0 7118 non-null int64 \n", " 1 index 7118 non-null int64 \n", " 2 Poster 7118 non-null object\n", " 3 Title 7118 non-null object\n", " 4 year 7118 non-null int64 \n", " 5 Genre 7118 non-null object\n", " 6 tags 7118 non-null object\n", "dtypes: int64(3), object(4)\n", "memory usage: 389.4+ KB\n" ] } ], "source": [ "new.info()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### ENCODING :" ] }, { "cell_type": "code", "execution_count": 108, "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "cv = CountVectorizer(max_features=5000, stop_words='english')" ] }, { "cell_type": "code", "execution_count": 109, "metadata": {}, "outputs": [], "source": [ "vectors = cv.fit_transform(new[\"tags\"]).toarray()" ] }, { "cell_type": "code", "execution_count": 110, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(7118, 5000)" ] }, "execution_count": 110, "metadata": {}, "output_type": "execute_result" } ], "source": [ "vectors.shape" ] }, { "cell_type": "code", "execution_count": 111, "metadata": {}, "outputs": [], "source": [ "most_common_words = cv.get_feature_names_out()" ] }, { "cell_type": "code", "execution_count": 112, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0001010010th1111th121313th14...zerozindazindagizintazonezoonizoyazubeidazulfiìê
00000100000...0000000000
10000000000...0000000000
20000000000...0000000000
30000000000...0000000000
40000000000...0000000000
..................................................................
71130000000000...0000000000
71140000000000...0000000000
71150000000000...0000000000
71160000000000...0000000000
71170000000000...0000000000
\n", "

7118 rows × 5000 columns

\n", "
" ], "text/plain": [ " 000 10 100 10th 11 11th 12 13 13th 14 ... zero zinda \\\n", "0 0 0 0 0 1 0 0 0 0 0 ... 0 0 \n", "1 0 0 0 0 0 0 0 0 0 0 ... 0 0 \n", "2 0 0 0 0 0 0 0 0 0 0 ... 0 0 \n", "3 0 0 0 0 0 0 0 0 0 0 ... 0 0 \n", "4 0 0 0 0 0 0 0 0 0 0 ... 0 0 \n", "... ... .. ... ... .. ... .. .. ... .. ... ... ... \n", "7113 0 0 0 0 0 0 0 0 0 0 ... 0 0 \n", "7114 0 0 0 0 0 0 0 0 0 0 ... 0 0 \n", "7115 0 0 0 0 0 0 0 0 0 0 ... 0 0 \n", "7116 0 0 0 0 0 0 0 0 0 0 ... 0 0 \n", "7117 0 0 0 0 0 0 0 0 0 0 ... 0 0 \n", "\n", " zindagi zinta zone zooni zoya zubeida zulfi ìê \n", "0 0 0 0 0 0 0 0 0 \n", "1 0 0 0 0 0 0 0 0 \n", "2 0 0 0 0 0 0 0 0 \n", "3 0 0 0 0 0 0 0 0 \n", "4 0 0 0 0 0 0 0 0 \n", "... ... ... ... ... ... ... ... .. \n", "7113 0 0 0 0 0 0 0 0 \n", "7114 0 0 0 0 0 0 0 0 \n", "7115 0 0 0 0 0 0 0 0 \n", "7116 0 0 0 0 0 0 0 0 \n", "7117 0 0 0 0 0 0 0 0 \n", "\n", "[7118 rows x 5000 columns]" ] }, "execution_count": 112, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cs = pd.DataFrame(vectors, columns=most_common_words)\n", "# cs.to_csv(\"data\\common_words.csv\")\n", "cs" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "#### RECOMMENDATION ON THE BASIS OF DISTANCE BETWEEN VECTORS :" ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics.pairwise import cosine_similarity" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [], "source": [ "similarity = cosine_similarity(vectors)" ] }, { "cell_type": "code", "execution_count": 115, "metadata": {}, "outputs": [], "source": [ "def recommended_movies(mov):\n", " idx = new[new[\"Title\"] == mov].index[0]\n", " corr = similarity[idx]\n", " rec = sorted(list(enumerate(corr)), reverse=True, key=lambda x: x[1])[1:6]\n", "\n", " for i in rec:\n", " print(new.iloc[i[0]].Title)" ] }, { "cell_type": "code", "execution_count": 116, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Melancholian 3 huonetta (2004)\n", "The War on Democracy (2007)\n", "My First War (2008)\n", "Shooting War (2000)\n", "Armadillo (2010)\n" ] } ], "source": [ "recommended_movies(\"Armadillo (2010)\")" ] }, { "cell_type": "code", "execution_count": 117, "metadata": {}, "outputs": [], "source": [ "from pickle import dump,load" ] }, { "cell_type": "code", "execution_count": 118, "metadata": {}, "outputs": [], "source": [ "# dump(new, open(\"hollywood.pkl\", 'wb'))" ] }, { "cell_type": "code", "execution_count": 119, "metadata": {}, "outputs": [], "source": [ "# dump(similarity, open(\"sim_hollywood.pkl\", 'wb'))" ] }, { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [], "source": [ "data2 = load(open(\"hollywood.pkl\", 'rb'))\n", "holly = pd.DataFrame(data2)\n" ] }, { "cell_type": "code", "execution_count": 121, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
level_0indexPosterTitleyearGenretags
551551551https://upload.wikimedia.org/wikipedia/en/thum...ROAR: Tigers of the Sundarbans2014[Action, Adventure, Sci-Fi]action adventur sci-fi after hi photojournalis...
\n", "
" ], "text/plain": [ " level_0 index Poster \\\n", "551 551 551 https://upload.wikimedia.org/wikipedia/en/thum... \n", "\n", " Title year Genre \\\n", "551 ROAR: Tigers of the Sundarbans 2014 [Action, Adventure, Sci-Fi] \n", "\n", " tags \n", "551 action adventur sci-fi after hi photojournalis... " ] }, "execution_count": 121, "metadata": {}, "output_type": "execute_result" } ], "source": [ "holly.sample()" ] }, { "cell_type": "code", "execution_count": 122, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(7118, 7)" ] }, "execution_count": 122, "metadata": {}, "output_type": "execute_result" } ], "source": [ "holly.shape" ] }, { "cell_type": "code", "execution_count": 123, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
level_0indexPosterTitleyearGenretags
491332263226http://ia.media-imdb.com/images/G/01/imdb/imag...Slow Southern Steel (2010)2010[Documentary, History, Music]slow southern steel documentary history music
\n", "
" ], "text/plain": [ " level_0 index Poster \\\n", "4913 3226 3226 http://ia.media-imdb.com/images/G/01/imdb/imag... \n", "\n", " Title year Genre \\\n", "4913 Slow Southern Steel (2010) 2010 [Documentary, History, Music] \n", "\n", " tags \n", "4913 slow southern steel documentary history music " ] }, "execution_count": 123, "metadata": {}, "output_type": "execute_result" } ], "source": [ "holly.sample()" ] }, { "cell_type": "code", "execution_count": 124, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Int64Index([0], dtype='int64')" ] }, "execution_count": 124, "metadata": {}, "output_type": "execute_result" } ], "source": [ "holly[holly[\"Title\"] == \"Uri: The Surgical Strike\"].index" ] }, { "cell_type": "code", "execution_count": 125, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('Avatar (2009)',\n", " 'https://images-na.ssl-images-amazon.com/images/M/MV5BMTYwOTEwNjAzMl5BMl5BanBnXkFtZTcwODc5MTUwMw@@._V1_UX182_CR0,0,182,268_AL_.jpg',\n", " 'Action Adventure Fantasy',\n", " 2009,\n", " ' ',\n", " 3235)" ] }, "execution_count": 125, "metadata": {}, "output_type": "execute_result" } ], "source": [ "idx = holly[holly[\"Title\"] == \"Avatar (2009)\"].index[0]\n", "p = holly.iloc[idx].Poster\n", "l = holly.iloc[idx].Title\n", "c = \" \".join([holly.iloc[idx].Genre][0])\n", "y = holly.iloc[idx].year\n", "s = \" \"\n", "l,p,c,y,s,idx" ] }, { "cell_type": "code", "execution_count": 126, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
level_0indexPosterTitleyearGenretags
000https://upload.wikimedia.org/wikipedia/en/thum...Uri: The Surgical Strike2019[Action, Drama, War]action drama war divid over five chapter the f...
\n", "
" ], "text/plain": [ " level_0 index Poster \\\n", "0 0 0 https://upload.wikimedia.org/wikipedia/en/thum... \n", "\n", " Title year Genre \\\n", "0 Uri: The Surgical Strike 2019 [Action, Drama, War] \n", "\n", " tags \n", "0 action drama war divid over five chapter the f... " ] }, "execution_count": 126, "metadata": {}, "output_type": "execute_result" } ], "source": [ "holly[holly[\"Title\"] == \"Uri: The Surgical Strike\"]" ] }, { "cell_type": "code", "execution_count": 127, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "level_0 1548\n", "index 1548\n", "Poster https://upload.wikimedia.org/wikipedia/en/thum...\n", "Title Pyaasa\n", "year 2002\n", "Genre [Drama]\n", "tags drama suraj thakur (aftab shivdasani) is one o...\n", "Name: 1548, dtype: object" ] }, "execution_count": 127, "metadata": {}, "output_type": "execute_result" } ], "source": [ "holly.iloc[1548]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "706654849fe4d07e215a38f448ee8e5d780794e2be3793e11d37ab3169b306ae" } } }, "nbformat": 4, "nbformat_minor": 2 }