{ "cells": [ { "cell_type": "markdown", "id": "30487fe4-5659-41fa-b2a7-7ca9d677b169", "metadata": {}, "source": [ "# Prediction:" ] }, { "cell_type": "code", "execution_count": 34, "id": "86ad082a-d600-493f-aac4-310e520cfe84", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to\n", "[nltk_data] C:\\Users\\26656\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package punkt to\n", "[nltk_data] C:\\Users\\26656\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", "[nltk_data] Downloading package wordnet to\n", "[nltk_data] C:\\Users\\26656\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n", "[nltk_data] Downloading package omw-1.4 to\n", "[nltk_data] C:\\Users\\26656\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package omw-1.4 is already up-to-date!\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Accuracy of the loaded model: 0.4909329829172142\n" ] } ], "source": [ "import pandas as pd\n", "import nltk\n", "from nltk.corpus import stopwords\n", "from nltk.tokenize import word_tokenize\n", "from string import punctuation\n", "import re\n", "import joblib\n", "from sklearn.metrics import accuracy_score\n", "\n", "# Load data\n", "df = pd.read_csv('./data.csv', usecols=['title', 'news'])\n", "\n", "# Download necessary NLTK data\n", "nltk.download('stopwords')\n", "nltk.download('punkt')\n", "nltk.download('wordnet')\n", "nltk.download('omw-1.4')\n", "\n", "# Define stop words\n", "stop_words = set(stopwords.words('english'))\n", "\n", "# Text cleaning function\n", "def clean_text(text):\n", " # Tokenize and lowercase\n", " words = word_tokenize(text.lower())\n", " # Remove stop words, punctuation, and digits\n", " words = [\n", " word for word in words\n", " if word not in stop_words and word not in punctuation and not re.search(r'\\d', word)\n", " ]\n", " # Rejoin words\n", " return ' '.join(words)\n", "\n", "# Apply cleaning function to the 'title' column\n", "df['title'] = df['title'].apply(clean_text)\n", "\n", "# Define features (X) and labels (y)\n", "X = df['title'] # Use the cleaned titles as features\n", "y = df['news'].apply(lambda x: 1 if x == 'fox' else 0) # Convert 'news' to binary labels\n", "\n", "# Load the saved vectorizer and model\n", "loaded_vectorizer = joblib.load('vectorizer_bong.pkl')\n", "loaded_model = joblib.load('naive_bayes_model.pkl')\n", "\n", "# Transform the text data using the loaded vectorizer\n", "X_test = loaded_vectorizer.transform(X)\n", "\n", "# Use the loaded model to make predictions\n", "y_pred_loaded = loaded_model.predict(X_test)\n", "\n", "# Evaluate the model\n", "accuracy = accuracy_score(y, y_pred_loaded)\n", "print(\"Accuracy of the loaded model:\", accuracy)" ] }, { "cell_type": "markdown", "id": "eda06eb9-cde6-4370-b189-3d06ffe505c8", "metadata": {}, "source": [ "## Data Processing" ] }, { "cell_type": "code", "execution_count": 35, "id": "23e35ed8-027e-42f8-8902-43591ef7e34e", "metadata": {}, "outputs": [], "source": [ "!pip install geopy > delete.txt\n", "!pip install datasets > delete.txt\n", "!pip install torch torchvision datasets > delete.txt\n", "!pip install huggingface_hub > delete.txt\n", "!pip install pyhocon > delete.txt\n", "!pip install transformers > delete.txt\n", "!rm delete.txt" ] }, { "cell_type": "code", "execution_count": 36, "id": "8d91a73a-cde9-48a6-9d2a-83715ad64a12", "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "import warnings\n", "warnings.filterwarnings(\"ignore\") \n", "from sklearn.model_selection import train_test_split\n", "import numpy as np\n", "import pandas as pd\n", "import re\n", "from datetime import datetime\n", "\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "from string import punctuation\n", "from nltk.corpus import stopwords\n", "from nltk.tokenize import word_tokenize\n", "import nltk\n", "pd.options.display.max_colwidth = None" ] }, { "cell_type": "code", "execution_count": 37, "id": "d6e65e5d-df2b-4db6-b866-6c473ed85919", "metadata": {}, "outputs": [], "source": [ "df=pd.read_csv('./data.csv',usecols=['title', 'news'])" ] }, { "cell_type": "code", "execution_count": 38, "id": "6c291504-9f0c-46cc-aaee-9ce274ccaafc", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlenews
0Jack Carr recalls Gen. Eisenhower's D-Day memo about 'great and noble undertaking'fox
1Bruce Willis, Demi Moore avoided doing one thing while co-parenting, daughter saysfox
2Blinken meets Qatar PM, says Israeli actions are not 'retaliation,' but 'defending the lives of its people'fox
3Emily Blunt says her ‘toes curl’ when people tell her their kids want to act: 'I want to say, don’t do it!'fox
4'The View' co-host, CNN commentator Ana Navarro to host night 2 of Democratic National Conventionfox
\n", "
" ], "text/plain": [ " title \\\n", "0 Jack Carr recalls Gen. Eisenhower's D-Day memo about 'great and noble undertaking' \n", "1 Bruce Willis, Demi Moore avoided doing one thing while co-parenting, daughter says \n", "2 Blinken meets Qatar PM, says Israeli actions are not 'retaliation,' but 'defending the lives of its people' \n", "3 Emily Blunt says her ‘toes curl’ when people tell her their kids want to act: 'I want to say, don’t do it!' \n", "4 'The View' co-host, CNN commentator Ana Navarro to host night 2 of Democratic National Convention \n", "\n", " news \n", "0 fox \n", "1 fox \n", "2 fox \n", "3 fox \n", "4 fox " ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 39, "id": "7106c263-be37-4037-bdfc-8c8308e895fc", "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to\n", "[nltk_data] C:\\Users\\26656\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package punkt_tab to\n", "[nltk_data] C:\\Users\\26656\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package punkt_tab is already up-to-date!\n", "[nltk_data] Downloading package wordnet to\n", "[nltk_data] C:\\Users\\26656\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n", "[nltk_data] Downloading package omw-1.4 to\n", "[nltk_data] C:\\Users\\26656\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package omw-1.4 is already up-to-date!\n" ] } ], "source": [ "nltk.download('stopwords')\n", "nltk.download('punkt_tab')\n", "nltk.download('wordnet')\n", "nltk.download('omw-1.4')\n", "\n", "stop_words = set(stopwords.words('english'))\n", "\n", "def clean_text(text):\n", " # Remove punctuation\n", " words = word_tokenize(text.lower())\n", " # Remove stop words and punctuation\n", " words = [\n", " word for word in words\n", " if word.lower() not in stop_words and word not in punctuation and not re.search(r'\\d', word)\n", " ]\n", " \n", " # Rejoin words\n", " return ' '.join(words)\n", "\n", "# Apply the function to the 'title' column\n", "df['title'] = df['title'].apply(clean_text)" ] }, { "cell_type": "code", "execution_count": 40, "id": "91f557c2-1aab-43ee-b464-f3ef7a33556d", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlenews
0jack carr recalls gen. eisenhower 's d-day memo 'great noble undertakingfox
1bruce willis demi moore avoided one thing co-parenting daughter saysfox
2blinken meets qatar pm says israeli actions 'retaliation 'defending lives peoplefox
3emily blunt says ‘ toes curl ’ people tell kids want act want say ’fox
4'the view co-host cnn commentator ana navarro host night democratic national conventionfox
.........
3800trump 's lawyers seek post-election day delay court fight immunity decision fallout interference casenbc
3801treat acne scars hyperpigmentation according expertsnbc
3802best vegetarian vegan meal delivery services according expertsnbc
3803trump says presidential civilian award 'better top military honor whose recipients 'dead 'hit bulletsnbc
3804best white elephant secret santa gift ideasnbc
\n", "

3805 rows × 2 columns

\n", "
" ], "text/plain": [ " title \\\n", "0 jack carr recalls gen. eisenhower 's d-day memo 'great noble undertaking \n", "1 bruce willis demi moore avoided one thing co-parenting daughter says \n", "2 blinken meets qatar pm says israeli actions 'retaliation 'defending lives people \n", "3 emily blunt says ‘ toes curl ’ people tell kids want act want say ’ \n", "4 'the view co-host cnn commentator ana navarro host night democratic national convention \n", "... ... \n", "3800 trump 's lawyers seek post-election day delay court fight immunity decision fallout interference case \n", "3801 treat acne scars hyperpigmentation according experts \n", "3802 best vegetarian vegan meal delivery services according experts \n", "3803 trump says presidential civilian award 'better top military honor whose recipients 'dead 'hit bullets \n", "3804 best white elephant secret santa gift ideas \n", "\n", " news \n", "0 fox \n", "1 fox \n", "2 fox \n", "3 fox \n", "4 fox \n", "... ... \n", "3800 nbc \n", "3801 nbc \n", "3802 nbc \n", "3803 nbc \n", "3804 nbc \n", "\n", "[3805 rows x 2 columns]" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 41, "id": "e01e6fd5-9314-4000-b467-dd718bc77acb", "metadata": {}, "outputs": [], "source": [ "train_df, test_df = train_test_split(df, test_size=0.2, random_state=41)\n", "X_train = train_df['title']\n", "y_train = train_df['news']\n", "X_test = test_df['title']\n", "y_test = test_df['news']\n", "y_train = y_train.apply(lambda x: 1 if x == 'fox' else 0)\n", "y_test = y_test.apply(lambda x: 1 if x == 'fox' else 0)\n", "accuracy_scores={}" ] }, { "cell_type": "markdown", "id": "0d65ad3b-9185-4462-a6fe-1f33ede24ce2", "metadata": {}, "source": [ "## Word Embedding" ] }, { "cell_type": "markdown", "id": "e267400b-0c90-496d-b392-fe651525150a", "metadata": {}, "source": [ "### TF-IDF/ Bag of worsd/Bag of ngrams\n" ] }, { "cell_type": "code", "execution_count": 42, "id": "12c31464-013a-4221-81d6-7d81ddb6c931", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Feature Names: ['aapi' 'aaron' 'abandon' ... 'zuckerberg' 'zyn' 'zzz']\n", "7471\n", "Feature Names: ['aapi' 'aaron' 'abandon' ... 'zuckerberg' 'zyn' 'zzz']\n", "7471\n", "Feature Names: ['aapi' 'aapi owned' 'aaron' ... 'zyn maker' 'zzz' 'zzz amazon']\n", "29715\n", "Vectorizer saved successfully.\n" ] } ], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.metrics import classification_report\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "\n", "vectorizer = TfidfVectorizer(max_features=10000) # Adjust max_features if needed\n", "X_train_tfidf = vectorizer.fit_transform(X_train)\n", "X_test_tfidf = vectorizer.transform(X_test)\n", "print(\"Feature Names:\", vectorizer.get_feature_names_out())\n", "print(len(vectorizer.vocabulary_))\n", "\n", "vectorizer = CountVectorizer() # Adjust max_features if needed\n", "X_train_bow = vectorizer.fit_transform(X_train)\n", "X_test_bow = vectorizer.transform(X_test)\n", "print(\"Feature Names:\", vectorizer.get_feature_names_out())\n", "print(len(vectorizer.vocabulary_))\n", "\n", "vectorizer = CountVectorizer(ngram_range=(1, 2)) # Adjust max_features&ngram_range if needed\n", "# Fit and transform the training data, then transform the test data\n", "X_train_bong = vectorizer.fit_transform(X_train)\n", "X_test_bong = vectorizer.transform(X_test)\n", "# Display the feature names (words and n-grams)\n", "print(\"Feature Names:\", vectorizer.get_feature_names_out())\n", "print(len(vectorizer.vocabulary_))\n", "joblib.dump(vectorizer, 'vectorizer_bong.pkl')\n", "print(\"Vectorizer saved successfully.\")\n" ] }, { "cell_type": "markdown", "id": "d9c46662-7c07-4a4f-b2dc-803a39153a46", "metadata": {}, "source": [ "### Word2Vec" ] }, { "cell_type": "code", "execution_count": 43, "id": "1564acd6-ea3b-4292-8aad-94cc07e179ac", "metadata": {}, "outputs": [], "source": [ "from gensim.models import KeyedVectors\n", "#install scipy<1.13 to be compatible for gensim\n", "#!pip install \"scipy<1.13\"\n", "word2vec_model = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz', binary=True,limit=500000)\n", "\n", "# Tokenize your text data\n", "X_train_tokenized = [sentence.split() for sentence in X_train]\n", "X_test_tokenized = [sentence.split() for sentence in X_test]\n", "\n", "# Define a function to average word vectors for each sentence\n", "def average_word_vectors(sentence, model, vector_size):\n", " words = [word for word in sentence if word in model]\n", " return np.mean(model[words], axis=0)\n", "# Apply average word vectors on training and test sets\n", "X_train_word2vec = np.array([average_word_vectors(sentence, word2vec_model, 300) for sentence in X_train_tokenized])\n", "X_test_word2vec = np.array([average_word_vectors(sentence, word2vec_model, 300) for sentence in X_test_tokenized])" ] }, { "cell_type": "markdown", "id": "be0674db-2da7-4347-8552-25ce83fd4337", "metadata": {}, "source": [ "### Glove" ] }, { "cell_type": "code", "execution_count": 44, "id": "311c869b-a8cf-4185-9619-9c768ac05e8b", "metadata": {}, "outputs": [], "source": [ "glove_file = \"./glove.6B.100d.txt\"\n", "\n", "# Load GloVe embeddings\n", "def load_glove_embeddings(file_path):\n", " embeddings = {}\n", " with open(file_path, \"r\", encoding=\"utf-8\") as f:\n", " for line in f:\n", " values = line.strip().split(' ')\n", " word = values[0]\n", " try:\n", " vector = np.asarray(values[1:], dtype=\"float32\")\n", " embeddings[word] = vector\n", " except ValueError:\n", " print(f\"Skipping line with invalid vector for word: {word}\")\n", " return embeddings\n", "glove_embeddings = load_glove_embeddings(glove_file)" ] }, { "cell_type": "code", "execution_count": 45, "id": "425dc076-76cc-45fd-93c4-23ab6aea6cd0", "metadata": {}, "outputs": [], "source": [ "\n", "def sentence_to_glove(sentence, embeddings):\n", " words = sentence.split()\n", " vectors = [embeddings[word] for word in words if word in embeddings]\n", " if vectors:\n", " return np.mean(vectors, axis=0)\n", " else:\n", " return np.zeros(300) # Return a zero vector if no words are in embeddings\n", "\n", "# Transform an entire dataset (e.g., train and test sets)\n", "X_train_glove = np.array([sentence_to_glove(sentence, glove_embeddings) for sentence in X_train])\n", "X_test_glove = np.array([sentence_to_glove(sentence, glove_embeddings) for sentence in X_test])\n", "\n", "glove_embeddings = load_glove_embeddings(glove_file)" ] }, { "cell_type": "markdown", "id": "84133d6c-d0d1-4ae4-8912-afed2401675d", "metadata": {}, "source": [ "## Model" ] }, { "cell_type": "markdown", "id": "6f54068e-2318-4174-b2a7-38583c2b141f", "metadata": {}, "source": [ "### LogisticRegression" ] }, { "cell_type": "code", "execution_count": 46, "id": "5cc91140-2e3c-477f-8d7e-b8eed08941db", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Tfidf With Logistic Regression\n", "Accuracy: 0.7963\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.79 0.72 0.75 330\n", " 1 0.80 0.85 0.83 431\n", "\n", " accuracy 0.80 761\n", " macro avg 0.80 0.79 0.79 761\n", "weighted avg 0.80 0.80 0.79 761\n", "\n", "Bag Of Word With Logistic Regression\n", "Accuracy: 0.7911\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.75 0.78 0.76 330\n", " 1 0.83 0.80 0.81 431\n", "\n", " accuracy 0.79 761\n", " macro avg 0.79 0.79 0.79 761\n", "weighted avg 0.79 0.79 0.79 761\n", "\n", "Bag Of N Grams With Logistic Regression\n", "Accuracy: 0.8003\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.76 0.79 0.77 330\n", " 1 0.83 0.81 0.82 431\n", "\n", " accuracy 0.80 761\n", " macro avg 0.80 0.80 0.80 761\n", "weighted avg 0.80 0.80 0.80 761\n", "\n", "Word2Vec With Logistic Regression\n", "Accuracy: 0.7293\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.70 0.66 0.68 330\n", " 1 0.75 0.78 0.77 431\n", "\n", " accuracy 0.73 761\n", " macro avg 0.72 0.72 0.72 761\n", "weighted avg 0.73 0.73 0.73 761\n", "\n", "GloVe With Logistic Regression\n", "Accuracy: 0.7122\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.66 0.68 0.67 330\n", " 1 0.75 0.74 0.74 431\n", "\n", " accuracy 0.71 761\n", " macro avg 0.71 0.71 0.71 761\n", "weighted avg 0.71 0.71 0.71 761\n", "\n" ] } ], "source": [ "# Logistic Regression with TF-IDF\n", "model = LogisticRegression()\n", "model.fit(X_train_tfidf, y_train)\n", "y_pred = model.predict(X_test_tfidf)\n", "accuracy = accuracy_score(y_test, y_pred)\n", "print(\"Tfidf With Logistic Regression\")\n", "print(f\"Accuracy: {accuracy:.4f}\")\n", "print(\"Classification Report:\\n\", classification_report(y_test, y_pred))\n", "accuracy_scores[\"Tfidf With Logistic Regression\"] = accuracy\n", "\n", "# Logistic Regression with Bag of Words\n", "model = LogisticRegression()\n", "model.fit(X_train_bow, y_train)\n", "y_pred = model.predict(X_test_bow)\n", "accuracy = accuracy_score(y_test, y_pred)\n", "print(\"Bag Of Word With Logistic Regression\")\n", "print(f\"Accuracy: {accuracy:.4f}\")\n", "print(\"Classification Report:\\n\", classification_report(y_test, y_pred))\n", "accuracy_scores[\"Bag Of Word With Logistic Regression\"] = accuracy\n", "\n", "# Logistic Regression with Bag of N-Grams\n", "model = LogisticRegression()\n", "model.fit(X_train_bong, y_train)\n", "y_pred = model.predict(X_test_bong)\n", "accuracy = accuracy_score(y_test, y_pred)\n", "print(\"Bag Of N Grams With Logistic Regression\")\n", "print(f\"Accuracy: {accuracy:.4f}\")\n", "print(\"Classification Report:\\n\", classification_report(y_test, y_pred))\n", "accuracy_scores[\"Bag Of N Grams With Logistic Regression\"] = accuracy\n", "\n", "# Logistic Regression with Word2Vec\n", "model = LogisticRegression()\n", "model.fit(X_train_word2vec, y_train)\n", "y_pred = model.predict(X_test_word2vec)\n", "accuracy = accuracy_score(y_test, y_pred)\n", "print(\"Word2Vec With Logistic Regression\")\n", "print(f\"Accuracy: {accuracy:.4f}\")\n", "print(\"Classification Report:\\n\", classification_report(y_test, y_pred))\n", "accuracy_scores[\"Word2Vec With Logistic Regression\"] = accuracy\n", "\n", "# Logistic Regression with GloVe\n", "model = LogisticRegression()\n", "model.fit(X_train_glove, y_train)\n", "y_pred = model.predict(X_test_glove)\n", "accuracy = accuracy_score(y_test, y_pred)\n", "print(\"GloVe With Logistic Regression\")\n", "print(f\"Accuracy: {accuracy:.4f}\")\n", "print(\"Classification Report:\\n\", classification_report(y_test, y_pred))\n", "accuracy_scores[\"GloVe With Logistic Regression\"] = accuracy" ] }, { "cell_type": "markdown", "id": "c67027b3-faaf-4255-8fa6-f3bf8202a39a", "metadata": {}, "source": [ "### Decision Tree" ] }, { "cell_type": "code", "execution_count": 47, "id": "ad692f26-6935-4c5c-adca-2a5f38ae311e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TFIDF with Decision Tree\n", "Accuracy: 0.7411\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.67 0.79 0.73 330\n", " 1 0.81 0.71 0.76 431\n", "\n", " accuracy 0.74 761\n", " macro avg 0.74 0.75 0.74 761\n", "weighted avg 0.75 0.74 0.74 761\n", "\n", "Bag of Words with Decision Tree\n", "Accuracy: 0.7503\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.71 0.71 0.71 330\n", " 1 0.78 0.78 0.78 431\n", "\n", " accuracy 0.75 761\n", " macro avg 0.75 0.75 0.75 761\n", "weighted avg 0.75 0.75 0.75 761\n", "\n", "Bag of N-grams with Decision Tree\n", "Accuracy: 0.7661\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.75 0.70 0.72 330\n", " 1 0.78 0.82 0.80 431\n", "\n", " accuracy 0.77 761\n", " macro avg 0.76 0.76 0.76 761\n", "weighted avg 0.77 0.77 0.77 761\n", "\n", "Word2Vec with Decision Tree\n", "Accuracy: 0.5782\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.51 0.55 0.53 330\n", " 1 0.63 0.60 0.62 431\n", "\n", " accuracy 0.58 761\n", " macro avg 0.57 0.57 0.57 761\n", "weighted avg 0.58 0.58 0.58 761\n", "\n", "GloVe with Decision Tree\n", "Accuracy: 0.6386\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.58 0.63 0.60 330\n", " 1 0.69 0.65 0.67 431\n", "\n", " accuracy 0.64 761\n", " macro avg 0.64 0.64 0.64 761\n", "weighted avg 0.64 0.64 0.64 761\n", "\n" ] } ], "source": [ "from sklearn.tree import DecisionTreeClassifier\n", "from sklearn.metrics import accuracy_score, classification_report\n", "\n", "# Decision Tree with TF-IDF\n", "dt_model = DecisionTreeClassifier()\n", "dt_model.fit(X_train_tfidf, y_train)\n", "y_pred_dt = dt_model.predict(X_test_tfidf)\n", "accuracy = accuracy_score(y_test, y_pred_dt)\n", "print(\"TFIDF with Decision Tree\")\n", "print(f\"Accuracy: {accuracy:.4f}\")\n", "print(\"Classification Report:\\n\", classification_report(y_test, y_pred_dt))\n", "accuracy_scores[\"TFIDF with Decision Tree\"] = accuracy\n", "\n", "# Decision Tree with Bag of Words\n", "dt_model = DecisionTreeClassifier()\n", "dt_model.fit(X_train_bow, y_train)\n", "y_pred_dt_bow = dt_model.predict(X_test_bow)\n", "accuracy = accuracy_score(y_test, y_pred_dt_bow)\n", "print(\"Bag of Words with Decision Tree\")\n", "print(f\"Accuracy: {accuracy:.4f}\")\n", "print(\"Classification Report:\\n\", classification_report(y_test, y_pred_dt_bow))\n", "accuracy_scores[\"Bag of Words with Decision Tree\"] = accuracy\n", "\n", "# Decision Tree with Bag of N-grams\n", "dt_model = DecisionTreeClassifier()\n", "dt_model.fit(X_train_bong, y_train)\n", "y_pred_dt_bong = dt_model.predict(X_test_bong)\n", "accuracy = accuracy_score(y_test, y_pred_dt_bong)\n", "print(\"Bag of N-grams with Decision Tree\")\n", "print(f\"Accuracy: {accuracy:.4f}\")\n", "print(\"Classification Report:\\n\", classification_report(y_test, y_pred_dt_bong))\n", "accuracy_scores[\"Bag of N-grams with Decision Tree\"] = accuracy\n", "\n", "# Decision Tree with Word2Vec\n", "dt_model = DecisionTreeClassifier()\n", "dt_model.fit(X_train_word2vec, y_train)\n", "y_pred_dt_word2vec = dt_model.predict(X_test_word2vec)\n", "accuracy = accuracy_score(y_test, y_pred_dt_word2vec)\n", "print(\"Word2Vec with Decision Tree\")\n", "print(f\"Accuracy: {accuracy:.4f}\")\n", "print(\"Classification Report:\\n\", classification_report(y_test, y_pred_dt_word2vec))\n", "accuracy_scores[\"Word2Vec With DecisionTree\"] = accuracy\n", "\n", "# Decision Tree with GloVe\n", "dt_model = DecisionTreeClassifier()\n", "dt_model.fit(X_train_glove, y_train)\n", "y_pred_dt_glove = dt_model.predict(X_test_glove)\n", "accuracy = accuracy_score(y_test, y_pred_dt_glove)\n", "print(\"GloVe with Decision Tree\")\n", "print(f\"Accuracy: {accuracy:.4f}\")\n", "print(\"Classification Report:\\n\", classification_report(y_test, y_pred_dt_glove))\n", "accuracy_scores[\"GloVe with Decision Tree\"] = accuracy" ] }, { "cell_type": "markdown", "id": "97b8ce80-3405-4605-b7ee-fdd4cc5f7981", "metadata": {}, "source": [ "### Random Forest" ] }, { "cell_type": "code", "execution_count": 48, "id": "23e5725e-cfb3-40f3-b120-6bfac8054b00", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TFIDF with Random Forest\n", "Accuracy: 0.7792\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.74 0.75 0.75 330\n", " 1 0.81 0.80 0.80 431\n", "\n", " accuracy 0.78 761\n", " macro avg 0.78 0.78 0.78 761\n", "weighted avg 0.78 0.78 0.78 761\n", "\n", "Bag of Words with Random Forest\n", "Accuracy: 0.7963\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.76 0.78 0.77 330\n", " 1 0.83 0.81 0.82 431\n", "\n", " accuracy 0.80 761\n", " macro avg 0.79 0.79 0.79 761\n", "weighted avg 0.80 0.80 0.80 761\n", "\n", "Bag of N-grams with Random Forest\n", "Accuracy: 0.7911\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.74 0.79 0.77 330\n", " 1 0.83 0.79 0.81 431\n", "\n", " accuracy 0.79 761\n", " macro avg 0.79 0.79 0.79 761\n", "weighted avg 0.79 0.79 0.79 761\n", "\n", "Word2Vec with Random Forest\n", "Accuracy: 0.7148\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.67 0.68 0.68 330\n", " 1 0.75 0.74 0.75 431\n", "\n", " accuracy 0.71 761\n", " macro avg 0.71 0.71 0.71 761\n", "weighted avg 0.72 0.71 0.72 761\n", "\n", "GloVe with Random Forest\n", "Accuracy: 0.7293\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.70 0.66 0.68 330\n", " 1 0.75 0.78 0.77 431\n", "\n", " accuracy 0.73 761\n", " macro avg 0.72 0.72 0.72 761\n", "weighted avg 0.73 0.73 0.73 761\n", "\n" ] } ], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import accuracy_score, classification_report\n", "\n", "# Random Forest with TF-IDF\n", "rf_model = RandomForestClassifier()\n", "rf_model.fit(X_train_tfidf, y_train)\n", "y_pred_rf = rf_model.predict(X_test_tfidf)\n", "accuracy = accuracy_score(y_test, y_pred_rf)\n", "print(\"TFIDF with Random Forest\")\n", "print(f\"Accuracy: {accuracy:.4f}\")\n", "print(\"Classification Report:\\n\", classification_report(y_test, y_pred_rf))\n", "accuracy_scores[\"TFIDF with Random Forest\"] = accuracy\n", "\n", "# Random Forest with Bag of Words\n", "rf_model = RandomForestClassifier()\n", "rf_model.fit(X_train_bow, y_train)\n", "y_pred_rf_bow = rf_model.predict(X_test_bow)\n", "accuracy = accuracy_score(y_test, y_pred_rf_bow)\n", "print(\"Bag of Words with Random Forest\")\n", "print(f\"Accuracy: {accuracy:.4f}\")\n", "print(\"Classification Report:\\n\", classification_report(y_test, y_pred_rf_bow))\n", "accuracy_scores[\"Bag of Words with Random Forest\"] = accuracy\n", "\n", "# Random Forest with Bag of N-grams\n", "rf_model = RandomForestClassifier()\n", "rf_model.fit(X_train_bong, y_train)\n", "y_pred_rf_bong = rf_model.predict(X_test_bong)\n", "accuracy = accuracy_score(y_test, y_pred_rf_bong)\n", "print(\"Bag of N-grams with Random Forest\")\n", "print(f\"Accuracy: {accuracy:.4f}\")\n", "print(\"Classification Report:\\n\", classification_report(y_test, y_pred_rf_bong))\n", "accuracy_scores[\"Bag of N-grams with Random Forest\"] = accuracy\n", "\n", "# Random Forest with Word2Vec\n", "rf_model = RandomForestClassifier()\n", "rf_model.fit(X_train_word2vec, y_train)\n", "y_pred_rf_w2c = rf_model.predict(X_test_word2vec)\n", "accuracy = accuracy_score(y_test, y_pred_rf_w2c)\n", "print(\"Word2Vec with Random Forest\")\n", "print(f\"Accuracy: {accuracy:.4f}\")\n", "print(\"Classification Report:\\n\", classification_report(y_test, y_pred_rf_w2c))\n", "accuracy_scores[\"Word2Vec With RandomForest\"] = accuracy\n", "\n", "# Random Forest with GloVe\n", "rf_model = RandomForestClassifier()\n", "rf_model.fit(X_train_glove, y_train)\n", "y_pred_rf_glove = rf_model.predict(X_test_glove)\n", "accuracy = accuracy_score(y_test, y_pred_rf_glove)\n", "print(\"GloVe with Random Forest\")\n", "print(f\"Accuracy: {accuracy:.4f}\")\n", "print(\"Classification Report:\\n\", classification_report(y_test, y_pred_rf_glove))\n", "accuracy_scores[\"GloVe with Random Forest\"] = accuracy" ] }, { "cell_type": "markdown", "id": "aee4f2d2-6d9d-4921-917c-417138843e15", "metadata": {}, "source": [ "### SVM" ] }, { "cell_type": "code", "execution_count": 49, "id": "502902cb-23fa-4a33-b244-ee7d9e24a6d8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TFIDF with SVM\n", "Accuracy: 0.8095\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.82 0.72 0.77 330\n", " 1 0.80 0.88 0.84 431\n", "\n", " accuracy 0.81 761\n", " macro avg 0.81 0.80 0.80 761\n", "weighted avg 0.81 0.81 0.81 761\n", "\n", "Bag of Words with SVM\n", "Accuracy: 0.7832\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.70 0.86 0.78 330\n", " 1 0.87 0.72 0.79 431\n", "\n", " accuracy 0.78 761\n", " macro avg 0.79 0.79 0.78 761\n", "weighted avg 0.80 0.78 0.78 761\n", "\n", "Bag of N-grams with SVM\n", "Accuracy: 0.5742\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.50 0.97 0.66 330\n", " 1 0.91 0.27 0.42 431\n", "\n", " accuracy 0.57 761\n", " macro avg 0.71 0.62 0.54 761\n", "weighted avg 0.74 0.57 0.53 761\n", "\n", "Word2Vec with SVM\n", "Accuracy: 0.7792\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.76 0.71 0.74 330\n", " 1 0.79 0.83 0.81 431\n", "\n", " accuracy 0.78 761\n", " macro avg 0.78 0.77 0.77 761\n", "weighted avg 0.78 0.78 0.78 761\n", "\n", "GloVe with SVM\n", "Accuracy: 0.7622\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.75 0.68 0.71 330\n", " 1 0.77 0.83 0.80 431\n", "\n", " accuracy 0.76 761\n", " macro avg 0.76 0.75 0.75 761\n", "weighted avg 0.76 0.76 0.76 761\n", "\n" ] } ], "source": [ "from sklearn.svm import SVC\n", "from sklearn.metrics import accuracy_score, classification_report\n", "\n", "# SVM with TFIDF\n", "svm_model = SVC()\n", "svm_model.fit(X_train_tfidf, y_train)\n", "y_pred_svm = svm_model.predict(X_test_tfidf)\n", "accuracy = accuracy_score(y_test, y_pred_svm)\n", "print(\"TFIDF with SVM\")\n", "print(f\"Accuracy: {accuracy:.4f}\")\n", "print(\"Classification Report:\\n\", classification_report(y_test, y_pred_svm))\n", "accuracy_scores[\"TFIDF with SVM\"] = accuracy\n", "\n", "# SVM with Bag of Words\n", "svm_model = SVC()\n", "svm_model.fit(X_train_bow, y_train)\n", "y_pred_svm_bow = svm_model.predict(X_test_bow)\n", "accuracy = accuracy_score(y_test, y_pred_svm_bow)\n", "print(\"Bag of Words with SVM\")\n", "print(f\"Accuracy: {accuracy:.4f}\")\n", "print(\"Classification Report:\\n\", classification_report(y_test, y_pred_svm_bow))\n", "accuracy_scores[\"Bag of Words with SVM\"] = accuracy\n", "\n", "# SVM with Bag of N-grams\n", "svm_model = SVC()\n", "svm_model.fit(X_train_bong, y_train)\n", "y_pred_svm_bong = svm_model.predict(X_test_bong)\n", "accuracy = accuracy_score(y_test, y_pred_svm_bong)\n", "print(\"Bag of N-grams with SVM\")\n", "print(f\"Accuracy: {accuracy:.4f}\")\n", "print(\"Classification Report:\\n\", classification_report(y_test, y_pred_svm_bong))\n", "accuracy_scores[\"Bag of N-grams with SVM\"] = accuracy\n", "\n", "# SVM with Word2Vec\n", "svm_model = SVC()\n", "svm_model.fit(X_train_word2vec, y_train)\n", "y_pred_svm_w2v = svm_model.predict(X_test_word2vec)\n", "accuracy = accuracy_score(y_test, y_pred_svm_w2v)\n", "print(\"Word2Vec with SVM\")\n", "print(f\"Accuracy: {accuracy:.4f}\")\n", "print(\"Classification Report:\\n\", classification_report(y_test, y_pred_svm_w2v))\n", "accuracy_scores[\"Word2Vec With SVM\"] = accuracy\n", "\n", "# SVM with GloVeBag of Words with Naive Bayes\n", "svm_model = SVC()\n", "svm_model.fit(X_train_glove, y_train)\n", "y_pred_svm_glove = svm_model.predict(X_test_glove)\n", "accuracy = accuracy_score(y_test, y_pred_svm_glove)\n", "print(\"GloVe with SVM\")\n", "print(f\"Accuracy: {accuracy:.4f}\")\n", "print(\"Classification Report:\\n\", classification_report(y_test, y_pred_svm_glove))\n", "accuracy_scores[\"GloVe with SVM\"] = accuracy" ] }, { "cell_type": "markdown", "id": "15e26973-193c-45df-8496-18a10b874690", "metadata": {}, "source": [ "### Naive Bayes" ] }, { "cell_type": "code", "execution_count": 50, "id": "11ae28fd-b517-47e0-94c5-f4e7f420e9c9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TFIDF with Naive Bayes\n", "Accuracy: 0.8029\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.78 0.75 0.77 330\n", " 1 0.82 0.84 0.83 431\n", "\n", " accuracy 0.80 761\n", " macro avg 0.80 0.80 0.80 761\n", "weighted avg 0.80 0.80 0.80 761\n", "\n", "Bag of Words with Naive Bayes\n", "Accuracy: 0.8055\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.77 0.78 0.78 330\n", " 1 0.83 0.83 0.83 431\n", "\n", " accuracy 0.81 761\n", " macro avg 0.80 0.80 0.80 761\n", "weighted avg 0.81 0.81 0.81 761\n", "\n", "Bag of N-grams with Naive Bayes\n", "Accuracy: 0.8200\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.81 0.77 0.79 330\n", " 1 0.83 0.86 0.84 431\n", "\n", " accuracy 0.82 761\n", " macro avg 0.82 0.81 0.82 761\n", "weighted avg 0.82 0.82 0.82 761\n", "\n", "Word2Vec with Naive Bayes (GaussianNB)\n", "Accuracy: 0.6728\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.63 0.59 0.61 330\n", " 1 0.70 0.74 0.72 431\n", "\n", " accuracy 0.67 761\n", " macro avg 0.67 0.66 0.66 761\n", "weighted avg 0.67 0.67 0.67 761\n", "\n", "GloVe with Naive Bayes (GaussianNB)\n", "Accuracy: 0.6689\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " 0 0.63 0.56 0.59 330\n", " 1 0.69 0.75 0.72 431\n", "\n", " accuracy 0.67 761\n", " macro avg 0.66 0.66 0.66 761\n", "weighted avg 0.67 0.67 0.67 761\n", "\n" ] } ], "source": [ "from sklearn.naive_bayes import MultinomialNB, GaussianNB\n", "from sklearn.metrics import accuracy_score, classification_report\n", "from sklearn.metrics import precision_score, recall_score\n", "# Naive Bayes with TF-IDF\n", "nb_model = MultinomialNB()\n", "nb_model.fit(X_train_tfidf, y_train)\n", "y_pred_nb_tfidf = nb_model.predict(X_test_tfidf)\n", "accuracy = accuracy_score(y_test, y_pred_nb_tfidf)\n", "print(\"TFIDF with Naive Bayes\")\n", "print(f\"Accuracy: {accuracy:.4f}\")\n", "print(\"Classification Report:\\n\", classification_report(y_test, y_pred_nb_tfidf))\n", "accuracy_scores[\"TFIDF with Naive Bayes\"] = accuracy\n", "\n", "# Naive Bayes with Bag of Words\n", "nb_model = MultinomialNB()\n", "nb_model.fit(X_train_bow, y_train)\n", "y_pred_nb_bow = nb_model.predict(X_test_bow)\n", "accuracy = accuracy_score(y_test, y_pred_nb_bow)\n", "print(\"Bag of Words with Naive Bayes\")\n", "print(f\"Accuracy: {accuracy:.4f}\")\n", "print(\"Classification Report:\\n\", classification_report(y_test, y_pred_nb_bow))\n", "accuracy_scores[\"Bag of Words with Naive Bayes\"] = accuracy\n", "\n", "# Naive Bayes with Bag of N-grams\n", "nb_model = MultinomialNB()\n", "nb_model.fit(X_train_bong, y_train)\n", "y_pred_nb_bong = nb_model.predict(X_test_bong)\n", "accuracy = accuracy_score(y_test, y_pred_nb_bong)\n", "print(\"Bag of N-grams with Naive Bayes\")\n", "print(f\"Accuracy: {accuracy:.4f}\")\n", "print(\"Classification Report:\\n\", classification_report(y_test, y_pred_nb_bong))\n", "accuracy_scores[\"Bag of N-grams with Naive Bayes\"] = accuracy\n", "precision = precision_score(y_test, y_pred_nb_bong, average='weighted')\n", "recall = recall_score(y_test, y_pred_nb_bong, average='weighted')\n", "\n", "\n", "joblib.dump(nb_model, 'naive_bayes_model.pkl')\n", "\n", "\n", "\n", "# Naive Bayes with Word2Vec (using GaussianNB)\n", "nb_model = GaussianNB()\n", "nb_model.fit(X_train_word2vec, y_train)\n", "y_pred_nb_w2v = nb_model.predict(X_test_word2vec)\n", "accuracy = accuracy_score(y_test, y_pred_nb_w2v)\n", "print(\"Word2Vec with Naive Bayes (GaussianNB)\")\n", "print(f\"Accuracy: {accuracy:.4f}\")\n", "print(\"Classification Report:\\n\", classification_report(y_test, y_pred_nb_w2v))\n", "accuracy_scores[\"Word2Vec with Naive Bayes\"] = accuracy\n", "\n", "# Naive Bayes with GloVe (using GaussianNB)\n", "nb_model = GaussianNB()\n", "nb_model.fit(X_train_glove, y_train)\n", "y_pred_nb_glove = nb_model.predict(X_test_glove)\n", "accuracy = accuracy_score(y_test, y_pred_nb_glove)\n", "print(\"GloVe with Naive Bayes (GaussianNB)\")\n", "print(f\"Accuracy: {accuracy:.4f}\")\n", "print(\"Classification Report:\\n\", classification_report(y_test, y_pred_nb_glove))\n", "accuracy_scores[\"GloVe with Naive Bayes\"] = accuracy" ] }, { "cell_type": "code", "execution_count": null, "id": "3bf8cec4-17ee-4342-a43b-163fd1b36d68", "metadata": {}, "outputs": [], "source": [ "\n" ] }, { "cell_type": "markdown", "id": "2a7e35c1-1444-4098-8866-450af4216066", "metadata": {}, "source": [ "## LSTM" ] }, { "cell_type": "code", "execution_count": 51, "id": "5852a896-7b5d-42dc-845d-3e58aa7007c4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/7\n", "96/96 [==============================] - 2s 18ms/step - loss: 0.6869 - accuracy: 0.5345 - val_loss: 0.6719 - val_accuracy: 0.5848\n", "Epoch 2/7\n", "96/96 [==============================] - 1s 7ms/step - loss: 0.6759 - accuracy: 0.5700 - val_loss: 0.6655 - val_accuracy: 0.5926\n", "Epoch 3/7\n", "96/96 [==============================] - 0s 3ms/step - loss: 0.6712 - accuracy: 0.5700 - val_loss: 0.6686 - val_accuracy: 0.5940\n", "Epoch 4/7\n", "96/96 [==============================] - 0s 3ms/step - loss: 0.6711 - accuracy: 0.5834 - val_loss: 0.6692 - val_accuracy: 0.5598\n", "Epoch 5/7\n", "96/96 [==============================] - 0s 3ms/step - loss: 0.6671 - accuracy: 0.5949 - val_loss: 0.6579 - val_accuracy: 0.6018\n", "Epoch 6/7\n", "96/96 [==============================] - 0s 3ms/step - loss: 0.6683 - accuracy: 0.5880 - val_loss: 0.6681 - val_accuracy: 0.6150\n", "Epoch 7/7\n", "96/96 [==============================] - 0s 3ms/step - loss: 0.6654 - accuracy: 0.5920 - val_loss: 0.6621 - val_accuracy: 0.6084\n", "24/24 [==============================] - 0s 1ms/step - loss: 0.6621 - accuracy: 0.6084\n", "Test Loss: 0.6620948314666748\n", "Test Accuracy: 0.6084100008010864\n" ] }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import LabelEncoder\n", "from tensorflow.keras.preprocessing.text import Tokenizer\n", "from keras_preprocessing.sequence import pad_sequences\n", "from keras.models import Sequential\n", "from keras.layers import Embedding, LSTM, Dense, Dropout\n", "from gensim.models import Word2Vec\n", "from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, GlobalMaxPooling1D\n", "\n", "\n", "# Encode labels\n", "le = LabelEncoder()\n", "df['news'] = le.fit_transform(df['news'])\n", "\n", "# Tokenize the titles\n", "tokenizer = Tokenizer()\n", "tokenizer.fit_on_texts(df['title'])\n", "sequences = tokenizer.texts_to_sequences(df['title'])\n", "word_index = tokenizer.word_index\n", "\n", "# Padding sequences\n", "max_len = max(len(x) for x in sequences)\n", "X = pad_sequences(sequences, maxlen=max_len, padding='post')\n", "y = df['news'].values\n", "\n", "# Split the data\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", "\n", "# Train Word2Vec model on the training set only\n", "sentences_train = [df['title'][i].split() for i in range(len(X_train))]\n", "word2vec_model = Word2Vec(sentences_train, vector_size=100, window=5, min_count=1, workers=4)\n", "\n", "# Create embedding matrix\n", "embedding_matrix = np.zeros((len(word_index) + 1, 100))\n", "for word, i in word_index.items():\n", " if word in word2vec_model.wv:\n", " embedding_matrix[i] = word2vec_model.wv[word]\n", "#-------------------------------\n", "# Updated LSTM Model\n", "# Simple LSTM Model\n", "model = Sequential()\n", "\n", "# Embedding layer with pre-trained Word2Vec weights\n", "model.add(Embedding(input_dim=len(word_index) + 1, \n", " output_dim=100, \n", " weights=[embedding_matrix], \n", " input_length=max_len, \n", " trainable=False))\n", "\n", "# Single LSTM layer\n", "model.add(LSTM(64))\n", "\n", "# Dropout for regularization\n", "model.add(Dropout(0.3))\n", "\n", "# Output layer\n", "model.add(Dense(1, activation='sigmoid')) # Binary classification\n", "\n", "# Compile the model\n", "model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n", "#---------------------\n", "\n", "history=model.fit(X_train, y_train, epochs=7, batch_size=32, validation_data=(X_test, y_test))\n", "\n", "# Evaluate the model\n", "loss, accuracy = model.evaluate(X_test, y_test)\n", "print(f\"Test Loss: {loss}\")\n", "print(f\"Test Accuracy: {accuracy}\")\n", "accuracy_scores['Word2Vec With LSTM']=accuracy\n", "def plot_metrics(history):\n", " # Plot loss\n", " plt.figure(figsize=(12, 5))\n", " plt.subplot(1, 2, 1)\n", " plt.plot(history.history['loss'], label='Training Loss')\n", " plt.plot(history.history['val_loss'], label='Validation Loss')\n", " plt.title('Loss over Epochs')\n", " plt.xlabel('Epochs')\n", " plt.ylabel('Loss')\n", " plt.legend()\n", "\n", " # Plot accuracy\n", " plt.subplot(1, 2, 2)\n", " plt.plot(history.history['accuracy'], label='Training Accuracy')\n", " plt.plot(history.history['val_accuracy'], label='Validation Accuracy')\n", " plt.title('Accuracy over Epochs')\n", " plt.xlabel('Epochs')\n", " plt.ylabel('Accuracy')\n", " plt.legend()\n", "\n", " plt.tight_layout()\n", " plt.show()\n", "\n", "# Call the plot_metrics function\n", "plot_metrics(history)" ] }, { "cell_type": "code", "execution_count": 52, "id": "01cbb05d-cb1e-4002-8894-a34725cfd66a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/4\n", "96/96 [==============================] - 2s 7ms/step - loss: 0.6274 - accuracy: 0.6393 - val_loss: 0.5472 - val_accuracy: 0.7214\n", "Epoch 2/4\n", "96/96 [==============================] - 0s 3ms/step - loss: 0.5293 - accuracy: 0.7418 - val_loss: 0.5232 - val_accuracy: 0.7490\n", "Epoch 3/4\n", "96/96 [==============================] - 0s 3ms/step - loss: 0.4808 - accuracy: 0.7727 - val_loss: 0.4907 - val_accuracy: 0.7582\n", "Epoch 4/4\n", "96/96 [==============================] - 0s 3ms/step - loss: 0.4403 - accuracy: 0.8003 - val_loss: 0.4851 - val_accuracy: 0.7582\n", "24/24 [==============================] - 0s 1ms/step - loss: 0.4851 - accuracy: 0.7582\n", "Test Loss: 0.48506996035575867\n", "Test Accuracy: 0.7582128643989563\n" ] }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.preprocessing import LabelEncoder\n", "from tensorflow.keras.preprocessing.text import Tokenizer\n", "from keras_preprocessing.sequence import pad_sequences\n", "from keras.models import Sequential\n", "from keras.layers import Embedding, LSTM, Dense, Dropout\n", "\n", "# Load your dataset (assuming df is your DataFrame with 'title' and 'news' columns)\n", "# df = pd.read_csv('your_dataset.csv')\n", "\n", "# Encode labels\n", "le = LabelEncoder()\n", "df['news'] = le.fit_transform(df['news'])\n", "\n", "# Tokenize the titles\n", "tokenizer = Tokenizer()\n", "tokenizer.fit_on_texts(df['title'])\n", "sequences = tokenizer.texts_to_sequences(df['title'])\n", "word_index = tokenizer.word_index\n", "\n", "# Padding sequences\n", "max_len = max(len(x) for x in sequences)\n", "X = pad_sequences(sequences, maxlen=max_len, padding='post')\n", "y = df['news'].values\n", "\n", "# Split the data\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", "\n", "# Load GloVe embeddings\n", "embedding_dim = 100\n", "glove_path = \"./glove.6B.100d.txt\" # Update this path with the correct location of your GloVe file\n", "\n", "# Create an embedding index\n", "embedding_index = {}\n", "with open(glove_path, \"r\", encoding=\"utf-8\") as f:\n", " for line in f:\n", " values = line.split()\n", " word = values[0]\n", " coefs = np.asarray(values[1:], dtype=\"float32\")\n", " embedding_index[word] = coefs\n", "\n", "# Create the embedding matrix\n", "embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))\n", "for word, i in word_index.items():\n", " embedding_vector = embedding_index.get(word)\n", " if embedding_vector is not None:\n", " embedding_matrix[i] = embedding_vector\n", "\n", "# Build the simple LSTM model\n", "model = Sequential()\n", "\n", "# Embedding layer with pre-trained GloVe weights\n", "model.add(Embedding(input_dim=len(word_index) + 1,\n", " output_dim=embedding_dim,\n", " weights=[embedding_matrix],\n", " input_length=max_len,\n", " trainable=False))\n", "\n", "# Single LSTM layer\n", "model.add(LSTM(64))\n", "\n", "# Dropout for regularization\n", "model.add(Dropout(0.3))\n", "\n", "# Output layer\n", "model.add(Dense(1, activation='sigmoid')) # Binary classification\n", "\n", "# Compile the model\n", "model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n", "\n", "# Train the model\n", "history=model.fit(X_train, y_train, epochs=4, batch_size=32, validation_data=(X_test, y_test))\n", "\n", "# Evaluate the model\n", "loss, accuracy = model.evaluate(X_test, y_test)\n", "print(f\"Test Loss: {loss}\")\n", "print(f\"Test Accuracy: {accuracy}\")\n", "accuracy_scores['Glove With LSTM']=accuracy\n", "def plot_metrics(history):\n", " # Plot loss\n", " plt.figure(figsize=(12, 5))\n", " plt.subplot(1, 2, 1)\n", " plt.plot(history.history['loss'], label='Training Loss')\n", " plt.plot(history.history['val_loss'], label='Validation Loss')\n", " plt.title('Loss over Epochs')\n", " plt.xlabel('Epochs')\n", " plt.ylabel('Loss')\n", " plt.legend()\n", "\n", " # Plot accuracy\n", " plt.subplot(1, 2, 2)\n", " plt.plot(history.history['accuracy'], label='Training Accuracy')\n", " plt.plot(history.history['val_accuracy'], label='Validation Accuracy')\n", " plt.title('Accuracy over Epochs')\n", " plt.xlabel('Epochs')\n", " plt.ylabel('Accuracy')\n", " plt.legend()\n", "\n", " plt.tight_layout()\n", " plt.show()\n", "\n", "# Call the plot_metrics function\n", "plot_metrics(history)" ] }, { "cell_type": "code", "execution_count": 53, "id": "4eba3a75-8e33-4453-abaa-526413c02f4c", "metadata": {}, "outputs": [], "source": [ "ranked_models = sorted(accuracy_scores.items(), key=lambda x: x[1], reverse=True)" ] }, { "cell_type": "code", "execution_count": 54, "id": "a883f144-71ef-48c2-af43-00590f811235", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('Bag of N-grams with Naive Bayes', 0.8199737187910644),\n", " ('TFIDF with SVM', 0.80946123521682),\n", " ('Bag of Words with Naive Bayes', 0.8055190538764783),\n", " ('TFIDF with Naive Bayes', 0.8028909329829172),\n", " ('Bag Of N Grams With Logistic Regression', 0.8002628120893561),\n", " ('Tfidf With Logistic Regression', 0.7963206307490145),\n", " ('Bag of Words with Random Forest', 0.7963206307490145),\n", " ('Bag Of Word With Logistic Regression', 0.7910643889618922),\n", " ('Bag of N-grams with Random Forest', 0.7910643889618922),\n", " ('Bag of Words with SVM', 0.783180026281209),\n", " ('TFIDF with Random Forest', 0.7792378449408672),\n", " ('Word2Vec With SVM', 0.7792378449408672),\n", " ('Bag of N-grams with Decision Tree', 0.7660972404730617),\n", " ('GloVe with SVM', 0.7621550591327201),\n", " ('Glove With LSTM', 0.7582128643989563),\n", " ('Bag of Words with Decision Tree', 0.7503285151116952),\n", " ('TFIDF with Decision Tree', 0.7411300919842313),\n", " ('Word2Vec With Logistic Regression', 0.7293035479632063),\n", " ('GloVe with Random Forest', 0.7293035479632063),\n", " ('Word2Vec With RandomForest', 0.7148488830486203),\n", " ('GloVe With Logistic Regression', 0.7122207621550591),\n", " ('Word2Vec with Naive Bayes', 0.6727989487516426),\n", " ('GloVe with Naive Bayes', 0.668856767411301),\n", " ('GloVe with Decision Tree', 0.6386333771353482),\n", " ('Word2Vec With LSTM', 0.6084100008010864),\n", " ('Word2Vec With DecisionTree', 0.5781865965834428),\n", " ('Bag of N-grams with SVM', 0.5742444152431012)]" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ranked_models" ] }, { "cell_type": "code", "execution_count": null, "id": "6e9e9a35-8788-451a-a1f8-d1b33506699b", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.15" } }, "nbformat": 4, "nbformat_minor": 5 }