{ "cells": [ { "cell_type": "markdown", "id": "30487fe4-5659-41fa-b2a7-7ca9d677b169", "metadata": {}, "source": [ "# Prediction:" ] }, { "cell_type": "code", "execution_count": 34, "id": "86ad082a-d600-493f-aac4-310e520cfe84", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to\n", "[nltk_data] C:\\Users\\26656\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package punkt to\n", "[nltk_data] C:\\Users\\26656\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package punkt is already up-to-date!\n", "[nltk_data] Downloading package wordnet to\n", "[nltk_data] C:\\Users\\26656\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n", "[nltk_data] Downloading package omw-1.4 to\n", "[nltk_data] C:\\Users\\26656\\AppData\\Roaming\\nltk_data...\n", "[nltk_data] Package omw-1.4 is already up-to-date!\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Accuracy of the loaded model: 0.4909329829172142\n" ] } ], "source": [ "import pandas as pd\n", "import nltk\n", "from nltk.corpus import stopwords\n", "from nltk.tokenize import word_tokenize\n", "from string import punctuation\n", "import re\n", "import joblib\n", "from sklearn.metrics import accuracy_score\n", "\n", "# Load data\n", "df = pd.read_csv('./data.csv', usecols=['title', 'news'])\n", "\n", "# Download necessary NLTK data\n", "nltk.download('stopwords')\n", "nltk.download('punkt')\n", "nltk.download('wordnet')\n", "nltk.download('omw-1.4')\n", "\n", "# Define stop words\n", "stop_words = set(stopwords.words('english'))\n", "\n", "# Text cleaning function\n", "def clean_text(text):\n", " # Tokenize and lowercase\n", " words = word_tokenize(text.lower())\n", " # Remove stop words, punctuation, and digits\n", " words = [\n", " word for word in words\n", " if word not in stop_words and word not in punctuation and not re.search(r'\\d', word)\n", " ]\n", " # Rejoin words\n", " return ' '.join(words)\n", "\n", "# Apply cleaning function to the 'title' column\n", "df['title'] = df['title'].apply(clean_text)\n", "\n", "# Define features (X) and labels (y)\n", "X = df['title'] # Use the cleaned titles as features\n", "y = df['news'].apply(lambda x: 1 if x == 'fox' else 0) # Convert 'news' to binary labels\n", "\n", "# Load the saved vectorizer and model\n", "loaded_vectorizer = joblib.load('vectorizer_bong.pkl')\n", "loaded_model = joblib.load('naive_bayes_model.pkl')\n", "\n", "# Transform the text data using the loaded vectorizer\n", "X_test = loaded_vectorizer.transform(X)\n", "\n", "# Use the loaded model to make predictions\n", "y_pred_loaded = loaded_model.predict(X_test)\n", "\n", "# Evaluate the model\n", "accuracy = accuracy_score(y, y_pred_loaded)\n", "print(\"Accuracy of the loaded model:\", accuracy)" ] }, { "cell_type": "markdown", "id": "eda06eb9-cde6-4370-b189-3d06ffe505c8", "metadata": {}, "source": [ "## Data Processing" ] }, { "cell_type": "code", "execution_count": 35, "id": "23e35ed8-027e-42f8-8902-43591ef7e34e", "metadata": {}, "outputs": [], "source": [ "!pip install geopy > delete.txt\n", "!pip install datasets > delete.txt\n", "!pip install torch torchvision datasets > delete.txt\n", "!pip install huggingface_hub > delete.txt\n", "!pip install pyhocon > delete.txt\n", "!pip install transformers > delete.txt\n", "!rm delete.txt" ] }, { "cell_type": "code", "execution_count": 36, "id": "8d91a73a-cde9-48a6-9d2a-83715ad64a12", "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "import warnings\n", "warnings.filterwarnings(\"ignore\") \n", "from sklearn.model_selection import train_test_split\n", "import numpy as np\n", "import pandas as pd\n", "import re\n", "from datetime import datetime\n", "\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "from string import punctuation\n", "from nltk.corpus import stopwords\n", "from nltk.tokenize import word_tokenize\n", "import nltk\n", "pd.options.display.max_colwidth = None" ] }, { "cell_type": "code", "execution_count": 37, "id": "d6e65e5d-df2b-4db6-b866-6c473ed85919", "metadata": {}, "outputs": [], "source": [ "df=pd.read_csv('./data.csv',usecols=['title', 'news'])" ] }, { "cell_type": "code", "execution_count": 38, "id": "6c291504-9f0c-46cc-aaee-9ce274ccaafc", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | title | \n", "news | \n", "
---|---|---|
0 | \n", "Jack Carr recalls Gen. Eisenhower's D-Day memo about 'great and noble undertaking' | \n", "fox | \n", "
1 | \n", "Bruce Willis, Demi Moore avoided doing one thing while co-parenting, daughter says | \n", "fox | \n", "
2 | \n", "Blinken meets Qatar PM, says Israeli actions are not 'retaliation,' but 'defending the lives of its people' | \n", "fox | \n", "
3 | \n", "Emily Blunt says her ‘toes curl’ when people tell her their kids want to act: 'I want to say, don’t do it!' | \n", "fox | \n", "
4 | \n", "'The View' co-host, CNN commentator Ana Navarro to host night 2 of Democratic National Convention | \n", "fox | \n", "
\n", " | title | \n", "news | \n", "
---|---|---|
0 | \n", "jack carr recalls gen. eisenhower 's d-day memo 'great noble undertaking | \n", "fox | \n", "
1 | \n", "bruce willis demi moore avoided one thing co-parenting daughter says | \n", "fox | \n", "
2 | \n", "blinken meets qatar pm says israeli actions 'retaliation 'defending lives people | \n", "fox | \n", "
3 | \n", "emily blunt says ‘ toes curl ’ people tell kids want act want say ’ | \n", "fox | \n", "
4 | \n", "'the view co-host cnn commentator ana navarro host night democratic national convention | \n", "fox | \n", "
... | \n", "... | \n", "... | \n", "
3800 | \n", "trump 's lawyers seek post-election day delay court fight immunity decision fallout interference case | \n", "nbc | \n", "
3801 | \n", "treat acne scars hyperpigmentation according experts | \n", "nbc | \n", "
3802 | \n", "best vegetarian vegan meal delivery services according experts | \n", "nbc | \n", "
3803 | \n", "trump says presidential civilian award 'better top military honor whose recipients 'dead 'hit bullets | \n", "nbc | \n", "
3804 | \n", "best white elephant secret santa gift ideas | \n", "nbc | \n", "
3805 rows × 2 columns
\n", "