File size: 6,433 Bytes
022acf4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import os
import re
from typing import Dict, Tuple
from warnings import filterwarnings

import pandas as pd
from sklearn.model_selection import train_test_split

import torch
from newsclassifier.config.config import Cfg, logger
from torch.utils.data import Dataset
from transformers import RobertaTokenizer

filterwarnings("ignore")


def load_dataset(filepath: str, print_i: int = 0) -> pd.DataFrame:
    """load data from source into a Pandas DataFrame.

    Args:
        filepath (str): file location.
        print_i (int): Print number of instances.

    Returns:
        pd.DataFrame: Pandas DataFrame of the data.
    """
    logger.info("Loading Data.")
    df = pd.read_csv(filepath)
    if print_i:
        print(df.head(print_i), "\n")
    return df


def prepare_data(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Separate headlines instance and feature selection.

    Args:
        df: original dataframe.

    Returns:
        df: new dataframe with appropriate features.
        headlines_df: dataframe cintaining "headlines" category instances.
    """
    logger.info("Preparing Data.")
    try:
        df = df[["Title", "Category"]]
        df.rename(columns={"Title": "Text"}, inplace=True)
        df, headlines_df = df[df["Category"] != "Headlines"].reset_index(drop=True), df[df["Category"] == "Headlines"].reset_index(drop=True)
    except Exception as e:
        logger.error(e)

    return df, headlines_df


def clean_text(text: str) -> str:
    """Clean text (lower, puntuations removal, blank space removal)."""
    # lower case the text
    logger.info("Cleaning input text.")
    text = text.lower()  # necessary to do before as stopwords are in lower case

    # remove stopwords
    stp_pattern = re.compile(r"\b(" + r"|".join(Cfg.STOPWORDS) + r")\b\s*")
    text = stp_pattern.sub("", text)

    # custom cleaning
    text = text.strip()  # remove space at start or end if any
    text = re.sub(" +", " ", text)  # remove extra spaces
    text = re.sub("[^A-Za-z0-9]+", " ", text)  # remove characters that are not alphanumeric

    return text


def preprocess(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, Dict, Dict]:
    """Preprocess the data.

    Args:
        df: Dataframe on which the preprocessing steps need to be performed.

    Returns:
        df: Preprocessed Data.
        class_to_index: class labels to indices mapping
        class_to_index: indices to class labels mapping
    """
    df, headlines_df = prepare_data(df)

    cats = df["Category"].unique().tolist()
    class_to_index = {tag: i for i, tag in enumerate(cats)}
    index_to_class = {v: k for k, v in class_to_index.items()}

    df["Text"] = df["Text"].apply(clean_text)  # clean text
    df = df[["Text", "Category"]]
    try:
        df["Category"] = df["Category"].map(class_to_index)  # label encoding
    except Exception as e:
        logger.error(e)
    return df, headlines_df, class_to_index, index_to_class


def data_split(df: pd.DataFrame, split_size: float = 0.2, stratify_on_target: bool = True, save_dfs: bool = False):
    """Split data into train and test sets.

    Args:
        df (pd.DataFrame): Data to be split.
        split_size (float): train-test split ratio (test ratio).
        stratify_on_target (bool): Whether to do stratify split on target.
        target_sep (bool): Whether to do target setting for train and test sets.
        save_dfs (bool): Whether to save dataset splits in artifacts.

    Returns:
        train-test splits (with/without target setting)
    """
    logger.info("Splitting Data.")
    try:
        if stratify_on_target:
            stra = df["Category"]
        else:
            stra = None

        train, test = train_test_split(df, test_size=split_size, random_state=42, stratify=stra)
        train_ds = pd.DataFrame(train, columns=df.columns)
        test_ds = pd.DataFrame(test, columns=df.columns)

        if save_dfs:
            logger.info("Saving and storing data splits.")

            os.makedirs(Cfg.preprocessed_data_path, exist_ok=True)
            train.to_csv(os.path.join(Cfg.preprocessed_data_path, "train.csv"))
            test.to_csv(os.path.join(Cfg.preprocessed_data_path, "test.csv"))
    except Exception as e:
        logger.error(e)

        return train_ds, test_ds


def prepare_input(tokenizer: RobertaTokenizer, text: str) -> Dict:
    """Tokenize and prepare the input text using the provided tokenizer.

    Args:
        tokenizer (RobertaTokenizer): The Roberta tokenizer to encode the input.
        text (str): The input text to be tokenized.

    Returns:
        inputs (dict): A dictionary containing the tokenized input with keys such as 'input_ids',
            'attention_mask', etc.
    """
    logger.info("Tokenizing input text.")
    inputs = tokenizer.encode_plus(
        text,
        return_tensors=None,
        add_special_tokens=Cfg.add_special_tokens,
        max_length=Cfg.max_len,
        pad_to_max_length=Cfg.pad_to_max_length,
        truncation=Cfg.truncation,
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class NewsDataset(Dataset):
    def __init__(self, ds):
        self.texts = ds["Text"].values
        self.labels = ds["Category"].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
        inputs = prepare_input(tokenizer, self.texts[item])
        labels = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, labels


def collate(inputs: Dict) -> Dict:
    """Collate and modify the input dictionary to have the same sequence length for a particular input batch.

    Args:
        inputs (dict): A dictionary containing input tensors with varying sequence lengths.

    Returns:
        modified_inputs (dict): A modified dictionary with input tensors trimmed to have the same sequence length.
    """
    max_len = int(inputs["input_ids"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:, :max_len]
    return inputs


if __name__ == "__main__":
    df = load_dataset(Cfg.dataset_loc)
    df, headlines_df, class_to_index, index_to_class = preprocess(df)
    print(df)
    print(class_to_index)
    train_ds, val_ds = data_split(df, save_dfs=True)
    dataset = NewsDataset(df)
    print(dataset.__getitem__(0))