ManishW's picture
Upload folder using huggingface_hub
022acf4
import os
import re
from typing import Dict, Tuple
from warnings import filterwarnings
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from newsclassifier.config.config import Cfg, logger
from torch.utils.data import Dataset
from transformers import RobertaTokenizer
filterwarnings("ignore")
def load_dataset(filepath: str, print_i: int = 0) -> pd.DataFrame:
"""load data from source into a Pandas DataFrame.
Args:
filepath (str): file location.
print_i (int): Print number of instances.
Returns:
pd.DataFrame: Pandas DataFrame of the data.
"""
logger.info("Loading Data.")
df = pd.read_csv(filepath)
if print_i:
print(df.head(print_i), "\n")
return df
def prepare_data(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""Separate headlines instance and feature selection.
Args:
df: original dataframe.
Returns:
df: new dataframe with appropriate features.
headlines_df: dataframe cintaining "headlines" category instances.
"""
logger.info("Preparing Data.")
try:
df = df[["Title", "Category"]]
df.rename(columns={"Title": "Text"}, inplace=True)
df, headlines_df = df[df["Category"] != "Headlines"].reset_index(drop=True), df[df["Category"] == "Headlines"].reset_index(drop=True)
except Exception as e:
logger.error(e)
return df, headlines_df
def clean_text(text: str) -> str:
"""Clean text (lower, puntuations removal, blank space removal)."""
# lower case the text
logger.info("Cleaning input text.")
text = text.lower() # necessary to do before as stopwords are in lower case
# remove stopwords
stp_pattern = re.compile(r"\b(" + r"|".join(Cfg.STOPWORDS) + r")\b\s*")
text = stp_pattern.sub("", text)
# custom cleaning
text = text.strip() # remove space at start or end if any
text = re.sub(" +", " ", text) # remove extra spaces
text = re.sub("[^A-Za-z0-9]+", " ", text) # remove characters that are not alphanumeric
return text
def preprocess(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, Dict, Dict]:
"""Preprocess the data.
Args:
df: Dataframe on which the preprocessing steps need to be performed.
Returns:
df: Preprocessed Data.
class_to_index: class labels to indices mapping
class_to_index: indices to class labels mapping
"""
df, headlines_df = prepare_data(df)
cats = df["Category"].unique().tolist()
class_to_index = {tag: i for i, tag in enumerate(cats)}
index_to_class = {v: k for k, v in class_to_index.items()}
df["Text"] = df["Text"].apply(clean_text) # clean text
df = df[["Text", "Category"]]
try:
df["Category"] = df["Category"].map(class_to_index) # label encoding
except Exception as e:
logger.error(e)
return df, headlines_df, class_to_index, index_to_class
def data_split(df: pd.DataFrame, split_size: float = 0.2, stratify_on_target: bool = True, save_dfs: bool = False):
"""Split data into train and test sets.
Args:
df (pd.DataFrame): Data to be split.
split_size (float): train-test split ratio (test ratio).
stratify_on_target (bool): Whether to do stratify split on target.
target_sep (bool): Whether to do target setting for train and test sets.
save_dfs (bool): Whether to save dataset splits in artifacts.
Returns:
train-test splits (with/without target setting)
"""
logger.info("Splitting Data.")
try:
if stratify_on_target:
stra = df["Category"]
else:
stra = None
train, test = train_test_split(df, test_size=split_size, random_state=42, stratify=stra)
train_ds = pd.DataFrame(train, columns=df.columns)
test_ds = pd.DataFrame(test, columns=df.columns)
if save_dfs:
logger.info("Saving and storing data splits.")
os.makedirs(Cfg.preprocessed_data_path, exist_ok=True)
train.to_csv(os.path.join(Cfg.preprocessed_data_path, "train.csv"))
test.to_csv(os.path.join(Cfg.preprocessed_data_path, "test.csv"))
except Exception as e:
logger.error(e)
return train_ds, test_ds
def prepare_input(tokenizer: RobertaTokenizer, text: str) -> Dict:
"""Tokenize and prepare the input text using the provided tokenizer.
Args:
tokenizer (RobertaTokenizer): The Roberta tokenizer to encode the input.
text (str): The input text to be tokenized.
Returns:
inputs (dict): A dictionary containing the tokenized input with keys such as 'input_ids',
'attention_mask', etc.
"""
logger.info("Tokenizing input text.")
inputs = tokenizer.encode_plus(
text,
return_tensors=None,
add_special_tokens=Cfg.add_special_tokens,
max_length=Cfg.max_len,
pad_to_max_length=Cfg.pad_to_max_length,
truncation=Cfg.truncation,
)
for k, v in inputs.items():
inputs[k] = torch.tensor(v, dtype=torch.long)
return inputs
class NewsDataset(Dataset):
def __init__(self, ds):
self.texts = ds["Text"].values
self.labels = ds["Category"].values
def __len__(self):
return len(self.texts)
def __getitem__(self, item):
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
inputs = prepare_input(tokenizer, self.texts[item])
labels = torch.tensor(self.labels[item], dtype=torch.float)
return inputs, labels
def collate(inputs: Dict) -> Dict:
"""Collate and modify the input dictionary to have the same sequence length for a particular input batch.
Args:
inputs (dict): A dictionary containing input tensors with varying sequence lengths.
Returns:
modified_inputs (dict): A modified dictionary with input tensors trimmed to have the same sequence length.
"""
max_len = int(inputs["input_ids"].sum(axis=1).max())
for k, v in inputs.items():
inputs[k] = inputs[k][:, :max_len]
return inputs
if __name__ == "__main__":
df = load_dataset(Cfg.dataset_loc)
df, headlines_df, class_to_index, index_to_class = preprocess(df)
print(df)
print(class_to_index)
train_ds, val_ds = data_split(df, save_dfs=True)
dataset = NewsDataset(df)
print(dataset.__getitem__(0))