In [2]:
!pip install geopy > delete.txt
!pip install datasets > delete.txt
!pip install torch torchvision datasets > delete.txt
!pip install huggingface_hub > delete.txt
!pip install pyhocon > delete.txt
!pip install transformers > delete.txt
!rm delete.txt

'rm' is not recognized as an internal or external command,
operable program or batch file.


In [3]:
!huggingface-cli login

^C


In [5]:
from datasets import load_dataset

dataset_train = load_dataset("CISProject/FOX_NBC", split="train")
dataset_test = load_dataset("path/to/test", split="test")

In [12]:
import numpy as np
import torch
from transformers import BertTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

def positional_encoding(seq_len, d_model):
    pos_enc = np.zeros((seq_len, d_model))
    for pos in range(seq_len):
        for i in range(0, d_model, 2):
            pos_enc[pos, i] = np.sin(pos / (10000 ** ((2 * i) / d_model)))
            if i + 1 < d_model:
                pos_enc[pos, i + 1] = np.cos(pos / (10000 ** ((2 * (i + 1)) / d_model)))
    return torch.tensor(pos_enc, dtype=torch.float)

def preprocess_data(data, mode="train", tfidf_vectorizer=None, max_tfidf_features=4096, max_seq_length=128, num_proc=4):
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    # Initialize TF-IDF vectorizer if not provided
    if tfidf_vectorizer is None and mode == "train":
        tfidf_vectorizer = TfidfVectorizer(max_features=max_tfidf_features)

    # Fit TF-IDF only in train mode
    if mode == "train":
        tfidf_vectorizer.fit(data["title"])
        print("TF-IDF vectorizer fitted on training data.")

    def process_batch(batch):
        headlines = batch["title"]
        agencies = batch["news"]

        # TF-IDF transformation (batch-wise)
        if mode == "train" or tfidf_vectorizer is not None:
            freq_inputs = tfidf_vectorizer.transform(headlines).toarray()
        else:
            raise ValueError("TF-IDF vectorizer must be provided in test mode.")

        # Tokenization (batch-wise)
        tokenized = tokenizer(
            headlines,
            padding="max_length",
            truncation=True,
            max_length=max_seq_length,
            return_tensors="pt"
        )

        # Stack input_ids and attention_mask along a new dimension
        input_ids = tokenized["input_ids"]
        attention_mask = tokenized["attention_mask"]

        # Ensure consistent stacking: (batch_size, 2, seq_len)
        seq_inputs = torch.stack([input_ids, attention_mask], dim=1)

        # Positional encoding
        pos_inputs = positional_encoding(max_seq_length, 512).unsqueeze(0).expand(len(headlines), -1, -1)

        # Labels
        labels = [1.0 if agency == "fox" else 0.0 for agency in agencies]

        return {
            "freq_inputs": torch.tensor(freq_inputs),
            "seq_inputs": seq_inputs,
            "pos_inputs": pos_inputs,
            "labels": torch.tensor(labels),
        }

    # Use `map` with batching and parallelism
    processed_data = data.map(
        process_batch,
        batched=True,
        batch_size=32,
        num_proc=num_proc
    )

    return processed_data, tfidf_vectorizer

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
dataset_train, tfidf_vectorizer = preprocess_data(
    data=dataset_train,
    mode="train",
    max_tfidf_features=8192,
    max_seq_length=128
)

dataset_test, _ = preprocess_data(
    data=dataset_test,
    mode="test",
    tfidf_vectorizer=tfidf_vectorizer,
    max_tfidf_features=8192,
    max_seq_length=128
)

In [None]:
# Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("CISProject/News-Headline-Classifier-Notebook")

In [1]:
from torch.utils.data import DataLoader

# Define a collate function to handle the batched data
def collate_fn(batch):
    freq_inputs = torch.stack([torch.tensor(item["freq_inputs"]) for item in batch])
    seq_inputs = torch.stack([torch.tensor(item["seq_inputs"]) for item in batch])
    pos_inputs = torch.stack([torch.tensor(item["pos_inputs"]) for item in batch])
    labels = torch.tensor([torch.tensor(item["labels"]) for item in batch])
    return {"freq_inputs": freq_inputs, "seq_inputs": seq_inputs, "pos_inputs": pos_inputs}, labels

train_loader = DataLoader(dataset_train, batch_size=config.train["batch_size"], shuffle=True,collate_fn=collate_fn)
test_loader = DataLoader(dataset_test, batch_size=config.train["batch_size"], shuffle=False,collate_fn=collate_fn)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = torch.nn.BCEWithLogitsLoss()

def evaluate_model(model, val_loader, criterion, device="cuda"):
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch_inputs, labels in tqdm(val_loader, desc="Testing", leave=False):
            freq_inputs = batch_inputs["freq_inputs"].to(device)
            seq_inputs = batch_inputs["seq_inputs"].to(device)
            pos_inputs = batch_inputs["pos_inputs"].to(device)
            labels = labels[:,None].to(device)

            preds = model({"freq_inputs": freq_inputs, "seq_inputs": seq_inputs, "pos_inputs": pos_inputs})
            loss = criterion(preds, labels)

            val_loss += loss.item()
            total += labels.size(0)
            correct += ((torch.sigmoid(preds) > 0.5).float() == labels).sum().item()

    print(f"Test Loss: {val_loss / total:.4f}")
    print(f"Test Accuracy: {correct / total:.4f}")


evaluate_model(model, test_loader, criterion)
# Save the final model in Hugging Face format



ModuleNotFoundError: No module named 'torch'