Spaces:

Anuj02003
/

Spam_Detection_Using_LLM

Sleeping

App Files Files Community

Anuj02003 commited on Nov 22, 2024

Commit

a980cfd

verified ·

1 Parent(s): d838e51

Upload 3 files

Browse files

Files changed (3) hide show

app.py +36 -0
fine_tune.py +84 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import streamlit as st
+from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
+import torch
+# Set page configuration as the very first Streamlit command
+st.set_page_config(page_title="Spam Detection", page_icon="📧")
+# Load fine-tuned model and tokenizer
+model = DistilBertForSequenceClassification.from_pretrained("./fine_tuned_model")
+tokenizer = DistilBertTokenizerFast.from_pretrained("./fine_tuned_model")
+# Function to predict whether a message is spam or not
+def predict_spam(text):
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
+    with torch.no_grad():
+        outputs = model(**inputs)
+        logits = outputs.logits
+        prediction = torch.argmax(logits, dim=-1).item()
+    return "Spam" if prediction == 1 else "Not Spam"
+def main():
+    st.title("Spam Detection")
+    st.write("This is a Spam Detection App using a fine-tuned DistilBERT model.")
+    # Input text box for the user
+    message = st.text_area("Enter message to classify as spam or not:")
+    if st.button("Predict"):
+        if message:
+            prediction = predict_spam(message)
+            st.write(f"The message is: {prediction}")
+        else:
+            st.write("Please enter a message to classify.")
+if __name__ == "__main__":
+    main()

fine_tune.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from datasets import load_dataset
+from transformers import DistilBertForSequenceClassification, DistilBertTokenizer, Trainer, TrainingArguments
+import torch
+from sklearn.metrics import accuracy_score
+# Load the dataset
+dataset = load_dataset("sms_spam")
+# Print the dataset structure and inspect the columns
+print(dataset)
+print(dataset['train'][0])  # Print the first row of the 'train' split
+# Initialize the tokenizer
+tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+# Initialize the model
+model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
+# Tokenize the dataset using the correct column
+def tokenize_function(examples):
+    return tokenizer(examples["sms"], padding="max_length", truncation=True)
+# Apply the tokenization to the dataset
+tokenized_datasets = dataset.map(tokenize_function, batched=True)
+# Check if 'test' split exists, else use 'validation' or create your own split
+train_dataset = tokenized_datasets["train"]
+# If there is no 'test' split, you can use 'validation' or manually split the dataset
+eval_dataset = tokenized_datasets.get("test", tokenized_datasets.get("validation"))
+# If neither 'test' nor 'validation' exists, manually split the dataset
+if eval_dataset is None:
+    eval_dataset = train_dataset.shuffle(seed=42).select([i for i in range(len(train_dataset)//10)])  # Take 10% as eval dataset
+    train_dataset = train_dataset.select([i for i in range(len(train_dataset)//10, len(train_dataset))])  # Take the remaining 90% as train dataset
+# Set up training arguments
+# Set up training arguments
+training_args = TrainingArguments(
+    output_dir="./results",
+    evaluation_strategy="steps",  # Evaluate every 'eval_steps'
+    save_strategy="steps",  # Save every 'save_steps'
+    eval_steps=500,  # Evaluate every 500 steps
+    save_steps=500,  # Save every 500 steps
+    learning_rate=2e-5,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=64,
+    num_train_epochs=3,
+    weight_decay=0.01,
+    logging_dir="./logs",
+    logging_steps=10,
+    load_best_model_at_end=True,
+    metric_for_best_model="accuracy",
+)
+# Define compute_metrics function (optional, if you want to track metrics)
+def compute_metrics(p):
+    predictions, labels = p
+    preds = predictions.argmax(axis=1)
+    return {"accuracy": accuracy_score(labels, preds)}
+# Initialize the Trainer
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset,
+    eval_dataset=eval_dataset,
+    compute_metrics=compute_metrics,  # Optional: to compute accuracy
+)
+# Train the model
+trainer.train()
+# Save the model after training
+model.save_pretrained("./fine_tuned_model")
+tokenizer.save_pretrained("./fine_tuned_model")
+# Optionally, push the model to Hugging Face Hub
+# from huggingface_hub import HfApi, HfFolder
+# model.push_to_hub("Anuj02003/Spam-classification-using-LLM")
+# tokenizer.push_to_hub("Anuj02003/Spam-classification-using-LLM")

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch==2.5.1
+torchvision
+torchaudio
+transformers==4.33.0
+datasets==2.14.0
+streamlit==1.24.0
+huggingface_hub