awngsz
/

baseline_model

Text Classification

Transformers

Joblib

bert

Model card Files Files and versions Community

nataliegilbert commited on Dec 14, 2024

Commit

4be6b95

verified ·

1 Parent(s): acc26a7

Update README.md

Browse files

Files changed (1) hide show

README.md +150 -0

README.md CHANGED Viewed

@@ -28,6 +28,16 @@ from torchvision.io import read_image
 from torch.utils.data import Dataset, DataLoader
 from sklearn.metrics import accuracy_score
 import numpy as np
 from transformers import DistilBertTokenizer, DistilBertModel</pre>
@@ -46,6 +56,146 @@ test_df = pd.read_csv(file_path)
 X_test = test_df['title']
 y_test = test_df['labels']  </pre>
 # Load the embedding model from Huggingface. Transformer: DistilBERT

 from torch.utils.data import Dataset, DataLoader
 from sklearn.metrics import accuracy_score
 import numpy as np
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import nltk
+from nltk.corpus import stopwords
+nltk.download('stopwords')
+nltk.download('wordnet')
+import re
 from transformers import DistilBertTokenizer, DistilBertModel</pre>
 X_test = test_df['title']
 y_test = test_df['labels']  </pre>
+# Clean the data
+<pre>
+def clean_headlines(df, column_name):
+    """
+    Cleans a specified column in a DataFrame by:
+    - Removing HTML tags
+    - Removing <script> elements
+    - Removing extra spaces, trailing/leading whitespaces
+    - Removing special characters
+    - Removing repeating special characters
+    - Removing tabs
+    - Removing newline characters
+    - Removing specific punctuation: periods, commas, and parentheses
+    - Normalizing double quotes ("") to single quotes ('')
+    Args:
+        df (pd.DataFrame): The DataFrame containing the column to clean
+        column_name (str): The name of the column to clean
+    Returns:
+        pd.DataFrame: A DataFrame with the cleaned column
+    """
+    # Remove HTML tags
+    df[column_name] = df[column_name].str.replace(r'<[^<]+?>', '', regex=True)
+    # Remove scripts
+    df[column_name] = df[column_name].str.replace(r'<script.*?</script>', '', regex=True)
+    # Remove extra spaces including leading/trailing whitespaces
+    df[column_name] = df[column_name].str.strip().str.replace(r'\s+', ' ', regex=True)
+    # Remove special characters
+    df[column_name] = df[column_name].str.strip().str.replace(r'[&*|~`^=_+{}[\]<>\\]', ' ', regex=True)
+    # Remove repeating special characters
+    df[column_name] = df[column_name].str.strip().str.replace(r'([?!])\1+', r'\1', regex=True)
+    # Remove tabs
+    df[column_name] = df[column_name].str.replace(r'\t', ' ', regex=True)
+    # Remove newline characters
+    df[column_name] = df[column_name].str.replace(r'\n', ' ', regex=True)
+    # Normalize double quotes to single quotes
+    # df[column_name] = df[column_name].str.replace(r'"', "'", regex=True)
+    # Punctuation
+    # df[column_name] = df[column_name].str.replace(r'[.,()]', '', regex=True)
+    return df </pre>
+<pre>
+def normalize_headlines(df, column_name):
+  """
+    Normalizes a given headline by:
+    - converting it to lowercase
+    - removing stopwords
+    - applying stemming or lemmatization to reduce words to their base forms
+    Args:
+        df (pd.DataFrame): The DataFrame containing the column to clean
+        column_name (str): The name of the column to clean
+    Returns:
+        pd.DataFrame: A DataFrame with the cleaned column
+  """
+  # Convert headlines to lowercase
+  df[column_name] = df[column_name].str.lower()
+  # Remove stopwords from headline
+  stop_words = set(stopwords.words('english'))
+  df[column_name] = df[column_name].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
+  # Lemmatize words to base form
+  lemmatizer = nltk.stem.WordNetLemmatizer()
+  df[column_name] = df[column_name].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
+  return df </pre>
+<pre>
+def handle_missing_data(df, column_name):
+    """
+    Handles missing or incomplete data in a given column of a DataFrame, including:
+    - Replacing NULL values with "Unknown Headline"
+    - Augmenting the data by creating headlines with synonyms of words in other headlines
+    Args:
+        df (pd.DataFrame): The DataFrame containing the column to clean
+        column_name (str): The name of the column to clean
+    Returns:
+        pd.DataFrame: A DataFrame with the cleaned column
+    """
+    # Remove NULL headlines
+    df = df.dropna(subset=[column_name])
+    # Set a minimum word count threshold
+    min_word_count = 3
+    # Filter out titles with fewer words
+    df = df[df[column_name].str.split().apply(len) >= min_word_count].reset_index(drop=True)
+    return df </pre>
+<pre>
+def consistency_checks(df, column_name):
+  """
+    Ensures all headlines follow a consistent format by:
+      - Removing duplicate headlines
+    Args:
+        df (pd.DataFrame): The DataFrame containing the column to clean
+        column_name (str): The name of the column to clean
+    Returns:
+        pd.DataFrame: A DataFrame with the cleaned column
+  """
+  # Remove duplicate headlines
+  df = df.drop_duplicates(subset=[column_name])
+  # Filter headlines with too few or too many words
+  #df = df[df['title'].str.split().apply(len).between(3, 20)]
+  return df </pre>
+<pre>
+X_test = clean_headlines(X_test, 'title')
+X_test = normalize_headlines(X_test, 'title')
+X_test = X_test.dropna(subset = ['title'])
+X_test = handle_missing_data(X_test, 'title')
+X_test = consistency_checks(X_test, 'title') </pre>
 # Load the embedding model from Huggingface. Transformer: DistilBERT