nataliegilbert commited on
Commit
4be6b95
·
verified ·
1 Parent(s): acc26a7

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +150 -0
README.md CHANGED
@@ -28,6 +28,16 @@ from torchvision.io import read_image
28
  from torch.utils.data import Dataset, DataLoader
29
  from sklearn.metrics import accuracy_score
30
  import numpy as np
 
 
 
 
 
 
 
 
 
 
31
  from transformers import DistilBertTokenizer, DistilBertModel</pre>
32
 
33
 
@@ -46,6 +56,146 @@ test_df = pd.read_csv(file_path)
46
  X_test = test_df['title']
47
  y_test = test_df['labels'] </pre>
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  # Load the embedding model from Huggingface. Transformer: DistilBERT
50
 
51
 
 
28
  from torch.utils.data import Dataset, DataLoader
29
  from sklearn.metrics import accuracy_score
30
  import numpy as np
31
+ import pandas as pd
32
+ import numpy as np
33
+ import matplotlib.pyplot as plt
34
+ import seaborn as sns
35
+ import nltk
36
+ from nltk.corpus import stopwords
37
+ nltk.download('stopwords')
38
+ nltk.download('wordnet')
39
+
40
+ import re
41
  from transformers import DistilBertTokenizer, DistilBertModel</pre>
42
 
43
 
 
56
  X_test = test_df['title']
57
  y_test = test_df['labels'] </pre>
58
 
59
+ # Clean the data
60
+
61
+ <pre>
62
+ def clean_headlines(df, column_name):
63
+ """
64
+ Cleans a specified column in a DataFrame by:
65
+ - Removing HTML tags
66
+ - Removing <script> elements
67
+ - Removing extra spaces, trailing/leading whitespaces
68
+ - Removing special characters
69
+ - Removing repeating special characters
70
+ - Removing tabs
71
+ - Removing newline characters
72
+ - Removing specific punctuation: periods, commas, and parentheses
73
+ - Normalizing double quotes ("") to single quotes ('')
74
+
75
+ Args:
76
+ df (pd.DataFrame): The DataFrame containing the column to clean
77
+ column_name (str): The name of the column to clean
78
+
79
+ Returns:
80
+ pd.DataFrame: A DataFrame with the cleaned column
81
+ """
82
+ # Remove HTML tags
83
+ df[column_name] = df[column_name].str.replace(r'<[^<]+?>', '', regex=True)
84
+
85
+ # Remove scripts
86
+ df[column_name] = df[column_name].str.replace(r'<script.*?</script>', '', regex=True)
87
+
88
+ # Remove extra spaces including leading/trailing whitespaces
89
+ df[column_name] = df[column_name].str.strip().str.replace(r'\s+', ' ', regex=True)
90
+
91
+ # Remove special characters
92
+ df[column_name] = df[column_name].str.strip().str.replace(r'[&*|~`^=_+{}[\]<>\\]', ' ', regex=True)
93
+
94
+ # Remove repeating special characters
95
+ df[column_name] = df[column_name].str.strip().str.replace(r'([?!])\1+', r'\1', regex=True)
96
+
97
+ # Remove tabs
98
+ df[column_name] = df[column_name].str.replace(r'\t', ' ', regex=True)
99
+
100
+ # Remove newline characters
101
+ df[column_name] = df[column_name].str.replace(r'\n', ' ', regex=True)
102
+
103
+ # Normalize double quotes to single quotes
104
+ # df[column_name] = df[column_name].str.replace(r'"', "'", regex=True)
105
+
106
+ # Punctuation
107
+ # df[column_name] = df[column_name].str.replace(r'[.,()]', '', regex=True)
108
+
109
+ return df </pre>
110
+
111
+ <pre>
112
+ def normalize_headlines(df, column_name):
113
+ """
114
+ Normalizes a given headline by:
115
+ - converting it to lowercase
116
+ - removing stopwords
117
+ - applying stemming or lemmatization to reduce words to their base forms
118
+
119
+ Args:
120
+ df (pd.DataFrame): The DataFrame containing the column to clean
121
+ column_name (str): The name of the column to clean
122
+
123
+ Returns:
124
+ pd.DataFrame: A DataFrame with the cleaned column
125
+ """
126
+
127
+ # Convert headlines to lowercase
128
+ df[column_name] = df[column_name].str.lower()
129
+
130
+ # Remove stopwords from headline
131
+ stop_words = set(stopwords.words('english'))
132
+ df[column_name] = df[column_name].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
133
+
134
+ # Lemmatize words to base form
135
+ lemmatizer = nltk.stem.WordNetLemmatizer()
136
+ df[column_name] = df[column_name].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
137
+
138
+ return df </pre>
139
+
140
+ <pre>
141
+ def handle_missing_data(df, column_name):
142
+ """
143
+ Handles missing or incomplete data in a given column of a DataFrame, including:
144
+
145
+ - Replacing NULL values with "Unknown Headline"
146
+ - Augmenting the data by creating headlines with synonyms of words in other headlines
147
+
148
+ Args:
149
+ df (pd.DataFrame): The DataFrame containing the column to clean
150
+ column_name (str): The name of the column to clean
151
+
152
+ Returns:
153
+ pd.DataFrame: A DataFrame with the cleaned column
154
+ """
155
+
156
+ # Remove NULL headlines
157
+ df = df.dropna(subset=[column_name])
158
+
159
+ # Set a minimum word count threshold
160
+ min_word_count = 3
161
+
162
+ # Filter out titles with fewer words
163
+ df = df[df[column_name].str.split().apply(len) >= min_word_count].reset_index(drop=True)
164
+
165
+
166
+ return df </pre>
167
+
168
+ <pre>
169
+ def consistency_checks(df, column_name):
170
+ """
171
+ Ensures all headlines follow a consistent format by:
172
+ - Removing duplicate headlines
173
+
174
+ Args:
175
+ df (pd.DataFrame): The DataFrame containing the column to clean
176
+ column_name (str): The name of the column to clean
177
+
178
+ Returns:
179
+ pd.DataFrame: A DataFrame with the cleaned column
180
+
181
+ """
182
+
183
+ # Remove duplicate headlines
184
+ df = df.drop_duplicates(subset=[column_name])
185
+
186
+ # Filter headlines with too few or too many words
187
+ #df = df[df['title'].str.split().apply(len).between(3, 20)]
188
+
189
+
190
+ return df </pre>
191
+
192
+ <pre>
193
+ X_test = clean_headlines(X_test, 'title')
194
+ X_test = normalize_headlines(X_test, 'title')
195
+ X_test = X_test.dropna(subset = ['title'])
196
+ X_test = handle_missing_data(X_test, 'title')
197
+ X_test = consistency_checks(X_test, 'title') </pre>
198
+
199
  # Load the embedding model from Huggingface. Transformer: DistilBERT
200
 
201