yitingliii
commited on
Create data_cleaning.py
Browse files- data_cleaning.py +23 -0
data_cleaning.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
```python
|
2 |
+
def clean(df):
|
3 |
+
stop_words = set(stopwords.words('english'))
|
4 |
+
lemmatizer = WordNetLemmatizer()
|
5 |
+
cleaned_headlines = []
|
6 |
+
|
7 |
+
for headline in df['title']:
|
8 |
+
headline = BeautifulSoup(headline, 'html.parser').get_text()
|
9 |
+
headline = re.sub(r'[^a-zA-Z0-9\s]', '', headline)
|
10 |
+
headline = re.sub(r'\s+', ' ', headline).strip()
|
11 |
+
headline = headline.lower()
|
12 |
+
|
13 |
+
words = headline.split()
|
14 |
+
words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
|
15 |
+
|
16 |
+
cleaned_headline = ' '.join(words)
|
17 |
+
cleaned_headlines.append(cleaned_headline)
|
18 |
+
|
19 |
+
df['title'] = cleaned_headlines
|
20 |
+
df.drop_duplicates(subset=['title'], inplace=True)
|
21 |
+
|
22 |
+
return df
|
23 |
+
```
|