Tymec commited on
Commit
e1645d7
·
1 Parent(s): d29d6fe

Add slang map

Browse files
Files changed (6) hide show
  1. .gitattributes +3 -1
  2. .gitignore +2 -2
  3. README.md +3 -0
  4. app/constants.py +3 -0
  5. app/data.py +100 -3
  6. data/slang.json +0 -0
.gitattributes CHANGED
@@ -5,6 +5,7 @@
5
  # Hide from GitHub's language detection
6
  *.yaml linguist-documentation
7
  *.toml linguist-documentation
 
8
 
9
  # Remove assets from github statistics
10
  *.yaml linguist-vendored
@@ -12,10 +13,11 @@
12
 
13
  # Set the language for these files to ensure GitHub doesn't show the comments as errors
14
  .vscode/*.json linguist-language=JSON5
 
15
 
16
  # Do not try and merge these files
17
  poetry.lock -diff
18
- *.ipynb -diff
19
 
20
  # LFS
21
  models/** filter=lfs diff=lfs merge=lfs -text
 
5
  # Hide from GitHub's language detection
6
  *.yaml linguist-documentation
7
  *.toml linguist-documentation
8
+ *.json linguist-documentation
9
 
10
  # Remove assets from github statistics
11
  *.yaml linguist-vendored
 
13
 
14
  # Set the language for these files to ensure GitHub doesn't show the comments as errors
15
  .vscode/*.json linguist-language=JSON5
16
+ data/* binary
17
 
18
  # Do not try and merge these files
19
  poetry.lock -diff
20
+ *.pkl -diff
21
 
22
  # LFS
23
  models/** filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -194,6 +194,6 @@ pyrightconfig.json
194
  # End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,python
195
 
196
  # Custom
197
- data/
198
- cache/
199
  flagged/
 
194
  # End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,python
195
 
196
  # Custom
197
+ data/*
198
+ !data/slang.json
199
  flagged/
README.md CHANGED
@@ -138,6 +138,9 @@ python -m app evaluate --help
138
  | imdb50k | `data/imdb50k.csv` | | [IMDB Movie Reviews](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews) |
139
  | test | `data/test.csv` | required for `evaluate` | [Multiclass Sentiment Analysis](https://huggingface.co/datasets/Sp1786/multiclass-sentiment-analysis-dataset) |
140
 
 
 
 
141
 
142
  ### Vectorizers
143
  | Option | Description | When to Use |
 
138
  | imdb50k | `data/imdb50k.csv` | | [IMDB Movie Reviews](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews) |
139
  | test | `data/test.csv` | required for `evaluate` | [Multiclass Sentiment Analysis](https://huggingface.co/datasets/Sp1786/multiclass-sentiment-analysis-dataset) |
140
 
141
+ #### Used for text preprocessing
142
+ - [Slang Map](Https://www.kaggle.com/code/nmaguette/up-to-date-list-of-slangs-for-text-preprocessing)
143
+
144
 
145
  ### Vectorizers
146
  | Option | Description | When to Use |
app/constants.py CHANGED
@@ -19,6 +19,9 @@ IMDB50K_URL = "https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-5
19
  TEST_DATASET_PATH = DATA_DIR / "test.csv"
20
  TEST_DATASET_URL = "https://huggingface.co/datasets/Sp1786/multiclass-sentiment-analysis-dataset"
21
 
 
 
 
22
  CACHE_DIR.mkdir(exist_ok=True, parents=True)
23
  DATA_DIR.mkdir(exist_ok=True, parents=True)
24
  MODEL_DIR.mkdir(exist_ok=True, parents=True)
 
19
  TEST_DATASET_PATH = DATA_DIR / "test.csv"
20
  TEST_DATASET_URL = "https://huggingface.co/datasets/Sp1786/multiclass-sentiment-analysis-dataset"
21
 
22
+ SLANGMAP_PATH = DATA_DIR / "slang.json"
23
+ SLANGMAP_URL = "Https://www.kaggle.com/code/nmaguette/up-to-date-list-of-slangs-for-text-preprocessing"
24
+
25
  CACHE_DIR.mkdir(exist_ok=True, parents=True)
26
  DATA_DIR.mkdir(exist_ok=True, parents=True)
27
  MODEL_DIR.mkdir(exist_ok=True, parents=True)
app/data.py CHANGED
@@ -1,8 +1,12 @@
1
  from __future__ import annotations
2
 
3
  import bz2
 
 
 
4
  from typing import TYPE_CHECKING, Literal, Sequence
5
 
 
6
  import pandas as pd
7
  import spacy
8
  from tqdm import tqdm
@@ -14,11 +18,15 @@ from app.constants import (
14
  IMDB50K_URL,
15
  SENTIMENT140_PATH,
16
  SENTIMENT140_URL,
 
 
17
  TEST_DATASET_PATH,
18
  TEST_DATASET_URL,
19
  )
20
 
21
  if TYPE_CHECKING:
 
 
22
  from spacy.tokens import Doc
23
 
24
  __all__ = ["load_data", "tokenize"]
@@ -35,6 +43,81 @@ except OSError:
35
  nlp = spacy.load("en_core_web_sm")
36
 
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  def _lemmatize(doc: Doc, threshold: int = 2) -> Sequence[str]:
39
  """Lemmatize the provided text using spaCy.
40
 
@@ -46,12 +129,15 @@ def _lemmatize(doc: Doc, threshold: int = 2) -> Sequence[str]:
46
  Sequence of lemmatized tokens
47
  """
48
  return [
49
- token.lemma_.lower().strip()
50
  for token in doc
51
  if not token.is_stop # Ignore stop words
52
  and not token.is_punct # Ignore punctuation
 
 
 
53
  and not token.is_alpha # Ignore non-alphabetic tokens
54
- and not (len(token.lemma_) < threshold) # Ignore short tokens
55
  ]
56
 
57
 
@@ -74,14 +160,25 @@ def tokenize(
74
  Returns:
75
  Tokenized text data
76
  """
 
 
 
 
 
 
 
 
 
 
77
  return pd.Series(
78
  [
79
  _lemmatize(doc, character_threshold)
80
  for doc in tqdm(
81
  nlp.pipe(text_data, batch_size=batch_size, n_process=n_jobs, disable=["parser", "ner", "tok2vec"]),
82
  total=len(text_data),
83
- disable=not show_progress,
84
  unit="doc",
 
85
  )
86
  ],
87
  )
 
1
  from __future__ import annotations
2
 
3
  import bz2
4
+ import json
5
+ import re
6
+ from functools import lru_cache
7
  from typing import TYPE_CHECKING, Literal, Sequence
8
 
9
+ import emoji
10
  import pandas as pd
11
  import spacy
12
  from tqdm import tqdm
 
18
  IMDB50K_URL,
19
  SENTIMENT140_PATH,
20
  SENTIMENT140_URL,
21
+ SLANGMAP_PATH,
22
+ SLANGMAP_URL,
23
  TEST_DATASET_PATH,
24
  TEST_DATASET_URL,
25
  )
26
 
27
  if TYPE_CHECKING:
28
+ from re import Pattern
29
+
30
  from spacy.tokens import Doc
31
 
32
  __all__ = ["load_data", "tokenize"]
 
43
  nlp = spacy.load("en_core_web_sm")
44
 
45
 
46
+ @lru_cache(maxsize=1)
47
+ def slang() -> tuple[Pattern, dict[str, str]]:
48
+ """Compile a re pattern for slang terms.
49
+
50
+ Returns:
51
+ Slang pattern and mapping
52
+
53
+ Raises:
54
+ FileNotFoundError: If the file is not found
55
+ """
56
+ if not SLANGMAP_PATH.exists():
57
+ # msg = f"Missing slang mapping file: {SLANG_PATH}"
58
+ msg = (
59
+ f"Slang mapping file not found at: '{SLANGMAP_PATH}'\n"
60
+ "Please download the file from:\n"
61
+ f"{SLANGMAP_URL}"
62
+ ) # fmt: off
63
+ raise FileNotFoundError(msg)
64
+
65
+ with SLANGMAP_PATH.open() as f:
66
+ mapping = json.load(f)
67
+
68
+ return re.compile(r"\b(" + "|".join(map(re.escape, mapping.keys())) + r")\b"), mapping
69
+
70
+
71
+ def _clean(text: str) -> str:
72
+ """Perform basic text cleaning.
73
+
74
+ Args:
75
+ text: Text to clean
76
+
77
+ Returns:
78
+ Cleaned text
79
+ """
80
+ # Make text lowercase
81
+ text = text.lower()
82
+
83
+ # Remove HTML tags
84
+ text = re.sub(r"<[^>]*>", "", text)
85
+
86
+ # Map slang terms
87
+ slang_pattern, slang_mapping = slang()
88
+ text = slang_pattern.sub(lambda x: slang_mapping[x.group()], text)
89
+
90
+ # Remove acronyms and abbreviations
91
+ # text = re.sub(r"(?:[a-z]\.){2,}", "", text)
92
+ text = re.sub(r"(?:[a-z]\.?)(?:[a-z]\.)", "", text)
93
+
94
+ # Remove honorifics
95
+ text = re.sub(r"\b(?:mr|mrs|ms|dr|prof|sr|jr)\.?\b", "", text)
96
+
97
+ # Remove year abbreviations
98
+ text = re.sub(r"\b(?:\d{3}0|\d0)s?\b", "", text)
99
+
100
+ # Remove hashtags
101
+ text = re.sub(r"#[^\s]+", "", text)
102
+
103
+ # Replace mentions with a generic tag
104
+ text = re.sub(r"@[^\s]+", "user", text)
105
+
106
+ # Replace X/Y with X or Y
107
+ text = re.sub(r"\b([a-z]+)[//]([a-z]+)\b", r"\1 or \2", text)
108
+
109
+ # Convert emojis to text
110
+ text = emoji.demojize(text, delimiters=("emoji_", ""))
111
+
112
+ # Remove special characters
113
+ text = re.sub(r"[^a-z0-9\s]", "", text)
114
+
115
+ # EXTRA: imdb50k specific cleaning
116
+ text = re.sub(r"mst3k", "", text) # Very common acronym for Mystery Science Theater 3000
117
+
118
+ return text.strip()
119
+
120
+
121
  def _lemmatize(doc: Doc, threshold: int = 2) -> Sequence[str]:
122
  """Lemmatize the provided text using spaCy.
123
 
 
129
  Sequence of lemmatized tokens
130
  """
131
  return [
132
+ tok
133
  for token in doc
134
  if not token.is_stop # Ignore stop words
135
  and not token.is_punct # Ignore punctuation
136
+ and not token.like_email # Ignore email addresses
137
+ and not token.like_url # Ignore URLs
138
+ and not token.like_num # Ignore numbers
139
  and not token.is_alpha # Ignore non-alphabetic tokens
140
+ and not (len(tok := token.lemma_.lower().strip()) < threshold) # Ignore short tokens
141
  ]
142
 
143
 
 
160
  Returns:
161
  Tokenized text data
162
  """
163
+ text_data = [
164
+ _clean(text)
165
+ for text in tqdm(
166
+ text_data,
167
+ desc="Cleaning",
168
+ unit="doc",
169
+ disable=not show_progress,
170
+ )
171
+ ]
172
+
173
  return pd.Series(
174
  [
175
  _lemmatize(doc, character_threshold)
176
  for doc in tqdm(
177
  nlp.pipe(text_data, batch_size=batch_size, n_process=n_jobs, disable=["parser", "ner", "tok2vec"]),
178
  total=len(text_data),
179
+ desc="Lemmatization",
180
  unit="doc",
181
+ disable=not show_progress,
182
  )
183
  ],
184
  )
data/slang.json ADDED
Binary file (6.24 kB). View file