Spaces:
Sleeping
Sleeping
edit preprocess
Browse files
app.py
CHANGED
@@ -29,6 +29,8 @@ nlp = spacy.load('en_core_web_lg', disable=['parser', 'ner', 'tagger'])
|
|
29 |
nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)
|
30 |
|
31 |
OOV_INDEX = 0
|
|
|
|
|
32 |
|
33 |
def preprocess_text(text):
|
34 |
"""Preprocess the input text using SpaCy and return word indices."""
|
@@ -37,9 +39,8 @@ def preprocess_text(text):
|
|
37 |
for doc in docs:
|
38 |
for token in doc:
|
39 |
if token.pos_ != "PUNCT":
|
40 |
-
|
41 |
-
|
42 |
-
word_seq.append(index)
|
43 |
return word_seq
|
44 |
|
45 |
def classify_question(text):
|
|
|
29 |
nlp.vocab.add_flag(lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS, spacy.attrs.IS_STOP)
|
30 |
|
31 |
OOV_INDEX = 0
|
32 |
+
word_dict = {"<OOV>": OOV_INDEX} # OOV token at index 0.
|
33 |
+
word_index = 1
|
34 |
|
35 |
def preprocess_text(text):
|
36 |
"""Preprocess the input text using SpaCy and return word indices."""
|
|
|
39 |
for doc in docs:
|
40 |
for token in doc:
|
41 |
if token.pos_ != "PUNCT":
|
42 |
+
index = word_dict.get(token.text, OOV_INDEX)
|
43 |
+
word_seq.append(index)
|
|
|
44 |
return word_seq
|
45 |
|
46 |
def classify_question(text):
|