Spaces:
Runtime error
Runtime error
Commit
·
c4283fb
1
Parent(s):
788794e
Update app.py (#7)
Browse files- Update app.py (7567b0698791efa80ae3a76135960428391fc3bf)
Co-authored-by: Bilal Sardar <[email protected]>
app.py
CHANGED
@@ -7,6 +7,7 @@ import nltk
|
|
7 |
from nltk.stem import WordNetLemmatizer
|
8 |
from nltk.tokenize import word_tokenize
|
9 |
from nltk.corpus import wordnet
|
|
|
10 |
|
11 |
nltk.download('punkt')
|
12 |
nltk.download('wordnet')
|
@@ -22,22 +23,34 @@ def get_wordnet_pos(tag):
|
|
22 |
elif tag.startswith('R'):
|
23 |
return wordnet.ADV
|
24 |
else:
|
25 |
-
return wordnet.NOUN
|
|
|
26 |
|
27 |
def get_lemma(word):
|
|
|
|
|
28 |
lemmatizer = WordNetLemmatizer()
|
29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
tokens = word_tokenize(word)
|
31 |
-
tagged_words =
|
32 |
lemmas = []
|
33 |
for tagged_word in tagged_words:
|
34 |
word = tagged_word[0]
|
35 |
pos = tagged_word[1]
|
36 |
wordnet_pos = get_wordnet_pos(pos)
|
37 |
-
|
|
|
|
|
|
|
38 |
lemmas.append(lemma)
|
39 |
return ' '.join(lemmas)
|
40 |
-
|
41 |
def apply_lemma_to_string(sentence):
|
42 |
words = word_tokenize(sentence)
|
43 |
lemmas = [get_lemma(word) for word in words]
|
|
|
7 |
from nltk.stem import WordNetLemmatizer
|
8 |
from nltk.tokenize import word_tokenize
|
9 |
from nltk.corpus import wordnet
|
10 |
+
from nltk.tag import pos_tag
|
11 |
|
12 |
nltk.download('punkt')
|
13 |
nltk.download('wordnet')
|
|
|
23 |
elif tag.startswith('R'):
|
24 |
return wordnet.ADV
|
25 |
else:
|
26 |
+
return wordnet.NOUN
|
27 |
+
|
28 |
|
29 |
def get_lemma(word):
|
30 |
+
nltk.download('averaged_perceptron_tagger')
|
31 |
+
nltk.download('wordnet')
|
32 |
lemmatizer = WordNetLemmatizer()
|
33 |
|
34 |
+
exceptions = {
|
35 |
+
'are': 'are', # Preserve 'are' as-is
|
36 |
+
'have': 'have', # Preserve 'have' as-is
|
37 |
+
'do': 'do', # Preserve 'do' as-is
|
38 |
+
'am':'am'
|
39 |
+
}
|
40 |
+
|
41 |
tokens = word_tokenize(word)
|
42 |
+
tagged_words = pos_tag(tokens)
|
43 |
lemmas = []
|
44 |
for tagged_word in tagged_words:
|
45 |
word = tagged_word[0]
|
46 |
pos = tagged_word[1]
|
47 |
wordnet_pos = get_wordnet_pos(pos)
|
48 |
+
if word in exceptions:
|
49 |
+
lemma = exceptions[word]
|
50 |
+
else:
|
51 |
+
lemma = lemmatizer.lemmatize(word, pos=wordnet_pos)
|
52 |
lemmas.append(lemma)
|
53 |
return ' '.join(lemmas)
|
|
|
54 |
def apply_lemma_to_string(sentence):
|
55 |
words = word_tokenize(sentence)
|
56 |
lemmas = [get_lemma(word) for word in words]
|