|
|
|
""" |
|
Created on Fri Jun 12 15:26:44 2020 |
|
|
|
@author: luol2 |
|
""" |
|
|
|
import nltk |
|
from nltk.stem import WordNetLemmatizer |
|
from nltk.corpus import wordnet |
|
from nltk.stem.porter import PorterStemmer |
|
lemmatizer = WordNetLemmatizer() |
|
stemmer = PorterStemmer() |
|
import io |
|
|
|
def get_wordnet_pos(treebank_tag): |
|
if treebank_tag.startswith('J'): |
|
return wordnet.ADJ |
|
elif treebank_tag.startswith('V'): |
|
return wordnet.VERB |
|
elif treebank_tag.startswith('N'): |
|
return wordnet.NOUN |
|
elif treebank_tag.startswith('R') or treebank_tag=='IN': |
|
return wordnet.ADV |
|
else: |
|
return wordnet.NOUN |
|
|
|
def ssplit_token_pos_lemma(in_text): |
|
|
|
fout=io.StringIO() |
|
|
|
line=in_text.strip() |
|
line=line.replace('-',' - ').replace('/',' / ') |
|
sentences = nltk.sent_tokenize(line) |
|
sentences = [nltk.word_tokenize(sent) for sent in sentences] |
|
|
|
for sent in sentences: |
|
token_pos = nltk.pos_tag(sent) |
|
for token in token_pos: |
|
lemma = lemmatizer.lemmatize(token[0].lower(), get_wordnet_pos(token[1])) |
|
stem = stemmer.stem(token[0].lower()) |
|
fout.write(token[0]+'\t'+lemma+'\t'+stem+'\t'+token[1]+'\n') |
|
fout.write('\n') |
|
|
|
return fout.getvalue() |