|
|
|
|
|
""" |
|
Created on Sun Jul 17 06:45:41 PM EDT 2022 |
|
author: Ryan Hildebrandt, github.com/ryancahildebrandt |
|
""" |
|
|
|
import nltk |
|
import pandas as pd |
|
import random |
|
import re |
|
import streamlit as st |
|
import string |
|
|
|
from nltk.corpus import stopwords |
|
from nltk.stem import PorterStemmer |
|
from nltk.stem import WordNetLemmatizer |
|
from nltk.tokenize import word_tokenize |
|
from spellchecker import SpellChecker |
|
|
|
random.seed(42) |
|
|
|
spell = SpellChecker() |
|
|
|
@st.cache(persist = True) |
|
def prep_load(): |
|
nltk.download('punkt') |
|
nltk.download('stopwords') |
|
nltk.download('wordnet') |
|
nltk.download('omw-1.4') |
|
|
|
|
|
def prep_lower(in_text): |
|
return [i.lower() for i in in_text] |
|
|
|
def prep_punct(in_text): |
|
return [i.translate(str.maketrans('', '', string.punctuation)) for i in in_text] |
|
|
|
def prep_stop(in_text): |
|
t = [] |
|
for i in in_text: |
|
t.append(" ".join([j for j in word_tokenize(i) if j not in stopwords.words()])) |
|
return t |
|
|
|
def prep_lemma(in_text): |
|
t = [] |
|
for i in in_text: |
|
t.append(" ".join([WordNetLemmatizer().lemmatize(k) for k in word_tokenize(i)])) |
|
return t |
|
|
|
def prep_stem(in_text): |
|
t = [] |
|
for i in in_text: |
|
t.append(" ".join([PorterStemmer().stem(k) for k in word_tokenize(i)])) |
|
return t |
|
|
|
def prep_spell(in_text): |
|
t = [] |
|
for i in in_text: |
|
t.append(" ".join([j if j in string.punctuation else spell.correction(j) for j in word_tokenize(i)])) |
|
return t |
|
|
|
clause_reg = "[\.\!\\\/\|,\?\;\:_\-=+]" |
|
clause_words = ["and","about","but","so","because","since","though","although","unless","however","until"] |
|
clause_sep = f"{clause_reg}{' | '.join(clause_words)}".replace("] ", "]") |
|
|
|
def prep_clause(in_text): |
|
t = [] |
|
for i in in_text: |
|
for j in re.split(clause_sep, i, flags = re.IGNORECASE): |
|
if j != "": |
|
t.append(str.strip(j)) |
|
return t |
|
|
|
def prep_ex(in_text, func): |
|
out = pd.DataFrame.from_dict({"Before" : in_text, "After" : func(in_text)}) |
|
return out |