File size: 1,944 Bytes
5f33ab8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Jul 17 06:45:41 PM EDT 2022 
author: Ryan Hildebrandt, github.com/ryancahildebrandt
"""
# imports
import nltk
import pandas as pd
import random
import re
import streamlit as st
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from spellchecker import SpellChecker

random.seed(42)

spell = SpellChecker()

@st.cache(persist = True)
def prep_load():
  nltk.download('punkt')
  nltk.download('stopwords')
  nltk.download('wordnet')
  nltk.download('omw-1.4')


def prep_lower(in_text):
  return [i.lower() for i in in_text]

def prep_punct(in_text):
  return [i.translate(str.maketrans('', '', string.punctuation)) for i in in_text]

def prep_stop(in_text):
  t = []
  for i in in_text:
    t.append(" ".join([j for j in word_tokenize(i) if j not in stopwords.words()]))
  return t

def prep_lemma(in_text):
  t = []
  for i in in_text:
    t.append(" ".join([WordNetLemmatizer().lemmatize(k) for k in word_tokenize(i)]))
  return t

def prep_stem(in_text):
  t = []
  for i in in_text:
    t.append(" ".join([PorterStemmer().stem(k) for k in word_tokenize(i)]))
  return t

def prep_spell(in_text):
  t = []
  for i in in_text:
    t.append(" ".join([j if j in string.punctuation else spell.correction(j) for j in word_tokenize(i)]))
  return t

clause_reg = "[\.\!\\\/\|,\?\;\:_\-=+]"
clause_words = ["and","about","but","so","because","since","though","although","unless","however","until"]
clause_sep = f"{clause_reg}{' | '.join(clause_words)}".replace("] ", "]")

def prep_clause(in_text):
  t = []
  for i in in_text:
    for j in re.split(clause_sep, i, flags = re.IGNORECASE):
      if j != "":
        t.append(str.strip(j))
  return t

def prep_ex(in_text, func):
  out = pd.DataFrame.from_dict({"Before" : in_text, "After" : func(in_text)})
  return out