Spaces:
Runtime error
Runtime error
import logging | |
import pandas as pd | |
import numpy as np | |
import string | |
import nltk | |
import spacy | |
import en_core_web_sm | |
import re | |
import streamlit as st | |
from haystack.nodes import PreProcessor | |
'''basic cleaning - suitable for transformer models''' | |
def basic(s,SDG = False): | |
""" | |
:param s: string to be processed | |
:return: processed string: see comments in the source code for more info | |
""" | |
# Text Lowercase | |
#s = s.lower() | |
# Remove punctuation | |
#translator = str.maketrans(' ', ' ', string.punctuation) | |
#s = s.translate(translator) | |
# Remove URLs | |
s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE) | |
s = re.sub(r"http\S+", " ", s) | |
if SDG == True: | |
s = s.lower() | |
translator = str.maketrans(' ', ' ', string.punctuation) | |
s = s.translate(translator) | |
s = re.sub('\n', ' ', s) | |
s = re.sub("\'", " ", s) | |
s = re.sub(r'\d+', ' ', s) | |
s = re.sub(r'\W+', ' ', s) | |
# Remove new line characters | |
#s = re.sub('\n', ' ', s) | |
# Remove distracting single quotes | |
#s = re.sub("\'", " ", s) | |
# Remove all remaining numbers and non alphanumeric characters | |
#s = re.sub(r'\d+', ' ', s) | |
#s = re.sub(r'\W+', ' ', s) | |
# define custom words to replace: | |
#s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s) | |
return s.strip() | |
def preprocessingForSDG(document): | |
""" | |
takes in haystack document object and splits it into paragraphs and applies simple cleaning. | |
Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and | |
list that contains all text joined together. | |
""" | |
preprocessor = PreProcessor( | |
clean_empty_lines=True, | |
clean_whitespace=True, | |
clean_header_footer=True, | |
split_by="word", | |
split_length=120, | |
split_respect_sentence_boundary=False, | |
#split_overlap=1 | |
) | |
for i in document: | |
docs_processed = preprocessor.process([i]) | |
for item in docs_processed: | |
item.content = basic(item.content, SDG = True) | |
with st.spinner("๐ document being splitted into paragraphs"): | |
logging.info("document has been splitted to {} paragraphs".format(len(docs_processed))) | |
# create dataframe of text and list of all text | |
df = pd.DataFrame(docs_processed) | |
all_text = " ".join(df.content.to_list()) | |
par_list = df.content.to_list() | |
return docs_processed, df, all_text, par_list | |
def preprocessing(document): | |
""" | |
takes in haystack document object and splits it into paragraphs and applies simple cleaning. | |
Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and | |
list that contains all text joined together. | |
""" | |
preprocessor = PreProcessor( | |
clean_empty_lines=True, | |
clean_whitespace=True, | |
clean_header_footer=True, | |
split_by="sentence", | |
split_length=3, | |
split_respect_sentence_boundary=False, | |
split_overlap=1 | |
) | |
for i in document: | |
docs_processed = preprocessor.process([i]) | |
for item in docs_processed: | |
item.content = basic(item.content) | |
with st.spinner("๐ document being splitted into paragraphs"): | |
logging.info("document has been splitted to {} paragraphs".format(len(docs_processed))) | |
# create dataframe of text and list of all text | |
df = pd.DataFrame(docs_processed) | |
all_text = " ".join(df.content.to_list()) | |
par_list = df.content.to_list() | |
return docs_processed, df, all_text, par_list | |
'''processing with spacy - suitable for models such as tf-idf, word2vec''' | |
def spacy_clean(alpha:str, use_nlp:bool = True) -> str: | |
""" | |
Clean and tokenise a string using Spacy. Keeps only alphabetic characters, removes stopwords and | |
filters out all but proper nouns, nounts, verbs and adjectives. | |
Parameters | |
---------- | |
alpha : str | |
The input string. | |
use_nlp : bool, default False | |
Indicates whether Spacy needs to use NLP. Enable this when using this function on its own. | |
Should be set to False if used inside nlp.pipeline | |
Returns | |
------- | |
' '.join(beta) : a concatenated list of lemmatised tokens, i.e. a processed string | |
Notes | |
----- | |
Fails if alpha is an NA value. Performance decreases as len(alpha) gets large. | |
Use together with nlp.pipeline for batch processing. | |
""" | |
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "textcat"]) | |
if use_nlp: | |
alpha = nlp(alpha) | |
beta = [] | |
for tok in alpha: | |
if all([tok.is_alpha, not tok.is_stop, tok.pos_ in ['PROPN', 'NOUN', 'VERB', 'ADJ']]): | |
beta.append(tok.lemma_) | |
text = ' '.join(beta) | |
text = text.lower() | |
return text |