Spaces:
Sleeping
Sleeping
import re | |
import os | |
import spacy_streamlit | |
from collections import Counter | |
import glob | |
import spacy | |
from spacy.tokens import Doc | |
from spacy.cli._util import import_code | |
from utils.visualize import visualize_spans | |
from utils.utility import preprocess, delete_overlapping_span, cleanup_justify | |
from resources.text_list import TEXT_LIST | |
from resources.template_list import TPL_SPAN, TPL_SPAN_SLICE, TPL_SPAN_START | |
from resources.colors import COLORS_1 | |
from skbio import diversity as dv | |
from pipeline.post_processors import simple_table, const_table, ngrammar, diversity_values | |
import pandas as pd | |
# from pipeline.custom_functions import custom_functions | |
SPAN_ATTRS = ["text", "label_", "start", "end"] | |
CATEGORIES = ['ATTRIBUTION', "CITATION", "COUNTER", "DENY", "ENDOPHORIC", "ENTERTAIN", "JUSTIFYING", "MONOGLOSS", "PROCLAIM", "SOURCES"] | |
# spacy.prefer_gpu() | |
def load_model(spacy_model): | |
# source = spacy.blank("en") | |
nlp = spacy.load(spacy_model) # , vocab=nlp_to_copy.vocab | |
nlp.add_pipe('sentencizer') | |
return (nlp) | |
# source = spacy.blank("en") | |
modelname = "en_engagement_LSTM_f3" | |
# modelname = "en_engagement_LSTM_f5" | |
# modelname = "en_engagement_Dual_RoBERTa_acad3_f4" | |
os.makedirs(os.path.join("/Users/masakieguchi/Dropbox/0_Projects/0_basenlp/SFLAnalyzer/engagement-analyzer-demo/results", modelname)) | |
import_code("pipeline/custom_functions.py") | |
# nlp = spacy.load("en_engagement_three_RoBERTa_base_LSTM384") | |
nlp = spacy.load(modelname) | |
# doc = nlp(preprocess(TEXT_LIST[0])) | |
# cleanup_justify(doc, doc.spans["sc"]) | |
# delete_overlapping_span(doc.spans['sc']) | |
# data, cols = const_table(doc, spans_key='sc', attrs=SPAN_ATTRS) | |
# seq = [s for s in doc.spans["sc"]] | |
# span_ngrams = ngrammar(seq=seq, n=3) | |
# df = pd.DataFrame(data, columns=cols) | |
# constant_value = 42 | |
# new_col = pd.Series([constant_value] * df.shape[0], name='new_col') | |
# doclen = len(doc) | |
# doc_len = pd.Series([doclen] * df.shape[0], name='nwords') | |
# df.insert(0, "new", new_col, True) | |
# df.insert(1, "nwords", doc_len, True) | |
# df.to_csv("results/test.csv") | |
# inputfiles = glob.glob("ECCE_texts/preprocessed/*.txt") | |
inputfiles = glob.glob("ICNALE_texts/*/*.txt") | |
savedir = "ICNALE_analysis" | |
storeall = True | |
storage = [] | |
os.makedirs(os.path.join("ICNALE_analysis", modelname)) | |
doc_level_storage = [] | |
for file in inputfiles: | |
filename = os.path.split(file)[-1] | |
with open(file, "r") as f: | |
text = f.read() | |
text = preprocess(text) | |
doc = nlp(text) | |
cleanup_justify(doc, doc.spans["sc"]) | |
delete_overlapping_span(doc.spans['sc']) | |
data, cols = const_table(doc, spans_key='sc', attrs=SPAN_ATTRS) | |
seq = [s for s in doc.spans["sc"]] | |
span_ngrams = ngrammar(seq=seq, n=3) | |
### Make it a dataset | |
df = pd.DataFrame(data, columns=cols) | |
df = df.astype({"start": int, "end": int}) #convert col type | |
df = df.sort_values(by= ['start']) #and sort by start | |
# constant_value = 42 | |
new_col = pd.Series([filename] * df.shape[0], name='filename') | |
doclen = len(doc) | |
doc_len = pd.Series([doclen] * df.shape[0], name='nwords') | |
df.insert(0, "filename", new_col, True) | |
df.insert(1, "nwords", doc_len, True) | |
df.to_csv(f"{savedir}/{modelname}/{filename}.csv") | |
sequences = list(df['label_']) | |
# Engagement ngrams | |
span_bigrams = ngrammar(seq=seq, n=2) | |
bidf = pd.DataFrame(span_bigrams) | |
# constant_value = 42 | |
new_col = pd.Series([filename] * bidf.shape[0], name='filename') | |
bidf = bidf.insert(0, "filename", new_col, True) | |
## Document level | |
doc_level = {} | |
counts = df['label_'].value_counts().reindex(CATEGORIES, fill_value=0) | |
div = diversity_values(list(counts)) | |
div_data = pd.DataFrame.from_dict(div, orient='index') | |
doc_data = pd.concat([counts, div_data], axis = 0).T | |
doc_data.insert(0, "filename", filename, True) | |
doc_data.insert(1, "nwords", doc_len, True) | |
doc_data.to_csv(f"{savedir}/{modelname}/ddata_{filename}.csv") | |
if storeall: | |
storage.append(df) | |
doc_level_storage.append(doc_data) | |
alldf = pd.concat(storage) | |
alldf.to_csv(f"{savedir}/0_{modelname}_20230726.csv") | |
# alldoc = pd.concat(doc_level_storage) | |
# alldoc.to_csv(f"{savedir}/1_{modelname}_doc_20230426.csv") | |