lihuigu's picture
update structure
cee6a24
import numpy as np
import pandas as pd
import textstat
import spacy
from collections import Counter
from itertools import groupby
nlp = spacy.load("en_core_web_sm")
PRESENT_TENSE_VERB_LIST = ["VB", "VBP", "VBZ", "VBG"]
VERB_LIST = ["VB", "VBP", "VBZ", "VBG", "VBN", "VBD"]
NOUN_LIST = ["NNP", "NNPS"]
SECTIONS_MAPS = {
"Authors": "Authors",
"AUTHORS": "AUTHORS",
"Abstract": "Abstract",
"ABSTRACT": "Abstract",
"Date": "Date",
"DATE": "DATE",
"INTRODUCTION": "Introduction",
"MATERIALS AND METHODS": "Methods",
"Materials and methods": "Methods",
"METHODS": "Methods",
"RESULTS": "Results",
"CONCLUSIONS": "Conclusions",
"CONCLUSIONS AND FUTURE APPLICATIONS": "Conclusions",
"DISCUSSION": "Discussion",
"ACKNOWLEDGMENTS": "Acknowledgement",
"TABLES": "Tables",
"Tabnles": "Tables",
"DISCLOSURE": "Disclosure",
"CONFLICT OF INTEREST": "Disclosure",
"Acknowledgement": "Acknowledgements",
}
def compute_readability_stats(text):
"""
Compute reading statistics of the given text
Reference: https://github.com/shivam5992/textstat
Parameters
==========
text: str, input section or abstract text
"""
try:
readability_dict = {
"flesch_reading_ease": textstat.flesch_reading_ease(text),
"smog": textstat.smog_index(text),
"flesch_kincaid_grade": textstat.flesch_kincaid_grade(text),
"coleman_liau_index": textstat.coleman_liau_index(text),
"automated_readability_index": textstat.automated_readability_index(text),
"dale_chall": textstat.dale_chall_readability_score(text),
"difficult_words": textstat.difficult_words(text),
"linsear_write": textstat.linsear_write_formula(text),
"gunning_fog": textstat.gunning_fog(text),
"text_standard": textstat.text_standard(text),
"n_syllable": textstat.syllable_count(text),
"avg_letter_per_word": textstat.avg_letter_per_word(text),
"avg_sentence_length": textstat.avg_sentence_length(text),
}
except:
readability_dict = {
"flesch_reading_ease": None,
"smog": None,
"flesch_kincaid_grade": None,
"coleman_liau_index": None,
"automated_readability_index": None,
"dale_chall": None,
"difficult_words": None,
"linsear_write": None,
"gunning_fog": None,
"text_standard": None,
"n_syllable": None,
"avg_letter_per_word": None,
"avg_sentence_length": None,
}
return readability_dict
def compute_text_stats(text):
"""
Compute part of speech features from a given spacy wrapper of text
Parameters
==========
text: spacy.tokens.doc.Doc, spacy wrapper of the section or abstract text
Output
======
text_stat: dict, part of speech and text features extracted from the given text
"""
try:
pos = dict(Counter([token.pos_ for token in text]))
pos_tag = dict(
Counter([token.tag_ for token in text])
) # detailed part-of-speech
n_present_verb = sum(
[v for k, v in pos_tag.items() if k in PRESENT_TENSE_VERB_LIST]
)
n_verb = sum([v for k, v in pos_tag.items() if k in VERB_LIST])
word_shape = dict(Counter([token.shape_ for token in text])) # word shape
n_word_per_sents = [len([token for token in sent]) for sent in text.sents]
n_digits = sum([token.is_digit or token.like_num for token in text])
n_word = sum(n_word_per_sents)
n_sents = len(n_word_per_sents)
text_stats_dict = {
"pos": pos,
"pos_tag": pos_tag,
"word_shape": word_shape,
"n_word": n_word,
"n_sents": n_sents,
"n_present_verb": n_present_verb,
"n_verb": n_verb,
"n_digits": n_digits,
"percent_digits": n_digits / n_word,
"n_word_per_sents": n_word_per_sents,
"avg_word_per_sents": np.mean(n_word_per_sents),
}
except:
text_stats_dict = {
"pos": None,
"pos_tag": None,
"word_shape": None,
"n_word": None,
"n_sents": None,
"n_present_verb": None,
"n_verb": None,
"n_digits": None,
"percent_digits": None,
"n_word_per_sents": None,
"avg_word_per_sents": None,
}
return text_stats_dict
def compute_journal_features(article):
"""
Parse features about journal references from a given dictionary of parsed article e.g.
number of reference made, number of unique journal refered, minimum year of references,
maximum year of references, ...
Parameters
==========
article: dict, article dictionary parsed from GROBID and converted to dictionary
see ``pdf/parse_pdf.py`` for the detail of the output dictionary
Output
======
reference_dict: dict, dictionary of
"""
try:
n_reference = len(article["references"])
n_unique_journals = len(
pd.unique([a["journal"] for a in article["references"]])
)
reference_years = []
for reference in article["references"]:
year = reference["year"]
if year.isdigit():
# filter outliers
if int(year) in range(1800, 2100):
reference_years.append(int(year))
avg_ref_year = np.mean(reference_years)
median_ref_year = np.median(reference_years)
min_ref_year = np.min(reference_years)
max_ref_year = np.max(reference_years)
journal_features_dict = {
"n_reference": n_reference,
"n_unique_journals": n_unique_journals,
"avg_ref_year": avg_ref_year,
"median_ref_year": median_ref_year,
"min_ref_year": min_ref_year,
"max_ref_year": max_ref_year,
}
except:
journal_features_dict = {
"n_reference": None,
"n_unique_journals": None,
"avg_ref_year": None,
"median_ref_year": None,
"min_ref_year": None,
"max_ref_year": None,
}
return journal_features_dict
def merge_section_list(section_list, section_maps=SECTIONS_MAPS, section_start=""):
"""
Merge a list of sections into a normalized list of sections,
you can get the list of sections from parsed article JSON in ``parse_pdf.py`` e.g.
>> section_list = [s['heading'] for s in article_json['sections']]
>> section_list_merged = merge_section_list(section_list)
Parameters
==========
section_list: list, list of sections
Output
======
section_list_merged: list, sections
"""
sect_map = section_start # text for starting section e.g. ``Introduction``
section_list_merged = []
for section in section_list:
if any([(s.lower() in section.lower()) for s in section_maps.keys()]):
sect = [s for s in section_maps.keys() if s.lower() in section.lower()][0]
sect_map = section_maps.get(sect, "") #
section_list_merged.append(sect_map)
else:
section_list_merged.append(sect_map)
return section_list_merged