Spaces:

lihuigu
/

SciPIP

Running

File size: 7,417 Bytes

cee6a24

import numpy as np
import pandas as pd
import textstat
import spacy
from collections import Counter
from itertools import groupby


nlp = spacy.load("en_core_web_sm")

PRESENT_TENSE_VERB_LIST = ["VB", "VBP", "VBZ", "VBG"]
VERB_LIST = ["VB", "VBP", "VBZ", "VBG", "VBN", "VBD"]
NOUN_LIST = ["NNP", "NNPS"]


SECTIONS_MAPS = {
    "Authors": "Authors",
    "AUTHORS": "AUTHORS",
    "Abstract": "Abstract",
    "ABSTRACT": "Abstract",
    "Date": "Date",
    "DATE": "DATE",
    "INTRODUCTION": "Introduction",
    "MATERIALS AND METHODS": "Methods",
    "Materials and methods": "Methods",
    "METHODS": "Methods",
    "RESULTS": "Results",
    "CONCLUSIONS": "Conclusions",
    "CONCLUSIONS AND FUTURE APPLICATIONS": "Conclusions",
    "DISCUSSION": "Discussion",
    "ACKNOWLEDGMENTS": "Acknowledgement",
    "TABLES": "Tables",
    "Tabnles": "Tables",
    "DISCLOSURE": "Disclosure",
    "CONFLICT OF INTEREST": "Disclosure",
    "Acknowledgement": "Acknowledgements",
}


def compute_readability_stats(text):
    """
    Compute reading statistics of the given text
    Reference: https://github.com/shivam5992/textstat

    Parameters
    ==========
    text: str, input section or abstract text
    """
    try:
        readability_dict = {
            "flesch_reading_ease": textstat.flesch_reading_ease(text),
            "smog": textstat.smog_index(text),
            "flesch_kincaid_grade": textstat.flesch_kincaid_grade(text),
            "coleman_liau_index": textstat.coleman_liau_index(text),
            "automated_readability_index": textstat.automated_readability_index(text),
            "dale_chall": textstat.dale_chall_readability_score(text),
            "difficult_words": textstat.difficult_words(text),
            "linsear_write": textstat.linsear_write_formula(text),
            "gunning_fog": textstat.gunning_fog(text),
            "text_standard": textstat.text_standard(text),
            "n_syllable": textstat.syllable_count(text),
            "avg_letter_per_word": textstat.avg_letter_per_word(text),
            "avg_sentence_length": textstat.avg_sentence_length(text),
        }
    except:
        readability_dict = {
            "flesch_reading_ease": None,
            "smog": None,
            "flesch_kincaid_grade": None,
            "coleman_liau_index": None,
            "automated_readability_index": None,
            "dale_chall": None,
            "difficult_words": None,
            "linsear_write": None,
            "gunning_fog": None,
            "text_standard": None,
            "n_syllable": None,
            "avg_letter_per_word": None,
            "avg_sentence_length": None,
        }
    return readability_dict


def compute_text_stats(text):
    """
    Compute part of speech features from a given spacy wrapper of text

    Parameters
    ==========
    text: spacy.tokens.doc.Doc, spacy wrapper of the section or abstract text

    Output
    ======
    text_stat: dict, part of speech and text features extracted from the given text
    """
    try:
        pos = dict(Counter([token.pos_ for token in text]))
        pos_tag = dict(
            Counter([token.tag_ for token in text])
        )  # detailed part-of-speech

        n_present_verb = sum(
            [v for k, v in pos_tag.items() if k in PRESENT_TENSE_VERB_LIST]
        )
        n_verb = sum([v for k, v in pos_tag.items() if k in VERB_LIST])

        word_shape = dict(Counter([token.shape_ for token in text]))  # word shape
        n_word_per_sents = [len([token for token in sent]) for sent in text.sents]
        n_digits = sum([token.is_digit or token.like_num for token in text])
        n_word = sum(n_word_per_sents)
        n_sents = len(n_word_per_sents)
        text_stats_dict = {
            "pos": pos,
            "pos_tag": pos_tag,
            "word_shape": word_shape,
            "n_word": n_word,
            "n_sents": n_sents,
            "n_present_verb": n_present_verb,
            "n_verb": n_verb,
            "n_digits": n_digits,
            "percent_digits": n_digits / n_word,
            "n_word_per_sents": n_word_per_sents,
            "avg_word_per_sents": np.mean(n_word_per_sents),
        }
    except:
        text_stats_dict = {
            "pos": None,
            "pos_tag": None,
            "word_shape": None,
            "n_word": None,
            "n_sents": None,
            "n_present_verb": None,
            "n_verb": None,
            "n_digits": None,
            "percent_digits": None,
            "n_word_per_sents": None,
            "avg_word_per_sents": None,
        }
    return text_stats_dict


def compute_journal_features(article):
    """
    Parse features about journal references from a given dictionary of parsed article e.g.
    number of reference made, number of unique journal refered, minimum year of references,
    maximum year of references, ...

    Parameters
    ==========
    article: dict, article dictionary parsed from GROBID and converted to dictionary
        see ``pdf/parse_pdf.py`` for the detail of the output dictionary

    Output
    ======
    reference_dict: dict, dictionary of
    """
    try:
        n_reference = len(article["references"])
        n_unique_journals = len(
            pd.unique([a["journal"] for a in article["references"]])
        )
        reference_years = []
        for reference in article["references"]:
            year = reference["year"]
            if year.isdigit():
                # filter outliers
                if int(year) in range(1800, 2100):
                    reference_years.append(int(year))
        avg_ref_year = np.mean(reference_years)
        median_ref_year = np.median(reference_years)
        min_ref_year = np.min(reference_years)
        max_ref_year = np.max(reference_years)
        journal_features_dict = {
            "n_reference": n_reference,
            "n_unique_journals": n_unique_journals,
            "avg_ref_year": avg_ref_year,
            "median_ref_year": median_ref_year,
            "min_ref_year": min_ref_year,
            "max_ref_year": max_ref_year,
        }
    except:
        journal_features_dict = {
            "n_reference": None,
            "n_unique_journals": None,
            "avg_ref_year": None,
            "median_ref_year": None,
            "min_ref_year": None,
            "max_ref_year": None,
        }
    return journal_features_dict


def merge_section_list(section_list, section_maps=SECTIONS_MAPS, section_start=""):
    """
    Merge a list of sections into a normalized list of sections,
    you can get the list of sections from parsed article JSON in ``parse_pdf.py`` e.g.

    >> section_list = [s['heading'] for s in article_json['sections']]
    >> section_list_merged = merge_section_list(section_list)

    Parameters
    ==========
    section_list: list, list of sections

    Output
    ======
    section_list_merged: list,  sections
    """
    sect_map = section_start  # text for starting section e.g. ``Introduction``
    section_list_merged = []
    for section in section_list:
        if any([(s.lower() in section.lower()) for s in section_maps.keys()]):
            sect = [s for s in section_maps.keys() if s.lower() in section.lower()][0]
            sect_map = section_maps.get(sect, "")  #
            section_list_merged.append(sect_map)
        else:
            section_list_merged.append(sect_map)
    return section_list_merged