File size: 7,417 Bytes
cee6a24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import numpy as np
import pandas as pd
import textstat
import spacy
from collections import Counter
from itertools import groupby


nlp = spacy.load("en_core_web_sm")

PRESENT_TENSE_VERB_LIST = ["VB", "VBP", "VBZ", "VBG"]
VERB_LIST = ["VB", "VBP", "VBZ", "VBG", "VBN", "VBD"]
NOUN_LIST = ["NNP", "NNPS"]


SECTIONS_MAPS = {
    "Authors": "Authors",
    "AUTHORS": "AUTHORS",
    "Abstract": "Abstract",
    "ABSTRACT": "Abstract",
    "Date": "Date",
    "DATE": "DATE",
    "INTRODUCTION": "Introduction",
    "MATERIALS AND METHODS": "Methods",
    "Materials and methods": "Methods",
    "METHODS": "Methods",
    "RESULTS": "Results",
    "CONCLUSIONS": "Conclusions",
    "CONCLUSIONS AND FUTURE APPLICATIONS": "Conclusions",
    "DISCUSSION": "Discussion",
    "ACKNOWLEDGMENTS": "Acknowledgement",
    "TABLES": "Tables",
    "Tabnles": "Tables",
    "DISCLOSURE": "Disclosure",
    "CONFLICT OF INTEREST": "Disclosure",
    "Acknowledgement": "Acknowledgements",
}


def compute_readability_stats(text):
    """
    Compute reading statistics of the given text
    Reference: https://github.com/shivam5992/textstat

    Parameters
    ==========
    text: str, input section or abstract text
    """
    try:
        readability_dict = {
            "flesch_reading_ease": textstat.flesch_reading_ease(text),
            "smog": textstat.smog_index(text),
            "flesch_kincaid_grade": textstat.flesch_kincaid_grade(text),
            "coleman_liau_index": textstat.coleman_liau_index(text),
            "automated_readability_index": textstat.automated_readability_index(text),
            "dale_chall": textstat.dale_chall_readability_score(text),
            "difficult_words": textstat.difficult_words(text),
            "linsear_write": textstat.linsear_write_formula(text),
            "gunning_fog": textstat.gunning_fog(text),
            "text_standard": textstat.text_standard(text),
            "n_syllable": textstat.syllable_count(text),
            "avg_letter_per_word": textstat.avg_letter_per_word(text),
            "avg_sentence_length": textstat.avg_sentence_length(text),
        }
    except:
        readability_dict = {
            "flesch_reading_ease": None,
            "smog": None,
            "flesch_kincaid_grade": None,
            "coleman_liau_index": None,
            "automated_readability_index": None,
            "dale_chall": None,
            "difficult_words": None,
            "linsear_write": None,
            "gunning_fog": None,
            "text_standard": None,
            "n_syllable": None,
            "avg_letter_per_word": None,
            "avg_sentence_length": None,
        }
    return readability_dict


def compute_text_stats(text):
    """
    Compute part of speech features from a given spacy wrapper of text

    Parameters
    ==========
    text: spacy.tokens.doc.Doc, spacy wrapper of the section or abstract text

    Output
    ======
    text_stat: dict, part of speech and text features extracted from the given text
    """
    try:
        pos = dict(Counter([token.pos_ for token in text]))
        pos_tag = dict(
            Counter([token.tag_ for token in text])
        )  # detailed part-of-speech

        n_present_verb = sum(
            [v for k, v in pos_tag.items() if k in PRESENT_TENSE_VERB_LIST]
        )
        n_verb = sum([v for k, v in pos_tag.items() if k in VERB_LIST])

        word_shape = dict(Counter([token.shape_ for token in text]))  # word shape
        n_word_per_sents = [len([token for token in sent]) for sent in text.sents]
        n_digits = sum([token.is_digit or token.like_num for token in text])
        n_word = sum(n_word_per_sents)
        n_sents = len(n_word_per_sents)
        text_stats_dict = {
            "pos": pos,
            "pos_tag": pos_tag,
            "word_shape": word_shape,
            "n_word": n_word,
            "n_sents": n_sents,
            "n_present_verb": n_present_verb,
            "n_verb": n_verb,
            "n_digits": n_digits,
            "percent_digits": n_digits / n_word,
            "n_word_per_sents": n_word_per_sents,
            "avg_word_per_sents": np.mean(n_word_per_sents),
        }
    except:
        text_stats_dict = {
            "pos": None,
            "pos_tag": None,
            "word_shape": None,
            "n_word": None,
            "n_sents": None,
            "n_present_verb": None,
            "n_verb": None,
            "n_digits": None,
            "percent_digits": None,
            "n_word_per_sents": None,
            "avg_word_per_sents": None,
        }
    return text_stats_dict


def compute_journal_features(article):
    """
    Parse features about journal references from a given dictionary of parsed article e.g.
    number of reference made, number of unique journal refered, minimum year of references,
    maximum year of references, ...

    Parameters
    ==========
    article: dict, article dictionary parsed from GROBID and converted to dictionary
        see ``pdf/parse_pdf.py`` for the detail of the output dictionary

    Output
    ======
    reference_dict: dict, dictionary of
    """
    try:
        n_reference = len(article["references"])
        n_unique_journals = len(
            pd.unique([a["journal"] for a in article["references"]])
        )
        reference_years = []
        for reference in article["references"]:
            year = reference["year"]
            if year.isdigit():
                # filter outliers
                if int(year) in range(1800, 2100):
                    reference_years.append(int(year))
        avg_ref_year = np.mean(reference_years)
        median_ref_year = np.median(reference_years)
        min_ref_year = np.min(reference_years)
        max_ref_year = np.max(reference_years)
        journal_features_dict = {
            "n_reference": n_reference,
            "n_unique_journals": n_unique_journals,
            "avg_ref_year": avg_ref_year,
            "median_ref_year": median_ref_year,
            "min_ref_year": min_ref_year,
            "max_ref_year": max_ref_year,
        }
    except:
        journal_features_dict = {
            "n_reference": None,
            "n_unique_journals": None,
            "avg_ref_year": None,
            "median_ref_year": None,
            "min_ref_year": None,
            "max_ref_year": None,
        }
    return journal_features_dict


def merge_section_list(section_list, section_maps=SECTIONS_MAPS, section_start=""):
    """
    Merge a list of sections into a normalized list of sections,
    you can get the list of sections from parsed article JSON in ``parse_pdf.py`` e.g.

    >> section_list = [s['heading'] for s in article_json['sections']]
    >> section_list_merged = merge_section_list(section_list)

    Parameters
    ==========
    section_list: list, list of sections

    Output
    ======
    section_list_merged: list,  sections
    """
    sect_map = section_start  # text for starting section e.g. ``Introduction``
    section_list_merged = []
    for section in section_list:
        if any([(s.lower() in section.lower()) for s in section_maps.keys()]):
            sect = [s for s in section_maps.keys() if s.lower() in section.lower()][0]
            sect_map = section_maps.get(sect, "")  #
            section_list_merged.append(sect_map)
        else:
            section_list_merged.append(sect_map)
    return section_list_merged