File size: 1,644 Bytes
22738ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
# -*- coding: utf-8 -*-

import re
import nltk
import json

from App.bin import constants


class FiguresCleaner(object):

    def __init__(self, sections):
        self.sections = sections

    def clean_figures(self):
        sections = self.sections
        clean_content = []
        with open(constants.ASSETS + "wordAfterNumber", 'r') as l:
            after_words = l.read().splitlines()
            after_words_patterns = re.compile('|'.join(after_words))
        with open(constants.ASSETS + "wordBeforeNumber", 'r') as l:
            before_words = l.read().splitlines()
            before_words_patterns = re.compile('|'.join(before_words))

        #sections = sections.splitlines()
        words = nltk.word_tokenize(sections)
        tagged_words = nltk.pos_tag(words)
        for i in range(len(tagged_words)):
            if i < len(tagged_words) - 1:
                next_word = tagged_words[i + 1][0]
                current_word = tagged_words[i][0]
                previous_word = tagged_words[i - 1][0]
                currentWordTag = tagged_words[i][1]
                if currentWordTag == 'CD' and not re.match(after_words_patterns,
                                                           next_word) is not None and not re.match(
                    before_words_patterns, previous_word) is not None:
                    if re.search(r'\d', current_word) is not None:
                        continue
                else:
                    clean_content.append(current_word + " ")
            else:
                clean_content.append("\n")

        return clean_content