Spaces:
Build error
Build error
# -*- coding: utf-8 -*- | |
import re | |
import nltk | |
import json | |
from App.bin import constants | |
class FiguresCleaner(object): | |
def __init__(self, sections): | |
self.sections = sections | |
def clean_figures(self): | |
sections = self.sections | |
clean_content = [] | |
with open(constants.ASSETS + "wordAfterNumber", 'r') as l: | |
after_words = l.read().splitlines() | |
after_words_patterns = re.compile('|'.join(after_words)) | |
with open(constants.ASSETS + "wordBeforeNumber", 'r') as l: | |
before_words = l.read().splitlines() | |
before_words_patterns = re.compile('|'.join(before_words)) | |
#sections = sections.splitlines() | |
words = nltk.word_tokenize(sections) | |
tagged_words = nltk.pos_tag(words) | |
for i in range(len(tagged_words)): | |
if i < len(tagged_words) - 1: | |
next_word = tagged_words[i + 1][0] | |
current_word = tagged_words[i][0] | |
previous_word = tagged_words[i - 1][0] | |
currentWordTag = tagged_words[i][1] | |
if currentWordTag == 'CD' and not re.match(after_words_patterns, | |
next_word) is not None and not re.match( | |
before_words_patterns, previous_word) is not None: | |
if re.search(r'\d', current_word) is not None: | |
continue | |
else: | |
clean_content.append(current_word + " ") | |
else: | |
clean_content.append("\n") | |
return clean_content |