Spaces:

xin
/

PatentSolver

Build error

PatentSolver / App /bin /FiguresCleaner.py

initial commit

22738ca over 3 years ago

1.64 kB

	# -- coding: utf-8 --

	import re
	import nltk
	import json

	from App.bin import constants


	class FiguresCleaner(object):

	def __init__(self, sections):
	self.sections = sections

	def clean_figures(self):
	sections = self.sections
	clean_content = []
	with open(constants.ASSETS + "wordAfterNumber", 'r') as l:
	after_words = l.read().splitlines()
	after_words_patterns = re.compile('\|'.join(after_words))
	with open(constants.ASSETS + "wordBeforeNumber", 'r') as l:
	before_words = l.read().splitlines()
	before_words_patterns = re.compile('\|'.join(before_words))

	#sections = sections.splitlines()
	words = nltk.word_tokenize(sections)
	tagged_words = nltk.pos_tag(words)
	for i in range(len(tagged_words)):
	if i < len(tagged_words) - 1:
	next_word = tagged_words[i + 1][0]
	current_word = tagged_words[i][0]
	previous_word = tagged_words[i - 1][0]
	currentWordTag = tagged_words[i][1]
	if currentWordTag == 'CD' and not re.match(after_words_patterns,
	next_word) is not None and not re.match(
	before_words_patterns, previous_word) is not None:
	if re.search(r'\d', current_word) is not None:
	continue
	else:
	clean_content.append(current_word + " ")
	else:
	clean_content.append("\n")

	return clean_content