Spaces:

xin
/

PatentSolver

Build error

App Files Files Community

PatentSolver / App /bin /PatentHandler.py

xin

initial commit

22738ca over 3 years ago

raw

history blame

9.38 kB

	# -- coding: utf-8 --

	#java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer --port 8080
	import glob
	import nltk
	import os
	import re
	import codecs
	import chardet
	import shutil
	import json
	from io import StringIO
	from App.bin import constants
	from App.bin.FiguresCleaner import FiguresCleaner


	from collections import OrderedDict

	class PatentHandler(object):

	def __init__(self, patents):
	self.patents = patents

	def custom_cleaner(self, line):
	line = str(line)
	#line = line.lower()
	line = re.sub(r'PatentInspiration Url', '', line)
	line = re.sub(r'(http\|ftp\|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', line)
	line = re.sub(r'{', '(', line)
	line = re.sub(r'"', '\'', line)
	line = re.sub(r'}', ')', line)
	line = re.sub(r'\t.patentinspiration.\n', '', line)
	line = re.sub(r'^\|\n{2,}\bAbstract\b\n?', '', line)
	line = re.sub(r'^\|\n{2,}\bClaims\b\n?', '', line)
	line = re.sub(r'^\|\n{2,}\bDescription\b\n?', '', line)
	line = re.sub(r'fig\.', 'figure', line)
	line = re.sub(r'Fig\.', 'Figure', line)
	line = re.sub(r'FIG\.', 'Figure', line)
	line = re.sub(r'figs\.', 'figures', line)
	line = re.sub(r'FIGS\.', 'Figures', line)
	line = re.sub(r'(\w+\.)', r'\1 ', line)
	line = re.sub(r''', '\'', line)
	line = re.sub(r'>', '>', line)
	line = re.sub(r'<', '<', line)
	line = re.sub(r'°', ' deg.', line)
	line = re.sub(r' ', ' ', line)
	line = line.strip()
	return line

	def dataCleaner(self,line):
	with open(constants.ASSETS + "dropPart") as l:
	# next(l)
	drop_part = l.read().splitlines()
	drop_part_pattern = re.compile('\|'.join(drop_part))

	line = str(line)
	#line = line.lower()
	line = re.sub(r'^([A-Z-/]+\s)+([A-Z])', r'\n\2', line)
	line = re.sub(drop_part_pattern, r'\n', line)
	line = re.sub(r'\s+\.\s?\d+\s+', ' ', line)
	line = line.strip()
	return line

	def smooth_data_cleaner(self,line):
	line = str(line)
	# line = line.lower()
	line = re.sub(r'\s+,', ',', line)
	line = re.sub(r'\d\w-\d\w (and? \d\w-\d\w)?', '', line)
	line = re.sub(r'\d\w-\d\w', '', line)
	line = re.sub(r'\(\s?(,\s?\|;\s?)+\s?\)', '', line)
	line = re.sub(r'\s+\.\s\.', '.\n', line)
	line = re.sub(r'\s+\.\s+([a-z]+)', r' \1', line)
	line = re.sub(r'\s+(\.)\s+\[\s?\d+\s?]\s+', r'.\n', line)
	line = re.sub(r'\s?\[\s?\d+\s?]\s+', r'\n', line)
	line = re.sub(r'\s+(\.)\s+([A-Z]+)', r'.\n\2', line)
	line = re.sub(r'\s+;\s+', '; ', line)
	line = re.sub(r'\(\s+\'\s+\)', '', line)
	line = re.sub(r'\(\s+\)', '', line)
	line = re.sub(r'\(\s?\.\s?\)', '', line)
	line = re.sub(r'\(\s/\s?\)', '', line)
	line = re.sub(r'\s{2,}', ' ', line)
	line = re.sub(r'(\d+)\s+(\.)\s+(\d+)', r'\1.\3', line)
	line = line.strip()
	return line


	def get_project_folder(self):
	patents = self.patents
	if patents:
	file = patents[0]
	project_folder = os.path.basename(os.path.dirname(file))
	return project_folder

	def convert_to_uf8(self, input_file_name,output_file_name, file_encoding):

	BLOCKSIZE = 1048576
	with codecs.open(input_file_name, "r", file_encoding) as input_file:
	with codecs.open(output_file_name, "w", "utf-8") as output_file:
	while True:
	file_contents = input_file.read(BLOCKSIZE)
	if not file_contents:
	break
	output_file.write(file_contents)

	def sectionFinder(self, file_name, start_delimiter, end_delimiter):

	patent_file = open(file_name, encoding='utf-8')
	section = ""
	found = False

	for line in patent_file:
	if found :
	section += line
	if line.strip() == end_delimiter:
	break
	else:
	if line.strip() == start_delimiter:
	found = True
	# abstract = "Abstract\n"
	return section

	def pretreat_data(self):
	clean_patent_data= []
	patents = self.patents

	project_folder = self.get_project_folder()

	# original code
	# corpus_folder = constants.CORPUS + project_folder + "/"

	corpus_folder = str(constants.CORPUS)+str(project_folder)+"/"
	temp_folder = str(constants.TEMP)+str(project_folder)+"/"
	graph_folder = str(constants.GRAPH_FOLDER)+str(project_folder)+"/"

	folders = [corpus_folder, temp_folder, graph_folder]
	for folder in folders:
	if not os.path.exists(folder):
	os.makedirs(folder)
	else:
	shutil.rmtree(folder)
	os.makedirs(folder)

	for patent in patents:

	patent_name_with_extension = os.path.basename(patent)
	patent_name, extension= patent_name_with_extension.split('.')
	corpus_patent_path = corpus_folder + patent_name_with_extension
	#temp_patent_path = temp_folder + patent_name+'.json'

	patent_binary = open(patent, 'rb').read()

	file_encoding = chardet.detect(patent_binary)
	file_encoding = file_encoding['encoding']
	self.convert_to_uf8(patent,corpus_patent_path, file_encoding)

	temp_file = StringIO()
	#print(temp_patent_path)
	a_abstract = self.sectionFinder(corpus_patent_path,"Abstract", "Claims")
	a_abstract= self.custom_cleaner(a_abstract)
	abstract_cleaner = FiguresCleaner(a_abstract)
	a_abstract = ''.join(abstract_cleaner.clean_figures())
	a_abstract = self.smooth_data_cleaner(a_abstract)
	a_abstract = self.dataCleaner(a_abstract)

	c_claims = self.sectionFinder(corpus_patent_path, "Claims", "")
	c_claims = self.custom_cleaner(c_claims)
	claims_cleaner = FiguresCleaner(c_claims)
	c_claims = ''.join(claims_cleaner.clean_figures())
	c_claims = self.smooth_data_cleaner(c_claims)
	c_claims = self.smooth_data_cleaner(c_claims)

	d_description = self.sectionFinder(corpus_patent_path,"Description", "Claims")
	d_description = self.custom_cleaner(d_description)
	description_cleaner = FiguresCleaner(d_description)
	d_description = ''.join(description_cleaner.clean_figures())
	d_description = self.smooth_data_cleaner(d_description)
	d_description = self.dataCleaner(d_description)

	#TODO Manipulate data on system memory.

	data = {

	'number': patent_name,
	'abstract': a_abstract,
	'claims': c_claims,
	'description': d_description
	}

	json.dump(data, temp_file)
	clean_patent_data.append(temp_file.getvalue())
	return clean_patent_data


	def pretreat_json(self):
	clean_patent_data= []
	patents = self.patents
	temp_file = StringIO()

	for patent in patents:
	patent = json.dumps(patent)

	read_patent_t = StringIO(patent)
	patent_section = json.load(read_patent_t)
	filename = patent_section['filename']
	number = patent_section['number']

	a_abstract = patent_section['abstract']
	a_abstract= self.custom_cleaner(a_abstract)
	abstract_cleaner = FiguresCleaner(a_abstract)
	a_abstract = ''.join(abstract_cleaner.clean_figures())
	a_abstract = self.smooth_data_cleaner(a_abstract)
	a_abstract = self.dataCleaner(a_abstract)

	c_claims = patent_section['claims']
	c_claims = self.custom_cleaner(c_claims)
	claims_cleaner = FiguresCleaner(c_claims)
	c_claims = ''.join(claims_cleaner.clean_figures())
	c_claims = self.smooth_data_cleaner(c_claims)
	c_claims = self.smooth_data_cleaner(c_claims)

	d_description = patent_section['description']
	d_description = self.custom_cleaner(d_description)
	description_cleaner = FiguresCleaner(d_description)
	d_description = ''.join(description_cleaner.clean_figures())
	d_description = self.smooth_data_cleaner(d_description)
	d_description = self.dataCleaner(d_description)

	#TODO Manipulate data on system memory.

	data = {
	'filename': filename,
	'number': number,
	'abstract': a_abstract,
	'claims': c_claims,
	'description': d_description
	}


	clean_patent_data.append(data)
	#json.dumps(clean_patent_data, temp_file)

	#print(json.dumps(clean_patent_data))
	return clean_patent_data