PatentSolver / App /bin /PatentHandler.py
xin's picture
initial commit
22738ca
raw
history blame
9.38 kB
# -*- coding: utf-8 -*-
#java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer --port 8080
import glob
import nltk
import os
import re
import codecs
import chardet
import shutil
import json
from io import StringIO
from App.bin import constants
from App.bin.FiguresCleaner import FiguresCleaner
from collections import OrderedDict
class PatentHandler(object):
def __init__(self, patents):
self.patents = patents
def custom_cleaner(self, line):
line = str(line)
#line = line.lower()
line = re.sub(r'PatentInspiration Url', '', line)
line = re.sub(r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', line)
line = re.sub(r'{', '(', line)
line = re.sub(r'"', '\'', line)
line = re.sub(r'}', ')', line)
line = re.sub(r'\t.*patentinspiration.*\n', '', line)
line = re.sub(r'^|\n{2,}\bAbstract\b\n?', '', line)
line = re.sub(r'^|\n{2,}\bClaims\b\n?', '', line)
line = re.sub(r'^|\n{2,}\bDescription\b\n?', '', line)
line = re.sub(r'fig\.', 'figure', line)
line = re.sub(r'Fig\.', 'Figure', line)
line = re.sub(r'FIG\.', 'Figure', line)
line = re.sub(r'figs\.', 'figures', line)
line = re.sub(r'FIGS\.', 'Figures', line)
line = re.sub(r'(\w+\.)', r'\1 ', line)
line = re.sub(r''', '\'', line)
line = re.sub(r'>', '>', line)
line = re.sub(r'&lt;', '<', line)
line = re.sub(r'&#176;', ' deg.', line)
line = re.sub(r' ', ' ', line)
line = line.strip()
return line
def dataCleaner(self,line):
with open(constants.ASSETS + "dropPart") as l:
# next(l)
drop_part = l.read().splitlines()
drop_part_pattern = re.compile('|'.join(drop_part))
line = str(line)
#line = line.lower()
line = re.sub(r'^([A-Z-/]+\s)+([A-Z])', r'\n\2', line)
line = re.sub(drop_part_pattern, r'\n', line)
line = re.sub(r'\s+\.\s?\d+\s+', ' ', line)
line = line.strip()
return line
def smooth_data_cleaner(self,line):
line = str(line)
# line = line.lower()
line = re.sub(r'\s+,', ',', line)
line = re.sub(r'\d\w-\d\w (and? \d\w-\d\w)?', '', line)
line = re.sub(r'\d\w-\d\w', '', line)
line = re.sub(r'\(\s?(,\s?|;\s?)+\s?\)', '', line)
line = re.sub(r'\s+\.\s\.', '.\n', line)
line = re.sub(r'\s+\.\s+([a-z]+)', r' \1', line)
line = re.sub(r'\s+(\.)\s+\[\s?\d+\s?]\s+', r'.\n', line)
line = re.sub(r'\s?\[\s?\d+\s?]\s+', r'\n', line)
line = re.sub(r'\s+(\.)\s+([A-Z]+)', r'.\n\2', line)
line = re.sub(r'\s+;\s+', '; ', line)
line = re.sub(r'\(\s+\'\s+\)', '', line)
line = re.sub(r'\(\s+\)', '', line)
line = re.sub(r'\(\s?\.\s?\)', '', line)
line = re.sub(r'\(\s/\s?\)', '', line)
line = re.sub(r'\s{2,}', ' ', line)
line = re.sub(r'(\d+)\s+(\.)\s+(\d+)', r'\1.\3', line)
line = line.strip()
return line
def get_project_folder(self):
patents = self.patents
if patents:
file = patents[0]
project_folder = os.path.basename(os.path.dirname(file))
return project_folder
def convert_to_uf8(self, input_file_name,output_file_name, file_encoding):
BLOCKSIZE = 1048576
with codecs.open(input_file_name, "r", file_encoding) as input_file:
with codecs.open(output_file_name, "w", "utf-8") as output_file:
while True:
file_contents = input_file.read(BLOCKSIZE)
if not file_contents:
break
output_file.write(file_contents)
def sectionFinder(self, file_name, start_delimiter, end_delimiter):
patent_file = open(file_name, encoding='utf-8')
section = ""
found = False
for line in patent_file:
if found :
section += line
if line.strip() == end_delimiter:
break
else:
if line.strip() == start_delimiter:
found = True
# abstract = "Abstract\n"
return section
def pretreat_data(self):
clean_patent_data= []
patents = self.patents
project_folder = self.get_project_folder()
# original code
# corpus_folder = constants.CORPUS + project_folder + "/"
corpus_folder = str(constants.CORPUS)+str(project_folder)+"/"
temp_folder = str(constants.TEMP)+str(project_folder)+"/"
graph_folder = str(constants.GRAPH_FOLDER)+str(project_folder)+"/"
folders = [corpus_folder, temp_folder, graph_folder]
for folder in folders:
if not os.path.exists(folder):
os.makedirs(folder)
else:
shutil.rmtree(folder)
os.makedirs(folder)
for patent in patents:
patent_name_with_extension = os.path.basename(patent)
patent_name, extension= patent_name_with_extension.split('.')
corpus_patent_path = corpus_folder + patent_name_with_extension
#temp_patent_path = temp_folder + patent_name+'.json'
patent_binary = open(patent, 'rb').read()
file_encoding = chardet.detect(patent_binary)
file_encoding = file_encoding['encoding']
self.convert_to_uf8(patent,corpus_patent_path, file_encoding)
temp_file = StringIO()
#print(temp_patent_path)
a_abstract = self.sectionFinder(corpus_patent_path,"Abstract", "Claims")
a_abstract= self.custom_cleaner(a_abstract)
abstract_cleaner = FiguresCleaner(a_abstract)
a_abstract = ''.join(abstract_cleaner.clean_figures())
a_abstract = self.smooth_data_cleaner(a_abstract)
a_abstract = self.dataCleaner(a_abstract)
c_claims = self.sectionFinder(corpus_patent_path, "Claims", "")
c_claims = self.custom_cleaner(c_claims)
claims_cleaner = FiguresCleaner(c_claims)
c_claims = ''.join(claims_cleaner.clean_figures())
c_claims = self.smooth_data_cleaner(c_claims)
c_claims = self.smooth_data_cleaner(c_claims)
d_description = self.sectionFinder(corpus_patent_path,"Description", "Claims")
d_description = self.custom_cleaner(d_description)
description_cleaner = FiguresCleaner(d_description)
d_description = ''.join(description_cleaner.clean_figures())
d_description = self.smooth_data_cleaner(d_description)
d_description = self.dataCleaner(d_description)
#TODO Manipulate data on system memory.
data = {
'number': patent_name,
'abstract': a_abstract,
'claims': c_claims,
'description': d_description
}
json.dump(data, temp_file)
clean_patent_data.append(temp_file.getvalue())
return clean_patent_data
def pretreat_json(self):
clean_patent_data= []
patents = self.patents
temp_file = StringIO()
for patent in patents:
patent = json.dumps(patent)
read_patent_t = StringIO(patent)
patent_section = json.load(read_patent_t)
filename = patent_section['filename']
number = patent_section['number']
a_abstract = patent_section['abstract']
a_abstract= self.custom_cleaner(a_abstract)
abstract_cleaner = FiguresCleaner(a_abstract)
a_abstract = ''.join(abstract_cleaner.clean_figures())
a_abstract = self.smooth_data_cleaner(a_abstract)
a_abstract = self.dataCleaner(a_abstract)
c_claims = patent_section['claims']
c_claims = self.custom_cleaner(c_claims)
claims_cleaner = FiguresCleaner(c_claims)
c_claims = ''.join(claims_cleaner.clean_figures())
c_claims = self.smooth_data_cleaner(c_claims)
c_claims = self.smooth_data_cleaner(c_claims)
d_description = patent_section['description']
d_description = self.custom_cleaner(d_description)
description_cleaner = FiguresCleaner(d_description)
d_description = ''.join(description_cleaner.clean_figures())
d_description = self.smooth_data_cleaner(d_description)
d_description = self.dataCleaner(d_description)
#TODO Manipulate data on system memory.
data = {
'filename': filename,
'number': number,
'abstract': a_abstract,
'claims': c_claims,
'description': d_description
}
clean_patent_data.append(data)
#json.dumps(clean_patent_data, temp_file)
#print(json.dumps(clean_patent_data))
return clean_patent_data