# -*- coding: utf-8 -*- #java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer --port 8080 import glob import nltk import os import re import codecs import chardet import shutil import json from io import StringIO from App.bin import constants from App.bin.FiguresCleaner import FiguresCleaner from collections import OrderedDict class PatentHandler(object): def __init__(self, patents): self.patents = patents def custom_cleaner(self, line): line = str(line) #line = line.lower() line = re.sub(r'PatentInspiration Url', '', line) line = re.sub(r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', line) line = re.sub(r'{', '(', line) line = re.sub(r'"', '\'', line) line = re.sub(r'}', ')', line) line = re.sub(r'\t.*patentinspiration.*\n', '', line) line = re.sub(r'^|\n{2,}\bAbstract\b\n?', '', line) line = re.sub(r'^|\n{2,}\bClaims\b\n?', '', line) line = re.sub(r'^|\n{2,}\bDescription\b\n?', '', line) line = re.sub(r'fig\.', 'figure', line) line = re.sub(r'Fig\.', 'Figure', line) line = re.sub(r'FIG\.', 'Figure', line) line = re.sub(r'figs\.', 'figures', line) line = re.sub(r'FIGS\.', 'Figures', line) line = re.sub(r'(\w+\.)', r'\1 ', line) line = re.sub(r''', '\'', line) line = re.sub(r'>', '>', line) line = re.sub(r'<', '<', line) line = re.sub(r'°', ' deg.', line) line = re.sub(r' ', ' ', line) line = line.strip() return line def dataCleaner(self,line): with open(constants.ASSETS + "dropPart") as l: # next(l) drop_part = l.read().splitlines() drop_part_pattern = re.compile('|'.join(drop_part)) line = str(line) #line = line.lower() line = re.sub(r'^([A-Z-/]+\s)+([A-Z])', r'\n\2', line) line = re.sub(drop_part_pattern, r'\n', line) line = re.sub(r'\s+\.\s?\d+\s+', ' ', line) line = line.strip() return line def smooth_data_cleaner(self,line): line = str(line) # line = line.lower() line = re.sub(r'\s+,', ',', line) line = re.sub(r'\d\w-\d\w (and? \d\w-\d\w)?', '', line) line = re.sub(r'\d\w-\d\w', '', line) line = re.sub(r'\(\s?(,\s?|;\s?)+\s?\)', '', line) line = re.sub(r'\s+\.\s\.', '.\n', line) line = re.sub(r'\s+\.\s+([a-z]+)', r' \1', line) line = re.sub(r'\s+(\.)\s+\[\s?\d+\s?]\s+', r'.\n', line) line = re.sub(r'\s?\[\s?\d+\s?]\s+', r'\n', line) line = re.sub(r'\s+(\.)\s+([A-Z]+)', r'.\n\2', line) line = re.sub(r'\s+;\s+', '; ', line) line = re.sub(r'\(\s+\'\s+\)', '', line) line = re.sub(r'\(\s+\)', '', line) line = re.sub(r'\(\s?\.\s?\)', '', line) line = re.sub(r'\(\s/\s?\)', '', line) line = re.sub(r'\s{2,}', ' ', line) line = re.sub(r'(\d+)\s+(\.)\s+(\d+)', r'\1.\3', line) line = line.strip() return line def get_project_folder(self): patents = self.patents if patents: file = patents[0] project_folder = os.path.basename(os.path.dirname(file)) return project_folder def convert_to_uf8(self, input_file_name,output_file_name, file_encoding): BLOCKSIZE = 1048576 with codecs.open(input_file_name, "r", file_encoding) as input_file: with codecs.open(output_file_name, "w", "utf-8") as output_file: while True: file_contents = input_file.read(BLOCKSIZE) if not file_contents: break output_file.write(file_contents) def sectionFinder(self, file_name, start_delimiter, end_delimiter): patent_file = open(file_name, encoding='utf-8') section = "" found = False for line in patent_file: if found : section += line if line.strip() == end_delimiter: break else: if line.strip() == start_delimiter: found = True # abstract = "Abstract\n" return section def pretreat_data(self): clean_patent_data= [] patents = self.patents project_folder = self.get_project_folder() # original code # corpus_folder = constants.CORPUS + project_folder + "/" corpus_folder = str(constants.CORPUS)+str(project_folder)+"/" temp_folder = str(constants.TEMP)+str(project_folder)+"/" graph_folder = str(constants.GRAPH_FOLDER)+str(project_folder)+"/" folders = [corpus_folder, temp_folder, graph_folder] for folder in folders: if not os.path.exists(folder): os.makedirs(folder) else: shutil.rmtree(folder) os.makedirs(folder) for patent in patents: patent_name_with_extension = os.path.basename(patent) patent_name, extension= patent_name_with_extension.split('.') corpus_patent_path = corpus_folder + patent_name_with_extension #temp_patent_path = temp_folder + patent_name+'.json' patent_binary = open(patent, 'rb').read() file_encoding = chardet.detect(patent_binary) file_encoding = file_encoding['encoding'] self.convert_to_uf8(patent,corpus_patent_path, file_encoding) temp_file = StringIO() #print(temp_patent_path) a_abstract = self.sectionFinder(corpus_patent_path,"Abstract", "Claims") a_abstract= self.custom_cleaner(a_abstract) abstract_cleaner = FiguresCleaner(a_abstract) a_abstract = ''.join(abstract_cleaner.clean_figures()) a_abstract = self.smooth_data_cleaner(a_abstract) a_abstract = self.dataCleaner(a_abstract) c_claims = self.sectionFinder(corpus_patent_path, "Claims", "") c_claims = self.custom_cleaner(c_claims) claims_cleaner = FiguresCleaner(c_claims) c_claims = ''.join(claims_cleaner.clean_figures()) c_claims = self.smooth_data_cleaner(c_claims) c_claims = self.smooth_data_cleaner(c_claims) d_description = self.sectionFinder(corpus_patent_path,"Description", "Claims") d_description = self.custom_cleaner(d_description) description_cleaner = FiguresCleaner(d_description) d_description = ''.join(description_cleaner.clean_figures()) d_description = self.smooth_data_cleaner(d_description) d_description = self.dataCleaner(d_description) #TODO Manipulate data on system memory. data = { 'number': patent_name, 'abstract': a_abstract, 'claims': c_claims, 'description': d_description } json.dump(data, temp_file) clean_patent_data.append(temp_file.getvalue()) return clean_patent_data def pretreat_json(self): clean_patent_data= [] patents = self.patents temp_file = StringIO() for patent in patents: patent = json.dumps(patent) read_patent_t = StringIO(patent) patent_section = json.load(read_patent_t) filename = patent_section['filename'] number = patent_section['number'] a_abstract = patent_section['abstract'] a_abstract= self.custom_cleaner(a_abstract) abstract_cleaner = FiguresCleaner(a_abstract) a_abstract = ''.join(abstract_cleaner.clean_figures()) a_abstract = self.smooth_data_cleaner(a_abstract) a_abstract = self.dataCleaner(a_abstract) c_claims = patent_section['claims'] c_claims = self.custom_cleaner(c_claims) claims_cleaner = FiguresCleaner(c_claims) c_claims = ''.join(claims_cleaner.clean_figures()) c_claims = self.smooth_data_cleaner(c_claims) c_claims = self.smooth_data_cleaner(c_claims) d_description = patent_section['description'] d_description = self.custom_cleaner(d_description) description_cleaner = FiguresCleaner(d_description) d_description = ''.join(description_cleaner.clean_figures()) d_description = self.smooth_data_cleaner(d_description) d_description = self.dataCleaner(d_description) #TODO Manipulate data on system memory. data = { 'filename': filename, 'number': number, 'abstract': a_abstract, 'claims': c_claims, 'description': d_description } clean_patent_data.append(data) #json.dumps(clean_patent_data, temp_file) #print(json.dumps(clean_patent_data)) return clean_patent_data