PatentSolver / App /bin /InformationExtractor_Claims.py
xin's picture
initial commit
22738ca
raw
history blame
6.39 kB
from App.bin.FiguresCleaner import FiguresCleaner
from App.bin.ParameterExtractor import ParameterExtractor
from App.bin import constants
import nltk
import re
import os
import json
import hashlib
import Levenshtein
import uuid
from collections import OrderedDict
from App.bin.SharpClassifier import SharpClassifier
from App.bin.ClassifierWithIncr import ClassifyWithIncr_it
class InformationExtractorClaims(object):
def __init__(self, section, input_folder, file_extension, file_name):
self.section = section
self.input_folder = input_folder
self.file_extension = file_extension
self.file_name = file_name
patent_abbreviations = open(constants.ASSETS + "abbreviation_sentence_splitter").read().split()
sentence_finder = nltk.data.load('tokenizers/punkt/english.pickle')
sentence_finder._params.abbrev_types.update(patent_abbreviations)
self.sentence_finder = sentence_finder
def clean_data (self, sentence):
sentence = str(sentence.lower())
sentence = re.sub(r'\(\s,?\s?\)', '', sentence)
sentence = re.sub(r'\s+,', ',', sentence)
sentence = re.sub(r'^\d+', '', sentence)
sentence = re.sub(r'\s+', ' ', sentence)
if sentence is not None:
return sentence
def truncate_data (self, sentence):
sentence = str(sentence.lower())
sentence = re.sub(r'wherein said\s*', '', sentence)
sentence = re.sub(r'characterized in that said\s*|characterised in that said?\s*', '', sentence)
sentence = re.sub(r'wherein\s*', '', sentence)
sentence = re.sub(r'characterized\s*|characterised\s*', '', sentence)
sentence = re.sub(r'characterized in that\s*', '', sentence)
sentence = re.sub(r'where\s*', '', sentence)
sentence = re.sub(r'where said\s*', '', sentence)
sentence = re.sub(r'further comprising', 'the system or method comprises', sentence)
sentence = re.sub(r'.*thereof\s*\,?', '', sentence)
sentence = re.sub(r'^\s+', '', sentence)
sentence = re.sub(r'\s+\.$', '', sentence)
if sentence is not None:
return sentence
def selectLines(self, line, lexic):
with open(constants.ASSETS + lexic) as n:
inclusion_list = n.read().splitlines()
claims_words = re.compile('|'.join(inclusion_list))
m = re.search(claims_words, line)
if m is not None:
return m.group(1)
# pass
# return line
def main(self):
output_result = []
compt_Id = 50
count_concept = 3
clean_content_list = []
concept_list = []
output_content = []
uniq_output_linked_content =[]
parameters_list = []
total_sentences_number =0
section = self.section
input_folder = self.input_folder
file_name = self.file_name
file_extension = self.file_extension
projectFolder = os.path.basename(os.path.normpath(input_folder))
output_file_name = input_folder+"/"+file_name+file_extension.strip("*")
root_img_url = 'https://worldwide.espacenet.com/espacenetImage.jpg?flavour=firstPageClipping&locale=en_EP&FT=D&'
root_pdf_url = 'https://worldwide.espacenet.com/publicationDetails/originalDocument?'
if file_name is not None:
match = re.search('(^[a-zA-Z]+)(([0-9]+)\s?([a-zA-Z0-9_]+$))', file_name)
# CC for country code
CC = match.group(1)
#NR for Number
NR = match.group(2)
NR = re.sub(r'\s', '', NR)
#KC for Kind code
KC = match.group(4)
urlImg = root_img_url+'&CC='+CC+'&NR='+NR+'&KC='+KC
urlPDF = root_pdf_url+'CC='+CC+'&NR='+NR+'&KC='+KC+'&FT=D&ND=3&date='+'&DB=&locale=en_EP#'
graphItemId = hashlib.md5(file_name.encode())
graphItemIdValue = graphItemId.hexdigest()
graphItemIdValue = str(uuid.uuid4())
sentence_finder = self.sentence_finder
sentences = sentence_finder.tokenize(section.strip())
for sentence in sentences:
# print(sentence)
sentence = self.clean_data(sentence)
if sentence !='':
clean_content_list.append(sentence)
for line in clean_content_list:
# print(len(line.split()))
if not re.match(r'^\s*$', line):
line = self.selectLines(line, 'claims_indices')
if line is not None and count_concept > 0:
line = self.truncate_data(line)
line = re.sub(r'in that', '', line)
# print(line, len(line.split()))
concept_list.append(line)
count_concept -= 1
count_concept = 3
if len(concept_list) is not None:
total_sentences_number = len(concept_list)
for concept in concept_list :
if concept is not None and not re.match(r'^\s,', concept) and len(concept.split())<50:
classifyT = ClassifyWithIncr_it()
polarite = classifyT.main(concept)
get_parameters = ParameterExtractor(concept)
parameters = get_parameters.extract_parameters()
parameters_list.extend(parameters)
values = OrderedDict({
"concept": {
"type": polarite,
"id": graphItemIdValue + str(compt_Id),
"sentence": concept,
"source": output_file_name,
"parameters": parameters_list,
"image": urlImg,
"pdf": urlPDF
}
})
json_string = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
output_result.append(json_string)
output_result = list(set(output_result))
output_json = ",".join(output_result)
return output_json, total_sentences_number