Spaces:
Build error
Build error
from App.bin.FiguresCleaner import FiguresCleaner | |
from App.bin.ParameterExtractor import ParameterExtractor | |
from App.bin import constants | |
import nltk | |
import re | |
import os | |
import json | |
import hashlib | |
import Levenshtein | |
import uuid | |
from collections import OrderedDict | |
from App.bin.SharpClassifier import SharpClassifier | |
from App.bin.ClassifierWithIncr import ClassifyWithIncr_it | |
class InformationExtractorClaims(object): | |
def __init__(self, section, input_folder, file_extension, file_name): | |
self.section = section | |
self.input_folder = input_folder | |
self.file_extension = file_extension | |
self.file_name = file_name | |
patent_abbreviations = open(constants.ASSETS + "abbreviation_sentence_splitter").read().split() | |
sentence_finder = nltk.data.load('tokenizers/punkt/english.pickle') | |
sentence_finder._params.abbrev_types.update(patent_abbreviations) | |
self.sentence_finder = sentence_finder | |
def clean_data (self, sentence): | |
sentence = str(sentence.lower()) | |
sentence = re.sub(r'\(\s,?\s?\)', '', sentence) | |
sentence = re.sub(r'\s+,', ',', sentence) | |
sentence = re.sub(r'^\d+', '', sentence) | |
sentence = re.sub(r'\s+', ' ', sentence) | |
if sentence is not None: | |
return sentence | |
def truncate_data (self, sentence): | |
sentence = str(sentence.lower()) | |
sentence = re.sub(r'wherein said\s*', '', sentence) | |
sentence = re.sub(r'characterized in that said\s*|characterised in that said?\s*', '', sentence) | |
sentence = re.sub(r'wherein\s*', '', sentence) | |
sentence = re.sub(r'characterized\s*|characterised\s*', '', sentence) | |
sentence = re.sub(r'characterized in that\s*', '', sentence) | |
sentence = re.sub(r'where\s*', '', sentence) | |
sentence = re.sub(r'where said\s*', '', sentence) | |
sentence = re.sub(r'further comprising', 'the system or method comprises', sentence) | |
sentence = re.sub(r'.*thereof\s*\,?', '', sentence) | |
sentence = re.sub(r'^\s+', '', sentence) | |
sentence = re.sub(r'\s+\.$', '', sentence) | |
if sentence is not None: | |
return sentence | |
def selectLines(self, line, lexic): | |
with open(constants.ASSETS + lexic) as n: | |
inclusion_list = n.read().splitlines() | |
claims_words = re.compile('|'.join(inclusion_list)) | |
m = re.search(claims_words, line) | |
if m is not None: | |
return m.group(1) | |
# pass | |
# return line | |
def main(self): | |
output_result = [] | |
compt_Id = 50 | |
count_concept = 3 | |
clean_content_list = [] | |
concept_list = [] | |
output_content = [] | |
uniq_output_linked_content =[] | |
parameters_list = [] | |
total_sentences_number =0 | |
section = self.section | |
input_folder = self.input_folder | |
file_name = self.file_name | |
file_extension = self.file_extension | |
projectFolder = os.path.basename(os.path.normpath(input_folder)) | |
output_file_name = input_folder+"/"+file_name+file_extension.strip("*") | |
root_img_url = 'https://worldwide.espacenet.com/espacenetImage.jpg?flavour=firstPageClipping&locale=en_EP&FT=D&' | |
root_pdf_url = 'https://worldwide.espacenet.com/publicationDetails/originalDocument?' | |
if file_name is not None: | |
match = re.search('(^[a-zA-Z]+)(([0-9]+)\s?([a-zA-Z0-9_]+$))', file_name) | |
# CC for country code | |
CC = match.group(1) | |
#NR for Number | |
NR = match.group(2) | |
NR = re.sub(r'\s', '', NR) | |
#KC for Kind code | |
KC = match.group(4) | |
urlImg = root_img_url+'&CC='+CC+'&NR='+NR+'&KC='+KC | |
urlPDF = root_pdf_url+'CC='+CC+'&NR='+NR+'&KC='+KC+'&FT=D&ND=3&date='+'&DB=&locale=en_EP#' | |
graphItemId = hashlib.md5(file_name.encode()) | |
graphItemIdValue = graphItemId.hexdigest() | |
graphItemIdValue = str(uuid.uuid4()) | |
sentence_finder = self.sentence_finder | |
sentences = sentence_finder.tokenize(section.strip()) | |
for sentence in sentences: | |
# print(sentence) | |
sentence = self.clean_data(sentence) | |
if sentence !='': | |
clean_content_list.append(sentence) | |
for line in clean_content_list: | |
# print(len(line.split())) | |
if not re.match(r'^\s*$', line): | |
line = self.selectLines(line, 'claims_indices') | |
if line is not None and count_concept > 0: | |
line = self.truncate_data(line) | |
line = re.sub(r'in that', '', line) | |
# print(line, len(line.split())) | |
concept_list.append(line) | |
count_concept -= 1 | |
count_concept = 3 | |
if len(concept_list) is not None: | |
total_sentences_number = len(concept_list) | |
for concept in concept_list : | |
if concept is not None and not re.match(r'^\s,', concept) and len(concept.split())<50: | |
classifyT = ClassifyWithIncr_it() | |
polarite = classifyT.main(concept) | |
get_parameters = ParameterExtractor(concept) | |
parameters = get_parameters.extract_parameters() | |
parameters_list.extend(parameters) | |
values = OrderedDict({ | |
"concept": { | |
"type": polarite, | |
"id": graphItemIdValue + str(compt_Id), | |
"sentence": concept, | |
"source": output_file_name, | |
"parameters": parameters_list, | |
"image": urlImg, | |
"pdf": urlPDF | |
} | |
}) | |
json_string = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': ')) | |
output_result.append(json_string) | |
output_result = list(set(output_result)) | |
output_json = ",".join(output_result) | |
return output_json, total_sentences_number | |