# -*- coding: utf-8 -*- #java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer --port 8080 import nltk nltk.download('all') import os import re import json import hashlib import Levenshtein import uuid from App.bin import constants from collections import OrderedDict from nltk import word_tokenize from App.bin.SharpClassifier import SharpClassifier from App.bin.ClassifierWithIncr import ClassifyWithIncr_it from App.bin.SentenceClassifier import SentenceClassifier from App.bin.ParameterExtractor import ParameterExtractor class InformationExtractor(object): patent_abbreviations = open(constants.ASSETS + "abbreviation_sentence_splitter").read().split() sentence_finder = nltk.data.load('tokenizers/punkt/english.pickle') sentence_finder._params.abbrev_types.update(patent_abbreviations) def __init__(self, section, input_folder,file_extension, file_name): self.section = section self.input_folder = input_folder self.file_extension = file_extension self.file_name = file_name print("Extracting problem graph") #@staticmethod def discardLines(self, line,lexic): with open (constants.ASSETS+ lexic) as m: exclusion_list = m.read().splitlines() if any(word in line for word in exclusion_list): pass else: return line def selectLines(self, line, lexic): with open(constants.ASSETS + lexic) as n: inclusion_list = n.read().splitlines() if any(word in line for word in inclusion_list): return line def last_cleansing(self, concept): concept = str(concept) concept = concept.lower() if concept.endswith("."): concept = concept.strip(".") concept = re.sub(r'^consequently ','', concept) concept = re.sub(r'^such ', '', concept) concept = re.sub(r'^said ', '', concept) concept = re.sub(r'^\s+', '', concept) concept = re.sub(r'^it is worth noting that ', '', concept) concept = re.sub(r'^example of ', '', concept) concept = re.sub(r'^since ', '', concept) concept = re.sub(r'^\( |\)$ ', '', concept) return concept # def get_from_claims(self): # # section = self.section # content = [] # sentence_finder = InformationExtractor.sentence_finder # sentences = sentence_finder.tokenize(section.strip()) # with open(constants.ASSETS + "getFromClaims") as concept: # # next(concept) # included_words = concept.read().splitlines() # include_link_pattern = re.compile('|'.join(included_words)) def get_from_description(self): previous_polarity = '' noise_trash =[] content = [] include_links = [] output_content = [] ex_output_content = [] output_result=[] output_linked_content = [] output_inter_content = [] uniq_output_linked_content =[] ex_output_content_linked =[] section = self.section input_folder = self.input_folder file_name = self.file_name file_extension = self.file_extension projectFolder = os.path.basename(os.path.normpath(input_folder)) output_file_name = input_folder+"/"+file_name+file_extension.strip("*") graphItemId = hashlib.md5(file_name.encode()) graphItemIdValue = graphItemId.hexdigest() graphItemIdValue = str(uuid.uuid4()) t_sline = "" t_sline_ex =[] compt_Id = 30 compt_Id_ex = 40 root_img_url = 'https://worldwide.espacenet.com/espacenetImage.jpg?flavour=firstPageClipping&locale=en_EP&FT=D&' root_pdf_url = 'https://worldwide.espacenet.com/publicationDetails/originalDocument?' if file_name is not None: match = re.search('(^[a-zA-Z]+)(([0-9]+)\s?([a-zA-Z0-9_]+$))', file_name) # CC for country code CC = match.group(1) # NR for Number NR = match.group(2) NR = re.sub(r'\s', '', NR) # KC for Kind code KC = match.group(4) urlImg = root_img_url + '&CC=' + CC + '&NR=' + NR + '&KC=' + KC urlPDF = root_pdf_url + 'CC=' + CC + '&NR=' + NR + '&KC=' + KC + '&FT=D&ND=3&date=' + '&DB=&locale=en_EP#' sentence_finder = InformationExtractor.sentence_finder #section = self.dataCleaner(section) #print(section) sentences = sentence_finder.tokenize(section.strip()) with open(constants.ASSETS + "includeLinks") as concept: # next(concept) included_words = concept.read().splitlines() include_link_pattern = re.compile('|'.join(included_words)) #open examplification wordfile with open(constants.ASSETS + "examplificationclues") as examplif: # next(concept) exam_words = examplif.read().splitlines() examplif_word_pattern = re.compile('|'.join(exam_words)) description_sentences_number = len(sentences) number_of_words = 0 for sentence in sentences: # with open(constants.DATA + 'sentences.txt', 'a', encoding='utf8') as file_handler: # for item in sentences: # file_handler.write("{}\n".format(item)) number_of_word = len(nltk.word_tokenize(sentence)) number_of_words += number_of_word sentenced = self.discardLines(sentence, "exclusionList") if sentenced is not None: content.append(sentenced) #print("origine=> "+sentence) total_sentences_number = len(sentences) # mean_sentence_length = int(round(number_of_words/total_sentences_number)) # print(mean_sentence_length) for line in content: line = self.selectLines(line, "inclusionList") if line is not None: if re.match(include_link_pattern, line): include_links.append(line) #print(line) if line.count(',') == 0: output_content.append(line) # content.remove(line) if line.count(',') > 0: output_inter_content.append(line) content.remove(line) for s in content: # print(s, file_name) sentence = self.discardLines(s, "FilterS") if sentence is not None: if s.count(',') <= 2 and re.match(examplif_word_pattern, s.lower()): s = str(s) cs = s.lower() cs = re.sub(examplif_word_pattern, '', cs) cs = re.sub('which', 'this/these', cs) cs = re.sub(r'\.$', '', cs) #print(s) if cs.count(',') == 1 and cs.count('such as')==0: ex_output_content_linked.append(cs) else: ex_output_content.append(cs) elif s.count(',') == 1: s = str(s) s = s.lower() s = self.selectLines(s, "OneCommaDiscriminator") if s is not None: #s = re.sub('which', 'this/these', s) #print(s) s = re.sub(r'^thus, ', '', s) s = re.sub(r'^preferably, ', '', s) s = re.sub(r'^conventional ', '', s) s = re.sub(r'^in particular, ', '', s) s = re.sub(r'^specifically, ', '', s) s = re.sub(r'^as necessary, ', '', s) s = re.sub(', which', ',this/these', s) s = re.sub(r'\.$', '', s) if s.count(',')==1: ex_output_content_linked.append(s) else: ex_output_content.append(s) else: pass print(len(ex_output_content_linked)) ex_output_content_linked = list(set(ex_output_content_linked)) for line in ex_output_content_linked: line = line.lower() if 'figure' not in line: #if line.count(',') <= 1: t_sline_ex = line.strip().split(',') #print("outpib"+str(t_sline_ex)) for concept in t_sline_ex: #print("outpib" + str(concept)) words = nltk.word_tokenize(concept) tagged = nltk.pos_tag(words) #print(tagged) parameters_list = [] compteur = 0 compt_Id_ex += 1 tagged = nltk.pos_tag(word_tokenize(concept)) tags = [word for word, pos in tagged if pos == 'VBZ' or pos == 'VBP' or pos == 'VBG' or pos == 'MD' or pos == 'JJR'] if len(tags) < 1: continue # classifyT = SentenceClassifier(concept) # polarite = classifyT.classifySentence() classifyT = ClassifyWithIncr_it() polarite = classifyT.main(concept) # if polarite == 'neutre': # classify = SentenceClassifier(concept) # polarite = classify.classifySentence() # print(concept) get_parameters = ParameterExtractor(concept) parameters = get_parameters.extract_parameters() parameters_list.extend( parameters) # parameters_list=", ".join(parameters_list) # parameters_list = parameters_list #print("Index is: ") #print(t_sline_ex.index(concept)) #print(concept) clean_concept = self.last_cleansing(concept) # if polarite == 'neutre': # words = word_tokenize(clean_concept) # hit = ' '.join([word + '/' + pos for word, pos in nltk.pos_tag(words)]) # noise_trash.append(hit) validity = self.discardLines(concept, 'referencing_indices') if t_sline_ex.index(concept) == 0 and validity is not None: previous_polarity = polarite values = OrderedDict({ "concept": { "type": polarite, "enfants": graphItemIdValue + str(compt_Id_ex + 1), "id": graphItemIdValue + str(compt_Id_ex), "sentence": clean_concept, "source": output_file_name, "parameters":parameters_list, "image": urlImg, "pdf": urlPDF } }) else: print("Previous polarity is : " + str(previous_polarity)) if previous_polarity =='partialSolution' or validity is None: continue else: compteur += 1 values = OrderedDict({ "concept": { "type": polarite, "parents": graphItemIdValue + str(compt_Id_ex - 1), "id": graphItemIdValue + str(compt_Id_ex), "sentence": clean_concept, "source": output_file_name, "parameters": parameters_list, "image": urlImg, "pdf": urlPDF } }) json_string_linkes = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': ')) output_result.append(json_string_linkes) #for line in output_content: #print ("include=> "+line) #just examplification sentences #make a function of that ex_output_content = list(set(ex_output_content)) for concept in ex_output_content: tagged = nltk.pos_tag(word_tokenize(concept)) tags = [word for word, pos in tagged if pos == 'VBZ' or pos == 'VBP' or pos == 'VBG' or pos == 'MD' or pos == 'JJR'] if len(tags) < 1: continue parameters_list = [] concept = concept.lower() compt_Id_ex += 1 # classify = SentenceClassifier(sline) # polarite = classify.classifySentence() classifyT = ClassifyWithIncr_it() polarite = classifyT.main(concept) # if polarite =='neutre': # classify = SentenceClassifier(concept) # polarite = classify.classifySentence() # print(sline) #if polarite == 'partialSolution': #print(sline) #Insert a classifier here get_parameters = ParameterExtractor(concept) parameters = get_parameters.extract_parameters() clean_concept = self.last_cleansing(concept) parameters_list.extend(parameters) # if polarite == 'neutre': # words = word_tokenize(clean_concept) # hit = ' '.join([word + '/' + pos for word, pos in nltk.pos_tag(words)]) # noise_trash.append(hit) # parameters_list = ", ".join(parameters_list) validity = self.discardLines(concept, 'referencing_indices') if polarite != 'partialSolution' and validity is not None: values = OrderedDict({ "concept": { "type": polarite, "id": graphItemIdValue + str(compt_Id_ex), "sentence": clean_concept, "source": output_file_name, "parameters": parameters_list, "image": urlImg, "pdf": urlPDF } }) json_string = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': ')) output_result.append(json_string) for line in include_links: #print(line) #Put in lower case to improve matching line = line.lower() if re.match(r'however', line) and line.count(',') <= 1: line = str(line) sline = re.sub(r'however|,', '', line) if sline not in output_linked_content: output_linked_content.append(sline) if re.match(r'however', line) and line.count(',') > 1: sline = re.sub(r'^however,?(\s\w+)\s*, that ', '', line) # sline = re.sub(r'however,.+, that ', '', sline) sline = re.sub(r'^however,?(\s\w+)+\s(above), ', '', sline) sline = re.sub(r'^however,?\s\w+ed(\s\w+)+,\s*', '', sline) sline = re.sub(r'^however,?\sif\s(desired|said)\s*,\s', '', sline) sline = re.sub(r'^however,?\s(it)\s(will be appreciated)\s*,\s(that)+\s*', '', sline) sline = re.sub(r'^however,?\s(as|if|because|when|since)\s*(?!is)', '', sline) sline = re.sub(r'^however,?\s*', '', sline) if sline not in output_linked_content: output_linked_content.append(sline) if re.match(r'if', line) and line.count(',') <= 1: line = str(line) sline = re.sub(r'^if\s?(and when|not|desired|necessary)\s?,?\s*', '', line) sline = re.sub(r'^if,?\s*', '', sline) sline = re.sub(r'^if ', '', sline) if sline not in output_linked_content: output_linked_content.append(sline) # print (sline) if re.match(r'when', line): line = str(line) line = line.lower() sline = re.sub(r'^when\s*', '', line) sline = re.sub(r'^when,?\s*', '', sline) sline = re.sub(r'^when ', '', sline) if sline not in output_linked_content: output_linked_content.append(sline) if re.match(r'(^since)|(^\w+\s?,\s?since\s?)', line): sline = re.sub(r'^since', '', line) sline = re.sub(r'^\w+\s?,\s?since\s?', '', sline) if sline not in output_linked_content: output_linked_content.append(sline) for line in output_content: line = line.lower() if re.match(r'if', line): line = str(line) sline = re.sub(r'^if ', '', line) if sline not in output_linked_content: output_content.append(sline) #output_content.remove(line) uniq_output_linked_content = list(set(output_linked_content)) for line in uniq_output_linked_content: #print("long sentences = > " + line) # line = str(i) #print(line) line = line.lower() if 'figure' in line: uniq_output_linked_content.remove(line) sline = re.sub(r'^\s+', '', line) sline = re.sub(r'^\d+\.+$', '', sline) if sline.count(',') <= 1: t_sline = tuple(sline.strip().split(', ')) #print("outpib"+str(t_sline)) for concept in t_sline: tagged = nltk.pos_tag(word_tokenize(concept)) tags = [word for word, pos in tagged if pos == 'VBZ' or pos == 'VBP' or pos == 'VBG' or pos == 'MD' or pos == 'JJR'] if len(tags) < 1: continue else: parameters_list = [] compteur = 0 compt_Id += 1 # classifyT = SentenceClassifier(concept) # polarite = classifyT.classifySentence() tagged = nltk.pos_tag(word_tokenize(concept)) tags = [word for word, pos in tagged if pos.startswith('V') or pos == 'JJR'] if len(tags) < 1: continue classifyT = ClassifyWithIncr_it() polarite = classifyT.main(concept) # if polarite == 'neutre': # classify = SentenceClassifier(concept) # polarite = classify.classifySentence() # print(concept) get_parameters = ParameterExtractor(concept) parameters = get_parameters.extract_parameters() parameters_list.extend( parameters) # parameters_list=", ".join(parameters_list) # parameters_list = parameters_list clean_concept = self.last_cleansing(concept) validity = self.discardLines(concept, 'referencing_indices') # if polarite == 'neutre': # words = word_tokenize(clean_concept) # hit = ' '.join([word + '/' + pos for word, pos in nltk.pos_tag(words)]) # noise_trash.append(hit) if t_sline.index(concept) == 0 and validity is not None: previous_polarity = polarite values = OrderedDict({ "concept": { "type": polarite, "enfants": graphItemIdValue + str(compt_Id + 1), "id": graphItemIdValue + str(compt_Id), "sentence": clean_concept, "source": output_file_name, "parameters":parameters_list, "image": urlImg, "pdf": urlPDF } }) else: print("Previous polarity is : " + str(previous_polarity)) if previous_polarity =='partialSolutiond' or validity is None: continue else: compteur += 1 values = OrderedDict({ "concept": { "type": polarite, "parents": graphItemIdValue + str(compt_Id - 1), "id": graphItemIdValue + str(compt_Id), "sentence": clean_concept, "source": output_file_name, "parameters": parameters_list, "image": urlImg, "pdf": urlPDF } }) json_string_linked = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': ')) output_result.append(json_string_linked) uniq_output_content = list(set(output_content)) for s in uniq_output_content: for y in uniq_output_content: if s != y: result = Levenshtein.ratio(s, y) if result > .7: # print(s + " :IS SIMILAR TO: " + y) if len(s) > len(y): uniq_output_content.remove(y) elif len(y) < len(s): uniq_output_content.remove(s) for concept in uniq_output_content: tagged = nltk.pos_tag(word_tokenize(concept)) tags = [word for word, pos in tagged if pos == 'VBZ' or pos == 'VBP' or pos == 'VBG' or pos == 'MD' or pos == 'JJR'] if len(tags) < 1: continue parameters_list = [] concept = concept.lower() compt_Id += 1 sline = re.sub(r'^if ', '', concept) sline = re.sub(r'^(if|preferably) ', '', sline) sline = re.sub(r'^\s+?said ', '', sline) # classify = SentenceClassifier(sline) # polarite = classify.classifySentence() classifyT = ClassifyWithIncr_it() polarite = classifyT.main(concept) # if polarite =='neutre': # classify = SentenceClassifier(sline) # polarite = classify.classifySentence() # print(sline) #if polarite == 'partialSolution': #print(sline) #Insert a classifier here get_parameters = ParameterExtractor(concept) parameters = get_parameters.extract_parameters() parameters_list.extend(parameters) # parameters_list = ", ".join(parameters_list) clean_concept = self.last_cleansing(sline) # if polarite == 'neutre': # words = word_tokenize(clean_concept) # hit = ' '.join([word + '/' + pos for word, pos in nltk.pos_tag(words)]) # noise_trash.append(hit) validity = self.discardLines(concept, 'referencing_indices') if polarite !='partialSolution' and validity is not None: values = OrderedDict({ "concept": { "type": polarite, "id": graphItemIdValue + str(compt_Id), "sentence": clean_concept, "source": output_file_name, "parameters": parameters_list, "image": urlImg, "pdf": urlPDF } }) json_string = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': ')) output_result.append(json_string) output_result = list(set(output_result)) output_json = ",".join(output_result) return output_json, total_sentences_number