Spaces:
Build error
Build error
# -*- coding: utf-8 -*- | |
#java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer --port 8080 | |
import nltk | |
nltk.download('all') | |
import os | |
import re | |
import json | |
import hashlib | |
import Levenshtein | |
import uuid | |
from App.bin import constants | |
from collections import OrderedDict | |
from nltk import word_tokenize | |
from App.bin.SharpClassifier import SharpClassifier | |
from App.bin.ClassifierWithIncr import ClassifyWithIncr_it | |
from App.bin.SentenceClassifier import SentenceClassifier | |
from App.bin.ParameterExtractor import ParameterExtractor | |
class InformationExtractor(object): | |
patent_abbreviations = open(constants.ASSETS + "abbreviation_sentence_splitter").read().split() | |
sentence_finder = nltk.data.load('tokenizers/punkt/english.pickle') | |
sentence_finder._params.abbrev_types.update(patent_abbreviations) | |
def __init__(self, section, input_folder,file_extension, file_name): | |
self.section = section | |
self.input_folder = input_folder | |
self.file_extension = file_extension | |
self.file_name = file_name | |
print("Extracting problem graph") | |
#@staticmethod | |
def discardLines(self, line,lexic): | |
with open (constants.ASSETS+ lexic) as m: | |
exclusion_list = m.read().splitlines() | |
if any(word in line for word in exclusion_list): | |
pass | |
else: | |
return line | |
def selectLines(self, line, lexic): | |
with open(constants.ASSETS + lexic) as n: | |
inclusion_list = n.read().splitlines() | |
if any(word in line for word in inclusion_list): | |
return line | |
def last_cleansing(self, concept): | |
concept = str(concept) | |
concept = concept.lower() | |
if concept.endswith("."): | |
concept = concept.strip(".") | |
concept = re.sub(r'^consequently ','', concept) | |
concept = re.sub(r'^such ', '', concept) | |
concept = re.sub(r'^said ', '', concept) | |
concept = re.sub(r'^\s+', '', concept) | |
concept = re.sub(r'^it is worth noting that ', '', concept) | |
concept = re.sub(r'^example of ', '', concept) | |
concept = re.sub(r'^since ', '', concept) | |
concept = re.sub(r'^\( |\)$ ', '', concept) | |
return concept | |
# def get_from_claims(self): | |
# | |
# section = self.section | |
# content = [] | |
# sentence_finder = InformationExtractor.sentence_finder | |
# sentences = sentence_finder.tokenize(section.strip()) | |
# with open(constants.ASSETS + "getFromClaims") as concept: | |
# # next(concept) | |
# included_words = concept.read().splitlines() | |
# include_link_pattern = re.compile('|'.join(included_words)) | |
def get_from_description(self): | |
previous_polarity = '' | |
noise_trash =[] | |
content = [] | |
include_links = [] | |
output_content = [] | |
ex_output_content = [] | |
output_result=[] | |
output_linked_content = [] | |
output_inter_content = [] | |
uniq_output_linked_content =[] | |
ex_output_content_linked =[] | |
section = self.section | |
input_folder = self.input_folder | |
file_name = self.file_name | |
file_extension = self.file_extension | |
projectFolder = os.path.basename(os.path.normpath(input_folder)) | |
output_file_name = input_folder+"/"+file_name+file_extension.strip("*") | |
graphItemId = hashlib.md5(file_name.encode()) | |
graphItemIdValue = graphItemId.hexdigest() | |
graphItemIdValue = str(uuid.uuid4()) | |
t_sline = "" | |
t_sline_ex =[] | |
compt_Id = 30 | |
compt_Id_ex = 40 | |
root_img_url = 'https://worldwide.espacenet.com/espacenetImage.jpg?flavour=firstPageClipping&locale=en_EP&FT=D&' | |
root_pdf_url = 'https://worldwide.espacenet.com/publicationDetails/originalDocument?' | |
if file_name is not None: | |
match = re.search('(^[a-zA-Z]+)(([0-9]+)\s?([a-zA-Z0-9_]+$))', file_name) | |
# CC for country code | |
CC = match.group(1) | |
# NR for Number | |
NR = match.group(2) | |
NR = re.sub(r'\s', '', NR) | |
# KC for Kind code | |
KC = match.group(4) | |
urlImg = root_img_url + '&CC=' + CC + '&NR=' + NR + '&KC=' + KC | |
urlPDF = root_pdf_url + 'CC=' + CC + '&NR=' + NR + '&KC=' + KC + '&FT=D&ND=3&date=' + '&DB=&locale=en_EP#' | |
sentence_finder = InformationExtractor.sentence_finder | |
#section = self.dataCleaner(section) | |
#print(section) | |
sentences = sentence_finder.tokenize(section.strip()) | |
with open(constants.ASSETS + "includeLinks") as concept: | |
# next(concept) | |
included_words = concept.read().splitlines() | |
include_link_pattern = re.compile('|'.join(included_words)) | |
#open examplification wordfile | |
with open(constants.ASSETS + "examplificationclues") as examplif: | |
# next(concept) | |
exam_words = examplif.read().splitlines() | |
examplif_word_pattern = re.compile('|'.join(exam_words)) | |
description_sentences_number = len(sentences) | |
number_of_words = 0 | |
for sentence in sentences: | |
# with open(constants.DATA + 'sentences.txt', 'a', encoding='utf8') as file_handler: | |
# for item in sentences: | |
# file_handler.write("{}\n".format(item)) | |
number_of_word = len(nltk.word_tokenize(sentence)) | |
number_of_words += number_of_word | |
sentenced = self.discardLines(sentence, "exclusionList") | |
if sentenced is not None: | |
content.append(sentenced) | |
#print("origine=> "+sentence) | |
total_sentences_number = len(sentences) | |
# mean_sentence_length = int(round(number_of_words/total_sentences_number)) | |
# print(mean_sentence_length) | |
for line in content: | |
line = self.selectLines(line, "inclusionList") | |
if line is not None: | |
if re.match(include_link_pattern, line): | |
include_links.append(line) | |
#print(line) | |
if line.count(',') == 0: | |
output_content.append(line) | |
# content.remove(line) | |
if line.count(',') > 0: | |
output_inter_content.append(line) | |
content.remove(line) | |
for s in content: | |
# print(s, file_name) | |
sentence = self.discardLines(s, "FilterS") | |
if sentence is not None: | |
if s.count(',') <= 2 and re.match(examplif_word_pattern, s.lower()): | |
s = str(s) | |
cs = s.lower() | |
cs = re.sub(examplif_word_pattern, '', cs) | |
cs = re.sub('which', 'this/these', cs) | |
cs = re.sub(r'\.$', '', cs) | |
#print(s) | |
if cs.count(',') == 1 and cs.count('such as')==0: | |
ex_output_content_linked.append(cs) | |
else: | |
ex_output_content.append(cs) | |
elif s.count(',') == 1: | |
s = str(s) | |
s = s.lower() | |
s = self.selectLines(s, "OneCommaDiscriminator") | |
if s is not None: | |
#s = re.sub('which', 'this/these', s) | |
#print(s) | |
s = re.sub(r'^thus, ', '', s) | |
s = re.sub(r'^preferably, ', '', s) | |
s = re.sub(r'^conventional ', '', s) | |
s = re.sub(r'^in particular, ', '', s) | |
s = re.sub(r'^specifically, ', '', s) | |
s = re.sub(r'^as necessary, ', '', s) | |
s = re.sub(', which', ',this/these', s) | |
s = re.sub(r'\.$', '', s) | |
if s.count(',')==1: | |
ex_output_content_linked.append(s) | |
else: | |
ex_output_content.append(s) | |
else: | |
pass | |
print(len(ex_output_content_linked)) | |
ex_output_content_linked = list(set(ex_output_content_linked)) | |
for line in ex_output_content_linked: | |
line = line.lower() | |
if 'figure' not in line: | |
#if line.count(',') <= 1: | |
t_sline_ex = line.strip().split(',') | |
#print("outpib"+str(t_sline_ex)) | |
for concept in t_sline_ex: | |
#print("outpib" + str(concept)) | |
words = nltk.word_tokenize(concept) | |
tagged = nltk.pos_tag(words) | |
#print(tagged) | |
parameters_list = [] | |
compteur = 0 | |
compt_Id_ex += 1 | |
tagged = nltk.pos_tag(word_tokenize(concept)) | |
tags = [word for word, pos in tagged if pos == 'VBZ' or pos == 'VBP' or pos == 'VBG' or pos == 'MD' or pos == 'JJR'] | |
if len(tags) < 1: | |
continue | |
# classifyT = SentenceClassifier(concept) | |
# polarite = classifyT.classifySentence() | |
classifyT = ClassifyWithIncr_it() | |
polarite = classifyT.main(concept) | |
# if polarite == 'neutre': | |
# classify = SentenceClassifier(concept) | |
# polarite = classify.classifySentence() | |
# print(concept) | |
get_parameters = ParameterExtractor(concept) | |
parameters = get_parameters.extract_parameters() | |
parameters_list.extend( parameters) | |
# parameters_list=", ".join(parameters_list) | |
# parameters_list = parameters_list | |
#print("Index is: ") | |
#print(t_sline_ex.index(concept)) | |
#print(concept) | |
clean_concept = self.last_cleansing(concept) | |
# if polarite == 'neutre': | |
# words = word_tokenize(clean_concept) | |
# hit = ' '.join([word + '/' + pos for word, pos in nltk.pos_tag(words)]) | |
# noise_trash.append(hit) | |
validity = self.discardLines(concept, 'referencing_indices') | |
if t_sline_ex.index(concept) == 0 and validity is not None: | |
previous_polarity = polarite | |
values = OrderedDict({ | |
"concept": { | |
"type": polarite, | |
"enfants": graphItemIdValue + str(compt_Id_ex + 1), | |
"id": graphItemIdValue + str(compt_Id_ex), | |
"sentence": clean_concept, | |
"source": output_file_name, | |
"parameters":parameters_list, | |
"image": urlImg, | |
"pdf": urlPDF | |
} | |
}) | |
else: | |
print("Previous polarity is : " + str(previous_polarity)) | |
if previous_polarity =='partialSolution' or validity is None: | |
continue | |
else: | |
compteur += 1 | |
values = OrderedDict({ | |
"concept": { | |
"type": polarite, | |
"parents": graphItemIdValue + str(compt_Id_ex - 1), | |
"id": graphItemIdValue + str(compt_Id_ex), | |
"sentence": clean_concept, | |
"source": output_file_name, | |
"parameters": parameters_list, | |
"image": urlImg, | |
"pdf": urlPDF | |
} | |
}) | |
json_string_linkes = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': ')) | |
output_result.append(json_string_linkes) | |
#for line in output_content: | |
#print ("include=> "+line) | |
#just examplification sentences | |
#make a function of that | |
ex_output_content = list(set(ex_output_content)) | |
for concept in ex_output_content: | |
tagged = nltk.pos_tag(word_tokenize(concept)) | |
tags = [word for word, pos in tagged if | |
pos == 'VBZ' or pos == 'VBP' or pos == 'VBG' or pos == 'MD' or pos == 'JJR'] | |
if len(tags) < 1: | |
continue | |
parameters_list = [] | |
concept = concept.lower() | |
compt_Id_ex += 1 | |
# classify = SentenceClassifier(sline) | |
# polarite = classify.classifySentence() | |
classifyT = ClassifyWithIncr_it() | |
polarite = classifyT.main(concept) | |
# if polarite =='neutre': | |
# classify = SentenceClassifier(concept) | |
# polarite = classify.classifySentence() | |
# print(sline) | |
#if polarite == 'partialSolution': | |
#print(sline) | |
#Insert a classifier here | |
get_parameters = ParameterExtractor(concept) | |
parameters = get_parameters.extract_parameters() | |
clean_concept = self.last_cleansing(concept) | |
parameters_list.extend(parameters) | |
# if polarite == 'neutre': | |
# words = word_tokenize(clean_concept) | |
# hit = ' '.join([word + '/' + pos for word, pos in nltk.pos_tag(words)]) | |
# noise_trash.append(hit) | |
# parameters_list = ", ".join(parameters_list) | |
validity = self.discardLines(concept, 'referencing_indices') | |
if polarite != 'partialSolution' and validity is not None: | |
values = OrderedDict({ | |
"concept": { | |
"type": polarite, | |
"id": graphItemIdValue + str(compt_Id_ex), | |
"sentence": clean_concept, | |
"source": output_file_name, | |
"parameters": parameters_list, | |
"image": urlImg, | |
"pdf": urlPDF | |
} | |
}) | |
json_string = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': ')) | |
output_result.append(json_string) | |
for line in include_links: | |
#print(line) | |
#Put in lower case to improve matching | |
line = line.lower() | |
if re.match(r'however', line) and line.count(',') <= 1: | |
line = str(line) | |
sline = re.sub(r'however|,', '', line) | |
if sline not in output_linked_content: | |
output_linked_content.append(sline) | |
if re.match(r'however', line) and line.count(',') > 1: | |
sline = re.sub(r'^however,?(\s\w+)\s*, that ', '', line) | |
# sline = re.sub(r'however,.+, that ', '', sline) | |
sline = re.sub(r'^however,?(\s\w+)+\s(above), ', '', sline) | |
sline = re.sub(r'^however,?\s\w+ed(\s\w+)+,\s*', '', sline) | |
sline = re.sub(r'^however,?\sif\s(desired|said)\s*,\s', '', sline) | |
sline = re.sub(r'^however,?\s(it)\s(will be appreciated)\s*,\s(that)+\s*', '', sline) | |
sline = re.sub(r'^however,?\s(as|if|because|when|since)\s*(?!is)', '', sline) | |
sline = re.sub(r'^however,?\s*', '', sline) | |
if sline not in output_linked_content: | |
output_linked_content.append(sline) | |
if re.match(r'if', line) and line.count(',') <= 1: | |
line = str(line) | |
sline = re.sub(r'^if\s?(and when|not|desired|necessary)\s?,?\s*', '', line) | |
sline = re.sub(r'^if,?\s*', '', sline) | |
sline = re.sub(r'^if ', '', sline) | |
if sline not in output_linked_content: | |
output_linked_content.append(sline) | |
# print (sline) | |
if re.match(r'when', line): | |
line = str(line) | |
line = line.lower() | |
sline = re.sub(r'^when\s*', '', line) | |
sline = re.sub(r'^when,?\s*', '', sline) | |
sline = re.sub(r'^when ', '', sline) | |
if sline not in output_linked_content: | |
output_linked_content.append(sline) | |
if re.match(r'(^since)|(^\w+\s?,\s?since\s?)', line): | |
sline = re.sub(r'^since', '', line) | |
sline = re.sub(r'^\w+\s?,\s?since\s?', '', sline) | |
if sline not in output_linked_content: | |
output_linked_content.append(sline) | |
for line in output_content: | |
line = line.lower() | |
if re.match(r'if', line): | |
line = str(line) | |
sline = re.sub(r'^if ', '', line) | |
if sline not in output_linked_content: | |
output_content.append(sline) | |
#output_content.remove(line) | |
uniq_output_linked_content = list(set(output_linked_content)) | |
for line in uniq_output_linked_content: | |
#print("long sentences = > " + line) | |
# line = str(i) | |
#print(line) | |
line = line.lower() | |
if 'figure' in line: | |
uniq_output_linked_content.remove(line) | |
sline = re.sub(r'^\s+', '', line) | |
sline = re.sub(r'^\d+\.+$', '', sline) | |
if sline.count(',') <= 1: | |
t_sline = tuple(sline.strip().split(', ')) | |
#print("outpib"+str(t_sline)) | |
for concept in t_sline: | |
tagged = nltk.pos_tag(word_tokenize(concept)) | |
tags = [word for word, pos in tagged if | |
pos == 'VBZ' or pos == 'VBP' or pos == 'VBG' or pos == 'MD' or pos == 'JJR'] | |
if len(tags) < 1: | |
continue | |
else: | |
parameters_list = [] | |
compteur = 0 | |
compt_Id += 1 | |
# classifyT = SentenceClassifier(concept) | |
# polarite = classifyT.classifySentence() | |
tagged = nltk.pos_tag(word_tokenize(concept)) | |
tags = [word for word, pos in tagged if pos.startswith('V') or pos == 'JJR'] | |
if len(tags) < 1: | |
continue | |
classifyT = ClassifyWithIncr_it() | |
polarite = classifyT.main(concept) | |
# if polarite == 'neutre': | |
# classify = SentenceClassifier(concept) | |
# polarite = classify.classifySentence() | |
# print(concept) | |
get_parameters = ParameterExtractor(concept) | |
parameters = get_parameters.extract_parameters() | |
parameters_list.extend( parameters) | |
# parameters_list=", ".join(parameters_list) | |
# parameters_list = parameters_list | |
clean_concept = self.last_cleansing(concept) | |
validity = self.discardLines(concept, 'referencing_indices') | |
# if polarite == 'neutre': | |
# words = word_tokenize(clean_concept) | |
# hit = ' '.join([word + '/' + pos for word, pos in nltk.pos_tag(words)]) | |
# noise_trash.append(hit) | |
if t_sline.index(concept) == 0 and validity is not None: | |
previous_polarity = polarite | |
values = OrderedDict({ | |
"concept": { | |
"type": polarite, | |
"enfants": graphItemIdValue + str(compt_Id + 1), | |
"id": graphItemIdValue + str(compt_Id), | |
"sentence": clean_concept, | |
"source": output_file_name, | |
"parameters":parameters_list, | |
"image": urlImg, | |
"pdf": urlPDF | |
} | |
}) | |
else: | |
print("Previous polarity is : " + str(previous_polarity)) | |
if previous_polarity =='partialSolutiond' or validity is None: | |
continue | |
else: | |
compteur += 1 | |
values = OrderedDict({ | |
"concept": { | |
"type": polarite, | |
"parents": graphItemIdValue + str(compt_Id - 1), | |
"id": graphItemIdValue + str(compt_Id), | |
"sentence": clean_concept, | |
"source": output_file_name, | |
"parameters": parameters_list, | |
"image": urlImg, | |
"pdf": urlPDF | |
} | |
}) | |
json_string_linked = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': ')) | |
output_result.append(json_string_linked) | |
uniq_output_content = list(set(output_content)) | |
for s in uniq_output_content: | |
for y in uniq_output_content: | |
if s != y: | |
result = Levenshtein.ratio(s, y) | |
if result > .7: | |
# print(s + " :IS SIMILAR TO: " + y) | |
if len(s) > len(y): | |
uniq_output_content.remove(y) | |
elif len(y) < len(s): | |
uniq_output_content.remove(s) | |
for concept in uniq_output_content: | |
tagged = nltk.pos_tag(word_tokenize(concept)) | |
tags = [word for word, pos in tagged if | |
pos == 'VBZ' or pos == 'VBP' or pos == 'VBG' or pos == 'MD' or pos == 'JJR'] | |
if len(tags) < 1: | |
continue | |
parameters_list = [] | |
concept = concept.lower() | |
compt_Id += 1 | |
sline = re.sub(r'^if ', '', concept) | |
sline = re.sub(r'^(if|preferably) ', '', sline) | |
sline = re.sub(r'^\s+?said ', '', sline) | |
# classify = SentenceClassifier(sline) | |
# polarite = classify.classifySentence() | |
classifyT = ClassifyWithIncr_it() | |
polarite = classifyT.main(concept) | |
# if polarite =='neutre': | |
# classify = SentenceClassifier(sline) | |
# polarite = classify.classifySentence() | |
# print(sline) | |
#if polarite == 'partialSolution': | |
#print(sline) | |
#Insert a classifier here | |
get_parameters = ParameterExtractor(concept) | |
parameters = get_parameters.extract_parameters() | |
parameters_list.extend(parameters) | |
# parameters_list = ", ".join(parameters_list) | |
clean_concept = self.last_cleansing(sline) | |
# if polarite == 'neutre': | |
# words = word_tokenize(clean_concept) | |
# hit = ' '.join([word + '/' + pos for word, pos in nltk.pos_tag(words)]) | |
# noise_trash.append(hit) | |
validity = self.discardLines(concept, 'referencing_indices') | |
if polarite !='partialSolution' and validity is not None: | |
values = OrderedDict({ | |
"concept": { | |
"type": polarite, | |
"id": graphItemIdValue + str(compt_Id), | |
"sentence": clean_concept, | |
"source": output_file_name, | |
"parameters": parameters_list, | |
"image": urlImg, | |
"pdf": urlPDF | |
} | |
}) | |
json_string = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': ')) | |
output_result.append(json_string) | |
output_result = list(set(output_result)) | |
output_json = ",".join(output_result) | |
return output_json, total_sentences_number |