PatentSolver / App /bin /InformationExtractor.py
xin's picture
initial commit
22738ca
raw
history blame
24.8 kB
# -*- coding: utf-8 -*-
#java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer --port 8080
import nltk
nltk.download('all')
import os
import re
import json
import hashlib
import Levenshtein
import uuid
from App.bin import constants
from collections import OrderedDict
from nltk import word_tokenize
from App.bin.SharpClassifier import SharpClassifier
from App.bin.ClassifierWithIncr import ClassifyWithIncr_it
from App.bin.SentenceClassifier import SentenceClassifier
from App.bin.ParameterExtractor import ParameterExtractor
class InformationExtractor(object):
patent_abbreviations = open(constants.ASSETS + "abbreviation_sentence_splitter").read().split()
sentence_finder = nltk.data.load('tokenizers/punkt/english.pickle')
sentence_finder._params.abbrev_types.update(patent_abbreviations)
def __init__(self, section, input_folder,file_extension, file_name):
self.section = section
self.input_folder = input_folder
self.file_extension = file_extension
self.file_name = file_name
print("Extracting problem graph")
#@staticmethod
def discardLines(self, line,lexic):
with open (constants.ASSETS+ lexic) as m:
exclusion_list = m.read().splitlines()
if any(word in line for word in exclusion_list):
pass
else:
return line
def selectLines(self, line, lexic):
with open(constants.ASSETS + lexic) as n:
inclusion_list = n.read().splitlines()
if any(word in line for word in inclusion_list):
return line
def last_cleansing(self, concept):
concept = str(concept)
concept = concept.lower()
if concept.endswith("."):
concept = concept.strip(".")
concept = re.sub(r'^consequently ','', concept)
concept = re.sub(r'^such ', '', concept)
concept = re.sub(r'^said ', '', concept)
concept = re.sub(r'^\s+', '', concept)
concept = re.sub(r'^it is worth noting that ', '', concept)
concept = re.sub(r'^example of ', '', concept)
concept = re.sub(r'^since ', '', concept)
concept = re.sub(r'^\( |\)$ ', '', concept)
return concept
# def get_from_claims(self):
#
# section = self.section
# content = []
# sentence_finder = InformationExtractor.sentence_finder
# sentences = sentence_finder.tokenize(section.strip())
# with open(constants.ASSETS + "getFromClaims") as concept:
# # next(concept)
# included_words = concept.read().splitlines()
# include_link_pattern = re.compile('|'.join(included_words))
def get_from_description(self):
previous_polarity = ''
noise_trash =[]
content = []
include_links = []
output_content = []
ex_output_content = []
output_result=[]
output_linked_content = []
output_inter_content = []
uniq_output_linked_content =[]
ex_output_content_linked =[]
section = self.section
input_folder = self.input_folder
file_name = self.file_name
file_extension = self.file_extension
projectFolder = os.path.basename(os.path.normpath(input_folder))
output_file_name = input_folder+"/"+file_name+file_extension.strip("*")
graphItemId = hashlib.md5(file_name.encode())
graphItemIdValue = graphItemId.hexdigest()
graphItemIdValue = str(uuid.uuid4())
t_sline = ""
t_sline_ex =[]
compt_Id = 30
compt_Id_ex = 40
root_img_url = 'https://worldwide.espacenet.com/espacenetImage.jpg?flavour=firstPageClipping&locale=en_EP&FT=D&'
root_pdf_url = 'https://worldwide.espacenet.com/publicationDetails/originalDocument?'
if file_name is not None:
match = re.search('(^[a-zA-Z]+)(([0-9]+)\s?([a-zA-Z0-9_]+$))', file_name)
# CC for country code
CC = match.group(1)
# NR for Number
NR = match.group(2)
NR = re.sub(r'\s', '', NR)
# KC for Kind code
KC = match.group(4)
urlImg = root_img_url + '&CC=' + CC + '&NR=' + NR + '&KC=' + KC
urlPDF = root_pdf_url + 'CC=' + CC + '&NR=' + NR + '&KC=' + KC + '&FT=D&ND=3&date=' + '&DB=&locale=en_EP#'
sentence_finder = InformationExtractor.sentence_finder
#section = self.dataCleaner(section)
#print(section)
sentences = sentence_finder.tokenize(section.strip())
with open(constants.ASSETS + "includeLinks") as concept:
# next(concept)
included_words = concept.read().splitlines()
include_link_pattern = re.compile('|'.join(included_words))
#open examplification wordfile
with open(constants.ASSETS + "examplificationclues") as examplif:
# next(concept)
exam_words = examplif.read().splitlines()
examplif_word_pattern = re.compile('|'.join(exam_words))
description_sentences_number = len(sentences)
number_of_words = 0
for sentence in sentences:
# with open(constants.DATA + 'sentences.txt', 'a', encoding='utf8') as file_handler:
# for item in sentences:
# file_handler.write("{}\n".format(item))
number_of_word = len(nltk.word_tokenize(sentence))
number_of_words += number_of_word
sentenced = self.discardLines(sentence, "exclusionList")
if sentenced is not None:
content.append(sentenced)
#print("origine=> "+sentence)
total_sentences_number = len(sentences)
# mean_sentence_length = int(round(number_of_words/total_sentences_number))
# print(mean_sentence_length)
for line in content:
line = self.selectLines(line, "inclusionList")
if line is not None:
if re.match(include_link_pattern, line):
include_links.append(line)
#print(line)
if line.count(',') == 0:
output_content.append(line)
# content.remove(line)
if line.count(',') > 0:
output_inter_content.append(line)
content.remove(line)
for s in content:
# print(s, file_name)
sentence = self.discardLines(s, "FilterS")
if sentence is not None:
if s.count(',') <= 2 and re.match(examplif_word_pattern, s.lower()):
s = str(s)
cs = s.lower()
cs = re.sub(examplif_word_pattern, '', cs)
cs = re.sub('which', 'this/these', cs)
cs = re.sub(r'\.$', '', cs)
#print(s)
if cs.count(',') == 1 and cs.count('such as')==0:
ex_output_content_linked.append(cs)
else:
ex_output_content.append(cs)
elif s.count(',') == 1:
s = str(s)
s = s.lower()
s = self.selectLines(s, "OneCommaDiscriminator")
if s is not None:
#s = re.sub('which', 'this/these', s)
#print(s)
s = re.sub(r'^thus, ', '', s)
s = re.sub(r'^preferably, ', '', s)
s = re.sub(r'^conventional ', '', s)
s = re.sub(r'^in particular, ', '', s)
s = re.sub(r'^specifically, ', '', s)
s = re.sub(r'^as necessary, ', '', s)
s = re.sub(', which', ',this/these', s)
s = re.sub(r'\.$', '', s)
if s.count(',')==1:
ex_output_content_linked.append(s)
else:
ex_output_content.append(s)
else:
pass
print(len(ex_output_content_linked))
ex_output_content_linked = list(set(ex_output_content_linked))
for line in ex_output_content_linked:
line = line.lower()
if 'figure' not in line:
#if line.count(',') <= 1:
t_sline_ex = line.strip().split(',')
#print("outpib"+str(t_sline_ex))
for concept in t_sline_ex:
#print("outpib" + str(concept))
words = nltk.word_tokenize(concept)
tagged = nltk.pos_tag(words)
#print(tagged)
parameters_list = []
compteur = 0
compt_Id_ex += 1
tagged = nltk.pos_tag(word_tokenize(concept))
tags = [word for word, pos in tagged if pos == 'VBZ' or pos == 'VBP' or pos == 'VBG' or pos == 'MD' or pos == 'JJR']
if len(tags) < 1:
continue
# classifyT = SentenceClassifier(concept)
# polarite = classifyT.classifySentence()
classifyT = ClassifyWithIncr_it()
polarite = classifyT.main(concept)
# if polarite == 'neutre':
# classify = SentenceClassifier(concept)
# polarite = classify.classifySentence()
# print(concept)
get_parameters = ParameterExtractor(concept)
parameters = get_parameters.extract_parameters()
parameters_list.extend( parameters)
# parameters_list=", ".join(parameters_list)
# parameters_list = parameters_list
#print("Index is: ")
#print(t_sline_ex.index(concept))
#print(concept)
clean_concept = self.last_cleansing(concept)
# if polarite == 'neutre':
# words = word_tokenize(clean_concept)
# hit = ' '.join([word + '/' + pos for word, pos in nltk.pos_tag(words)])
# noise_trash.append(hit)
validity = self.discardLines(concept, 'referencing_indices')
if t_sline_ex.index(concept) == 0 and validity is not None:
previous_polarity = polarite
values = OrderedDict({
"concept": {
"type": polarite,
"enfants": graphItemIdValue + str(compt_Id_ex + 1),
"id": graphItemIdValue + str(compt_Id_ex),
"sentence": clean_concept,
"source": output_file_name,
"parameters":parameters_list,
"image": urlImg,
"pdf": urlPDF
}
})
else:
print("Previous polarity is : " + str(previous_polarity))
if previous_polarity =='partialSolution' or validity is None:
continue
else:
compteur += 1
values = OrderedDict({
"concept": {
"type": polarite,
"parents": graphItemIdValue + str(compt_Id_ex - 1),
"id": graphItemIdValue + str(compt_Id_ex),
"sentence": clean_concept,
"source": output_file_name,
"parameters": parameters_list,
"image": urlImg,
"pdf": urlPDF
}
})
json_string_linkes = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
output_result.append(json_string_linkes)
#for line in output_content:
#print ("include=> "+line)
#just examplification sentences
#make a function of that
ex_output_content = list(set(ex_output_content))
for concept in ex_output_content:
tagged = nltk.pos_tag(word_tokenize(concept))
tags = [word for word, pos in tagged if
pos == 'VBZ' or pos == 'VBP' or pos == 'VBG' or pos == 'MD' or pos == 'JJR']
if len(tags) < 1:
continue
parameters_list = []
concept = concept.lower()
compt_Id_ex += 1
# classify = SentenceClassifier(sline)
# polarite = classify.classifySentence()
classifyT = ClassifyWithIncr_it()
polarite = classifyT.main(concept)
# if polarite =='neutre':
# classify = SentenceClassifier(concept)
# polarite = classify.classifySentence()
# print(sline)
#if polarite == 'partialSolution':
#print(sline)
#Insert a classifier here
get_parameters = ParameterExtractor(concept)
parameters = get_parameters.extract_parameters()
clean_concept = self.last_cleansing(concept)
parameters_list.extend(parameters)
# if polarite == 'neutre':
# words = word_tokenize(clean_concept)
# hit = ' '.join([word + '/' + pos for word, pos in nltk.pos_tag(words)])
# noise_trash.append(hit)
# parameters_list = ", ".join(parameters_list)
validity = self.discardLines(concept, 'referencing_indices')
if polarite != 'partialSolution' and validity is not None:
values = OrderedDict({
"concept": {
"type": polarite,
"id": graphItemIdValue + str(compt_Id_ex),
"sentence": clean_concept,
"source": output_file_name,
"parameters": parameters_list,
"image": urlImg,
"pdf": urlPDF
}
})
json_string = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
output_result.append(json_string)
for line in include_links:
#print(line)
#Put in lower case to improve matching
line = line.lower()
if re.match(r'however', line) and line.count(',') <= 1:
line = str(line)
sline = re.sub(r'however|,', '', line)
if sline not in output_linked_content:
output_linked_content.append(sline)
if re.match(r'however', line) and line.count(',') > 1:
sline = re.sub(r'^however,?(\s\w+)\s*, that ', '', line)
# sline = re.sub(r'however,.+, that ', '', sline)
sline = re.sub(r'^however,?(\s\w+)+\s(above), ', '', sline)
sline = re.sub(r'^however,?\s\w+ed(\s\w+)+,\s*', '', sline)
sline = re.sub(r'^however,?\sif\s(desired|said)\s*,\s', '', sline)
sline = re.sub(r'^however,?\s(it)\s(will be appreciated)\s*,\s(that)+\s*', '', sline)
sline = re.sub(r'^however,?\s(as|if|because|when|since)\s*(?!is)', '', sline)
sline = re.sub(r'^however,?\s*', '', sline)
if sline not in output_linked_content:
output_linked_content.append(sline)
if re.match(r'if', line) and line.count(',') <= 1:
line = str(line)
sline = re.sub(r'^if\s?(and when|not|desired|necessary)\s?,?\s*', '', line)
sline = re.sub(r'^if,?\s*', '', sline)
sline = re.sub(r'^if ', '', sline)
if sline not in output_linked_content:
output_linked_content.append(sline)
# print (sline)
if re.match(r'when', line):
line = str(line)
line = line.lower()
sline = re.sub(r'^when\s*', '', line)
sline = re.sub(r'^when,?\s*', '', sline)
sline = re.sub(r'^when ', '', sline)
if sline not in output_linked_content:
output_linked_content.append(sline)
if re.match(r'(^since)|(^\w+\s?,\s?since\s?)', line):
sline = re.sub(r'^since', '', line)
sline = re.sub(r'^\w+\s?,\s?since\s?', '', sline)
if sline not in output_linked_content:
output_linked_content.append(sline)
for line in output_content:
line = line.lower()
if re.match(r'if', line):
line = str(line)
sline = re.sub(r'^if ', '', line)
if sline not in output_linked_content:
output_content.append(sline)
#output_content.remove(line)
uniq_output_linked_content = list(set(output_linked_content))
for line in uniq_output_linked_content:
#print("long sentences = > " + line)
# line = str(i)
#print(line)
line = line.lower()
if 'figure' in line:
uniq_output_linked_content.remove(line)
sline = re.sub(r'^\s+', '', line)
sline = re.sub(r'^\d+\.+$', '', sline)
if sline.count(',') <= 1:
t_sline = tuple(sline.strip().split(', '))
#print("outpib"+str(t_sline))
for concept in t_sline:
tagged = nltk.pos_tag(word_tokenize(concept))
tags = [word for word, pos in tagged if
pos == 'VBZ' or pos == 'VBP' or pos == 'VBG' or pos == 'MD' or pos == 'JJR']
if len(tags) < 1:
continue
else:
parameters_list = []
compteur = 0
compt_Id += 1
# classifyT = SentenceClassifier(concept)
# polarite = classifyT.classifySentence()
tagged = nltk.pos_tag(word_tokenize(concept))
tags = [word for word, pos in tagged if pos.startswith('V') or pos == 'JJR']
if len(tags) < 1:
continue
classifyT = ClassifyWithIncr_it()
polarite = classifyT.main(concept)
# if polarite == 'neutre':
# classify = SentenceClassifier(concept)
# polarite = classify.classifySentence()
# print(concept)
get_parameters = ParameterExtractor(concept)
parameters = get_parameters.extract_parameters()
parameters_list.extend( parameters)
# parameters_list=", ".join(parameters_list)
# parameters_list = parameters_list
clean_concept = self.last_cleansing(concept)
validity = self.discardLines(concept, 'referencing_indices')
# if polarite == 'neutre':
# words = word_tokenize(clean_concept)
# hit = ' '.join([word + '/' + pos for word, pos in nltk.pos_tag(words)])
# noise_trash.append(hit)
if t_sline.index(concept) == 0 and validity is not None:
previous_polarity = polarite
values = OrderedDict({
"concept": {
"type": polarite,
"enfants": graphItemIdValue + str(compt_Id + 1),
"id": graphItemIdValue + str(compt_Id),
"sentence": clean_concept,
"source": output_file_name,
"parameters":parameters_list,
"image": urlImg,
"pdf": urlPDF
}
})
else:
print("Previous polarity is : " + str(previous_polarity))
if previous_polarity =='partialSolutiond' or validity is None:
continue
else:
compteur += 1
values = OrderedDict({
"concept": {
"type": polarite,
"parents": graphItemIdValue + str(compt_Id - 1),
"id": graphItemIdValue + str(compt_Id),
"sentence": clean_concept,
"source": output_file_name,
"parameters": parameters_list,
"image": urlImg,
"pdf": urlPDF
}
})
json_string_linked = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
output_result.append(json_string_linked)
uniq_output_content = list(set(output_content))
for s in uniq_output_content:
for y in uniq_output_content:
if s != y:
result = Levenshtein.ratio(s, y)
if result > .7:
# print(s + " :IS SIMILAR TO: " + y)
if len(s) > len(y):
uniq_output_content.remove(y)
elif len(y) < len(s):
uniq_output_content.remove(s)
for concept in uniq_output_content:
tagged = nltk.pos_tag(word_tokenize(concept))
tags = [word for word, pos in tagged if
pos == 'VBZ' or pos == 'VBP' or pos == 'VBG' or pos == 'MD' or pos == 'JJR']
if len(tags) < 1:
continue
parameters_list = []
concept = concept.lower()
compt_Id += 1
sline = re.sub(r'^if ', '', concept)
sline = re.sub(r'^(if|preferably) ', '', sline)
sline = re.sub(r'^\s+?said ', '', sline)
# classify = SentenceClassifier(sline)
# polarite = classify.classifySentence()
classifyT = ClassifyWithIncr_it()
polarite = classifyT.main(concept)
# if polarite =='neutre':
# classify = SentenceClassifier(sline)
# polarite = classify.classifySentence()
# print(sline)
#if polarite == 'partialSolution':
#print(sline)
#Insert a classifier here
get_parameters = ParameterExtractor(concept)
parameters = get_parameters.extract_parameters()
parameters_list.extend(parameters)
# parameters_list = ", ".join(parameters_list)
clean_concept = self.last_cleansing(sline)
# if polarite == 'neutre':
# words = word_tokenize(clean_concept)
# hit = ' '.join([word + '/' + pos for word, pos in nltk.pos_tag(words)])
# noise_trash.append(hit)
validity = self.discardLines(concept, 'referencing_indices')
if polarite !='partialSolution' and validity is not None:
values = OrderedDict({
"concept": {
"type": polarite,
"id": graphItemIdValue + str(compt_Id),
"sentence": clean_concept,
"source": output_file_name,
"parameters": parameters_list,
"image": urlImg,
"pdf": urlPDF
}
})
json_string = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
output_result.append(json_string)
output_result = list(set(output_result))
output_json = ",".join(output_result)
return output_json, total_sentences_number