PatentSolver / App /bin /CorpusProcessor.py
xin's picture
initial commit
22738ca
raw
history blame
19.5 kB
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import json
import os
import re
import matplotlib.pyplot as plt
import numpy as np
import Levenshtein
from io import StringIO
from App.bin import constants
import hashlib
from collections import OrderedDict
from App.bin.InformationExtractor import InformationExtractor
from App.bin.ParameterExtractor import ParameterExtractor
from App.bin.TechnologyFinder import TechnologyFinder
from App.bin.InformationExtractor_Claims import InformationExtractorClaims
class CorpusProcessor(object):
def __init__(self, patents,input_folder, file_extension):
self.patents = patents
self.input_folder = input_folder
self.file_extension = file_extension
print("Processing started")
def make_graphic (self, sizes, text, colors, labels):
col = [[i / 255. for i in c] for c in colors]
fig, ax = plt.subplots()
ax.axis('equal')
width = 0.35
kwargs = dict(colors=col, startangle=180)
outside, _ = ax.pie(sizes, radius=1, pctdistance=1 - width / 2, labels=labels, **kwargs)
plt.setp(outside, width=width, edgecolor='white')
kwargs = dict(size=20, fontweight='bold', va='center')
ax.text(0, 0, text, ha='center', **kwargs)
plt.show()
def change_keys(self, dictionnary, number):
number = number+'-'
if type(dictionnary) is dict:
return dict([(number+str(k) , self.change_keys(v, number)) for k, v in dictionnary.items()])
else:
return dictionnary
def process_corpus(self):
count_abstract = 0
count_claims = 0
count_description = 0
count_patent = 0
total_sentences_number =0
count_concepts_solupart = 0
count_concepts_problem = 0
patents = self.patents
input_folder = self.input_folder
file_extension = self.file_extension
project_folder = os.path.basename(os.path.normpath(input_folder))
graph_folder = constants.GRAPH_FOLDER + project_folder+"/"
extracted_concepts = []
output_result = []
parameters_graph = []
reduced_content = []
patent_corpus = []
source_list = []
parameters_list =[]
technologies_graph =[]
for patent_file in patents:
output_json_claims ={}
total_sentences_number_claims =0
if type(patent_file) is dict:
patent_file = json.dumps(patent_file)
read_patent = StringIO(patent_file)
patent = json.load(read_patent)
nNumber = patent['number']
aAbstract = patent['abstract']
cClaims = patent['claims']
dDescription = patent['description']
root_img_url = 'https://worldwide.espacenet.com/espacenetImage.jpg?flavour=firstPageClipping&locale=en_EP&FT=D&'
root_pdf_url = 'https://worldwide.espacenet.com/publicationDetails/originalDocument?'
if nNumber is not None:
match = re.search('(^[a-zA-Z]+)(([0-9]+)\s?([a-zA-Z0-9_]+$))', nNumber)
# CC for country code
CC = match.group(1)
# NR for Number
NR = match.group(2)
NR = re.sub(r'\s', '', NR)
# KC for Kind code
KC = match.group(4)
urlImg = root_img_url + '&CC=' + CC + '&NR=' + NR + '&KC=' + KC
urlPDF = root_pdf_url + 'CC=' + CC + '&NR=' + NR + '&KC=' + KC + '&FT=D&ND=3&date=' + '&DB=&locale=en_EP#'
#Find a more elegant way to do it
patent_content = aAbstract + cClaims + dDescription
patent_content = patent_content.splitlines()
# for line in patent_content:
# line = self.dataCleaner(line)
# reduced_content.append(line)
for line in patent_content:
get_parameters = ParameterExtractor(line)
parameters = get_parameters.extract_parameters()
if parameters:
parameters_list.extend( parameters)
for i in parameters_list:
for j in parameters_list:
if i != j and len(i.split()) == 1:
if j.find(i) > -1 and i in parameters_list:
parameters_list.remove(i)
parameters_list=list(set(parameters_list))
if len(parameters_list) > 50:
for i in parameters_list:
for j in parameters_list:
if i!=j:
comp = Levenshtein.ratio(i, j)
if comp >=.4 and i in parameters_list and j in parameters_list:
if len(i) > len(j):
# print('{} is near duplicate of {}'.format(i, j))
parameters_list.remove(i)
for el in parameters_list:
if len(el.split()) == 1:
parameters_list.remove(el)
parameters = dict(enumerate(parameters_list, 1))
parameters = self.change_keys(parameters, nNumber.lower())
source = input_folder+"/"+nNumber+file_extension.strip("*")
parameters_array = OrderedDict({
"concept": {
"source": source,
"valeurs": parameters,
"image": urlImg,
"pdf": urlPDF
}
})
pParameters= json.dumps(parameters_array, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
parameters_graph.append(pParameters)
if dDescription !="" or cClaims!="":
count_description +=1
extract_concepts = InformationExtractor(dDescription,input_folder, file_extension, nNumber )
output_json, total_sentences_number = extract_concepts.get_from_description()
extract_concepts_claims = InformationExtractorClaims(cClaims,input_folder, file_extension, nNumber )
output_json_claims_result= extract_concepts_claims.main()
if output_json_claims_result is not None:
output_json_claims, total_sentences_number_claims = output_json_claims_result
count_claims += 1
if output_json is not None:
if type(output_json) is dict:
output_json = json.dumps(output_json)
extracted_concepts.append(output_json)
total_sentences_number += total_sentences_number
if output_json_claims is not None :
if type(output_json_claims) is dict:
output_json_claims = json.dumps(output_json_claims)
extracted_concepts.append(output_json_claims)
total_sentences_number += total_sentences_number_claims
elif cClaims !="":
count_claims +=1
print('Processing claims')
else:
count_abstract +=1
print("processing abstract")
count_patent +=1
#print(source)
source_list.append(source)
patent_corpus.append(reduced_content)
patent_corpus = dict(zip(source_list, patent_corpus))
'''
get_patent_technologies = TechnologyFinder(patent_corpus)
technologies = get_patent_technologies.get_technologies()
for source_file, technologies_list in technologies.items():
technologies_array = OrderedDict({
"concept": {
"source": source_file,
"values": technologies_list
}
})
tTechnologies = json.dumps(technologies_array, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
technologies_graph.append(tTechnologies)
'''
print(type(extracted_concepts))
header = '{'
graph = '"problem_graph": [%s],' % ','.join(extracted_concepts)
parameters_output = '"parameters": [%s]' % ','.join(parameters_graph)
#technologies_output = '"technologies": [%s]' % ','.join(technologies_graph)
footer = '}'
#output_result.extend((header, graph, parameters_output,technologies_output, footer ))
output_result.extend((header, graph, parameters_output, footer))
output_result = "".join(output_result)
output_result = re.sub(r'\,{2,}', ',', output_result)
output_result = re.sub(r'\}\,\]', '}]', output_result)
# exit()
# print(output_result)
concepts_json = json.loads(output_result)
# concepts_json = json.loads(concepts_json)
count_concepts = len(concepts_json['problem_graph'])
for item, value in concepts_json.items():
#if cle == "type" and value =="partialSolution":
# print ("yes")
for element in value:
for cle, valeur in element.items():
for k,v in valeur.items():
if k == "type" and v =="partialSolution":
count_concepts_solupart += 1
elif k == "type" and v =="problem":
count_concepts_problem += 1
json_write_to_file = json.dumps(concepts_json, sort_keys=False, indent=4, separators=(',', ': '))
#print(concepts_json.keys())
# original code
with open(graph_folder+"graph.json", 'w') as json_graph:
# with open(graph_folder + 'graph.json', 'w') as json_graph:
json_graph.write(json_write_to_file)
number_neutre = count_concepts - count_concepts_problem - count_concepts_solupart
print("Le corpus contenait %s brevets dont %s abstract, %s revendications et %s descriptions" % (count_patent, count_abstract, count_claims, count_description))
print("%s phrases ont été analysée(s)" % (total_sentences_number))
print("%s concepts ont été trouvé(s) dont %s problèmes, %s solutions partielles et %s neutres" % (count_concepts, count_concepts_problem, count_concepts_solupart, number_neutre))
#Display graphics
first_color = (46, 204, 113)
second_color = (245, 176, 65)
#self.make_graphic([count_concepts_problem, count_concepts_solupart], "Ratio",[first_color,second_color],['Problems','Partial Solutions'])
return json_write_to_file
def process_corpus_json(self):
count_abstract = 0
count_claims = 0
count_description = 0
count_patent = 0
total_sentences_number = 0
count_concepts_solupart = 0
count_concepts_problem = 0
patents = self.patents
input_folder = self.input_folder
file_extension = self.file_extension
project_folder = os.path.basename(os.path.normpath(input_folder))
graph_folder = constants.GRAPH_FOLDER + project_folder + "/"
extracted_concepts = []
output_result = []
parameters_graph = []
reduced_content = []
patent_corpus = []
source_list = []
parameters_list = []
technologies_graph = []
for patent_file in patents:
# print(type(patent_file))
#if type(patent_file) is dict:
patent_file = json.dumps(patent_file)
read_patent = StringIO(patent_file)
patent = json.load(read_patent)
# print(type(patent))
filename = patent['filename']
nNumber = patent['number']
aAbstract = patent['abstract']
cClaims = patent['claims']
dDescription = patent['description']
# Find a more elegant way to do it
patent_content = aAbstract + cClaims + dDescription
patent_content = patent_content.splitlines()
# for line in patent_content:
# line = self.dataCleaner(line)
# reduced_content.append(line)
for line in patent_content:
get_parameters = ParameterExtractor(line)
parameters = get_parameters.extract_parameters()
if parameters:
parameters_list.extend(parameters)
for i in parameters_list:
for j in parameters_list:
if i != j and len(i.split()) == 1:
if j.find(i) > -1 and i in parameters_list:
parameters_list.remove(i)
parameters_list = list(set(parameters_list))
if len(parameters_list) > 50:
for i in parameters_list:
for j in parameters_list:
if i!=j:
comp = Levenshtein.ratio(i, j)
if comp >=.4 and i in parameters_list and j in parameters_list:
if len(i) > len(j):
# print('{} is near duplicate of {}'.format(i, j))
parameters_list.remove(i)
for el in parameters_list:
if len(el.split()) == 1:
parameters_list.remove(el)
print('{} {}'.format('Taille: ', len(parameters_list)))
parameters = dict(enumerate(parameters_list, 1))
parameters = self.change_keys(parameters, nNumber.lower())
source = input_folder + "/" + nNumber + file_extension.strip("*")
parameters_array = OrderedDict({
"concept": {
"source": source,
"valeurs": parameters
}
})
pParameters = json.dumps(parameters_array, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
parameters_graph.append(pParameters)
#if dDescription != "" and cClaims!="":
if dDescription != "":
count_description += 1
extract_concepts = InformationExtractor(dDescription, input_folder, file_extension, filename)
output_json, total_sentences_number_d = extract_concepts.get_from_description()
if output_json != "":
extracted_concepts.append(output_json)
total_sentences_number += total_sentences_number_d
#count_claims += 1
#extract_concepts = InformationExtractor(cClaims, input_folder, file_extension, nNumber)
#output_json, total_sentences_number_c = extract_concepts.get_from_claims()
#if output_json != "":
#extracted_concepts.append(output_json)
#total_sentences_number_c += total_sentences_number_c
#total_sentences_number = total_sentences_number_c+total_sentences_number_d
elif cClaims != "":
count_claims += 1
extract_concepts = InformationExtractor(cClaims, input_folder, file_extension, nNumber)
output_json, total_sentences_number = extract_concepts.get_from_claims()
if output_json != "":
extracted_concepts.append(output_json)
total_sentences_number += total_sentences_number
elif dDescription != "":
count_description += 1
extract_concepts = InformationExtractor(dDescription, input_folder, file_extension, nNumber)
output_json, total_sentences_number = extract_concepts.get_from_description()
if output_json != "":
extracted_concepts.append(output_json)
total_sentences_number += total_sentences_number
count_claims += 1
else:
count_abstract += 1
print("processing abstract")
count_patent += 1
# print(source)
# source_list.append(source)
# patent_corpus.append(reduced_content)
# patent_corpus = dict(zip(source_list, patent_corpus))
'''
get_patent_technologies = TechnologyFinder(patent_corpus)
technologies = get_patent_technologies.get_technologies()
for source_file, technologies_list in technologies.items():
technologies_array = OrderedDict({
"concept": {
"source": source_file,
"values": technologies_list
}
})
tTechnologies = json.dumps(technologies_array, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
technologies_graph.append(tTechnologies)
'''
header = '{'
graph = '"problem_graph": [%s],' % ','.join(extracted_concepts)
parameters_output = '"parameters": [%s]' % ','.join(parameters_graph)
# technologies_output = '"technologies": [%s]' % ','.join(technologies_graph)
footer = '}'
# output_result.extend((header, graph, parameters_output,technologies_output, footer ))
output_result.extend((header, graph, parameters_output, footer))
output_result = "".join(output_result)
output_result = re.sub(r'\,{2,}', ',', output_result)
output_result = re.sub(r'\}\,\]', '}]', output_result)
concepts_json = json.loads(output_result)
count_concepts = len(concepts_json['problem_graph'])
for item, value in concepts_json.items():
# if cle == "type" and value =="partialSolution":
# print ("yes")
for element in value:
for cle, valeur in element.items():
for k, v in valeur.items():
if k == "type" and v == "partialSolution":
count_concepts_solupart += 1
elif k == "type" and v == "problem":
count_concepts_problem += 1
json_write_to_file = json.dumps(concepts_json, sort_keys=False, indent=4, separators=(',', ': '))
# print(concepts_json.keys())
with open(graph_folder + "graph.json", 'w') as json_graph:
json_graph.write(json_write_to_file)
print("Le corpus contenait %s brevets dont %s abstract, %s revendications et %s descriptions" % (
count_patent, count_abstract, count_claims, count_description))
print("%s phrases ont été analysée(s)" % (total_sentences_number))
print("%s concepts ont été trouvé(s) dont %s problèmes et %s solutions partielles" % (
count_concepts, count_concepts_problem, count_concepts_solupart))
# Display graphics
first_color = (46, 204, 113)
second_color = (245, 176, 65)
# self.make_graphic([count_concepts_problem, count_concepts_solupart], "Ratio",[first_color,second_color],['Problems','Partial Solutions'])
return json_write_to_file