Spaces:
Build error
Build error
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
import json | |
import os | |
import re | |
import matplotlib.pyplot as plt | |
import numpy as np | |
import Levenshtein | |
from io import StringIO | |
from App.bin import constants | |
import hashlib | |
from collections import OrderedDict | |
from App.bin.InformationExtractor import InformationExtractor | |
from App.bin.ParameterExtractor import ParameterExtractor | |
from App.bin.TechnologyFinder import TechnologyFinder | |
from App.bin.InformationExtractor_Claims import InformationExtractorClaims | |
class CorpusProcessor(object): | |
def __init__(self, patents,input_folder, file_extension): | |
self.patents = patents | |
self.input_folder = input_folder | |
self.file_extension = file_extension | |
print("Processing started") | |
def make_graphic (self, sizes, text, colors, labels): | |
col = [[i / 255. for i in c] for c in colors] | |
fig, ax = plt.subplots() | |
ax.axis('equal') | |
width = 0.35 | |
kwargs = dict(colors=col, startangle=180) | |
outside, _ = ax.pie(sizes, radius=1, pctdistance=1 - width / 2, labels=labels, **kwargs) | |
plt.setp(outside, width=width, edgecolor='white') | |
kwargs = dict(size=20, fontweight='bold', va='center') | |
ax.text(0, 0, text, ha='center', **kwargs) | |
plt.show() | |
def change_keys(self, dictionnary, number): | |
number = number+'-' | |
if type(dictionnary) is dict: | |
return dict([(number+str(k) , self.change_keys(v, number)) for k, v in dictionnary.items()]) | |
else: | |
return dictionnary | |
def process_corpus(self): | |
count_abstract = 0 | |
count_claims = 0 | |
count_description = 0 | |
count_patent = 0 | |
total_sentences_number =0 | |
count_concepts_solupart = 0 | |
count_concepts_problem = 0 | |
patents = self.patents | |
input_folder = self.input_folder | |
file_extension = self.file_extension | |
project_folder = os.path.basename(os.path.normpath(input_folder)) | |
graph_folder = constants.GRAPH_FOLDER + project_folder+"/" | |
extracted_concepts = [] | |
output_result = [] | |
parameters_graph = [] | |
reduced_content = [] | |
patent_corpus = [] | |
source_list = [] | |
parameters_list =[] | |
technologies_graph =[] | |
for patent_file in patents: | |
output_json_claims ={} | |
total_sentences_number_claims =0 | |
if type(patent_file) is dict: | |
patent_file = json.dumps(patent_file) | |
read_patent = StringIO(patent_file) | |
patent = json.load(read_patent) | |
nNumber = patent['number'] | |
aAbstract = patent['abstract'] | |
cClaims = patent['claims'] | |
dDescription = patent['description'] | |
root_img_url = 'https://worldwide.espacenet.com/espacenetImage.jpg?flavour=firstPageClipping&locale=en_EP&FT=D&' | |
root_pdf_url = 'https://worldwide.espacenet.com/publicationDetails/originalDocument?' | |
if nNumber is not None: | |
match = re.search('(^[a-zA-Z]+)(([0-9]+)\s?([a-zA-Z0-9_]+$))', nNumber) | |
# CC for country code | |
CC = match.group(1) | |
# NR for Number | |
NR = match.group(2) | |
NR = re.sub(r'\s', '', NR) | |
# KC for Kind code | |
KC = match.group(4) | |
urlImg = root_img_url + '&CC=' + CC + '&NR=' + NR + '&KC=' + KC | |
urlPDF = root_pdf_url + 'CC=' + CC + '&NR=' + NR + '&KC=' + KC + '&FT=D&ND=3&date=' + '&DB=&locale=en_EP#' | |
#Find a more elegant way to do it | |
patent_content = aAbstract + cClaims + dDescription | |
patent_content = patent_content.splitlines() | |
# for line in patent_content: | |
# line = self.dataCleaner(line) | |
# reduced_content.append(line) | |
for line in patent_content: | |
get_parameters = ParameterExtractor(line) | |
parameters = get_parameters.extract_parameters() | |
if parameters: | |
parameters_list.extend( parameters) | |
for i in parameters_list: | |
for j in parameters_list: | |
if i != j and len(i.split()) == 1: | |
if j.find(i) > -1 and i in parameters_list: | |
parameters_list.remove(i) | |
parameters_list=list(set(parameters_list)) | |
if len(parameters_list) > 50: | |
for i in parameters_list: | |
for j in parameters_list: | |
if i!=j: | |
comp = Levenshtein.ratio(i, j) | |
if comp >=.4 and i in parameters_list and j in parameters_list: | |
if len(i) > len(j): | |
# print('{} is near duplicate of {}'.format(i, j)) | |
parameters_list.remove(i) | |
for el in parameters_list: | |
if len(el.split()) == 1: | |
parameters_list.remove(el) | |
parameters = dict(enumerate(parameters_list, 1)) | |
parameters = self.change_keys(parameters, nNumber.lower()) | |
source = input_folder+"/"+nNumber+file_extension.strip("*") | |
parameters_array = OrderedDict({ | |
"concept": { | |
"source": source, | |
"valeurs": parameters, | |
"image": urlImg, | |
"pdf": urlPDF | |
} | |
}) | |
pParameters= json.dumps(parameters_array, sort_keys=OrderedDict, indent=4, separators=(',', ': ')) | |
parameters_graph.append(pParameters) | |
if dDescription !="" or cClaims!="": | |
count_description +=1 | |
extract_concepts = InformationExtractor(dDescription,input_folder, file_extension, nNumber ) | |
output_json, total_sentences_number = extract_concepts.get_from_description() | |
extract_concepts_claims = InformationExtractorClaims(cClaims,input_folder, file_extension, nNumber ) | |
output_json_claims_result= extract_concepts_claims.main() | |
if output_json_claims_result is not None: | |
output_json_claims, total_sentences_number_claims = output_json_claims_result | |
count_claims += 1 | |
if output_json is not None: | |
if type(output_json) is dict: | |
output_json = json.dumps(output_json) | |
extracted_concepts.append(output_json) | |
total_sentences_number += total_sentences_number | |
if output_json_claims is not None : | |
if type(output_json_claims) is dict: | |
output_json_claims = json.dumps(output_json_claims) | |
extracted_concepts.append(output_json_claims) | |
total_sentences_number += total_sentences_number_claims | |
elif cClaims !="": | |
count_claims +=1 | |
print('Processing claims') | |
else: | |
count_abstract +=1 | |
print("processing abstract") | |
count_patent +=1 | |
#print(source) | |
source_list.append(source) | |
patent_corpus.append(reduced_content) | |
patent_corpus = dict(zip(source_list, patent_corpus)) | |
''' | |
get_patent_technologies = TechnologyFinder(patent_corpus) | |
technologies = get_patent_technologies.get_technologies() | |
for source_file, technologies_list in technologies.items(): | |
technologies_array = OrderedDict({ | |
"concept": { | |
"source": source_file, | |
"values": technologies_list | |
} | |
}) | |
tTechnologies = json.dumps(technologies_array, sort_keys=OrderedDict, indent=4, separators=(',', ': ')) | |
technologies_graph.append(tTechnologies) | |
''' | |
print(type(extracted_concepts)) | |
header = '{' | |
graph = '"problem_graph": [%s],' % ','.join(extracted_concepts) | |
parameters_output = '"parameters": [%s]' % ','.join(parameters_graph) | |
#technologies_output = '"technologies": [%s]' % ','.join(technologies_graph) | |
footer = '}' | |
#output_result.extend((header, graph, parameters_output,technologies_output, footer )) | |
output_result.extend((header, graph, parameters_output, footer)) | |
output_result = "".join(output_result) | |
output_result = re.sub(r'\,{2,}', ',', output_result) | |
output_result = re.sub(r'\}\,\]', '}]', output_result) | |
# exit() | |
# print(output_result) | |
concepts_json = json.loads(output_result) | |
# concepts_json = json.loads(concepts_json) | |
count_concepts = len(concepts_json['problem_graph']) | |
for item, value in concepts_json.items(): | |
#if cle == "type" and value =="partialSolution": | |
# print ("yes") | |
for element in value: | |
for cle, valeur in element.items(): | |
for k,v in valeur.items(): | |
if k == "type" and v =="partialSolution": | |
count_concepts_solupart += 1 | |
elif k == "type" and v =="problem": | |
count_concepts_problem += 1 | |
json_write_to_file = json.dumps(concepts_json, sort_keys=False, indent=4, separators=(',', ': ')) | |
#print(concepts_json.keys()) | |
# original code | |
with open(graph_folder+"graph.json", 'w') as json_graph: | |
# with open(graph_folder + 'graph.json', 'w') as json_graph: | |
json_graph.write(json_write_to_file) | |
number_neutre = count_concepts - count_concepts_problem - count_concepts_solupart | |
print("Le corpus contenait %s brevets dont %s abstract, %s revendications et %s descriptions" % (count_patent, count_abstract, count_claims, count_description)) | |
print("%s phrases ont été analysée(s)" % (total_sentences_number)) | |
print("%s concepts ont été trouvé(s) dont %s problèmes, %s solutions partielles et %s neutres" % (count_concepts, count_concepts_problem, count_concepts_solupart, number_neutre)) | |
#Display graphics | |
first_color = (46, 204, 113) | |
second_color = (245, 176, 65) | |
#self.make_graphic([count_concepts_problem, count_concepts_solupart], "Ratio",[first_color,second_color],['Problems','Partial Solutions']) | |
return json_write_to_file | |
def process_corpus_json(self): | |
count_abstract = 0 | |
count_claims = 0 | |
count_description = 0 | |
count_patent = 0 | |
total_sentences_number = 0 | |
count_concepts_solupart = 0 | |
count_concepts_problem = 0 | |
patents = self.patents | |
input_folder = self.input_folder | |
file_extension = self.file_extension | |
project_folder = os.path.basename(os.path.normpath(input_folder)) | |
graph_folder = constants.GRAPH_FOLDER + project_folder + "/" | |
extracted_concepts = [] | |
output_result = [] | |
parameters_graph = [] | |
reduced_content = [] | |
patent_corpus = [] | |
source_list = [] | |
parameters_list = [] | |
technologies_graph = [] | |
for patent_file in patents: | |
# print(type(patent_file)) | |
#if type(patent_file) is dict: | |
patent_file = json.dumps(patent_file) | |
read_patent = StringIO(patent_file) | |
patent = json.load(read_patent) | |
# print(type(patent)) | |
filename = patent['filename'] | |
nNumber = patent['number'] | |
aAbstract = patent['abstract'] | |
cClaims = patent['claims'] | |
dDescription = patent['description'] | |
# Find a more elegant way to do it | |
patent_content = aAbstract + cClaims + dDescription | |
patent_content = patent_content.splitlines() | |
# for line in patent_content: | |
# line = self.dataCleaner(line) | |
# reduced_content.append(line) | |
for line in patent_content: | |
get_parameters = ParameterExtractor(line) | |
parameters = get_parameters.extract_parameters() | |
if parameters: | |
parameters_list.extend(parameters) | |
for i in parameters_list: | |
for j in parameters_list: | |
if i != j and len(i.split()) == 1: | |
if j.find(i) > -1 and i in parameters_list: | |
parameters_list.remove(i) | |
parameters_list = list(set(parameters_list)) | |
if len(parameters_list) > 50: | |
for i in parameters_list: | |
for j in parameters_list: | |
if i!=j: | |
comp = Levenshtein.ratio(i, j) | |
if comp >=.4 and i in parameters_list and j in parameters_list: | |
if len(i) > len(j): | |
# print('{} is near duplicate of {}'.format(i, j)) | |
parameters_list.remove(i) | |
for el in parameters_list: | |
if len(el.split()) == 1: | |
parameters_list.remove(el) | |
print('{} {}'.format('Taille: ', len(parameters_list))) | |
parameters = dict(enumerate(parameters_list, 1)) | |
parameters = self.change_keys(parameters, nNumber.lower()) | |
source = input_folder + "/" + nNumber + file_extension.strip("*") | |
parameters_array = OrderedDict({ | |
"concept": { | |
"source": source, | |
"valeurs": parameters | |
} | |
}) | |
pParameters = json.dumps(parameters_array, sort_keys=OrderedDict, indent=4, separators=(',', ': ')) | |
parameters_graph.append(pParameters) | |
#if dDescription != "" and cClaims!="": | |
if dDescription != "": | |
count_description += 1 | |
extract_concepts = InformationExtractor(dDescription, input_folder, file_extension, filename) | |
output_json, total_sentences_number_d = extract_concepts.get_from_description() | |
if output_json != "": | |
extracted_concepts.append(output_json) | |
total_sentences_number += total_sentences_number_d | |
#count_claims += 1 | |
#extract_concepts = InformationExtractor(cClaims, input_folder, file_extension, nNumber) | |
#output_json, total_sentences_number_c = extract_concepts.get_from_claims() | |
#if output_json != "": | |
#extracted_concepts.append(output_json) | |
#total_sentences_number_c += total_sentences_number_c | |
#total_sentences_number = total_sentences_number_c+total_sentences_number_d | |
elif cClaims != "": | |
count_claims += 1 | |
extract_concepts = InformationExtractor(cClaims, input_folder, file_extension, nNumber) | |
output_json, total_sentences_number = extract_concepts.get_from_claims() | |
if output_json != "": | |
extracted_concepts.append(output_json) | |
total_sentences_number += total_sentences_number | |
elif dDescription != "": | |
count_description += 1 | |
extract_concepts = InformationExtractor(dDescription, input_folder, file_extension, nNumber) | |
output_json, total_sentences_number = extract_concepts.get_from_description() | |
if output_json != "": | |
extracted_concepts.append(output_json) | |
total_sentences_number += total_sentences_number | |
count_claims += 1 | |
else: | |
count_abstract += 1 | |
print("processing abstract") | |
count_patent += 1 | |
# print(source) | |
# source_list.append(source) | |
# patent_corpus.append(reduced_content) | |
# patent_corpus = dict(zip(source_list, patent_corpus)) | |
''' | |
get_patent_technologies = TechnologyFinder(patent_corpus) | |
technologies = get_patent_technologies.get_technologies() | |
for source_file, technologies_list in technologies.items(): | |
technologies_array = OrderedDict({ | |
"concept": { | |
"source": source_file, | |
"values": technologies_list | |
} | |
}) | |
tTechnologies = json.dumps(technologies_array, sort_keys=OrderedDict, indent=4, separators=(',', ': ')) | |
technologies_graph.append(tTechnologies) | |
''' | |
header = '{' | |
graph = '"problem_graph": [%s],' % ','.join(extracted_concepts) | |
parameters_output = '"parameters": [%s]' % ','.join(parameters_graph) | |
# technologies_output = '"technologies": [%s]' % ','.join(technologies_graph) | |
footer = '}' | |
# output_result.extend((header, graph, parameters_output,technologies_output, footer )) | |
output_result.extend((header, graph, parameters_output, footer)) | |
output_result = "".join(output_result) | |
output_result = re.sub(r'\,{2,}', ',', output_result) | |
output_result = re.sub(r'\}\,\]', '}]', output_result) | |
concepts_json = json.loads(output_result) | |
count_concepts = len(concepts_json['problem_graph']) | |
for item, value in concepts_json.items(): | |
# if cle == "type" and value =="partialSolution": | |
# print ("yes") | |
for element in value: | |
for cle, valeur in element.items(): | |
for k, v in valeur.items(): | |
if k == "type" and v == "partialSolution": | |
count_concepts_solupart += 1 | |
elif k == "type" and v == "problem": | |
count_concepts_problem += 1 | |
json_write_to_file = json.dumps(concepts_json, sort_keys=False, indent=4, separators=(',', ': ')) | |
# print(concepts_json.keys()) | |
with open(graph_folder + "graph.json", 'w') as json_graph: | |
json_graph.write(json_write_to_file) | |
print("Le corpus contenait %s brevets dont %s abstract, %s revendications et %s descriptions" % ( | |
count_patent, count_abstract, count_claims, count_description)) | |
print("%s phrases ont été analysée(s)" % (total_sentences_number)) | |
print("%s concepts ont été trouvé(s) dont %s problèmes et %s solutions partielles" % ( | |
count_concepts, count_concepts_problem, count_concepts_solupart)) | |
# Display graphics | |
first_color = (46, 204, 113) | |
second_color = (245, 176, 65) | |
# self.make_graphic([count_concepts_problem, count_concepts_solupart], "Ratio",[first_color,second_color],['Problems','Partial Solutions']) | |
return json_write_to_file |