Spaces:
Build error
Build error
File size: 6,386 Bytes
22738ca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
from App.bin.FiguresCleaner import FiguresCleaner
from App.bin.ParameterExtractor import ParameterExtractor
from App.bin import constants
import nltk
import re
import os
import json
import hashlib
import Levenshtein
import uuid
from collections import OrderedDict
from App.bin.SharpClassifier import SharpClassifier
from App.bin.ClassifierWithIncr import ClassifyWithIncr_it
class InformationExtractorClaims(object):
def __init__(self, section, input_folder, file_extension, file_name):
self.section = section
self.input_folder = input_folder
self.file_extension = file_extension
self.file_name = file_name
patent_abbreviations = open(constants.ASSETS + "abbreviation_sentence_splitter").read().split()
sentence_finder = nltk.data.load('tokenizers/punkt/english.pickle')
sentence_finder._params.abbrev_types.update(patent_abbreviations)
self.sentence_finder = sentence_finder
def clean_data (self, sentence):
sentence = str(sentence.lower())
sentence = re.sub(r'\(\s,?\s?\)', '', sentence)
sentence = re.sub(r'\s+,', ',', sentence)
sentence = re.sub(r'^\d+', '', sentence)
sentence = re.sub(r'\s+', ' ', sentence)
if sentence is not None:
return sentence
def truncate_data (self, sentence):
sentence = str(sentence.lower())
sentence = re.sub(r'wherein said\s*', '', sentence)
sentence = re.sub(r'characterized in that said\s*|characterised in that said?\s*', '', sentence)
sentence = re.sub(r'wherein\s*', '', sentence)
sentence = re.sub(r'characterized\s*|characterised\s*', '', sentence)
sentence = re.sub(r'characterized in that\s*', '', sentence)
sentence = re.sub(r'where\s*', '', sentence)
sentence = re.sub(r'where said\s*', '', sentence)
sentence = re.sub(r'further comprising', 'the system or method comprises', sentence)
sentence = re.sub(r'.*thereof\s*\,?', '', sentence)
sentence = re.sub(r'^\s+', '', sentence)
sentence = re.sub(r'\s+\.$', '', sentence)
if sentence is not None:
return sentence
def selectLines(self, line, lexic):
with open(constants.ASSETS + lexic) as n:
inclusion_list = n.read().splitlines()
claims_words = re.compile('|'.join(inclusion_list))
m = re.search(claims_words, line)
if m is not None:
return m.group(1)
# pass
# return line
def main(self):
output_result = []
compt_Id = 50
count_concept = 3
clean_content_list = []
concept_list = []
output_content = []
uniq_output_linked_content =[]
parameters_list = []
total_sentences_number =0
section = self.section
input_folder = self.input_folder
file_name = self.file_name
file_extension = self.file_extension
projectFolder = os.path.basename(os.path.normpath(input_folder))
output_file_name = input_folder+"/"+file_name+file_extension.strip("*")
root_img_url = 'https://worldwide.espacenet.com/espacenetImage.jpg?flavour=firstPageClipping&locale=en_EP&FT=D&'
root_pdf_url = 'https://worldwide.espacenet.com/publicationDetails/originalDocument?'
if file_name is not None:
match = re.search('(^[a-zA-Z]+)(([0-9]+)\s?([a-zA-Z0-9_]+$))', file_name)
# CC for country code
CC = match.group(1)
#NR for Number
NR = match.group(2)
NR = re.sub(r'\s', '', NR)
#KC for Kind code
KC = match.group(4)
urlImg = root_img_url+'&CC='+CC+'&NR='+NR+'&KC='+KC
urlPDF = root_pdf_url+'CC='+CC+'&NR='+NR+'&KC='+KC+'&FT=D&ND=3&date='+'&DB=&locale=en_EP#'
graphItemId = hashlib.md5(file_name.encode())
graphItemIdValue = graphItemId.hexdigest()
graphItemIdValue = str(uuid.uuid4())
sentence_finder = self.sentence_finder
sentences = sentence_finder.tokenize(section.strip())
for sentence in sentences:
# print(sentence)
sentence = self.clean_data(sentence)
if sentence !='':
clean_content_list.append(sentence)
for line in clean_content_list:
# print(len(line.split()))
if not re.match(r'^\s*$', line):
line = self.selectLines(line, 'claims_indices')
if line is not None and count_concept > 0:
line = self.truncate_data(line)
line = re.sub(r'in that', '', line)
# print(line, len(line.split()))
concept_list.append(line)
count_concept -= 1
count_concept = 3
if len(concept_list) is not None:
total_sentences_number = len(concept_list)
for concept in concept_list :
if concept is not None and not re.match(r'^\s,', concept) and len(concept.split())<50:
classifyT = ClassifyWithIncr_it()
polarite = classifyT.main(concept)
get_parameters = ParameterExtractor(concept)
parameters = get_parameters.extract_parameters()
parameters_list.extend(parameters)
values = OrderedDict({
"concept": {
"type": polarite,
"id": graphItemIdValue + str(compt_Id),
"sentence": concept,
"source": output_file_name,
"parameters": parameters_list,
"image": urlImg,
"pdf": urlPDF
}
})
json_string = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
output_result.append(json_string)
output_result = list(set(output_result))
output_json = ",".join(output_result)
return output_json, total_sentences_number
|