PatentSolver / App /bin /FindTechnologies.py
xin's picture
initial commit
22738ca
#!/usr/bin/python3
# -*- coding: utf-8 -*
import sys
import os
import math
import xlsxwriter
from textblob import TextBlob as tb
class FindTechnologies(object):
def __init__(self):
print("Starting")
def tf(word, blob):
return (float)(blob.noun_phrases.count(word)) / (float)(len(blob.noun_phrases))
def n_containing(word, bloblist):
return sum(1 for blob in bloblist if word in blob.noun_phrases)
def idf(word, bloblist):
return math.log(len(bloblist) / (float)(1 + n_containing(word, bloblist)))
def tfidf(word, blob, bloblist):
return tf(word, blob) * idf(word, bloblist)
# Create an excel file for validation purpose
def get_technologies(self):
folder_path = "C:/Users/asouili01/Documents/PatSemBeta-v3/Data/input/Gaggenau/"
stopwords = open('C:/Users/asouili01/Documents/PIXSEB/Ressources/stopwords.txt', 'r').read().split('\r\n')
bloblist = []
filenamelist = []
for path, dirs, files in os.walk(folder_path):
for filename in files:
print(filename)
filenamelist.append(filename)
name, extension = filename.split('.')
filepath = folder_path + "/" + filename
filehandler = open(filepath, "r",encoding="utf-8")
content = filehandler.read()
filteredtext = [t for t in content if t.lower() not in stopwords]
filteredcontent = ''.join(filteredtext)
blob = 'blob_' + name.lower()
print (blob)
blob = tb(filteredcontent.lower())
bloblist.append(blob)
print(bloblist)
for i, blob in enumerate(bloblist):
print("Top words in document {}".format(i + 1))
scores = {word: tfidf(word, blob, bloblist) for word in blob.noun_phrases}
sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
for word, score in sorted_words[:5]:
print("\tWord: {}, TF-IDF: {}".format(word, round(score, 10)))