Spaces:

sashdev
/

humnifierai

Build error

File size: 3,650 Bytes

e2f13a4
 
 
 
 
 
 
23a08cd
e2f13a4
936bfca
e2f13a4
936bfca
e2f13a4
 
 
 
 
23a08cd
e2f13a4
 
 
c93f011
e2f13a4
 
 
 
 
 
936bfca
e2f13a4
 
 
 
 
 
 
 
 
35244e7
e2f13a4
 
 
 
 
 
 
 
23a08cd
e2f13a4
 
 
 
 
 
 
 
10dc1f6
e2f13a4
 
 
 
 
 
 
ea28e08
e2f13a4
 
 
 
 
 
 
 
 
 
 
 
 
29edf23
e2f13a4
 
 
 
 
 
 
 
 
 
 
 
99b3c08
e2f13a4
 
 
 
 
 
 
 
 
 
 
 
 
 
776fa07
e2f13a4

from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import wordnet as wn
from sematch.semantic.similarity import WordNetSimilarity
from vocabulary.vocabulary import Vocabulary as vb
import json
from random import randint
import spacy
import os.path

nlp = spacy.load('en_core_web_sm')

# Function to tag sentence with part of speach
def tag(sentence):
 words = word_tokenize(sentence)
 words = pos_tag(words)
 return words

# Determine the POS to paraphrase
def paraphraseable(tag):
 return tag.startswith('NN') or tag =='VB' or tag.startswith('JJ')

# POS tagging
def pos(tag):
 if tag.startswith('NN'):
  return wn.NOUN
 elif tag.startswith('V'):
  return wn.VERB

# Function to crate synonyms using wordnet nltk
def synonyms(word, tag):
    listOfLemmas = [baseWord.lemmas() for baseWord in wn.synsets(word, pos(tag))]  
    if len(listOfLemmas) > 0:
    	listOfLemmas = listOfLemmas[0]
    	lemmas = [lemma.name().encode('ascii', 'ignore') for lemma in listOfLemmas]
    	return set(lemmas)
    else:
    	return set([])

# Create  dictonary synonums
def dictonarySynonums(word):
	synJSON = vb.synonym(word)
	if synJSON != False:
		synonyms_lists = [dictSyno["text"].encode('ascii', 'ignore') for dictSyno in json.loads(vb.synonym(word))]
		return set(synonyms_lists)
	else:
		return set([])

# controll set to calculate the semantic similarity of synonums from the base words using SPACY
def controlledSetSpacy(word,similarWords):
	utf_en_word = nlp(word.decode('utf-8', 'ignore'))
	for similarWord in similarWords.copy():
		utf_en_similarWord = nlp(similarWord.decode('utf-8','ignore'))
		if utf_en_word.similarity(utf_en_similarWord) <.76: # Variable to control accuracy of controlset 
			similarWords.discard(similarWord)
	return similarWords

# controll set to calculate the semantic similarity of synonums from the base words using WordNetSimilarity
def controlledSetWordNetSimilarity(word,similarWords):
	wns = WordNetSimilarity()
	for similarWord in similarWords.copy():
		if wns.word_similarity(word, similarWord, 'li') < 0.9996: # Variable to control accuracy of controlset
			similarWords.discard(similarWord)
	return similarWords

# to to get synonums from wordnet nltk as well as from python dictonary synonums
def synonymIfExists(sentence):
 for (word, t) in tag(sentence):
   if paraphraseable(t) and word not in ["i","I"]:
    syns = synonyms(word, t)
    syns.update(dictonarySynonums(word))
    if syns:
    	syns = controlledSetWordNetSimilarity(word,syns) # Or use the commented controlled set
    	#syns = controlledSetSpacy(word,syns)
    	if len(syns) > 1:
    		yield [word, list(syns)]
    		continue
   yield [word,[]]

# Function to get the semantic similar synonums and the total count of synonums in the entire sentence
def paraphrase(sentence):
	bagOfWords = []
	counter = 1	
	for tempArray in synonymIfExists(sentence):
		eachBoW=[]
		eachBoW.append(tempArray[0])
		eachBoW.extend(tempArray[1])
		eachBoW=list(set(eachBoW))	
		counter *= len(eachBoW)
		bagOfWords.append(eachBoW)
	return bagOfWords,counter

# Function to re-create sentence with synonums where the synonums are taken in randon order 
def paraPhraseThisSentence(sentence):
	ppList = []
	vList,count = paraphrase(sentence)
	allWordsCount = len(vList)
	for y in range(count):
		str = []
		returnStr = " "
		for w in range(allWordsCount):
			str.append(vList[w][randint(0,len(vList[w])-1)].replace("_"," "))
		ppList.append(returnStr.join(str))
	ppList = list(set(ppList))
	print (ppList)
	return ppList

paraPhraseThisSentence("Financial Institutes have always helped the society to become better version of itself.")