File size: 3,650 Bytes
e2f13a4
 
 
 
 
 
 
23a08cd
e2f13a4
936bfca
e2f13a4
936bfca
e2f13a4
 
 
 
 
23a08cd
e2f13a4
 
 
c93f011
e2f13a4
 
 
 
 
 
936bfca
e2f13a4
 
 
 
 
 
 
 
 
35244e7
e2f13a4
 
 
 
 
 
 
 
23a08cd
e2f13a4
 
 
 
 
 
 
 
10dc1f6
e2f13a4
 
 
 
 
 
 
ea28e08
e2f13a4
 
 
 
 
 
 
 
 
 
 
 
 
29edf23
e2f13a4
 
 
 
 
 
 
 
 
 
 
 
99b3c08
e2f13a4
 
 
 
 
 
 
 
 
 
 
 
 
 
776fa07
e2f13a4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import wordnet as wn
from sematch.semantic.similarity import WordNetSimilarity
from vocabulary.vocabulary import Vocabulary as vb
import json
from random import randint
import spacy
import os.path

nlp = spacy.load('en_core_web_sm')

# Function to tag sentence with part of speach
def tag(sentence):
 words = word_tokenize(sentence)
 words = pos_tag(words)
 return words

# Determine the POS to paraphrase
def paraphraseable(tag):
 return tag.startswith('NN') or tag =='VB' or tag.startswith('JJ')

# POS tagging
def pos(tag):
 if tag.startswith('NN'):
  return wn.NOUN
 elif tag.startswith('V'):
  return wn.VERB

# Function to crate synonyms using wordnet nltk
def synonyms(word, tag):
    listOfLemmas = [baseWord.lemmas() for baseWord in wn.synsets(word, pos(tag))]  
    if len(listOfLemmas) > 0:
    	listOfLemmas = listOfLemmas[0]
    	lemmas = [lemma.name().encode('ascii', 'ignore') for lemma in listOfLemmas]
    	return set(lemmas)
    else:
    	return set([])

# Create  dictonary synonums
def dictonarySynonums(word):
	synJSON = vb.synonym(word)
	if synJSON != False:
		synonyms_lists = [dictSyno["text"].encode('ascii', 'ignore') for dictSyno in json.loads(vb.synonym(word))]
		return set(synonyms_lists)
	else:
		return set([])

# controll set to calculate the semantic similarity of synonums from the base words using SPACY
def controlledSetSpacy(word,similarWords):
	utf_en_word = nlp(word.decode('utf-8', 'ignore'))
	for similarWord in similarWords.copy():
		utf_en_similarWord = nlp(similarWord.decode('utf-8','ignore'))
		if utf_en_word.similarity(utf_en_similarWord) <.76: # Variable to control accuracy of controlset 
			similarWords.discard(similarWord)
	return similarWords

# controll set to calculate the semantic similarity of synonums from the base words using WordNetSimilarity
def controlledSetWordNetSimilarity(word,similarWords):
	wns = WordNetSimilarity()
	for similarWord in similarWords.copy():
		if wns.word_similarity(word, similarWord, 'li') < 0.9996: # Variable to control accuracy of controlset
			similarWords.discard(similarWord)
	return similarWords

# to to get synonums from wordnet nltk as well as from python dictonary synonums
def synonymIfExists(sentence):
 for (word, t) in tag(sentence):
   if paraphraseable(t) and word not in ["i","I"]:
    syns = synonyms(word, t)
    syns.update(dictonarySynonums(word))
    if syns:
    	syns = controlledSetWordNetSimilarity(word,syns) # Or use the commented controlled set
    	#syns = controlledSetSpacy(word,syns)
    	if len(syns) > 1:
    		yield [word, list(syns)]
    		continue
   yield [word,[]]

# Function to get the semantic similar synonums and the total count of synonums in the entire sentence
def paraphrase(sentence):
	bagOfWords = []
	counter = 1	
	for tempArray in synonymIfExists(sentence):
		eachBoW=[]
		eachBoW.append(tempArray[0])
		eachBoW.extend(tempArray[1])
		eachBoW=list(set(eachBoW))	
		counter *= len(eachBoW)
		bagOfWords.append(eachBoW)
	return bagOfWords,counter

# Function to re-create sentence with synonums where the synonums are taken in randon order 
def paraPhraseThisSentence(sentence):
	ppList = []
	vList,count = paraphrase(sentence)
	allWordsCount = len(vList)
	for y in range(count):
		str = []
		returnStr = " "
		for w in range(allWordsCount):
			str.append(vList[w][randint(0,len(vList[w])-1)].replace("_"," "))
		ppList.append(returnStr.join(str))
	ppList = list(set(ppList))
	print (ppList)
	return ppList

paraPhraseThisSentence("Financial Institutes have always helped the society to become better version of itself.")