Spaces:
Sleeping
Sleeping
File size: 2,257 Bytes
7a8cb87 94f57b8 7a8cb87 6062294 6483ca7 7a8cb87 211efbb 7a8cb87 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
import gradio as gr
import io
def load_vectors(fname):
fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
data = {}
for line in fin:
tokens = line.rstrip().split(' ')
data[tokens[0]] = map(float, tokens[1:])
del fin
return data, sorted(data.keys(), key=len, reverse=True)
vectors, sorted_vector = load_vectors('wiki-news-300d-1M.vec')
class TrieNode:
def __init__(self):
self.children = {}
self.is_end_of_token = False
class Trie:
def __init__(self):
self.root = TrieNode()
def insert(self, token):
node = self.root
for char in token:
if char not in node.children:
node.children[char] = TrieNode()
node = node.children[char]
node.is_end_of_token = True
def search_longest_prefix(self, text, start):
node = self.root
longest_match = None
current_pos = start
while current_pos < len(text) and text[current_pos] in node.children:
node = node.children[text[current_pos]]
if node.is_end_of_token:
longest_match = current_pos
current_pos += 1
return longest_match
def word2vec(word):
return list(vectors[word])
def tokenize(text):
trie = Trie()
for token in sorted_vector:
trie.insert(token)
result = []
start = 0
while start < len(text):
longest_match = trie.search_longest_prefix(text, start)
if longest_match is not None:
result.append(text[start:longest_match+1])
start = longest_match + 1
else:
start += 1
return result
def paragraph2word(paragraph):
tokens = tokenize(paragraph)
mergedVector = []
# Merge vectors
for token in tokens:
vector = word2vec(token)
if len(mergedVector) == 0:
mergedVector = vector
else:
for i in range(len(vector)):
mergedVector[i] += vector[i]
# Normalize
for i in range(len(mergedVector)):
mergedVector[i] /= len(tokens)
return mergedVector
demo = gr.Interface(fn=paragraph2word, inputs="text", outputs="text")
demo.launch()
|