Spaces:
Sleeping
Sleeping
import gradio as gr | |
def load_vectors(fname): | |
fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') | |
n, d = map(int, fin.readline().split()) | |
data = {} | |
for line in fin: | |
tokens = line.rstrip().split(' ') | |
data[tokens[0]] = map(float, tokens[1:]) | |
return data, sorted(data.keys(), key=len, reverse=True) | |
vectors, sorted_vector = load_vectors('../wiki-news-300d-1M.vec') | |
class TrieNode: | |
def __init__(self): | |
self.children = {} | |
self.is_end_of_token = False | |
class Trie: | |
def __init__(self): | |
self.root = TrieNode() | |
def insert(self, token): | |
node = self.root | |
for char in token: | |
if char not in node.children: | |
node.children[char] = TrieNode() | |
node = node.children[char] | |
node.is_end_of_token = True | |
def search_longest_prefix(self, text, start): | |
node = self.root | |
longest_match = None | |
current_pos = start | |
while current_pos < len(text) and text[current_pos] in node.children: | |
node = node.children[text[current_pos]] | |
if node.is_end_of_token: | |
longest_match = current_pos | |
current_pos += 1 | |
return longest_match | |
def word2vec(word): | |
if not word in vectors: | |
return None | |
return list(vectors[word]) | |
def tokenize(text): | |
trie = Trie() | |
for token in sorted_vector: | |
trie.insert(token) | |
result = [] | |
start = 0 | |
while start < len(text): | |
longest_match = trie.search_longest_prefix(text, start) | |
if longest_match is not None: | |
result.append(text[start:longest_match+1]) | |
start = longest_match + 1 | |
else: | |
start += 1 | |
return result | |
def paragraph2word(paragraph): | |
tokens = tokenize(paragraph) | |
mergedVector = [] | |
# Merge vectors | |
for token in tokens: | |
vector = word2vec(token) | |
if len(mergedVector) == 0: | |
mergedVector = vector | |
else: | |
for i in range(len(vector)): | |
mergedVector[i] += vector[i] | |
# Normalize | |
for i in range(len(mergedVector)): | |
mergedVector[i] /= len(tokens) | |
return mergedVector | |
demo = gr.Interface(fn=paragraph2word, inputs="text", outputs="text") | |
demo.launch() | |