|
import gradio as gr |
|
import io, |
|
def load_vectors(fname): |
|
fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') |
|
n, d = map(int, fin.readline().split()) |
|
data = {} |
|
for line in fin: |
|
tokens = line.rstrip().split(' ') |
|
data[tokens[0]] = map(float, tokens[1:]) |
|
return data, sorted(data.keys(), key=len, reverse=True) |
|
vectors, sorted_vector = load_vectors('../wiki-news-300d-1M.vec') |
|
|
|
class TrieNode: |
|
def __init__(self): |
|
self.children = {} |
|
self.is_end_of_token = False |
|
|
|
class Trie: |
|
def __init__(self): |
|
self.root = TrieNode() |
|
|
|
def insert(self, token): |
|
node = self.root |
|
for char in token: |
|
if char not in node.children: |
|
node.children[char] = TrieNode() |
|
node = node.children[char] |
|
node.is_end_of_token = True |
|
|
|
def search_longest_prefix(self, text, start): |
|
node = self.root |
|
longest_match = None |
|
current_pos = start |
|
|
|
while current_pos < len(text) and text[current_pos] in node.children: |
|
node = node.children[text[current_pos]] |
|
if node.is_end_of_token: |
|
longest_match = current_pos |
|
current_pos += 1 |
|
|
|
return longest_match |
|
|
|
def word2vec(word): |
|
if not word in vectors: |
|
return None |
|
return list(vectors[word]) |
|
def tokenize(text): |
|
trie = Trie() |
|
for token in sorted_vector: |
|
trie.insert(token) |
|
|
|
result = [] |
|
start = 0 |
|
|
|
while start < len(text): |
|
longest_match = trie.search_longest_prefix(text, start) |
|
if longest_match is not None: |
|
result.append(text[start:longest_match+1]) |
|
start = longest_match + 1 |
|
else: |
|
start += 1 |
|
|
|
return result |
|
def paragraph2word(paragraph): |
|
tokens = tokenize(paragraph) |
|
mergedVector = [] |
|
|
|
|
|
for token in tokens: |
|
vector = word2vec(token) |
|
if len(mergedVector) == 0: |
|
mergedVector = vector |
|
else: |
|
for i in range(len(vector)): |
|
mergedVector[i] += vector[i] |
|
|
|
|
|
for i in range(len(mergedVector)): |
|
mergedVector[i] /= len(tokens) |
|
|
|
return mergedVector |
|
|
|
demo = gr.Interface(fn=paragraph2word, inputs="text", outputs="text") |
|
demo.launch() |
|
|