import gradio as gr import io import numpy as np def load_vectors(fname): fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') data = {} for line in fin: tokens = line.rstrip().split(' ') data[tokens[0]] = np.array(list(map(float, tokens[1:]))) # Convert to NumPy array del fin return data, sorted(data.keys(), key=len, reverse=True) vectors, sorted_vector = load_vectors('wiki-news-300d-1M.vec') class TrieNode: def __init__(self): self.children = {} self.is_end_of_token = False class Trie: def __init__(self): self.root = TrieNode() def insert(self, token): node = self.root for char in token: if char not in node.children: node.children[char] = TrieNode() node = node.children[char] node.is_end_of_token = True def search_longest_prefix(self, text, start): node = self.root longest_match = None current_pos = start while current_pos < len(text) and text[current_pos] in node.children: node = node.children[text[current_pos]] if node.is_end_of_token: longest_match = current_pos current_pos += 1 return longest_match def word2vec(word): return vectors[word] def tokenize(text): trie = Trie() for token in sorted_vector: trie.insert(token) result = [] start = 0 while start < len(text): longest_match = trie.search_longest_prefix(text, start) if longest_match is not None: result.append(text[start:longest_match+1]) start = longest_match + 1 else: start += 1 return result def paragraph2word(paragraph): tokens = tokenize(paragraph) if not tokens: # Handle case with no tokens found return np.zeros(300).tolist() # Return a zero vector of appropriate dimension merged_vector = np.zeros(300) # Assuming vectors are 300-dimensional # Merge vectors using NumPy for token in tokens: vector = word2vec(token) merged_vector += vector # Normalize merged_vector /= len(tokens) return merged_vector.tolist() # Convert back to list for output demo = gr.Interface(fn=paragraph2word, inputs="text", outputs="text") demo.launch()