asynchronousai commited on
Commit
7a8cb87
·
verified ·
1 Parent(s): 7515f6e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -0
app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ def load_vectors(fname):
4
+ fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
5
+ n, d = map(int, fin.readline().split())
6
+ data = {}
7
+ for line in fin:
8
+ tokens = line.rstrip().split(' ')
9
+ data[tokens[0]] = map(float, tokens[1:])
10
+ return data, sorted(data.keys(), key=len, reverse=True)
11
+ vectors, sorted_vector = load_vectors('../wiki-news-300d-1M.vec')
12
+
13
+ class TrieNode:
14
+ def __init__(self):
15
+ self.children = {}
16
+ self.is_end_of_token = False
17
+
18
+ class Trie:
19
+ def __init__(self):
20
+ self.root = TrieNode()
21
+
22
+ def insert(self, token):
23
+ node = self.root
24
+ for char in token:
25
+ if char not in node.children:
26
+ node.children[char] = TrieNode()
27
+ node = node.children[char]
28
+ node.is_end_of_token = True
29
+
30
+ def search_longest_prefix(self, text, start):
31
+ node = self.root
32
+ longest_match = None
33
+ current_pos = start
34
+
35
+ while current_pos < len(text) and text[current_pos] in node.children:
36
+ node = node.children[text[current_pos]]
37
+ if node.is_end_of_token:
38
+ longest_match = current_pos
39
+ current_pos += 1
40
+
41
+ return longest_match
42
+
43
+ def word2vec(word):
44
+ if not word in vectors:
45
+ return None
46
+ return list(vectors[word])
47
+ def tokenize(text):
48
+ trie = Trie()
49
+ for token in sorted_vector:
50
+ trie.insert(token)
51
+
52
+ result = []
53
+ start = 0
54
+
55
+ while start < len(text):
56
+ longest_match = trie.search_longest_prefix(text, start)
57
+ if longest_match is not None:
58
+ result.append(text[start:longest_match+1])
59
+ start = longest_match + 1
60
+ else:
61
+ start += 1
62
+
63
+ return result
64
+ def paragraph2word(paragraph):
65
+ tokens = tokenize(paragraph)
66
+ mergedVector = []
67
+
68
+ # Merge vectors
69
+ for token in tokens:
70
+ vector = word2vec(token)
71
+ if len(mergedVector) == 0:
72
+ mergedVector = vector
73
+ else:
74
+ for i in range(len(vector)):
75
+ mergedVector[i] += vector[i]
76
+
77
+ # Normalize
78
+ for i in range(len(mergedVector)):
79
+ mergedVector[i] /= len(tokens)
80
+
81
+ return mergedVector
82
+
83
+ demo = gr.Interface(fn=paragraph2word, inputs="text", outputs="text")
84
+ demo.launch()