dustalov commited on
Commit
168e76d
·
verified ·
1 Parent(s): c01d0d3

Add app.py

Browse files
Files changed (2) hide show
  1. README.md +5 -3
  2. app.py +237 -0
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- title: Watset
3
- emoji: 🐨
4
  colorFrom: yellow
5
  colorTo: purple
6
  sdk: gradio
@@ -10,4 +10,6 @@ pinned: false
10
  license: apache-2.0
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
1
  ---
2
+ title: Structure Discovery with Watset
3
+ emoji: 🔮
4
  colorFrom: yellow
5
  colorTo: purple
6
  sdk: gradio
 
10
  license: apache-2.0
11
  ---
12
 
13
+ **Watset** is a soft clustering algorithm for graphs as described in paper
14
+ [Watset: Local-Global Graph Clustering with Applications in Sense and Frame Induction](https://doi.org/10.1162/COLI_a_00354)
15
+ ([arXiv](https://arxiv.org/abs/1808.06696)).
app.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Dmitry Ustalov
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ __author__ = 'Dmitry Ustalov'
16
+ __license__ = 'Apache 2.0'
17
+
18
+ import csv
19
+ import re
20
+ import subprocess
21
+ from dataclasses import dataclass
22
+ from tempfile import NamedTemporaryFile
23
+ from typing import Dict, IO, List, cast, Tuple, Optional
24
+
25
+ import gradio as gr
26
+ import matplotlib.pyplot as plt
27
+ import networkx as nx
28
+ import pandas as pd
29
+
30
+
31
+ @dataclass
32
+ class Algorithm:
33
+ name: str
34
+ mode: Optional[str] = None
35
+ local_name: Optional[str] = None
36
+ local_params: Optional[str] = None
37
+ global_name: Optional[str] = None
38
+ global_params: Optional[str] = None
39
+
40
+ def args_clustering(self) -> List[str]:
41
+ args = [self.name]
42
+
43
+ if self.mode:
44
+ args.extend(['--mode', self.mode])
45
+
46
+ args.extend(self.args_graph())
47
+
48
+ if self.global_name:
49
+ args.extend(['--global', self.global_name])
50
+
51
+ if self.global_params:
52
+ args.extend(['--global-params', self.global_params])
53
+
54
+ return args
55
+
56
+ def args_graph(self) -> List[str]:
57
+ args = []
58
+
59
+ if self.local_name:
60
+ args.extend(['--local', self.local_name])
61
+
62
+ if self.local_params:
63
+ args.extend(['--local-params', self.local_params])
64
+
65
+ return args
66
+
67
+
68
+ ALGORITHMS: Dict[str, Algorithm] = {
69
+ 'Watset[CW_top, CW_top]': Algorithm('watset', None, 'cw', 'mode=top', 'cw', 'mode=top'),
70
+ 'Watset[CW_lin, CW_top]': Algorithm('watset', None, 'cw', 'mode=lin', 'cw', 'mode=top'),
71
+ 'Watset[CW_log, CW_top]': Algorithm('watset', None, 'cw', 'mode=log', 'cw', 'mode=top'),
72
+ 'Watset[MCL, CW_top]': Algorithm('watset', None, 'mcl', None, 'cw', 'mode=top'),
73
+ 'Watset[CW_top, CW_lin]': Algorithm('watset', None, 'cw', 'mode=top', 'cw', 'mode=lin'),
74
+ 'Watset[CW_lin, CW_lin]': Algorithm('watset', None, 'cw', 'mode=lin', 'cw', 'mode=lin'),
75
+ 'Watset[CW_log, CW_lin]': Algorithm('watset', None, 'cw', 'mode=log', 'cw', 'mode=lin'),
76
+ 'Watset[MCL, CW_lin]': Algorithm('watset', None, 'mcl', None, 'cw', 'mode=lin'),
77
+ 'Watset[CW_top, CW_log]': Algorithm('watset', None, 'cw', 'mode=top', 'cw', 'mode=log'),
78
+ 'Watset[CW_lin, CW_log]': Algorithm('watset', None, 'cw', 'mode=lin', 'cw', 'mode=log'),
79
+ 'Watset[CW_log, CW_log]': Algorithm('watset', None, 'cw', 'mode=log', 'cw', 'mode=log'),
80
+ 'Watset[MCL, CW_log]': Algorithm('watset', None, 'mcl', None, 'cw', 'mode=log'),
81
+ 'CW_top': Algorithm('cw', 'top'),
82
+ 'CW_lin': Algorithm('cw', 'lin'),
83
+ 'CW_log': Algorithm('cw', 'log'),
84
+ 'MaxMax': Algorithm('maxmax')
85
+ }
86
+
87
+ SENSE = re.compile(r'^(?P<item>\d+)#(?P<sense>\d+)$')
88
+
89
+
90
+ def visualize(G: nx.Graph, seed: int = 0) -> plt.Figure:
91
+ pos = nx.spring_layout(G, seed=seed)
92
+
93
+ fig = plt.figure(dpi=240)
94
+ plt.axis('off')
95
+ nx.draw_networkx_edges(G, pos, alpha=.15)
96
+ nx.draw_networkx_labels(G, pos)
97
+
98
+ return fig
99
+
100
+
101
+ def watset(G: nx.Graph, algorithm: str, seed: int = 0,
102
+ jar: str = 'watset.jar', timeout: int = 10) -> Tuple[pd.DataFrame, Optional[nx.Graph]]:
103
+ with (NamedTemporaryFile() as graph,
104
+ NamedTemporaryFile(mode='rb') as clusters,
105
+ NamedTemporaryFile(mode='rb') as senses):
106
+ nx.write_edgelist(G, graph.name, delimiter='\t', data=['weight'])
107
+
108
+ try:
109
+ result = subprocess.run(['java', '-jar', jar,
110
+ '--input', graph.name, '--output', clusters.name, '--seed', str(seed),
111
+ *ALGORITHMS[algorithm].args_clustering()],
112
+ capture_output=True, text=True, timeout=timeout)
113
+
114
+ if result.returncode != 0:
115
+ raise gr.Error(f'Backend error (code {result.returncode}): {result.stderr}')
116
+ except subprocess.SubprocessError as e:
117
+ raise gr.Error(f'Backend error: {e}')
118
+
119
+ df_clusters = pd.read_csv(clusters, sep='\t', names=('cluster', 'size', 'items'),
120
+ dtype={'cluster': int, 'size': int, 'items': str})
121
+
122
+ df_clusters['items'] = df_clusters['items'].str.split(', ')
123
+
124
+ if ALGORITHMS[algorithm].name == 'watset':
125
+ try:
126
+ result = subprocess.run(['java', '-jar', jar,
127
+ '--input', graph.name, '--output', senses.name, '--seed', str(seed),
128
+ 'graph', *ALGORITHMS[algorithm].args_graph()],
129
+ capture_output=True, text=True, timeout=timeout)
130
+
131
+ if result.returncode != 0:
132
+ raise gr.Error(f'Backend error (code {result.returncode}): {result.stderr}')
133
+ except subprocess.SubprocessError as e:
134
+ raise gr.Error(f'Backend error: {e}')
135
+
136
+ G_senses = nx.read_edgelist(senses.name, delimiter='\t', comments='\n', data=[('weight', float)])
137
+
138
+ return df_clusters, G_senses
139
+
140
+ return df_clusters, None
141
+
142
+
143
+ def handler(file: IO[bytes], algorithm: str, seed: int) -> Tuple[pd.DataFrame, plt.Figure]:
144
+ if file is None:
145
+ raise gr.Error('File must be uploaded')
146
+
147
+ if algorithm not in ALGORITHMS:
148
+ raise gr.Error(f'Unknown algorithm: {algorithm}')
149
+
150
+ with open(file.name) as f:
151
+ try:
152
+ dialect = csv.Sniffer().sniff(f.readline(4096))
153
+ delimiter = dialect.delimiter
154
+ except csv.Error:
155
+ delimiter = ','
156
+
157
+ G: nx.Graph = nx.read_edgelist(file.name, delimiter=delimiter, comments='\n', data=[('weight', float)])
158
+
159
+ mapping, reverse = {}, {}
160
+
161
+ for i, node in enumerate(G):
162
+ mapping[node] = i
163
+ reverse[i] = node
164
+
165
+ nx.relabel_nodes(G, mapping, copy=False)
166
+
167
+ df_clusters, G_senses = watset(G, algorithm=algorithm, seed=seed)
168
+
169
+ nx.relabel_nodes(G, reverse, copy=False)
170
+
171
+ df_clusters['items'] = df_clusters['items'].apply(lambda items: sorted(reverse[int(item)] for item in items))
172
+
173
+ if G_senses is None:
174
+ fig = visualize(G, seed=seed)
175
+ else:
176
+ sense_mapping = {node: f'{reverse[int(match["item"])]}#{match["sense"]}' # type: ignore
177
+ for node in G_senses for match in (SENSE.match(node),)}
178
+
179
+ nx.relabel_nodes(G_senses, sense_mapping, copy=False)
180
+
181
+ fig = visualize(G_senses, seed=seed)
182
+
183
+ return df_clusters, fig
184
+
185
+
186
+ def main() -> None:
187
+ iface = gr.Interface(
188
+ fn=handler,
189
+ inputs=[
190
+ gr.File(
191
+ value='java.tsv',
192
+ file_types=['.tsv', '.csv'],
193
+ label='Graph'
194
+ ),
195
+ gr.Dropdown(
196
+ choices=cast(List[str], ALGORITHMS),
197
+ value='Watset[MCL, CW_lin]',
198
+ label='Algorithm'
199
+ ),
200
+ gr.Number(
201
+ label='Seed',
202
+ precision=0
203
+ )
204
+ ],
205
+ outputs=[
206
+ gr.Dataframe(
207
+ headers=['cluster', 'size', 'items'],
208
+ label='Clustering'
209
+ ),
210
+ gr.Plot(
211
+ label='Graph'
212
+ )
213
+ ],
214
+ title='Structure Discovery with Watset',
215
+ description='''
216
+ **Watset** is a powerful algorithm for structure discovery in graphs.
217
+
218
+ By capturing the ambiguity of nodes in a graph, Watset efficiently finds clusters in the input data.
219
+
220
+ Whether you're working with linguistic data or other networks, Watset is the go-to solution for unlocking hidden patterns and structures.
221
+ ''',
222
+ article='''
223
+ **More Watset:**
224
+
225
+ - Paper: <https://doi.org/10.1162/COLI_a_00354> ([arXiv](https://arxiv.org/abs/1808.06696))
226
+ - Implementation: <https://github.com/nlpub/watset-java>
227
+ - Maven Central: <https://search.maven.org/artifact/org.nlpub/watset>
228
+ - conda-forge: <https://anaconda.org/conda-forge/watset>
229
+ ''',
230
+ allow_flagging='never'
231
+ )
232
+
233
+ iface.launch()
234
+
235
+
236
+ if __name__ == '__main__':
237
+ main()