|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__author__ = 'Dmitry Ustalov' |
|
__license__ = 'Apache 2.0' |
|
|
|
import csv |
|
import os |
|
import re |
|
import subprocess |
|
from dataclasses import dataclass |
|
from tempfile import NamedTemporaryFile |
|
from typing import cast, BinaryIO, Optional |
|
|
|
import gradio as gr |
|
import matplotlib.pyplot as plt |
|
import networkx as nx |
|
import pandas as pd |
|
from matplotlib.pyplot import Figure |
|
|
|
if 'MCL_BIN' in os.environ and os.path.isfile(os.environ['MCL_BIN']) and os.access(os.environ['MCL_BIN'], os.X_OK): |
|
MCL: Optional[str] = os.environ['MCL_BIN'] |
|
else: |
|
MCL = None |
|
|
|
|
|
@dataclass |
|
class Algorithm: |
|
name: str |
|
mode: Optional[str] = None |
|
local_name: Optional[str] = None |
|
local_params: Optional[str] = None |
|
global_name: Optional[str] = None |
|
global_params: Optional[str] = None |
|
bin: Optional[str] = None |
|
|
|
def args_clustering(self) -> list[str]: |
|
args = [self.name] |
|
|
|
if self.mode: |
|
args.extend(['--mode', self.mode]) |
|
|
|
args.extend(self.args_graph()) |
|
|
|
if self.global_name: |
|
args.extend(['--global', self.global_name]) |
|
|
|
if self.global_params: |
|
args.extend(['--global-params', self.global_params]) |
|
|
|
if self.bin: |
|
args.extend(['--bin', self.bin]) |
|
|
|
return args |
|
|
|
def args_graph(self) -> list[str]: |
|
args = [] |
|
|
|
if self.local_name: |
|
args.extend(['--local', self.local_name]) |
|
|
|
if self.local_params: |
|
args.extend(['--local-params', self.local_params]) |
|
|
|
return args |
|
|
|
|
|
ALGORITHMS: dict[str, Algorithm] = { |
|
'CW_top': Algorithm('cw', 'top'), |
|
'CW_lin': Algorithm('cw', 'lin'), |
|
'CW_log': Algorithm('cw', 'log'), |
|
'MaxMax': Algorithm('maxmax'), |
|
'Watset[CW_top, CW_top]': Algorithm('watset', None, 'cw', 'mode=top', 'cw', 'mode=top'), |
|
'Watset[CW_lin, CW_top]': Algorithm('watset', None, 'cw', 'mode=lin', 'cw', 'mode=top'), |
|
'Watset[CW_log, CW_top]': Algorithm('watset', None, 'cw', 'mode=log', 'cw', 'mode=top'), |
|
'Watset[MCL, CW_top]': Algorithm('watset', None, 'mcl', None, 'cw', 'mode=top'), |
|
'Watset[CW_top, CW_lin]': Algorithm('watset', None, 'cw', 'mode=top', 'cw', 'mode=lin'), |
|
'Watset[CW_lin, CW_lin]': Algorithm('watset', None, 'cw', 'mode=lin', 'cw', 'mode=lin'), |
|
'Watset[CW_log, CW_lin]': Algorithm('watset', None, 'cw', 'mode=log', 'cw', 'mode=lin'), |
|
'Watset[MCL, CW_lin]': Algorithm('watset', None, 'mcl', None, 'cw', 'mode=lin'), |
|
'Watset[CW_top, CW_log]': Algorithm('watset', None, 'cw', 'mode=top', 'cw', 'mode=log'), |
|
'Watset[CW_lin, CW_log]': Algorithm('watset', None, 'cw', 'mode=lin', 'cw', 'mode=log'), |
|
'Watset[CW_log, CW_log]': Algorithm('watset', None, 'cw', 'mode=log', 'cw', 'mode=log'), |
|
'Watset[MCL, CW_log]': Algorithm('watset', None, 'mcl', None, 'cw', 'mode=log'), |
|
} |
|
|
|
if MCL: |
|
ALGORITHMS.update({ |
|
'Watset[CW_top, MCL]': Algorithm('watset', None, 'cw', 'mode=top', 'mcl-bin', 'bin=' + MCL), |
|
'Watset[CW_lin, MCL]': Algorithm('watset', None, 'cw', 'mode=lin', 'mcl-bin', 'bin=' + MCL), |
|
'Watset[CW_log, MCL]': Algorithm('watset', None, 'cw', 'mode=log', 'mcl-bin', 'bin=' + MCL), |
|
'Watset[MCL, MCL]': Algorithm('watset', None, 'mcl', None, 'mcl-bin', 'bin=' + MCL), |
|
'MCL': Algorithm('mcl-bin', bin=MCL) |
|
}) |
|
|
|
SENSE = re.compile(r'^(?P<item>\d+)#(?P<sense>\d+)$') |
|
|
|
|
|
|
|
def visualize(G: 'nx.Graph[str]', seed: int = 0) -> Figure: |
|
pos = nx.spring_layout(G, seed=seed) |
|
|
|
fig = plt.figure(dpi=240) |
|
plt.axis('off') |
|
nx.draw_networkx_edges(G, pos, alpha=.15) |
|
nx.draw_networkx_labels(G, pos) |
|
|
|
return fig |
|
|
|
|
|
|
|
def watset(G: 'nx.Graph[str]', algorithm: str, seed: int = 0, |
|
jar: str = 'watset.jar', timeout: int = 10) -> tuple[pd.DataFrame, Optional['nx.Graph[str]']]: |
|
with (NamedTemporaryFile() as graph, |
|
NamedTemporaryFile(mode='rb') as clusters, |
|
NamedTemporaryFile(mode='rb') as senses): |
|
nx.write_edgelist(G, graph.name, delimiter='\t', data=['weight']) |
|
|
|
try: |
|
result = subprocess.run(['java', '-jar', jar, |
|
'--input', graph.name, '--output', clusters.name, '--seed', str(seed), |
|
*ALGORITHMS[algorithm].args_clustering()], |
|
capture_output=True, text=True, timeout=timeout) |
|
|
|
if result.returncode != 0: |
|
raise gr.Error(f'Clustering error (code {result.returncode}): {result.stderr}') |
|
except subprocess.SubprocessError as e: |
|
raise gr.Error(f'Clustering error: {e}') |
|
|
|
df_clusters = pd.read_csv(clusters, sep='\t', names=('cluster', 'size', 'items'), |
|
dtype={'cluster': int, 'size': int, 'items': str}) |
|
|
|
df_clusters['items'] = df_clusters['items'].str.split(', ') |
|
|
|
if ALGORITHMS[algorithm].name == 'watset': |
|
try: |
|
result = subprocess.run(['java', '-jar', jar, |
|
'--input', graph.name, '--output', senses.name, '--seed', str(seed), |
|
'graph', *ALGORITHMS[algorithm].args_graph()], |
|
capture_output=True, text=True, timeout=timeout) |
|
|
|
if result.returncode != 0: |
|
raise gr.Error(f'Graph error (code {result.returncode}): {result.stderr}') |
|
except subprocess.SubprocessError as e: |
|
raise gr.Error(f'Graph error: {e}') |
|
|
|
G_senses = nx.read_edgelist(senses.name, delimiter='\t', comments='\n', data=[('weight', float)]) |
|
|
|
return df_clusters, G_senses |
|
|
|
return df_clusters, None |
|
|
|
|
|
def handler(file: BinaryIO, algorithm: str, seed: int) -> tuple[pd.DataFrame, Figure]: |
|
if file is None: |
|
raise gr.Error('File must be uploaded') |
|
|
|
if algorithm not in ALGORITHMS: |
|
raise gr.Error(f'Unknown algorithm: {algorithm}') |
|
|
|
with open(file.name) as f: |
|
try: |
|
dialect = csv.Sniffer().sniff(f.read(4096)) |
|
delimiter = dialect.delimiter |
|
except csv.Error: |
|
delimiter = ',' |
|
|
|
G: 'nx.Graph[str]' = nx.read_edgelist(file.name, delimiter=delimiter, comments='\n', data=[('weight', float)]) |
|
|
|
mapping: dict[str, int] = {} |
|
reverse: dict[int, str] = {} |
|
|
|
for i, node in enumerate(G): |
|
mapping[node] = i |
|
reverse[i] = node |
|
|
|
nx.relabel_nodes(G, mapping, copy=False) |
|
|
|
df_clusters, G_senses = watset(G, algorithm=algorithm, seed=seed) |
|
|
|
nx.relabel_nodes(G, reverse, copy=False) |
|
|
|
df_clusters['items'] = df_clusters['items'].apply(lambda items: sorted(reverse[int(item)] for item in items)) |
|
|
|
if G_senses is None: |
|
fig = visualize(G, seed=seed) |
|
else: |
|
sense_mapping = {node: f'{reverse[int(match["item"])]}#{match["sense"]}' |
|
for node in G_senses for match in (SENSE.match(node),)} |
|
|
|
nx.relabel_nodes(G_senses, sense_mapping, copy=False) |
|
|
|
fig = visualize(G_senses, seed=seed) |
|
|
|
return df_clusters, fig |
|
|
|
|
|
def main() -> None: |
|
iface = gr.Interface( |
|
fn=handler, |
|
inputs=[ |
|
gr.File( |
|
file_types=['.tsv', '.csv'], |
|
label='Graph' |
|
), |
|
gr.Dropdown( |
|
choices=cast(list[str], ALGORITHMS), |
|
value='Watset[MCL, CW_lin]', |
|
label='Algorithm' |
|
), |
|
gr.Number( |
|
label='Seed', |
|
precision=0 |
|
) |
|
], |
|
outputs=[ |
|
gr.Dataframe( |
|
headers=['cluster', 'size', 'items'], |
|
label='Clustering' |
|
), |
|
gr.Plot( |
|
label='Graph' |
|
) |
|
], |
|
examples=[ |
|
['java.tsv', 'Watset[MCL, CW_lin]', 0], |
|
['java.tsv', 'MaxMax', 0], |
|
['bank.tsv', 'Watset[MCL, MCL]', 0], |
|
['bank.tsv', 'MCL', 0], |
|
], |
|
title='Structure Discovery with Watset', |
|
description=''' |
|
**Watset** is a powerful algorithm for structure discovery in undirected graphs. |
|
|
|
By capturing the ambiguity of nodes in a graph, Watset efficiently finds clusters in the input data. |
|
|
|
As the input, this tool expects [edge list](https://en.wikipedia.org/wiki/Edge_list) as a comma-separated (CSV) file without header. |
|
Each line of the file should contain three columns: |
|
|
|
- `source`: edge source |
|
- `target`: edge target |
|
- `weight`: edge weight |
|
|
|
Whether you're working with linguistic data or other networks, Watset is the go-to solution for unlocking hidden patterns and structures. |
|
''', |
|
article=''' |
|
**More Watset:** |
|
|
|
- Paper: <https://doi.org/10.1162/COLI_a_00354> ([arXiv](https://arxiv.org/abs/1808.06696)) |
|
- Implementation: <https://github.com/nlpub/watset-java> |
|
- Maven Central: <https://search.maven.org/artifact/org.nlpub/watset> |
|
- conda-forge: <https://anaconda.org/conda-forge/watset> |
|
''', |
|
allow_flagging='never' |
|
) |
|
|
|
iface.launch() |
|
|
|
|
|
if __name__ == '__main__': |
|
main() |
|
|