Spaces:

Yuichiroh
/

UOT

Sleeping

App Files Files Community

4kasha commited on Jul 3, 2023

Commit

527e550

1 Parent(s): b4fe012

init

Browse files

Files changed (4) hide show

aligner.py +132 -0
app.py +170 -0
requirements.txt +6 -0
utils.py +106 -0

aligner.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import numpy as np
+import torch
+import ot
+from utils import (
+    compute_distance_matrix_cosine,
+    compute_distance_matrix_l2,
+    compute_weights_norm,
+    compute_weights_uniform,
+    min_max_scaling
+)
+class Aligner:
+    def __init__(self, ot_type, sinkhorn, chimera, dist_type, weight_type, distortion, thresh, tau, **kwargs):
+        self.ot_type = ot_type
+        self.sinkhorn = sinkhorn
+        self.chimera = chimera
+        self.dist_type = dist_type
+        self.weight_type = weight_type
+        self.distotion = distortion
+        self.thresh = thresh
+        self.tau = tau
+        self.epsilon = 0.1
+        self.stopThr = 1e-6
+        self.numItermax = 1000
+        self.div_type = kwargs['div_type']
+        self.dist_func = compute_distance_matrix_cosine if dist_type == 'cos' else compute_distance_matrix_l2
+        if weight_type == 'uniform':
+            self.weight_func = compute_weights_uniform
+        else:
+            self.weight_func = compute_weights_norm
+    def compute_alignment_matrixes(self, s1_vecs, s2_vecs):
+        self.align_matrixes = []
+        for vecX, vecY in zip(s1_vecs, s2_vecs):
+            P = self.compute_optimal_transport(vecX, vecY)
+            if torch.is_tensor(P):
+                P = P.to('cpu').numpy()
+            self.align_matrixes.append(P)
+    def get_alignments(self, thresh, assign_cost=False):
+        assert len(self.align_matrixes) > 0
+        self.thresh = thresh
+        all_alignments = []
+        for P in self.align_matrixes:
+            alignments = self.matrix_to_alignments(P, assign_cost)
+            all_alignments.append(alignments)
+        return all_alignments
+    def matrix_to_alignments(self, P, assign_cost):
+        alignments = set()
+        align_pairs = np.transpose(np.nonzero(P > self.thresh))
+        if assign_cost:
+            for i_j in align_pairs:
+                alignments.add('{0}-{1}-{2:.4f}'.format(i_j[0], i_j[1], P[i_j[0], i_j[1]]))
+        else:
+            for i_j in align_pairs:
+                alignments.add('{0}-{1}'.format(i_j[0], i_j[1]))
+        return alignments
+    def compute_optimal_transport(self, s1_word_embeddigs, s2_word_embeddigs):
+        s1_word_embeddigs = s1_word_embeddigs.to(torch.float64)
+        s2_word_embeddigs = s2_word_embeddigs.to(torch.float64)
+        C = self.dist_func(s1_word_embeddigs, s2_word_embeddigs, self.distotion)
+        s1_weights, s2_weights = self.weight_func(s1_word_embeddigs, s2_word_embeddigs)
+        if self.ot_type == 'ot':
+            s1_weights = s1_weights / s1_weights.sum()
+            s2_weights = s2_weights / s2_weights.sum()
+            s1_weights, s2_weights, C = self.comvert_to_numpy(s1_weights, s2_weights, C)
+            if self.sinkhorn:
+                P = ot.bregman.sinkhorn_log(s1_weights, s2_weights, C, reg=self.epsilon, stopThr=self.stopThr,
+                                            numItermax=self.numItermax)
+            else:
+                P = ot.emd(s1_weights, s2_weights, C)
+            # Min-max normalization
+            P = min_max_scaling(P)
+        elif self.ot_type == 'pot':
+            if self.chimera:
+                m = self.tau * self.bertscore_F1(s1_word_embeddigs, s2_word_embeddigs)
+                m = min(1.0, m.item())
+            else:
+                m = self.tau
+            s1_weights, s2_weights, C = self.comvert_to_numpy(s1_weights, s2_weights, C)
+            m = np.min((np.sum(s1_weights), np.sum(s2_weights))) * m
+            if self.sinkhorn:
+                P = ot.partial.entropic_partial_wasserstein(s1_weights, s2_weights, C,
+                                                            reg=self.epsilon,
+                                                            m=m, stopThr=self.stopThr, numItermax=self.numItermax)
+            else:
+                # To cope with round error
+                P = ot.partial.partial_wasserstein(s1_weights, s2_weights, C, m=m)
+            # Min-max normalization
+            P = min_max_scaling(P)
+        elif 'uot' in self.ot_type:
+            if self.chimera:
+                tau = self.tau * self.bertscore_F1(s1_word_embeddigs, s2_word_embeddigs)
+            else:
+                tau = self.tau
+            if self.ot_type == 'uot':
+                P = ot.unbalanced.sinkhorn_stabilized_unbalanced(s1_weights, s2_weights, C, reg=self.epsilon, reg_m=tau,
+                                                                 stopThr=self.stopThr, numItermax=self.numItermax)
+            elif self.ot_type == 'uot-mm':
+                P = ot.unbalanced.mm_unbalanced(s1_weights, s2_weights, C, reg_m=tau, div=self.div_type,
+                                                stopThr=self.stopThr, numItermax=self.numItermax)
+            # Min-max normalization
+            P = min_max_scaling(P)
+        elif self.ot_type == 'none':
+            P = 1 - C
+        return P
+    def comvert_to_numpy(self, s1_weights, s2_weights, C):
+        if torch.is_tensor(s1_weights):
+            s1_weights = s1_weights.to('cpu').numpy()
+            s2_weights = s2_weights.to('cpu').numpy()
+        if torch.is_tensor(C):
+            C = C.to('cpu').numpy()
+        return s1_weights, s2_weights, C

app.py ADDED Viewed

	@@ -0,0 +1,170 @@

+import streamlit as st
+import random
+import numpy as np
+import torch
+from transformers import AutoTokenizer, AutoModel
+from aligner import Aligner
+from utils import align_matrix_heatmap, plot_align_matrix_heatmap
+device = "cuda" if torch.cuda.is_available() else "cpu"
+torch.manual_seed(42)
+np.random.seed(42)
+random.seed(42)
+@st.cache_resource
+def init_model(model: str):
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    model = AutoModel.from_pretrained(model, output_hidden_states=True).to(device).eval()
+    return tokenizer, model
+@st.cache_resource(max_entries=100)
+def init_aligner(ot_type: str, sinkhorn: bool, distortion: float, threshhold: float, tau: float):
+    return Aligner(
+        ot_type=ot_type,
+        sinkhorn=sinkhorn,
+        chimera=False,
+        dist_type="cos",
+        weight_type="uniform",
+        distortion=distortion,
+        thresh=threshhold,
+        tau=tau,
+        div_type="--"
+    )
+def encode_sentence(sent, pair, tokenizer, model, layer: int):
+    if pair == None:
+        inputs = tokenizer(sent, padding=False, truncation=False, is_split_into_words=True, return_offsets_mapping=True,
+                           return_tensors="pt")
+        with torch.no_grad():
+            outputs = model(inputs['input_ids'].to(device), inputs['attention_mask'].to(device),
+                            inputs['token_type_ids'].to(device))
+    else:
+        inputs = tokenizer(text=sent, text_pair=pair, padding=False, truncation=True,
+                           is_split_into_words=True,
+                           return_offsets_mapping=True, return_tensors="pt")
+        with torch.no_grad():
+            outputs = model(inputs['input_ids'].to(device), inputs['attention_mask'].to(device),
+                            inputs['token_type_ids'].to(device))
+    return outputs.hidden_states[layer][0], inputs['input_ids'][0], inputs['offset_mapping'][0]
+def centering(hidden_outputs):
+    """
+    hidden_outputs : [tokens, hidden_size]
+    """
+    # 全てのトークンの埋め込みについて足し上げ、その平均ベクトルを求める
+    mean_vec = torch.sum(hidden_outputs, dim=0) / hidden_outputs.shape[0]
+    hidden_outputs = hidden_outputs - mean_vec
+    print(hidden_outputs.shape)
+    return hidden_outputs
+def convert_to_word_embeddings(offset_mapping, token_ids, hidden_tensors, tokenizer, pair):
+    word_idx = -1
+    subword_to_word_conv = np.full((hidden_tensors.shape[0]), -1)
+    # Bug in hugging face tokenizer? Sometimes Metaspace is inserted
+    metaspace = getattr(tokenizer.decoder, "replacement", None)
+    metaspace = tokenizer.decoder.prefix if metaspace is None else metaspace
+    tokenizer_bug_idxes = [i for i, x in enumerate(tokenizer.convert_ids_to_tokens(token_ids)) if
+                           x == metaspace]
+    for subw_idx, offset in enumerate(offset_mapping):
+        if subw_idx in tokenizer_bug_idxes:
+            continue
+        elif offset[0] == offset[1]:  # Special token
+            continue
+        elif offset[0] == 0:
+            word_idx += 1
+            subword_to_word_conv[subw_idx] = word_idx
+        else:
+            subword_to_word_conv[subw_idx] = word_idx
+    word_embeddings = torch.vstack(
+        ([torch.mean(hidden_tensors[subword_to_word_conv == word_idx], dim=0) for word_idx in range(word_idx + 1)]))
+    print(word_embeddings.shape)
+    if pair:
+        sep_tok_indices = [i for i, x in enumerate(token_ids) if x == tokenizer.sep_token_id]
+        s2_start_idx = subword_to_word_conv[
+            sep_tok_indices[0] + np.argmax(subword_to_word_conv[sep_tok_indices[0]:] > -1)]
+        s1_word_embeddigs = word_embeddings[0:s2_start_idx, :]
+        s2_word_embeddigs = word_embeddings[s2_start_idx:, :]
+        return s1_word_embeddigs, s2_word_embeddigs
+    else:
+        return word_embeddings
+def main():
+    st.set_page_config(layout="wide")
+    # Sidebar
+    st.sidebar.markdown("## Settings & Parameters")
+    model = st.sidebar.selectbox('model', ['microsoft/deberta-v3-base', 'bert-base-uncased'])
+    layer = st.sidebar.slider(
+      'layer number for embeddings', 0, 11, value=9
+    )
+    is_centering = st.sidebar.checkbox('centering embeddings', value=True)
+    ot_type = st.sidebar.selectbox('ot_type', ['OT', 'POT', 'UOT'])
+    ot_type = ot_type.lower()
+    sinkhorn = st.sidebar.checkbox('sinkhorn', value=True)
+    distortion = st.sidebar.slider(
+      'distortion: $\kappa$', 0.0, 1.0, value=0.20
+    )
+    tau = st.sidebar.slider(
+      'tau: $\\tau$', 0.0, 1.0, value=0.98
+    ) # with 0.02 interva
+    threshhold = st.sidebar.slider(
+      'threshhold: $\lambda$', 0.0, 1.0
+    ) # with 0.01 interval
+    # Content
+    st.markdown('## Playground: Unbalanced Optimal Transport for Unbalanced Word Alignment')
+    col1, col2 = st.columns(2)
+    with col1:
+      sent1 = st.text_area(
+        'sentence 1',
+        'By one estimate , fewer than 20,000 lions exist in the wild , a drop of about 40 percent in the past two decades .'
+      )
+    with col2:
+      sent2 = st.text_area(
+        'sentence 2',
+        'Today there are only around 20,000 wild lions left in the world .'
+      )
+    tokenizer, model = init_model(model)
+    aligner = init_aligner(ot_type, sinkhorn, distortion, threshhold, tau)
+    with st.container():
+      st.write("word alignment matrix")
+      if sent1 != '' and sent2 != '':
+        sent1 = sent1.lower().split()
+        sent2 = sent2.lower().split()
+        hidden_output, input_id, offset_map = encode_sentence(sent1, sent2, tokenizer, model, layer=layer)
+        if is_centering:
+            hidden_output = centering(hidden_output)
+        s1_vec, s2_vec = convert_to_word_embeddings(offset_map, input_id, hidden_output, tokenizer, pair=True)
+        aligner.compute_alignment_matrixes([s1_vec], [s2_vec])
+        align_matrix = aligner.align_matrixes[0]
+        print(align_matrix.shape)
+        #fig = align_matrix_heatmap(align_matrix.T, sent1, sent2, threshhold)
+        #st.plotly_chart(fig, use_container_width=True)
+        fig = plot_align_matrix_heatmap(align_matrix.T, sent1, sent2, threshhold)
+        st.pyplot(fig, dpi=300)
+    st.divider()
+    st.markdown("Note that the centering in this demo is applied only to the input sentences, so the variance may be large.")
+    st.subheader('Refs')
+    st.write("Yuki Arase, Han Bao, Sho Yokoi, [Unbalanced Optimal Transport for Unbalanced Word Alignment](https://arxiv.org/abs/2306.04116), ACL2023 [[github](https://github.com/yukiar/OTAlign/tree/main)]")
+if __name__ == '__main__':
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+POT==0.9.0
+sentencepiece==0.1.99
+streamlit==1.24.0
+tokenizers==0.13.3
+transformers==4.30.2
+matplotlib==3.7.1

utils.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import numpy as np
+import torch
+import torch.nn.functional as F
+from ot.backend import get_backend
+import plotly.graph_objects as go
+device = "cuda" if torch.cuda.is_available() else "cpu"
+def compute_distance_matrix_cosine(s1_word_embeddigs, s2_word_embeddigs, distortion_ratio):
+    C = (torch.matmul(F.normalize(s1_word_embeddigs), F.normalize(s2_word_embeddigs).t()) + 1.0) / 2  # Range 0-1
+    C = apply_distortion(C, distortion_ratio)
+    C = min_max_scaling(C)  # Range 0-1
+    C = 1.0 - C  # Convert to distance
+    return C
+def compute_distance_matrix_l2(s1_word_embeddigs, s2_word_embeddigs, distortion_ratio):
+    C = torch.cdist(s1_word_embeddigs, s2_word_embeddigs, p=2)
+    C = min_max_scaling(C)  # Range 0-1
+    C = 1.0 - C  # Convert to similarity
+    C = apply_distortion(C, distortion_ratio)
+    C = min_max_scaling(C)  # Range 0-1
+    C = 1.0 - C  # Convert to distance
+    return C
+def apply_distortion(sim_matrix, ratio):
+    shape = sim_matrix.shape
+    if (shape[0] < 2 or shape[1] < 2) or ratio == 0.0:
+        return sim_matrix
+    pos_x = torch.tensor([[y / float(shape[1] - 1) for y in range(shape[1])] for x in range(shape[0])],
+                         device=device)
+    pos_y = torch.tensor([[x / float(shape[0] - 1) for x in range(shape[0])] for y in range(shape[1])],
+                         device=device)
+    distortion_mask = 1.0 - ((pos_x - pos_y.T) ** 2) * ratio
+    sim_matrix = torch.mul(sim_matrix, distortion_mask)
+    return sim_matrix
+def compute_weights_norm(s1_word_embeddigs, s2_word_embeddigs):
+    s1_weights = torch.norm(s1_word_embeddigs, dim=1)
+    s2_weights = torch.norm(s2_word_embeddigs, dim=1)
+    return s1_weights, s2_weights
+def compute_weights_uniform(s1_word_embeddigs, s2_word_embeddigs):
+    s1_weights = torch.ones(s1_word_embeddigs.shape[0], dtype=torch.float64, device=device)
+    s2_weights = torch.ones(s2_word_embeddigs.shape[0], dtype=torch.float64, device=device)
+    # # Uniform weights to make L2 norm=1
+    # s1_weights /= torch.linalg.norm(s1_weights)
+    # s2_weights /= torch.linalg.norm(s2_weights)
+    return s1_weights, s2_weights
+def min_max_scaling(C):
+    eps = 1e-10
+    # Min-max scaling for stabilization
+    nx = get_backend(C)
+    C_min = nx.min(C)
+    C_max = nx.max(C)
+    C = (C - C_min + eps) / (C_max - C_min + eps)
+    return C
+import seaborn as sns
+import matplotlib.pyplot as plt
+from mpl_toolkits.axes_grid1 import make_axes_locatable
+def plot_align_matrix_heatmap(align_matrix, sent1, sent2, thresh, **kwargs):
+    align_matrix = np.where(align_matrix <= thresh, 0, align_matrix)
+    fig, ax = plt.subplots(figsize=(10, 6))
+    sns.set(font='sans-serif', style="ticks")
+    _color = ['#F2F2F2', '#E0F4FA', '#BEE4F0', '#88CCE5', '#33b7df', '#1B88A6', '#105264', '#092E39']
+    _ticks = [0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0]
+    divider = make_axes_locatable(ax)
+    cbar_ax = divider.append_axes("right", size="2.5%", pad=0.1)
+    fig.add_axes(cbar_ax)
+    ax = sns.heatmap(
+        align_matrix,
+        xticklabels=sent1,
+        yticklabels=sent2,
+        cmap=_color,
+        linewidths=1,
+        square=True,
+        ax=ax,
+        cbar_ax=cbar_ax,
+        **kwargs
+    )
+    ax.collections[0].colorbar.ax.yaxis.set_ticks(_ticks, minor=False)
+    ax.collections[0].colorbar.set_ticklabels(_ticks)
+    cax = ax.collections[0].colorbar.ax
+    cax.tick_params(which='major', length=3, labelsize=5)
+    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
+    ax.set_yticklabels(ax.get_yticklabels(), rotation=0)
+    return fig