import base64
import re
import json
import pandas as pd
import gradio as gr
import pyterrier as pt
pt.init()
import pyt_splade
from pyterrier_gradio import Demo, MarkdownFile, interface, df2code, code2md, EX_Q, EX_D
factory_max = pyt_splade.Splade(agg='max')
factory_sum = pyt_splade.Splade(agg='sum')
COLAB_NAME = 'pyterrier_splade.ipynb'
COLAB_INSTALL = '''
!pip install -q git+https://github.com/naver/splade
!pip install -q git+https://github.com/cmacdonald/pyt_splade
'''.strip()
def generate_vis(df, mode='Document'):
if len(df) == 0:
return ''
result = []
if mode == 'Document':
max_score = max(max(t.values()) for t in df['toks'])
for row in df.itertuples(index=False):
if mode == 'Query':
tok_scores = row.query_toks
orig_tokens = factory_max.tokenizer.tokenize(row.query)
max_score = max(tok_scores.values())
id = row.qid
else:
tok_scores = row.toks
orig_tokens = factory_max.tokenizer.tokenize(row.text)
id = row.docno
def toks2span(toks):
return ' '.join(f'{t}' for t in toks)
orig_tokens_set = set(orig_tokens)
exp_tokens = [t for t, v in sorted(tok_scores.items(), key=lambda x: (-x[1], x[0])) if t not in orig_tokens_set]
result.append(f'''
{mode}: {id}
{toks2span(orig_tokens)}
Expansion Tokens: {toks2span(exp_tokens)}
''')
return '\n'.join(result)
def predict_query(input, agg):
code = f'''import pyt_splade
splade = pyt_splade.Splade(agg={agg!r})
query_pipeline = splade.query_encoder()
query_pipeline({df2list(input)})
'''
pipeline = {
'max': factory_max,
'sum': factory_sum
}[agg].query_encoder()
res = pipeline(input)
vis = generate_vis(res, mode='Query')
res['query_toks'] = [json.dumps({k: round(v, 4) for k, v in t.items()}) for t in res['query_toks']]
return (res, code2md(code, COLAB_INSTALL, COLAB_NAME), vis)
def predict_doc(input, agg):
code = f'''import pyt_splade
splade = pyt_splade.Splade(agg={repr(agg)})
doc_pipeline = splade.doc_encoder()
doc_pipeline({df2list(input)})
'''
pipeline = {
'max': factory_max,
'sum': factory_sum
}[agg].doc_encoder()
res = pipeline(input)
vis = generate_vis(res, mode='Document')
res['toks'] = [json.dumps({k: round(v, 4) for k, v in t.items()}) for t in res['toks']]
return (res, code2md(code, COLAB_INSTALL, COLAB_NAME), vis)
interface(
MarkdownFile('README.md'),
MarkdownFile('query.md'),
Demo(
predict_query,
EX_Q,
[
gr.Dropdown(choices=['max', 'sum'], value='max', label='Aggregation'),
],
scale=2/3
),
MarkdownFile('doc.md'),
Demo(
predict_doc,
EX_D,
[
gr.Dropdown(choices=['max', 'sum'], value='max', label='Aggregation'),
],
scale=2/3
),
MarkdownFile('wrapup.md'),
).launch(share=False)