File size: 4,740 Bytes
cc3cff0
 
 
 
2349e64
 
 
 
 
cc3cff0
 
 
 
 
 
 
 
 
 
 
2349e64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from transformers import AutoModelForMaskedLM
from transformers import AutoTokenizer
import spacy
import pytextrank
from nlp_entities import *
import torch
import streamlit as st
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

model_checkpoint = "vives/distilbert-base-uncased-finetuned-cvent-2019_2022"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint, output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


FILT_GROUPS = ["CARDINAL", "TIME", "DATE", "PERCENT", "MONEY", "QUANTITY", "ORDINAL"]
POS = ["NOUN", "PROPN", "VERB"]

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("textrank", last=True, config={"pos_kept": POS, "token_lookback": 3})
all_stopwords = nlp.Defaults.stop_words

#streamlit stuff
tags = st.text_input("Input tags separated by commas")
text = st.text_input("Input text to classify")
#Methods for tag processing
def pool_embeddings(out, tok):
  embeddings = out["hidden_states"][-1]
  attention_mask = tok['attention_mask']
  mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
  masked_embeddings = embeddings * mask
  summed = torch.sum(masked_embeddings, 1)
  summed_mask = torch.clamp(mask.sum(1), min=1e-9)
  mean_pooled = summed / summed_mask
  return mean_pooled
import pandas as pd

def get_transcript(file):
    data = pd.io.json.read_json(file)
    transcript = data['results'].values[1][0]['transcript']
    transcript = transcript.lower()
    return transcript
#    
"""preprocess tags"""
if tags:
  tags = [x.lower().strip() for x in tags.split(",")]
  tags_tokens = concat_tokens(tags)
  tags_tokens.pop("KPS")
  with torch.no_grad():
    outputs_tags = model(**tags_tokens)
  pools_tags = pool_embeddings(outputs_tags, tags_tokens).detach().numpy()
  token_dict = {}
  for tag,embedding in zip(tags,pools_tags):
    token_dict[tag] = embedding

"""Code related with processing text, extracting KPs, and doing distance to tag"""
def concat_tokens(sentences):
  tokens = {'input_ids': [], 'attention_mask': [], 'KPS': {}}
  for sentence, values in sentences.items():
      weight = values['weight']
      # encode each sentence and append to dictionary
      new_tokens = tokenizer.encode_plus(sentence, max_length=64,
                                         truncation=True, padding='max_length',
                                         return_tensors='pt')
      tokens['input_ids'].append(new_tokens['input_ids'][0])
      tokens['attention_mask'].append(new_tokens['attention_mask'][0])
      tokens['KPS'][sentence] = weight
  # reformat list of tensors into single tensor
  tokens['input_ids'] = torch.stack(tokens['input_ids'])
  tokens['attention_mask'] = torch.stack(tokens['attention_mask'])
  return tokens
  
def calculate_weighted_embed_dist(out, tokens, weight, text,kp_dict, idx, exclude_text=False,exclude_words=False):
  sim_dict = {}
  pools = pool_embeddings_count(out, tokens, idx).detach().numpy()
  for key in kp_dict.keys():
    if exclude_text and text in key:
      continue
    if exclude_words and True in [x in key for x in text.split(" ")]:
      continue

    sim_dict[key] = cosine_similarity(
        pools,
        [kp_dict[key]] 
    )[0][0] * weight
  return sim_dict
def pool_embeddings_count(out, tok, idx):
  embeddings = out["hidden_states"][-1][idx:idx+1,:,:]
  attention_mask = tok['attention_mask'][idx]
  mask = attention_mask.unsqueeze(-1).expand(embeddings.size()).float()
  masked_embeddings = embeddings * mask
  summed = torch.sum(masked_embeddings, 1)
  summed_mask = torch.clamp(mask.sum(1), min=1e-9)
  mean_pooled = summed / summed_mask
  return mean_pooled
import pandas as pd
def extract_tokens(text,top_kp=30):
  kps = return_ners_and_kp([text], ret_ne=True)['KP']
  #only process the top_kp tokens
  kps = sorted(kps.items(), key= lambda x: x[1]['weight'], reverse = True)[:top_kp]
  kps =  {x:y for x,y in kps}
  return concat_tokens(kps)

"""Process text and classify it"""
if text and tags:
  text = text.lower()
  t1_tokens = extract_tokens(text)
  t1_kps = t1_tokens.pop("KPS")
  with torch.no_grad():
    outputs = model(**t1_tokens)
  tag_distance = None
  for i,kp in enumerate(t1_kps):
    if tag_distance is None:
      tag_distance = calculate_weighted_embed_dist(outputs, t1_tokens,t1_kps[kp], kp, token_dict,i,exclude_text=False,exclude_words=False)
    else:
      curr = calculate_weighted_embed_dist(outputs, t1_tokens,t1_kps[kp], kp, token_dict,i,exclude_text=False,exclude_words=False)
      tag_distance = {x:tag_distance[x] + curr[x] for x in tag_distance.keys()}
  tag_distance = sorted(tag_distance.items(), key= lambda x: x[1], reverse = True)
  tag_distance =  {x:y for x,y in tag_distance}
  st.json(tag_distance)