rachith commited on
Commit
2a873fa
·
1 Parent(s): bdda483

adding 2022 model

Browse files
Files changed (1) hide show
  1. app.py +26 -3
app.py CHANGED
@@ -7,7 +7,9 @@ description = "Based on TimeLMs which is a RoBERTa model finetuned on tweets at
7
  article = "This outputs the top 500 similar tokens to the input word, as a list. Stay tuned for more info"
8
 
9
  available_models = ['2019',
10
- '2020']
 
 
11
 
12
  model_2019 = AutoModel.from_pretrained('cardiffnlp/twitter-roberta-base-2019-90m')
13
  tokenizers_2019 = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-2019-90m')
@@ -32,6 +34,17 @@ knn_model_2020 = NearestNeighbors(n_neighbors=500,
32
  nbrs_2020 = knn_model_2020.fit(embedding_matrix_2020)
33
  distances_2020, indices_2020 = nbrs_2020.kneighbors(embedding_matrix_2020)
34
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  title = "How does a word's meaning change with time?"
37
 
@@ -40,16 +53,26 @@ def topk(word,model):
40
 
41
  if model == '2019':
42
  index = tokenizers_2019.encode(f'{word}')
 
43
  for i in indices_2019[index[1]]:
44
  outs.append(tokenizers_2019.decode(i))
45
- print(tokenizers_2019.decode(i))
46
  return outs
47
 
48
  if model == '2020':
49
  index = tokenizers_2020.encode(f'{word}')
 
50
  for i in indices_2020[index[1]]:
51
  outs.append(tokenizers_2020.decode(i))
52
- print(tokenizers_2020.decode(i))
 
 
 
 
 
 
 
 
53
  return outs
54
 
55
  # with gr.Blocks() as demo:
 
7
  article = "This outputs the top 500 similar tokens to the input word, as a list. Stay tuned for more info"
8
 
9
  available_models = ['2019',
10
+ '2020',
11
+ '2022'
12
+ ]
13
 
14
  model_2019 = AutoModel.from_pretrained('cardiffnlp/twitter-roberta-base-2019-90m')
15
  tokenizers_2019 = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-2019-90m')
 
34
  nbrs_2020 = knn_model_2020.fit(embedding_matrix_2020)
35
  distances_2020, indices_2020 = nbrs_2020.kneighbors(embedding_matrix_2020)
36
 
37
+ model_2022 = AutoModel.from_pretrained('cardiffnlp/twitter-roberta-base-2022-154m')
38
+ tokenizers_2022 = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-jun2020')
39
+ embedding_matrix_2022 = model_2022.embeddings.word_embeddings.weight
40
+ embedding_matrix_2022 = embedding_matrix_2022.detach().numpy()
41
+ knn_model_2022 = NearestNeighbors(n_neighbors=500,
42
+ metric='cosine',
43
+ algorithm='auto',
44
+ n_jobs=3)
45
+ nbrs_2022 = knn_model_2022.fit(embedding_matrix_2022)
46
+ distances_2022, indices_2022 = nbrs_2020.kneighbors(embedding_matrix_2022)
47
+
48
 
49
  title = "How does a word's meaning change with time?"
50
 
 
53
 
54
  if model == '2019':
55
  index = tokenizers_2019.encode(f'{word}')
56
+ print(index)
57
  for i in indices_2019[index[1]]:
58
  outs.append(tokenizers_2019.decode(i))
59
+ # print(tokenizers_2019.decode(i))
60
  return outs
61
 
62
  if model == '2020':
63
  index = tokenizers_2020.encode(f'{word}')
64
+ print(index)
65
  for i in indices_2020[index[1]]:
66
  outs.append(tokenizers_2020.decode(i))
67
+ # print(tokenizers_2020.decode(i))
68
+ return outs
69
+
70
+ if model == '2022':
71
+ index = tokenizers_2022.encode(f'{word}')
72
+ print(index)
73
+ for i in indices_2022[index[1]]:
74
+ outs.append(tokenizers_2022.decode(i))
75
+ # print(tokenizers_2022decode(i))
76
  return outs
77
 
78
  # with gr.Blocks() as demo: