marksverdhei commited on
Commit
e8bfa89
·
1 Parent(s): 78022ff

Add device

Browse files
Files changed (2) hide show
  1. app.py +1 -3
  2. resources.py +38 -0
app.py CHANGED
@@ -9,9 +9,7 @@ device = "cpu" if use_cpu else "cuda"
9
 
10
  df = load_data()
11
 
12
- encoder, tokenizer = load_model_and_tokenizer()
13
-
14
-
15
 
16
  corrector = load_corrector()
17
 
 
9
 
10
  df = load_data()
11
 
12
+ encoder, tokenizer = load_model_and_tokenizer(device)
 
 
13
 
14
  corrector = load_corrector()
15
 
resources.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import vec2text
4
+ from transformers import AutoModel, AutoTokenizer
5
+ from sklearn.decomposition import PCA
6
+ from utils import file_cache
7
+
8
+
9
+ # Caching the vec2text corrector
10
+ @st.cache_resource
11
+ def load_corrector():
12
+ return vec2text.load_pretrained_corrector("gtr-base")
13
+
14
+ # Caching the dataframe since loading from an external source can be time-consuming
15
+ @st.cache_data
16
+ def load_data():
17
+ return pd.read_csv("https://huggingface.co/datasets/marksverdhei/reddit-syac-urls/resolve/main/train.csv")
18
+
19
+
20
+ @st.cache_resource
21
+ def vector_compressor_from_config():
22
+ # Return UMAP with 2 components for dimensionality reduction
23
+ # return UMAP(n_components=2)
24
+ return PCA(n_components=2)
25
+
26
+
27
+ @st.cache_data
28
+ @file_cache(".cache/reducer_embeddings.pickle")
29
+ def reduce_embeddings(embeddings):
30
+ reducer = vector_compressor_from_config()
31
+ return reducer.fit_transform(embeddings), reducer
32
+
33
+ # Caching the model and tokenizer to avoid reloading
34
+ @st.cache_resource
35
+ def load_model_and_tokenizer(device="cpu"):
36
+ encoder = AutoModel.from_pretrained("sentence-transformers/gtr-t5-base").encoder.to(device)
37
+ tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/gtr-t5-base")
38
+ return encoder, tokenizer