awinml commited on
Commit
694ff38
·
1 Parent(s): 57e70ed

Upload 6 files

Browse files
Files changed (2) hide show
  1. app.py +9 -2
  2. utils/models.py +8 -1
app.py CHANGED
@@ -14,6 +14,7 @@ from utils.models import (
14
  tokenizer,
15
  get_data,
16
  get_instructor_embedding_model,
 
17
  preprocess_text,
18
  )
19
  from utils.retriever import (
@@ -38,6 +39,8 @@ data = get_data()
38
  col1, col2 = st.columns([3, 3], gap="medium")
39
 
40
  instructor_model = get_instructor_embedding_model()
 
 
41
 
42
  question_choice = [
43
  "What was discussed regarding Ryzen revenue performance?",
@@ -90,7 +93,7 @@ sparse_scores = np.argsort(bm25.get_scores(tokenized_query), axis=0)[::-1]
90
  indices = get_bm25_search_hits(corpus, sparse_scores, 50)
91
 
92
 
93
- json_output_embedding = instructor_model.predict(
94
  query_embedding_instruction,
95
  query_text,
96
  api_name="/predict",
@@ -99,7 +102,11 @@ json_output_embedding = instructor_model.predict(
99
  json_file = open(json_output_embedding, "r")
100
  json_dict = json.load(json_file)
101
  dense_array = np.array(json_dict["data"], dtype=np.float64)
102
- dense_embedding = dense_array.tolist()
 
 
 
 
103
 
104
  text_embedding_instructions_choice = [
105
  "Represent the financial statement for retrieval:",
 
14
  tokenizer,
15
  get_data,
16
  get_instructor_embedding_model,
17
+ get_instructor_embedding_model_api,
18
  preprocess_text,
19
  )
20
  from utils.retriever import (
 
39
  col1, col2 = st.columns([3, 3], gap="medium")
40
 
41
  instructor_model = get_instructor_embedding_model()
42
+ instructor_model_api = get_instructor_embedding_model_api()
43
+
44
 
45
  question_choice = [
46
  "What was discussed regarding Ryzen revenue performance?",
 
93
  indices = get_bm25_search_hits(corpus, sparse_scores, 50)
94
 
95
 
96
+ json_output_embedding = instructor_model_api.predict(
97
  query_embedding_instruction,
98
  query_text,
99
  api_name="/predict",
 
102
  json_file = open(json_output_embedding, "r")
103
  json_dict = json.load(json_file)
104
  dense_array = np.array(json_dict["data"], dtype=np.float64)
105
+ dense_embedding_api = dense_array.tolist()
106
+
107
+
108
+ dense_embedding = instructor_model.encode([[query_embedding_instruction, query_text]]).tolist()
109
+
110
 
111
  text_embedding_instructions_choice = [
112
  "Represent the financial statement for retrieval:",
utils/models.py CHANGED
@@ -9,6 +9,7 @@ from nltk.tokenize import word_tokenize
9
  from nltk.corpus import stopwords
10
  from nltk.stem.porter import PorterStemmer
11
  import re
 
12
 
13
 
14
  def tokenizer(
@@ -44,7 +45,7 @@ def get_data():
44
 
45
 
46
  @st.cache_resource
47
- def get_instructor_embedding_model():
48
  client = Client("https://awinml-api-instructor-xl-1.hf.space/")
49
  return client
50
 
@@ -56,3 +57,9 @@ def get_bm25_model(data):
56
  tokenized_corpus = [doc.split(" ") for doc in corpus_clean]
57
  bm25 = BM25Plus(tokenized_corpus)
58
  return corpus, bm25
 
 
 
 
 
 
 
9
  from nltk.corpus import stopwords
10
  from nltk.stem.porter import PorterStemmer
11
  import re
12
+ from InstructorEmbedding import INSTRUCTOR
13
 
14
 
15
  def tokenizer(
 
45
 
46
 
47
  @st.cache_resource
48
+ def get_instructor_embedding_model_api():
49
  client = Client("https://awinml-api-instructor-xl-1.hf.space/")
50
  return client
51
 
 
57
  tokenized_corpus = [doc.split(" ") for doc in corpus_clean]
58
  bm25 = BM25Plus(tokenized_corpus)
59
  return corpus, bm25
60
+
61
+
62
+ @st.cache_resource
63
+ def get_instructor_embedding_model():
64
+ model = INSTRUCTOR("hkunlp/instructor-xl")
65
+ return model