Spaces:
Runtime error
Runtime error
Upload 6 files
Browse files- app.py +9 -2
- utils/models.py +8 -1
app.py
CHANGED
@@ -14,6 +14,7 @@ from utils.models import (
|
|
14 |
tokenizer,
|
15 |
get_data,
|
16 |
get_instructor_embedding_model,
|
|
|
17 |
preprocess_text,
|
18 |
)
|
19 |
from utils.retriever import (
|
@@ -38,6 +39,8 @@ data = get_data()
|
|
38 |
col1, col2 = st.columns([3, 3], gap="medium")
|
39 |
|
40 |
instructor_model = get_instructor_embedding_model()
|
|
|
|
|
41 |
|
42 |
question_choice = [
|
43 |
"What was discussed regarding Ryzen revenue performance?",
|
@@ -90,7 +93,7 @@ sparse_scores = np.argsort(bm25.get_scores(tokenized_query), axis=0)[::-1]
|
|
90 |
indices = get_bm25_search_hits(corpus, sparse_scores, 50)
|
91 |
|
92 |
|
93 |
-
json_output_embedding =
|
94 |
query_embedding_instruction,
|
95 |
query_text,
|
96 |
api_name="/predict",
|
@@ -99,7 +102,11 @@ json_output_embedding = instructor_model.predict(
|
|
99 |
json_file = open(json_output_embedding, "r")
|
100 |
json_dict = json.load(json_file)
|
101 |
dense_array = np.array(json_dict["data"], dtype=np.float64)
|
102 |
-
|
|
|
|
|
|
|
|
|
103 |
|
104 |
text_embedding_instructions_choice = [
|
105 |
"Represent the financial statement for retrieval:",
|
|
|
14 |
tokenizer,
|
15 |
get_data,
|
16 |
get_instructor_embedding_model,
|
17 |
+
get_instructor_embedding_model_api,
|
18 |
preprocess_text,
|
19 |
)
|
20 |
from utils.retriever import (
|
|
|
39 |
col1, col2 = st.columns([3, 3], gap="medium")
|
40 |
|
41 |
instructor_model = get_instructor_embedding_model()
|
42 |
+
instructor_model_api = get_instructor_embedding_model_api()
|
43 |
+
|
44 |
|
45 |
question_choice = [
|
46 |
"What was discussed regarding Ryzen revenue performance?",
|
|
|
93 |
indices = get_bm25_search_hits(corpus, sparse_scores, 50)
|
94 |
|
95 |
|
96 |
+
json_output_embedding = instructor_model_api.predict(
|
97 |
query_embedding_instruction,
|
98 |
query_text,
|
99 |
api_name="/predict",
|
|
|
102 |
json_file = open(json_output_embedding, "r")
|
103 |
json_dict = json.load(json_file)
|
104 |
dense_array = np.array(json_dict["data"], dtype=np.float64)
|
105 |
+
dense_embedding_api = dense_array.tolist()
|
106 |
+
|
107 |
+
|
108 |
+
dense_embedding = instructor_model.encode([[query_embedding_instruction, query_text]]).tolist()
|
109 |
+
|
110 |
|
111 |
text_embedding_instructions_choice = [
|
112 |
"Represent the financial statement for retrieval:",
|
utils/models.py
CHANGED
@@ -9,6 +9,7 @@ from nltk.tokenize import word_tokenize
|
|
9 |
from nltk.corpus import stopwords
|
10 |
from nltk.stem.porter import PorterStemmer
|
11 |
import re
|
|
|
12 |
|
13 |
|
14 |
def tokenizer(
|
@@ -44,7 +45,7 @@ def get_data():
|
|
44 |
|
45 |
|
46 |
@st.cache_resource
|
47 |
-
def
|
48 |
client = Client("https://awinml-api-instructor-xl-1.hf.space/")
|
49 |
return client
|
50 |
|
@@ -56,3 +57,9 @@ def get_bm25_model(data):
|
|
56 |
tokenized_corpus = [doc.split(" ") for doc in corpus_clean]
|
57 |
bm25 = BM25Plus(tokenized_corpus)
|
58 |
return corpus, bm25
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
from nltk.corpus import stopwords
|
10 |
from nltk.stem.porter import PorterStemmer
|
11 |
import re
|
12 |
+
from InstructorEmbedding import INSTRUCTOR
|
13 |
|
14 |
|
15 |
def tokenizer(
|
|
|
45 |
|
46 |
|
47 |
@st.cache_resource
|
48 |
+
def get_instructor_embedding_model_api():
|
49 |
client = Client("https://awinml-api-instructor-xl-1.hf.space/")
|
50 |
return client
|
51 |
|
|
|
57 |
tokenized_corpus = [doc.split(" ") for doc in corpus_clean]
|
58 |
bm25 = BM25Plus(tokenized_corpus)
|
59 |
return corpus, bm25
|
60 |
+
|
61 |
+
|
62 |
+
@st.cache_resource
|
63 |
+
def get_instructor_embedding_model():
|
64 |
+
model = INSTRUCTOR("hkunlp/instructor-xl")
|
65 |
+
return model
|