Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -2,84 +2,66 @@
|
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
import torch
|
|
|
|
|
5 |
from sentence_transformers.util import cos_sim
|
6 |
from sentence_transformers import SentenceTransformer
|
|
|
7 |
import gradio as gr
|
|
|
|
|
|
|
8 |
#%%
|
9 |
-
# etalon = pd.read_csv("etalon_prod.csv")
|
10 |
-
df = pd.read_csv("preprocessed_complaints.csv")
|
11 |
|
12 |
model = SentenceTransformer('sentence-transformers/multi-qa-distilbert-cos-v1')
|
|
|
13 |
|
14 |
-
|
15 |
-
|
16 |
|
17 |
with open("embeddings.npy", 'rb') as f:
|
18 |
-
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
treshold = 0.8):
|
25 |
-
|
26 |
-
cols_for_top_k = ["Специальность врача",
|
27 |
-
"Рекомендуемые специалисты"]
|
28 |
-
|
29 |
-
usr_embeddings = model.encode(user_input)
|
30 |
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
sorted_df = df.loc[sorted_idx].copy()
|
38 |
-
sorted_df['cos_sim'] = cos_similarity
|
39 |
-
sorted_df = sorted_df[sorted_df['cos_sim'] > treshold]
|
40 |
-
|
41 |
-
result = {}
|
42 |
-
for col in cols_for_top_k:
|
43 |
-
result[col] = sorted_df[col].value_counts()[:top_k_spec].index.tolist()
|
44 |
-
result['Жалобы'] = sorted_df['Жалобы'].value_counts()[:top_k_services].index.tolist()
|
45 |
-
|
46 |
-
top_k_mkb = sorted_df['Диагноз МКБ'].value_counts()[:top_k_services].index.tolist()
|
47 |
-
result['Диагноз МКБ'] = top_k_mkb
|
48 |
-
|
49 |
-
categories = ['Инструментальная диагностика', 'Лабораторная диагностика']
|
50 |
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
for category in categories:
|
56 |
-
top_k_services_in_cat_mkb = slice_df[slice_df['service_name_category'] == category]['Рекомендации по обследованию'].value_counts()[:top_k_services].index.tolist()
|
57 |
-
temp_lst.append({category:top_k_services_in_cat_mkb})
|
58 |
-
|
59 |
-
top_k_services_lst_by_mkb.append({mkb:temp_lst})
|
60 |
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
top_k_services_lst.append({category:list_top_k_services})
|
67 |
|
|
|
68 |
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
|
|
74 |
#%%
|
75 |
gradio_app = gr.Interface(
|
76 |
get_recommend,
|
77 |
inputs=['text',
|
78 |
-
gr.Slider(minimum=1, maximum=10, step=1, label="Топ N специалистов", value=3),
|
79 |
-
gr.Slider(minimum=1, maximum=10, step=1, label="Топ N услуг", value=5),
|
80 |
gr.Slider(minimum=0, maximum=1, step=0.05, label="Порог релевантности", value=0.8)],
|
81 |
-
outputs=[gr.JSON(label='
|
82 |
-
# title="Предсказание топ-10 наиболее схожих услуг",
|
83 |
description="Введите услугу:"
|
84 |
)
|
85 |
|
|
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
import torch
|
5 |
+
import json
|
6 |
+
import re
|
7 |
from sentence_transformers.util import cos_sim
|
8 |
from sentence_transformers import SentenceTransformer
|
9 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
10 |
import gradio as gr
|
11 |
+
import nltk
|
12 |
+
nltk.download('stopwords')
|
13 |
+
from nltk.corpus import stopwords
|
14 |
#%%
|
|
|
|
|
15 |
|
16 |
model = SentenceTransformer('sentence-transformers/multi-qa-distilbert-cos-v1')
|
17 |
+
russian_stopwords = stopwords.words('russian') + ['ВАШ']
|
18 |
|
19 |
+
with open("data/symptom_checker/top_150_symps_by_spec.json", 'r') as f:
|
20 |
+
symps = json.load(f)
|
21 |
|
22 |
with open("embeddings.npy", 'rb') as f:
|
23 |
+
embs = np.load(f)
|
24 |
|
25 |
+
def remove_numbers(text):
|
26 |
+
text = re.sub(r'\d+', '', text)
|
27 |
+
text = re.sub(r'[^\w\s]', '', text)
|
28 |
+
return text.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
+
|
31 |
+
|
32 |
+
vectorizer = CountVectorizer(ngram_range=(1, 3),
|
33 |
+
stop_words=russian_stopwords,
|
34 |
+
preprocessor=remove_numbers,
|
35 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
+
def get_symptomps_v2(text, treshold = 0.7):
|
38 |
+
try:
|
39 |
+
if isinstance(text, str):
|
40 |
+
text = [text]
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
+
X = vectorizer.fit_transform(text)
|
43 |
+
# print(vectorizer.get_feature_names_out())
|
44 |
+
text_emb = model.encode(vectorizer.get_feature_names_out(), batch_size=64)
|
45 |
+
cos_sim_m = util.cos_sim(text_emb, embs).numpy()
|
46 |
+
cos_sim_m = np.where(cos_sim_m > treshold, cos_sim_m, -1)
|
|
|
47 |
|
48 |
+
arg_max_idx = np.argmax(cos_sim_m, axis=1)
|
49 |
|
50 |
+
outputs = []
|
51 |
+
for idx, cos_sim_row in zip(arg_max_idx, cos_sim_m):
|
52 |
+
if cos_sim_row[idx] > 0:
|
53 |
+
outputs.append(symps[idx])
|
54 |
+
if len(outputs) == 0:
|
55 |
+
return ['Симптомы не определены']
|
56 |
+
return np.unique(outputs).tolist()
|
57 |
+
except:
|
58 |
+
return ['Симптомы не определены']
|
59 |
#%%
|
60 |
gradio_app = gr.Interface(
|
61 |
get_recommend,
|
62 |
inputs=['text',
|
|
|
|
|
63 |
gr.Slider(minimum=0, maximum=1, step=0.05, label="Порог релевантности", value=0.8)],
|
64 |
+
outputs=[gr.JSON(label='Симптомы: ')],
|
|
|
65 |
description="Введите услугу:"
|
66 |
)
|
67 |
|