warleagle commited on
Commit
e32f802
·
verified ·
1 Parent(s): ee1dec5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -58
app.py CHANGED
@@ -2,84 +2,66 @@
2
  import pandas as pd
3
  import numpy as np
4
  import torch
 
 
5
  from sentence_transformers.util import cos_sim
6
  from sentence_transformers import SentenceTransformer
 
7
  import gradio as gr
 
 
 
8
  #%%
9
- # etalon = pd.read_csv("etalon_prod.csv")
10
- df = pd.read_csv("preprocessed_complaints.csv")
11
 
12
  model = SentenceTransformer('sentence-transformers/multi-qa-distilbert-cos-v1')
 
13
 
14
-
15
- unique_complaints = df['Жалобы'].unique()
16
 
17
  with open("embeddings.npy", 'rb') as f:
18
- embeddings = np.load(f)
19
 
20
- #%%
21
- def get_recommend(user_input,
22
- top_k_spec = 3,
23
- top_k_services = 5,
24
- treshold = 0.8):
25
-
26
- cols_for_top_k = ["Специальность врача",
27
- "Рекомендуемые специалисты"]
28
-
29
- usr_embeddings = model.encode(user_input)
30
 
31
- cos_similarity = cos_sim(usr_embeddings, embeddings).detach().numpy()
32
- sorted_idx = cos_similarity[0].argsort()[::-1]
33
- cos_similarity.sort()
34
-
35
- cos_similarity = cos_similarity[0][::-1]
36
-
37
- sorted_df = df.loc[sorted_idx].copy()
38
- sorted_df['cos_sim'] = cos_similarity
39
- sorted_df = sorted_df[sorted_df['cos_sim'] > treshold]
40
-
41
- result = {}
42
- for col in cols_for_top_k:
43
- result[col] = sorted_df[col].value_counts()[:top_k_spec].index.tolist()
44
- result['Жалобы'] = sorted_df['Жалобы'].value_counts()[:top_k_services].index.tolist()
45
-
46
- top_k_mkb = sorted_df['Диагноз МКБ'].value_counts()[:top_k_services].index.tolist()
47
- result['Диагноз МКБ'] = top_k_mkb
48
-
49
- categories = ['Инструментальная диагностика', 'Лабораторная диагностика']
50
 
51
- top_k_services_lst_by_mkb = []
52
- for mkb in top_k_mkb:
53
- temp_lst = []
54
- slice_df = sorted_df[sorted_df['Диагноз МКБ'] == mkb]
55
- for category in categories:
56
- top_k_services_in_cat_mkb = slice_df[slice_df['service_name_category'] == category]['Рекомендации по обследованию'].value_counts()[:top_k_services].index.tolist()
57
- temp_lst.append({category:top_k_services_in_cat_mkb})
58
-
59
- top_k_services_lst_by_mkb.append({mkb:temp_lst})
60
 
61
- top_k_services_lst = []
62
-
63
- for category in categories:
64
- slice_df = sorted_df[sorted_df['service_name_category'] == category]
65
- list_top_k_services = slice_df['Рекомендации по обследованию'].value_counts()[:top_k_services].index.tolist()
66
- top_k_services_lst.append({category:list_top_k_services})
67
 
 
68
 
69
-
70
- result['Рекомендации по обследованию'] = top_k_services_lst
71
- result['Рекомендации по обследованию по МКБ'] = top_k_services_lst_by_mkb
72
-
73
- return result
 
 
 
 
74
  #%%
75
  gradio_app = gr.Interface(
76
  get_recommend,
77
  inputs=['text',
78
- gr.Slider(minimum=1, maximum=10, step=1, label="Топ N специалистов", value=3),
79
- gr.Slider(minimum=1, maximum=10, step=1, label="Топ N услуг", value=5),
80
  gr.Slider(minimum=0, maximum=1, step=0.05, label="Порог релевантности", value=0.8)],
81
- outputs=[gr.JSON(label='Рекомендации: ')],
82
- # title="Предсказание топ-10 наиболее схожих услуг",
83
  description="Введите услугу:"
84
  )
85
 
 
2
  import pandas as pd
3
  import numpy as np
4
  import torch
5
+ import json
6
+ import re
7
  from sentence_transformers.util import cos_sim
8
  from sentence_transformers import SentenceTransformer
9
+ from sklearn.feature_extraction.text import CountVectorizer
10
  import gradio as gr
11
+ import nltk
12
+ nltk.download('stopwords')
13
+ from nltk.corpus import stopwords
14
  #%%
 
 
15
 
16
  model = SentenceTransformer('sentence-transformers/multi-qa-distilbert-cos-v1')
17
+ russian_stopwords = stopwords.words('russian') + ['ВАШ']
18
 
19
+ with open("data/symptom_checker/top_150_symps_by_spec.json", 'r') as f:
20
+ symps = json.load(f)
21
 
22
  with open("embeddings.npy", 'rb') as f:
23
+ embs = np.load(f)
24
 
25
+ def remove_numbers(text):
26
+ text = re.sub(r'\d+', '', text)
27
+ text = re.sub(r'[^\w\s]', '', text)
28
+ return text.strip()
 
 
 
 
 
 
29
 
30
+
31
+
32
+ vectorizer = CountVectorizer(ngram_range=(1, 3),
33
+ stop_words=russian_stopwords,
34
+ preprocessor=remove_numbers,
35
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
+ def get_symptomps_v2(text, treshold = 0.7):
38
+ try:
39
+ if isinstance(text, str):
40
+ text = [text]
 
 
 
 
 
41
 
42
+ X = vectorizer.fit_transform(text)
43
+ # print(vectorizer.get_feature_names_out())
44
+ text_emb = model.encode(vectorizer.get_feature_names_out(), batch_size=64)
45
+ cos_sim_m = util.cos_sim(text_emb, embs).numpy()
46
+ cos_sim_m = np.where(cos_sim_m > treshold, cos_sim_m, -1)
 
47
 
48
+ arg_max_idx = np.argmax(cos_sim_m, axis=1)
49
 
50
+ outputs = []
51
+ for idx, cos_sim_row in zip(arg_max_idx, cos_sim_m):
52
+ if cos_sim_row[idx] > 0:
53
+ outputs.append(symps[idx])
54
+ if len(outputs) == 0:
55
+ return ['Симптомы не определены']
56
+ return np.unique(outputs).tolist()
57
+ except:
58
+ return ['Симптомы не определены']
59
  #%%
60
  gradio_app = gr.Interface(
61
  get_recommend,
62
  inputs=['text',
 
 
63
  gr.Slider(minimum=0, maximum=1, step=0.05, label="Порог релевантности", value=0.8)],
64
+ outputs=[gr.JSON(label='Симптомы: ')],
 
65
  description="Введите услугу:"
66
  )
67