cogcorp commited on
Commit
30e35ef
·
1 Parent(s): 64aa184

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +154 -99
app.py CHANGED
@@ -3,23 +3,26 @@ import pandas as pd
3
  import os
4
  import spacy
5
  import numpy as np
 
 
6
  from sklearn.feature_extraction.text import TfidfVectorizer
7
  from sklearn.metrics.pairwise import cosine_similarity
8
  from keras.preprocessing.text import text_to_word_sequence
9
- from joblib import Parallel, delayed
10
- import sys
11
- import subprocess
12
- import pickle
13
 
14
- subprocess.run(["python", "-m", "spacy", "download", "en_core_web_md"])
15
 
 
 
 
16
  nlp = spacy.load('en_core_web_md')
17
 
18
- # Define vendor_df and vectorizer as global variables
19
- vendor_df = None
20
- vectorizer = TfidfVectorizer()
 
 
21
 
22
- # Function to preprocess text
23
  def preprocess_text(text):
24
  if isinstance(text, str):
25
  text = text.lower()
@@ -32,97 +35,149 @@ def preprocess_text(text):
32
  else:
33
  return text
34
 
35
- # Function to perform semantic search
36
- def semantic_search(query, vendor_vectors):
 
37
  query = preprocess_text(query)
38
- query_vector = vectorizer.transform([query])
39
- cosine_similarities = cosine_similarity(query_vector, vendor_vectors).flatten()
40
- return cosine_similarities
41
-
42
- # Function to parse number from text
43
- def parse_number(text):
44
- if isinstance(text, str):
45
- return int(''.join(filter(str.isdigit, text)))
46
- else:
47
- return 0
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
- # Function to process row
51
- def process_row(row, vendor_data, vendor_scores):
52
- scores = {}
53
- for vendor_name, vendor_vectors in vendor_data.items():
54
- cosine_similarities = semantic_search(row[0], vendor_vectors)
55
- most_similar_index = np.argmax(cosine_similarities)
56
- vendor_score = vendor_scores[vendor_name] # Get the vendor's score
57
- scores[vendor_name] = row['score_client'] * vendor_score # Multiply vendor score with client priority
58
- row_scores = pd.Series(scores)
59
- combined_row = pd.concat([row, row_scores])
60
- return combined_row
61
-
62
- # Function to process file
63
- def process_file(vendor_name_input, mode, file):
64
- global vendor_df
65
- global vectorizer
66
-
67
- # Sanitize the vendor name input
68
- vendor_name = vendor_name_input.strip().lower().replace(" ", "_")
69
-
70
- if mode == 'Upload Vendor File':
71
- vendor_df = pd.read_excel(file.name)
72
- vendor_df.iloc[:, 2] = vendor_df.iloc[:, 2].apply(preprocess_text)
73
- vendor_df['score_vendor'] = vendor_df.iloc[:, 4].apply(parse_number)
74
- vendor_df.iloc[:, 2] = vendor_df.iloc[:, 2].fillna('')
75
- vectorizer.fit(vendor_df.iloc[:, 2])
76
- vendor_vectors = vectorizer.transform(vendor_df.iloc[:, 2])
77
-
78
- # Save vendor vectors as pickle file
79
- vendor_vectors_path = os.path.join('data', f'{vendor_name}_vectors.pkl')
80
- os.makedirs(os.path.dirname(vendor_vectors_path), exist_ok=True)
81
- with open(vendor_vectors_path, 'wb') as f:
82
- pickle.dump(vendor_vectors, f)
83
-
84
- # Save vendor data (not vectors) as CSV file
85
- vendor_df_path = os.path.join('data', f'{vendor_name}_data.csv')
86
- vendor_df.to_csv(vendor_df_path, index=False)
87
-
88
- # Save vendor scores as pickle file
89
- vendor_scores_path = os.path.join('data', f'{vendor_name}_scores.pkl')
90
- with open(vendor_scores_path, 'wb') as f:
91
- pickle.dump(vendor_df['score_vendor'].to_dict(), f)
92
-
93
- return f"Vendor data file for {vendor_name} has been uploaded and saved.", None
94
-
95
- elif mode == 'Compare with Client File':
96
- csv_files = [f for f in os.listdir('data') if f.endswith('_data.csv')]
97
- vector_files = [f for f in os.listdir('data') if f.endswith('_vectors.pkl')]
98
- score_files = [f for f in os.listdir('data') if f.endswith('_scores.pkl')]
99
-
100
- if not csv_files or not vector_files or not score_files:
101
- return "No vendor data found. Please upload it first.", None
102
-
103
- vendor_data = {}
104
- vendor_scores = {}
105
- for csv_file, vector_file, score_file in zip(csv_files, vector_files, score_files):
106
- with open(os.path.join('data', vector_file), 'rb') as f:
107
- vendor_vectors = pickle.load(f)
108
- vendor_data[vendor_name] = vendor_vectors
109
- with open(os.path.join('data', score_file), 'rb') as f:
110
- vendor_scores[vendor_name] = pickle.load(f)
111
-
112
- client_df = pd.read_excel(file.name)
113
- client_df.iloc[:, 2] = client_df.iloc[:, 2].fillna('3 - Medium')
114
- client_df = client_df[client_df.iloc[:, 1] == 'Yes'] # Only consider rows where the second column is 'Yes'
115
- client_df.iloc[:, 0] = client_df.iloc[:, 0].apply(preprocess_text)
116
- client_df['score_client'] = client_df.iloc[:, 2].apply(parse_number)
117
- common_list = Parallel(n_jobs=-1)(delayed(process_row)(row, vendor_data, vendor_scores) for index, row in client_df.iterrows())
118
- common_df = pd.DataFrame(common_list)
119
- common_df = common_df.drop(common_df.columns[[1, 2, 3, 4]], axis=1) # Drop the second, third, fourth and fifth columns
120
- common_df.to_excel(f'client_matches.xlsx', index=False)
121
- return f"Matching data for all vendors has been saved to 'client_matches.xlsx'. You can download it from the link below.", os.path.abspath('client_matches.xlsx')
122
-
123
-
124
-
125
-
126
-
127
- iface = gr.Interface(fn=process_file, inputs=["text", gr.components.Dropdown(choices=['Upload Vendor File', 'Compare with Client File']), "file"], outputs=["text", "file"])
128
  iface.launch()
 
3
  import os
4
  import spacy
5
  import numpy as np
6
+ import zipfile
7
+ import tempfile
8
  from sklearn.feature_extraction.text import TfidfVectorizer
9
  from sklearn.metrics.pairwise import cosine_similarity
10
  from keras.preprocessing.text import text_to_word_sequence
11
+ import openai
12
+ import re
 
 
13
 
 
14
 
15
+ #vendor name (from column 0)
16
+
17
+ openai.api_key = "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"
18
  nlp = spacy.load('en_core_web_md')
19
 
20
+ vendor_df_dict = {}
21
+
22
+ # A dictionary to store the total Trellis Score of each vendor
23
+ total_trellis_scores = {}
24
+
25
 
 
26
  def preprocess_text(text):
27
  if isinstance(text, str):
28
  text = text.lower()
 
35
  else:
36
  return text
37
 
38
+ def semantic_search(query, data):
39
+ query = str(query)
40
+ data = [str(text) for text in data]
41
  query = preprocess_text(query)
42
+ data = [preprocess_text(text) for text in data]
43
+ vectorizer = TfidfVectorizer().fit_transform([query] + data)
44
+ cosine_similarities = cosine_similarity(vectorizer[0:1], vectorizer).flatten()
45
+ return np.argmax(cosine_similarities[1:])
46
+
47
+ def parse_score(score):
48
+ level_scores = {
49
+ 'Level 1 - Basic': 1,
50
+ 'Level 2 - Developing': 2,
51
+ 'Level 3 - Intermediate': 3,
52
+ 'Level 4 - Advanced': 4,
53
+ 'Level 5 - Leading': 5,
54
+ '1 - Low': 1,
55
+ '2 - Below average': 2,
56
+ '3 - Average': 3,
57
+ '4 - Above average': 4,
58
+ '5 - High': 5,
59
+ '1 - Very Low': 1,
60
+ '2 - Low': 2,
61
+ '3 - Medium': 3,
62
+ '4 - High-Medium': 4,
63
+ '5 - Very High': 5
64
+ }
65
 
66
+ if score is None or str(score).strip() == '':
67
+ return 3
68
+
69
+ if isinstance(score, str):
70
+ score = score.replace(',', '.')
71
+ if score in level_scores:
72
+ return level_scores[score]
73
+ else:
74
+ number = re.findall(r"[-+]?\d*\.\d+|\d+", score)
75
+ if number:
76
+ return float(number[0])
77
+ return 0
78
+
79
+ def load_vendor_files(zip_file_path):
80
+ global vendor_df_dict
81
+ with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
82
+ temp_dir = tempfile.TemporaryDirectory()
83
+ zip_ref.extractall(temp_dir.name)
84
+ for file_name in os.listdir(temp_dir.name):
85
+ if file_name.endswith(".xlsx"):
86
+ vendor_df_dict[file_name] = pd.read_excel(os.path.join(temp_dir.name, file_name))
87
+ vendor_df_dict[file_name].iloc[:, 2] = vendor_df_dict[file_name].iloc[:, 2].apply(preprocess_text)
88
+ vendor_df_dict[file_name]['score_vendor'] = vendor_df_dict[file_name].iloc[:, 4].apply(parse_score).apply(float)
89
+
90
+ vendor_df_dict[file_name]['score_vendor'] = vendor_df_dict[file_name]['score_vendor'].fillna(0)
91
+ #vendor_df_dict[file_name].columns = ['ID', 'Topic', 'Vendor Question', 'Vendor Response', 'Vendor Score']
92
+ temp_dir.cleanup()
93
+
94
+ def process_file(client_file):
95
+ zip_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'vendor_files.zip')
96
+ load_vendor_files(zip_file_path) # Load vendor files from zip file
97
+ client_df = pd.read_excel(client_file.name)
98
+ client_df['score_client'] = client_df.iloc[:, 2].apply(parse_score).astype(float) #return numbr only from client score
99
+
100
+ client_df = client_df[client_df.iloc[:, 1] == 'Yes']
101
+ client_df.iloc[:, 0] = client_df.iloc[:, 0].apply(preprocess_text)
102
+ client_df['score_client'] = client_df['score_client'].astype(float)
103
+
104
+ matches_found = False # Flag to check if any matches were found
105
+ highest_score_vendor = None
106
+ highest_score = 0
107
+ total_scores = {} # dictionary to store the total Trellis Score for each vendor
108
+
109
+ with pd.ExcelWriter('matches.xlsx') as writer:
110
+ common_list = []
111
+ for vendor_file, vendor_df in vendor_df_dict.items():
112
+ for index, row in client_df.iterrows():
113
+
114
+ most_similar_index = semantic_search(row[0], vendor_df.iloc[:, 2])
115
+
116
+ most_similar_row = vendor_df.iloc[most_similar_index, :]
117
+ client_score = row['score_client'] # directly access the pre-parsed client score
118
+ vendor_score = parse_score(most_similar_row[4])
119
+
120
+ client_row_selected = row[[0, 2, 3]]
121
+ vendor_row_selected = most_similar_row[[0, 2, 4, 5]]
122
+ combined_row = pd.concat([client_row_selected, vendor_row_selected])
123
+
124
+ trellis_score = client_score * vendor_score
125
+ combined_row['Trellis Score'] = trellis_score
126
+ common_list.append(combined_row)
127
+
128
+ if trellis_score > highest_score:
129
+ highest_score = trellis_score
130
+ highest_score_vendor = vendor_file
131
+
132
+ if common_list:
133
+ common_df = pd.DataFrame(common_list)
134
+ # Compute the total Trellis Score
135
+ total_trellis_score = common_df['Trellis Score'].sum()
136
+ total_trellis_scores[vendor_file] = total_trellis_score # store the total score for each vendor
137
+ # Add a row with the total Trellis Score to the DataFrame
138
+ common_df.loc[len(common_df.index)] = [np.nan]*len(common_df.columns)
139
+ common_df.at[len(common_df.index)-1, 'Trellis Score'] = total_trellis_score
140
+
141
+ common_df.to_excel(writer, sheet_name=os.path.splitext(vendor_file)[0][:31], index=False)
142
+ common_list = [] # Reset the common_list for next vendor_file
143
+ matches_found = True # Set the flag to True as matches were found
144
+
145
+ highest_score_vendor = max(total_trellis_scores, key=total_trellis_scores.get)
146
+ highest_score_vendor = highest_score_vendor.split(".")[0]
147
+
148
+ if not matches_found: # In case there were no matches
149
+ return "No matching data found.", None, None
150
+ else:
151
+ def gpt3_query(prompt, engine='gpt-3.5-turbo', max_tokens=100, temperature=0.3):
152
+ try:
153
+ response = openai.ChatCompletion.create(
154
+ model=engine,
155
+ messages=[
156
+ {"role": "system", "content": "You are a helpful AI."},
157
+ {"role": "user", "content": prompt}
158
+ ],
159
+ max_tokens=max_tokens,
160
+ temperature=temperature
161
+ )
162
+ return response['choices'][0]['message']['content'].strip()
163
+ except Exception as e:
164
+ print(f"Error in gpt3_query: {str(e)}")
165
+ return None
166
+
167
+ # Get GPT-3.5-turbo to create a summary text
168
+ summary = gpt3_query(f"Based on the Trellis Score, the best vendor is {highest_score_vendor}. Please provide a brief summary.")
169
+
170
+ return f"Matching data has been saved to 'matches.xlsx'.\n\n{summary}", os.path.abspath('matches.xlsx'), highest_score_vendor
171
+
172
+
173
+ iface = gr.Interface(
174
+ fn=process_file,
175
+ inputs=[gr.components.File(label="Client File")],
176
+ outputs=[
177
+ gr.components.Textbox(label="Status"),
178
+ gr.components.File(label="Download Match Results"),
179
+ gr.components.Textbox(label="Vendor with Highest Score")
180
+ ],
181
+ )
182
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  iface.launch()