Update app.py
Browse files
app.py
CHANGED
@@ -3,23 +3,26 @@ import pandas as pd
|
|
3 |
import os
|
4 |
import spacy
|
5 |
import numpy as np
|
|
|
|
|
6 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
7 |
from sklearn.metrics.pairwise import cosine_similarity
|
8 |
from keras.preprocessing.text import text_to_word_sequence
|
9 |
-
|
10 |
-
import
|
11 |
-
import subprocess
|
12 |
-
import pickle
|
13 |
|
14 |
-
subprocess.run(["python", "-m", "spacy", "download", "en_core_web_md"])
|
15 |
|
|
|
|
|
|
|
16 |
nlp = spacy.load('en_core_web_md')
|
17 |
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
|
|
21 |
|
22 |
-
# Function to preprocess text
|
23 |
def preprocess_text(text):
|
24 |
if isinstance(text, str):
|
25 |
text = text.lower()
|
@@ -32,97 +35,149 @@ def preprocess_text(text):
|
|
32 |
else:
|
33 |
return text
|
34 |
|
35 |
-
|
36 |
-
|
|
|
37 |
query = preprocess_text(query)
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
def
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
-
# Function to process row
|
51 |
-
def process_row(row, vendor_data, vendor_scores):
|
52 |
-
scores = {}
|
53 |
-
for vendor_name, vendor_vectors in vendor_data.items():
|
54 |
-
cosine_similarities = semantic_search(row[0], vendor_vectors)
|
55 |
-
most_similar_index = np.argmax(cosine_similarities)
|
56 |
-
vendor_score = vendor_scores[vendor_name] # Get the vendor's score
|
57 |
-
scores[vendor_name] = row['score_client'] * vendor_score # Multiply vendor score with client priority
|
58 |
-
row_scores = pd.Series(scores)
|
59 |
-
combined_row = pd.concat([row, row_scores])
|
60 |
-
return combined_row
|
61 |
-
|
62 |
-
# Function to process file
|
63 |
-
def process_file(vendor_name_input, mode, file):
|
64 |
-
global vendor_df
|
65 |
-
global vectorizer
|
66 |
-
|
67 |
-
# Sanitize the vendor name input
|
68 |
-
vendor_name = vendor_name_input.strip().lower().replace(" ", "_")
|
69 |
-
|
70 |
-
if mode == 'Upload Vendor File':
|
71 |
-
vendor_df = pd.read_excel(file.name)
|
72 |
-
vendor_df.iloc[:, 2] = vendor_df.iloc[:, 2].apply(preprocess_text)
|
73 |
-
vendor_df['score_vendor'] = vendor_df.iloc[:, 4].apply(parse_number)
|
74 |
-
vendor_df.iloc[:, 2] = vendor_df.iloc[:, 2].fillna('')
|
75 |
-
vectorizer.fit(vendor_df.iloc[:, 2])
|
76 |
-
vendor_vectors = vectorizer.transform(vendor_df.iloc[:, 2])
|
77 |
-
|
78 |
-
# Save vendor vectors as pickle file
|
79 |
-
vendor_vectors_path = os.path.join('data', f'{vendor_name}_vectors.pkl')
|
80 |
-
os.makedirs(os.path.dirname(vendor_vectors_path), exist_ok=True)
|
81 |
-
with open(vendor_vectors_path, 'wb') as f:
|
82 |
-
pickle.dump(vendor_vectors, f)
|
83 |
-
|
84 |
-
# Save vendor data (not vectors) as CSV file
|
85 |
-
vendor_df_path = os.path.join('data', f'{vendor_name}_data.csv')
|
86 |
-
vendor_df.to_csv(vendor_df_path, index=False)
|
87 |
-
|
88 |
-
# Save vendor scores as pickle file
|
89 |
-
vendor_scores_path = os.path.join('data', f'{vendor_name}_scores.pkl')
|
90 |
-
with open(vendor_scores_path, 'wb') as f:
|
91 |
-
pickle.dump(vendor_df['score_vendor'].to_dict(), f)
|
92 |
-
|
93 |
-
return f"Vendor data file for {vendor_name} has been uploaded and saved.", None
|
94 |
-
|
95 |
-
elif mode == 'Compare with Client File':
|
96 |
-
csv_files = [f for f in os.listdir('data') if f.endswith('_data.csv')]
|
97 |
-
vector_files = [f for f in os.listdir('data') if f.endswith('_vectors.pkl')]
|
98 |
-
score_files = [f for f in os.listdir('data') if f.endswith('_scores.pkl')]
|
99 |
-
|
100 |
-
if not csv_files or not vector_files or not score_files:
|
101 |
-
return "No vendor data found. Please upload it first.", None
|
102 |
-
|
103 |
-
vendor_data = {}
|
104 |
-
vendor_scores = {}
|
105 |
-
for csv_file, vector_file, score_file in zip(csv_files, vector_files, score_files):
|
106 |
-
with open(os.path.join('data', vector_file), 'rb') as f:
|
107 |
-
vendor_vectors = pickle.load(f)
|
108 |
-
vendor_data[vendor_name] = vendor_vectors
|
109 |
-
with open(os.path.join('data', score_file), 'rb') as f:
|
110 |
-
vendor_scores[vendor_name] = pickle.load(f)
|
111 |
-
|
112 |
-
client_df = pd.read_excel(file.name)
|
113 |
-
client_df.iloc[:, 2] = client_df.iloc[:, 2].fillna('3 - Medium')
|
114 |
-
client_df = client_df[client_df.iloc[:, 1] == 'Yes'] # Only consider rows where the second column is 'Yes'
|
115 |
-
client_df.iloc[:, 0] = client_df.iloc[:, 0].apply(preprocess_text)
|
116 |
-
client_df['score_client'] = client_df.iloc[:, 2].apply(parse_number)
|
117 |
-
common_list = Parallel(n_jobs=-1)(delayed(process_row)(row, vendor_data, vendor_scores) for index, row in client_df.iterrows())
|
118 |
-
common_df = pd.DataFrame(common_list)
|
119 |
-
common_df = common_df.drop(common_df.columns[[1, 2, 3, 4]], axis=1) # Drop the second, third, fourth and fifth columns
|
120 |
-
common_df.to_excel(f'client_matches.xlsx', index=False)
|
121 |
-
return f"Matching data for all vendors has been saved to 'client_matches.xlsx'. You can download it from the link below.", os.path.abspath('client_matches.xlsx')
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
iface = gr.Interface(fn=process_file, inputs=["text", gr.components.Dropdown(choices=['Upload Vendor File', 'Compare with Client File']), "file"], outputs=["text", "file"])
|
128 |
iface.launch()
|
|
|
3 |
import os
|
4 |
import spacy
|
5 |
import numpy as np
|
6 |
+
import zipfile
|
7 |
+
import tempfile
|
8 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
9 |
from sklearn.metrics.pairwise import cosine_similarity
|
10 |
from keras.preprocessing.text import text_to_word_sequence
|
11 |
+
import openai
|
12 |
+
import re
|
|
|
|
|
13 |
|
|
|
14 |
|
15 |
+
#vendor name (from column 0)
|
16 |
+
|
17 |
+
openai.api_key = "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"
|
18 |
nlp = spacy.load('en_core_web_md')
|
19 |
|
20 |
+
vendor_df_dict = {}
|
21 |
+
|
22 |
+
# A dictionary to store the total Trellis Score of each vendor
|
23 |
+
total_trellis_scores = {}
|
24 |
+
|
25 |
|
|
|
26 |
def preprocess_text(text):
|
27 |
if isinstance(text, str):
|
28 |
text = text.lower()
|
|
|
35 |
else:
|
36 |
return text
|
37 |
|
38 |
+
def semantic_search(query, data):
|
39 |
+
query = str(query)
|
40 |
+
data = [str(text) for text in data]
|
41 |
query = preprocess_text(query)
|
42 |
+
data = [preprocess_text(text) for text in data]
|
43 |
+
vectorizer = TfidfVectorizer().fit_transform([query] + data)
|
44 |
+
cosine_similarities = cosine_similarity(vectorizer[0:1], vectorizer).flatten()
|
45 |
+
return np.argmax(cosine_similarities[1:])
|
46 |
+
|
47 |
+
def parse_score(score):
|
48 |
+
level_scores = {
|
49 |
+
'Level 1 - Basic': 1,
|
50 |
+
'Level 2 - Developing': 2,
|
51 |
+
'Level 3 - Intermediate': 3,
|
52 |
+
'Level 4 - Advanced': 4,
|
53 |
+
'Level 5 - Leading': 5,
|
54 |
+
'1 - Low': 1,
|
55 |
+
'2 - Below average': 2,
|
56 |
+
'3 - Average': 3,
|
57 |
+
'4 - Above average': 4,
|
58 |
+
'5 - High': 5,
|
59 |
+
'1 - Very Low': 1,
|
60 |
+
'2 - Low': 2,
|
61 |
+
'3 - Medium': 3,
|
62 |
+
'4 - High-Medium': 4,
|
63 |
+
'5 - Very High': 5
|
64 |
+
}
|
65 |
|
66 |
+
if score is None or str(score).strip() == '':
|
67 |
+
return 3
|
68 |
+
|
69 |
+
if isinstance(score, str):
|
70 |
+
score = score.replace(',', '.')
|
71 |
+
if score in level_scores:
|
72 |
+
return level_scores[score]
|
73 |
+
else:
|
74 |
+
number = re.findall(r"[-+]?\d*\.\d+|\d+", score)
|
75 |
+
if number:
|
76 |
+
return float(number[0])
|
77 |
+
return 0
|
78 |
+
|
79 |
+
def load_vendor_files(zip_file_path):
|
80 |
+
global vendor_df_dict
|
81 |
+
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
|
82 |
+
temp_dir = tempfile.TemporaryDirectory()
|
83 |
+
zip_ref.extractall(temp_dir.name)
|
84 |
+
for file_name in os.listdir(temp_dir.name):
|
85 |
+
if file_name.endswith(".xlsx"):
|
86 |
+
vendor_df_dict[file_name] = pd.read_excel(os.path.join(temp_dir.name, file_name))
|
87 |
+
vendor_df_dict[file_name].iloc[:, 2] = vendor_df_dict[file_name].iloc[:, 2].apply(preprocess_text)
|
88 |
+
vendor_df_dict[file_name]['score_vendor'] = vendor_df_dict[file_name].iloc[:, 4].apply(parse_score).apply(float)
|
89 |
+
|
90 |
+
vendor_df_dict[file_name]['score_vendor'] = vendor_df_dict[file_name]['score_vendor'].fillna(0)
|
91 |
+
#vendor_df_dict[file_name].columns = ['ID', 'Topic', 'Vendor Question', 'Vendor Response', 'Vendor Score']
|
92 |
+
temp_dir.cleanup()
|
93 |
+
|
94 |
+
def process_file(client_file):
|
95 |
+
zip_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'vendor_files.zip')
|
96 |
+
load_vendor_files(zip_file_path) # Load vendor files from zip file
|
97 |
+
client_df = pd.read_excel(client_file.name)
|
98 |
+
client_df['score_client'] = client_df.iloc[:, 2].apply(parse_score).astype(float) #return numbr only from client score
|
99 |
+
|
100 |
+
client_df = client_df[client_df.iloc[:, 1] == 'Yes']
|
101 |
+
client_df.iloc[:, 0] = client_df.iloc[:, 0].apply(preprocess_text)
|
102 |
+
client_df['score_client'] = client_df['score_client'].astype(float)
|
103 |
+
|
104 |
+
matches_found = False # Flag to check if any matches were found
|
105 |
+
highest_score_vendor = None
|
106 |
+
highest_score = 0
|
107 |
+
total_scores = {} # dictionary to store the total Trellis Score for each vendor
|
108 |
+
|
109 |
+
with pd.ExcelWriter('matches.xlsx') as writer:
|
110 |
+
common_list = []
|
111 |
+
for vendor_file, vendor_df in vendor_df_dict.items():
|
112 |
+
for index, row in client_df.iterrows():
|
113 |
+
|
114 |
+
most_similar_index = semantic_search(row[0], vendor_df.iloc[:, 2])
|
115 |
+
|
116 |
+
most_similar_row = vendor_df.iloc[most_similar_index, :]
|
117 |
+
client_score = row['score_client'] # directly access the pre-parsed client score
|
118 |
+
vendor_score = parse_score(most_similar_row[4])
|
119 |
+
|
120 |
+
client_row_selected = row[[0, 2, 3]]
|
121 |
+
vendor_row_selected = most_similar_row[[0, 2, 4, 5]]
|
122 |
+
combined_row = pd.concat([client_row_selected, vendor_row_selected])
|
123 |
+
|
124 |
+
trellis_score = client_score * vendor_score
|
125 |
+
combined_row['Trellis Score'] = trellis_score
|
126 |
+
common_list.append(combined_row)
|
127 |
+
|
128 |
+
if trellis_score > highest_score:
|
129 |
+
highest_score = trellis_score
|
130 |
+
highest_score_vendor = vendor_file
|
131 |
+
|
132 |
+
if common_list:
|
133 |
+
common_df = pd.DataFrame(common_list)
|
134 |
+
# Compute the total Trellis Score
|
135 |
+
total_trellis_score = common_df['Trellis Score'].sum()
|
136 |
+
total_trellis_scores[vendor_file] = total_trellis_score # store the total score for each vendor
|
137 |
+
# Add a row with the total Trellis Score to the DataFrame
|
138 |
+
common_df.loc[len(common_df.index)] = [np.nan]*len(common_df.columns)
|
139 |
+
common_df.at[len(common_df.index)-1, 'Trellis Score'] = total_trellis_score
|
140 |
+
|
141 |
+
common_df.to_excel(writer, sheet_name=os.path.splitext(vendor_file)[0][:31], index=False)
|
142 |
+
common_list = [] # Reset the common_list for next vendor_file
|
143 |
+
matches_found = True # Set the flag to True as matches were found
|
144 |
+
|
145 |
+
highest_score_vendor = max(total_trellis_scores, key=total_trellis_scores.get)
|
146 |
+
highest_score_vendor = highest_score_vendor.split(".")[0]
|
147 |
+
|
148 |
+
if not matches_found: # In case there were no matches
|
149 |
+
return "No matching data found.", None, None
|
150 |
+
else:
|
151 |
+
def gpt3_query(prompt, engine='gpt-3.5-turbo', max_tokens=100, temperature=0.3):
|
152 |
+
try:
|
153 |
+
response = openai.ChatCompletion.create(
|
154 |
+
model=engine,
|
155 |
+
messages=[
|
156 |
+
{"role": "system", "content": "You are a helpful AI."},
|
157 |
+
{"role": "user", "content": prompt}
|
158 |
+
],
|
159 |
+
max_tokens=max_tokens,
|
160 |
+
temperature=temperature
|
161 |
+
)
|
162 |
+
return response['choices'][0]['message']['content'].strip()
|
163 |
+
except Exception as e:
|
164 |
+
print(f"Error in gpt3_query: {str(e)}")
|
165 |
+
return None
|
166 |
+
|
167 |
+
# Get GPT-3.5-turbo to create a summary text
|
168 |
+
summary = gpt3_query(f"Based on the Trellis Score, the best vendor is {highest_score_vendor}. Please provide a brief summary.")
|
169 |
+
|
170 |
+
return f"Matching data has been saved to 'matches.xlsx'.\n\n{summary}", os.path.abspath('matches.xlsx'), highest_score_vendor
|
171 |
+
|
172 |
+
|
173 |
+
iface = gr.Interface(
|
174 |
+
fn=process_file,
|
175 |
+
inputs=[gr.components.File(label="Client File")],
|
176 |
+
outputs=[
|
177 |
+
gr.components.Textbox(label="Status"),
|
178 |
+
gr.components.File(label="Download Match Results"),
|
179 |
+
gr.components.Textbox(label="Vendor with Highest Score")
|
180 |
+
],
|
181 |
+
)
|
182 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
iface.launch()
|