cogcorp commited on
Commit
8d19c24
·
1 Parent(s): 8a2b80d

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.txt +75 -0
  2. requirements.txt +9 -0
app.txt ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import os
4
+ import spacy
5
+ import numpy as np
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+ from keras.preprocessing.text import text_to_word_sequence
9
+ from joblib import Parallel, delayed
10
+
11
+ nlp = spacy.load('en_core_web_md')
12
+
13
+ # Define vendor_df and vectorizer as global variables
14
+ vendor_df = None
15
+ vectorizer = TfidfVectorizer()
16
+
17
+ # Function to preprocess text
18
+ def preprocess_text(text):
19
+ if isinstance(text, str):
20
+ text = text.lower()
21
+ filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
22
+ text = text.translate(str.maketrans('', '', filters))
23
+ stop_words = ['does', 'the', 'offer', 'do', 'you', 'require']
24
+ words = text_to_word_sequence(text)
25
+ words = [word for word in words if word not in stop_words]
26
+ return ' '.join(words)
27
+ else:
28
+ return text
29
+
30
+ # Function to perform semantic search
31
+ def semantic_search(query, vendor_vectors):
32
+ query = preprocess_text(query)
33
+ query_vector = vectorizer.transform([query])
34
+ cosine_similarities = cosine_similarity(query_vector, vendor_vectors).flatten()
35
+ return np.argmax(cosine_similarities)
36
+
37
+ # Function to parse number from text
38
+ def parse_number(text):
39
+ if isinstance(text, str):
40
+ return int(''.join(filter(str.isdigit, text)))
41
+ else:
42
+ return 0
43
+
44
+ # Function to process row
45
+ def process_row(row, vendor_vectors):
46
+ most_similar_index = semantic_search(row[0], vendor_vectors)
47
+ most_similar_row = vendor_df.iloc[most_similar_index, :]
48
+ combined_row = pd.concat([row, most_similar_row])
49
+ combined_row['score'] = combined_row['score_client'] * combined_row['score_vendor']
50
+ return combined_row
51
+
52
+ # Function to process file
53
+ def process_file(mode, file):
54
+ global vendor_df # Declare vendor_df as global
55
+ global vectorizer # Declare vectorizer as global
56
+
57
+ if mode == 'Upload Vendor File':
58
+ vendor_df = pd.read_excel(file.name)
59
+ vendor_df.iloc[:, 2] = vendor_df.iloc[:, 2].apply(preprocess_text)
60
+ vendor_df['score_vendor'] = vendor_df.iloc[:, 4].apply(parse_number)
61
+ vectorizer.fit(vendor_df.iloc[:, 2])
62
+ return "Vendor data file has been uploaded.", None
63
+ elif mode == 'Compare with Client File':
64
+ client_df = pd.read_excel(file.name)
65
+ client_df = client_df[client_df.iloc[:, 1] == 'Yes'] # Only consider rows where the second column is 'Yes'
66
+ client_df.iloc[:, 0] = client_df.iloc[:, 0].apply(preprocess_text)
67
+ client_df['score_client'] = client_df.iloc[:, 2].apply(parse_number)
68
+ vendor_vectors = vectorizer.transform(vendor_df.iloc[:, 2])
69
+ common_list = Parallel(n_jobs=-1)(delayed(process_row)(row, vendor_vectors) for index, row in client_df.iterrows())
70
+ common_df = pd.DataFrame(common_list)
71
+ common_df.to_excel('matches.xlsx', index=False)
72
+ return "Matching data has been saved to 'matches.xlsx'. You can download it from the link below.", os.path.abspath('matches.xlsx')
73
+
74
+ iface = gr.Interface(fn=process_file, inputs=[gr.components.Dropdown(choices=['Upload Vendor File', 'Compare with Client File']), "file"], outputs=["text", "file"])
75
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ pandas
3
+ numpy
4
+ spacy
5
+ scikit-learn
6
+ keras
7
+ joblib
8
+ openpyxl
9
+ xlrd