Upload 2 files
Browse files- app.txt +75 -0
- requirements.txt +9 -0
app.txt
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import os
|
4 |
+
import spacy
|
5 |
+
import numpy as np
|
6 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
7 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
8 |
+
from keras.preprocessing.text import text_to_word_sequence
|
9 |
+
from joblib import Parallel, delayed
|
10 |
+
|
11 |
+
nlp = spacy.load('en_core_web_md')
|
12 |
+
|
13 |
+
# Define vendor_df and vectorizer as global variables
|
14 |
+
vendor_df = None
|
15 |
+
vectorizer = TfidfVectorizer()
|
16 |
+
|
17 |
+
# Function to preprocess text
|
18 |
+
def preprocess_text(text):
|
19 |
+
if isinstance(text, str):
|
20 |
+
text = text.lower()
|
21 |
+
filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
|
22 |
+
text = text.translate(str.maketrans('', '', filters))
|
23 |
+
stop_words = ['does', 'the', 'offer', 'do', 'you', 'require']
|
24 |
+
words = text_to_word_sequence(text)
|
25 |
+
words = [word for word in words if word not in stop_words]
|
26 |
+
return ' '.join(words)
|
27 |
+
else:
|
28 |
+
return text
|
29 |
+
|
30 |
+
# Function to perform semantic search
|
31 |
+
def semantic_search(query, vendor_vectors):
|
32 |
+
query = preprocess_text(query)
|
33 |
+
query_vector = vectorizer.transform([query])
|
34 |
+
cosine_similarities = cosine_similarity(query_vector, vendor_vectors).flatten()
|
35 |
+
return np.argmax(cosine_similarities)
|
36 |
+
|
37 |
+
# Function to parse number from text
|
38 |
+
def parse_number(text):
|
39 |
+
if isinstance(text, str):
|
40 |
+
return int(''.join(filter(str.isdigit, text)))
|
41 |
+
else:
|
42 |
+
return 0
|
43 |
+
|
44 |
+
# Function to process row
|
45 |
+
def process_row(row, vendor_vectors):
|
46 |
+
most_similar_index = semantic_search(row[0], vendor_vectors)
|
47 |
+
most_similar_row = vendor_df.iloc[most_similar_index, :]
|
48 |
+
combined_row = pd.concat([row, most_similar_row])
|
49 |
+
combined_row['score'] = combined_row['score_client'] * combined_row['score_vendor']
|
50 |
+
return combined_row
|
51 |
+
|
52 |
+
# Function to process file
|
53 |
+
def process_file(mode, file):
|
54 |
+
global vendor_df # Declare vendor_df as global
|
55 |
+
global vectorizer # Declare vectorizer as global
|
56 |
+
|
57 |
+
if mode == 'Upload Vendor File':
|
58 |
+
vendor_df = pd.read_excel(file.name)
|
59 |
+
vendor_df.iloc[:, 2] = vendor_df.iloc[:, 2].apply(preprocess_text)
|
60 |
+
vendor_df['score_vendor'] = vendor_df.iloc[:, 4].apply(parse_number)
|
61 |
+
vectorizer.fit(vendor_df.iloc[:, 2])
|
62 |
+
return "Vendor data file has been uploaded.", None
|
63 |
+
elif mode == 'Compare with Client File':
|
64 |
+
client_df = pd.read_excel(file.name)
|
65 |
+
client_df = client_df[client_df.iloc[:, 1] == 'Yes'] # Only consider rows where the second column is 'Yes'
|
66 |
+
client_df.iloc[:, 0] = client_df.iloc[:, 0].apply(preprocess_text)
|
67 |
+
client_df['score_client'] = client_df.iloc[:, 2].apply(parse_number)
|
68 |
+
vendor_vectors = vectorizer.transform(vendor_df.iloc[:, 2])
|
69 |
+
common_list = Parallel(n_jobs=-1)(delayed(process_row)(row, vendor_vectors) for index, row in client_df.iterrows())
|
70 |
+
common_df = pd.DataFrame(common_list)
|
71 |
+
common_df.to_excel('matches.xlsx', index=False)
|
72 |
+
return "Matching data has been saved to 'matches.xlsx'. You can download it from the link below.", os.path.abspath('matches.xlsx')
|
73 |
+
|
74 |
+
iface = gr.Interface(fn=process_file, inputs=[gr.components.Dropdown(choices=['Upload Vendor File', 'Compare with Client File']), "file"], outputs=["text", "file"])
|
75 |
+
iface.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
pandas
|
3 |
+
numpy
|
4 |
+
spacy
|
5 |
+
scikit-learn
|
6 |
+
keras
|
7 |
+
joblib
|
8 |
+
openpyxl
|
9 |
+
xlrd
|