Spaces:

cogcorp
/

homework

Sleeping

App Files Files Community

cogcorp commited on Jun 11, 2023

Commit

30e35ef

1 Parent(s): 64aa184

Update app.py

Browse files

Files changed (1) hide show

app.py +154 -99

app.py CHANGED Viewed

@@ -3,23 +3,26 @@ import pandas as pd
 import os
 import spacy
 import numpy as np
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from keras.preprocessing.text import text_to_word_sequence
-from joblib import Parallel, delayed
-import sys
-import subprocess
-import pickle
-subprocess.run(["python", "-m", "spacy", "download", "en_core_web_md"])
 nlp = spacy.load('en_core_web_md')
-# Define vendor_df and vectorizer as global variables
-vendor_df = None
-vectorizer = TfidfVectorizer()
-# Function to preprocess text
 def preprocess_text(text):
     if isinstance(text, str):
         text = text.lower()
@@ -32,97 +35,149 @@ def preprocess_text(text):
     else:
         return text
-# Function to perform semantic search
-def semantic_search(query, vendor_vectors):
     query = preprocess_text(query)
-    query_vector = vectorizer.transform([query])
-    cosine_similarities = cosine_similarity(query_vector, vendor_vectors).flatten()
-    return cosine_similarities
-# Function to parse number from text
-def parse_number(text):
-    if isinstance(text, str):
-        return int(''.join(filter(str.isdigit, text)))
-    else:
-        return 0
-# Function to process row
-def process_row(row, vendor_data, vendor_scores):
-    scores = {}
-    for vendor_name, vendor_vectors in vendor_data.items():
-        cosine_similarities = semantic_search(row[0], vendor_vectors)
-        most_similar_index = np.argmax(cosine_similarities)
-        vendor_score = vendor_scores[vendor_name] # Get the vendor's score
-        scores[vendor_name] = row['score_client'] * vendor_score  # Multiply vendor score with client priority
-    row_scores = pd.Series(scores)
-    combined_row = pd.concat([row, row_scores])
-    return combined_row
-# Function to process file
-def process_file(vendor_name_input, mode, file):
-    global vendor_df
-    global vectorizer
-    # Sanitize the vendor name input
-    vendor_name = vendor_name_input.strip().lower().replace(" ", "_")
-    if mode == 'Upload Vendor File':
-        vendor_df = pd.read_excel(file.name)
-        vendor_df.iloc[:, 2] = vendor_df.iloc[:, 2].apply(preprocess_text)
-        vendor_df['score_vendor'] = vendor_df.iloc[:, 4].apply(parse_number)
-        vendor_df.iloc[:, 2] = vendor_df.iloc[:, 2].fillna('')
-        vectorizer.fit(vendor_df.iloc[:, 2])
-        vendor_vectors = vectorizer.transform(vendor_df.iloc[:, 2])
-        # Save vendor vectors as pickle file
-        vendor_vectors_path = os.path.join('data', f'{vendor_name}_vectors.pkl')
-        os.makedirs(os.path.dirname(vendor_vectors_path), exist_ok=True)
-        with open(vendor_vectors_path, 'wb') as f:
-            pickle.dump(vendor_vectors, f)
-        # Save vendor data (not vectors) as CSV file
-        vendor_df_path = os.path.join('data', f'{vendor_name}_data.csv')
-        vendor_df.to_csv(vendor_df_path, index=False)
-        # Save vendor scores as pickle file
-        vendor_scores_path = os.path.join('data', f'{vendor_name}_scores.pkl')
-        with open(vendor_scores_path, 'wb') as f:
-            pickle.dump(vendor_df['score_vendor'].to_dict(), f)
-        return f"Vendor data file for {vendor_name} has been uploaded and saved.", None
-    elif mode == 'Compare with Client File':
-        csv_files = [f for f in os.listdir('data') if f.endswith('_data.csv')]
-        vector_files = [f for f in os.listdir('data') if f.endswith('_vectors.pkl')]
-        score_files = [f for f in os.listdir('data') if f.endswith('_scores.pkl')]
-        if not csv_files or not vector_files or not score_files:
-            return "No vendor data found. Please upload it first.", None
-        vendor_data = {}
-        vendor_scores = {}
-        for csv_file, vector_file, score_file in zip(csv_files, vector_files, score_files):
-            with open(os.path.join('data', vector_file), 'rb') as f:
-                vendor_vectors = pickle.load(f)
-                vendor_data[vendor_name] = vendor_vectors
-            with open(os.path.join('data', score_file), 'rb') as f:
-                vendor_scores[vendor_name] = pickle.load(f)
-        client_df = pd.read_excel(file.name)
-        client_df.iloc[:, 2] = client_df.iloc[:, 2].fillna('3 - Medium')
-        client_df = client_df[client_df.iloc[:, 1] == 'Yes']  # Only consider rows where the second column is 'Yes'
-        client_df.iloc[:, 0] = client_df.iloc[:, 0].apply(preprocess_text)
-        client_df['score_client'] = client_df.iloc[:, 2].apply(parse_number)
-        common_list = Parallel(n_jobs=-1)(delayed(process_row)(row, vendor_data, vendor_scores) for index, row in client_df.iterrows())
-        common_df = pd.DataFrame(common_list)
-        common_df = common_df.drop(common_df.columns[[1, 2, 3, 4]], axis=1)  # Drop the second, third, fourth and fifth columns
-        common_df.to_excel(f'client_matches.xlsx', index=False)
-        return f"Matching data for all vendors has been saved to 'client_matches.xlsx'. You can download it from the link below.", os.path.abspath('client_matches.xlsx')
-iface = gr.Interface(fn=process_file, inputs=["text", gr.components.Dropdown(choices=['Upload Vendor File', 'Compare with Client File']), "file"], outputs=["text", "file"])
 iface.launch()

 import os
 import spacy
 import numpy as np
+import zipfile
+import tempfile
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from keras.preprocessing.text import text_to_word_sequence
+import openai
+import re
+#vendor name (from column 0)
+openai.api_key = "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"
 nlp = spacy.load('en_core_web_md')
+vendor_df_dict = {}
+# A dictionary to store the total Trellis Score of each vendor
+total_trellis_scores = {}
 def preprocess_text(text):
     if isinstance(text, str):
         text = text.lower()
     else:
         return text
+def semantic_search(query, data):
+    query = str(query)
+    data = [str(text) for text in data]
     query = preprocess_text(query)
+    data = [preprocess_text(text) for text in data]
+    vectorizer = TfidfVectorizer().fit_transform([query] + data)
+    cosine_similarities = cosine_similarity(vectorizer[0:1], vectorizer).flatten()
+    return np.argmax(cosine_similarities[1:])
+def parse_score(score):
+    level_scores = {
+        'Level 1 - Basic': 1,
+        'Level 2 - Developing': 2,
+        'Level 3 - Intermediate': 3,
+        'Level 4 - Advanced': 4,
+        'Level 5 - Leading': 5,
+        '1 - Low': 1,
+        '2 - Below average': 2,
+        '3 - Average': 3,
+        '4 - Above average': 4,
+        '5 - High': 5,
+        '1 - Very Low': 1,
+        '2 - Low': 2,
+        '3 - Medium': 3,
+        '4 - High-Medium': 4,
+        '5 - Very High': 5
+    }
+    if score is None or str(score).strip() == '':
+        return 3
+    if isinstance(score, str):
+        score = score.replace(',', '.')
+        if score in level_scores:
+            return level_scores[score]
+        else:
+            number = re.findall(r"[-+]?\d*\.\d+|\d+", score)
+            if number:
+                return float(number[0])
+    return 0
+def load_vendor_files(zip_file_path):
+    global vendor_df_dict
+    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
+        temp_dir = tempfile.TemporaryDirectory()
+        zip_ref.extractall(temp_dir.name)
+        for file_name in os.listdir(temp_dir.name):
+            if file_name.endswith(".xlsx"):
+                vendor_df_dict[file_name] = pd.read_excel(os.path.join(temp_dir.name, file_name))
+                vendor_df_dict[file_name].iloc[:, 2] = vendor_df_dict[file_name].iloc[:, 2].apply(preprocess_text)
+                vendor_df_dict[file_name]['score_vendor'] = vendor_df_dict[file_name].iloc[:, 4].apply(parse_score).apply(float)
+                vendor_df_dict[file_name]['score_vendor'] = vendor_df_dict[file_name]['score_vendor'].fillna(0)
+                #vendor_df_dict[file_name].columns = ['ID', 'Topic', 'Vendor Question', 'Vendor Response', 'Vendor Score']
+        temp_dir.cleanup()
+def process_file(client_file):
+    zip_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'vendor_files.zip')
+    load_vendor_files(zip_file_path)  # Load vendor files from zip file
+    client_df = pd.read_excel(client_file.name)
+    client_df['score_client'] = client_df.iloc[:, 2].apply(parse_score).astype(float)  #return numbr only from client score
+    client_df = client_df[client_df.iloc[:, 1] == 'Yes']
+    client_df.iloc[:, 0] = client_df.iloc[:, 0].apply(preprocess_text)
+    client_df['score_client'] = client_df['score_client'].astype(float)
+    matches_found = False  # Flag to check if any matches were found
+    highest_score_vendor = None
+    highest_score = 0
+    total_scores = {} # dictionary to store the total Trellis Score for each vendor
+    with pd.ExcelWriter('matches.xlsx') as writer:
+        common_list = []
+        for vendor_file, vendor_df in vendor_df_dict.items():
+            for index, row in client_df.iterrows():
+                most_similar_index = semantic_search(row[0], vendor_df.iloc[:, 2])
+                most_similar_row = vendor_df.iloc[most_similar_index, :]
+                client_score = row['score_client'] # directly access the pre-parsed client score
+                vendor_score = parse_score(most_similar_row[4])
+                client_row_selected = row[[0, 2, 3]]
+                vendor_row_selected = most_similar_row[[0, 2, 4, 5]]
+                combined_row = pd.concat([client_row_selected, vendor_row_selected])
+                trellis_score = client_score * vendor_score
+                combined_row['Trellis Score'] = trellis_score
+                common_list.append(combined_row)
+                if trellis_score > highest_score:
+                    highest_score = trellis_score
+                    highest_score_vendor = vendor_file
+            if common_list:
+                common_df = pd.DataFrame(common_list)
+                # Compute the total Trellis Score
+                total_trellis_score = common_df['Trellis Score'].sum()
+                total_trellis_scores[vendor_file] = total_trellis_score # store the total score for each vendor
+                # Add a row with the total Trellis Score to the DataFrame
+                common_df.loc[len(common_df.index)] = [np.nan]*len(common_df.columns)
+                common_df.at[len(common_df.index)-1, 'Trellis Score'] = total_trellis_score
+                common_df.to_excel(writer, sheet_name=os.path.splitext(vendor_file)[0][:31], index=False)
+                common_list = []  # Reset the common_list for next vendor_file
+                matches_found = True  # Set the flag to True as matches were found
+        highest_score_vendor = max(total_trellis_scores, key=total_trellis_scores.get)
+        highest_score_vendor = highest_score_vendor.split(".")[0]
+        if not matches_found:  # In case there were no matches
+            return "No matching data found.", None, None
+        else:
+            def gpt3_query(prompt, engine='gpt-3.5-turbo', max_tokens=100, temperature=0.3):
+                try:
+                    response = openai.ChatCompletion.create(
+                        model=engine,
+                        messages=[
+                            {"role": "system", "content": "You are a helpful AI."},
+                            {"role": "user", "content": prompt}
+                        ],
+                        max_tokens=max_tokens,
+                        temperature=temperature
+                    )
+                    return response['choices'][0]['message']['content'].strip()
+                except Exception as e:
+                    print(f"Error in gpt3_query: {str(e)}")
+                    return None
+        # Get GPT-3.5-turbo to create a summary text
+            summary = gpt3_query(f"Based on the Trellis Score, the best vendor is {highest_score_vendor}. Please provide a brief summary.")
+            return f"Matching data has been saved to 'matches.xlsx'.\n\n{summary}", os.path.abspath('matches.xlsx'), highest_score_vendor
+iface = gr.Interface(
+    fn=process_file,
+    inputs=[gr.components.File(label="Client File")],
+    outputs=[
+        gr.components.Textbox(label="Status"),
+        gr.components.File(label="Download Match Results"),
+        gr.components.Textbox(label="Vendor with Highest Score")
+    ],
+)
 iface.launch()