umairahmad89 commited on
Commit
081077e
·
1 Parent(s): fbd3198

initial commit

Browse files
Files changed (2) hide show
  1. .gitignore +6 -0
  2. app.py +160 -0
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ *.xlsx
2
+ *.csv
3
+ test*
4
+ submission/
5
+ flagged/
6
+ submission.zip
app.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import openpyxl
3
+ import csv
4
+ from sentence_transformers import SentenceTransformer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+ import numpy as np
7
+ import tempfile
8
+ import os
9
+
10
+ # Load the sentence transformer model
11
+ model = SentenceTransformer('BAAI/bge-small-en-v1.5')
12
+
13
+ def filter_excel1(excel_path, min_row, max_row):
14
+ try:
15
+ excel = openpyxl.load_workbook(excel_path)
16
+ sheet_0 = excel.worksheets[0]
17
+ data = [["category", "diagnostic_statement"]]
18
+ prev_category = ""
19
+ for row in sheet_0.iter_rows(min_row=min_row, max_row=max_row):
20
+ category = row[1].value
21
+ diagnostic_statement = row[5].value
22
+ if prev_category == "":
23
+ prev_category = category
24
+ if not category:
25
+ category = prev_category
26
+ else:
27
+ prev_category = category
28
+ data.append([category, diagnostic_statement])
29
+ return data
30
+ except Exception as e:
31
+ raise gr.Error(f"Error processing Excel 1: {str(e)}")
32
+
33
+ def filter_excel2(excel_path, min_row, max_row, sheetname):
34
+ try:
35
+ excel = openpyxl.load_workbook(excel_path)
36
+ sheet_0 = excel[sheetname]
37
+ data = [["description", "category"]]
38
+ for row in sheet_0.iter_rows(min_row=min_row, max_row=max_row):
39
+ description = row[0].value
40
+ category = row[6].value
41
+
42
+ # filtering out the categories
43
+ if isinstance(category, str) and category!="#N/A":
44
+ pass
45
+ elif isinstance(category, int):
46
+ category="#N/A"
47
+ else:
48
+ category="#N/A"
49
+
50
+ if description:
51
+ data.append([description, category])
52
+ return data
53
+ except Exception as e:
54
+ raise gr.Error(f"Error processing Excel 2: {str(e)}")
55
+
56
+ def get_embeddings(texts):
57
+ return model.encode(texts)
58
+
59
+ def get_top_n_categories(query_embedding, statement_embeddings, categories, n=3):
60
+ similarities = cosine_similarity([query_embedding], statement_embeddings)[0]
61
+ top_indices = np.argsort(similarities)[-n:][::-1]
62
+ return [categories[i] for i in top_indices]
63
+
64
+ def process_data(csv1_data, csv2_data):
65
+ try:
66
+ diagnostic_statements = [row[1] for row in csv1_data[1:]]
67
+ statement_embeddings = get_embeddings(diagnostic_statements)
68
+ categories = [row[0] for row in csv1_data[1:]]
69
+
70
+ processed_descriptions = []
71
+ processed_categories = []
72
+ for row in csv2_data[1:]:
73
+ description = row[0]
74
+ if description in processed_descriptions:
75
+ row[1] = processed_categories[processed_descriptions.index(description)]
76
+ continue
77
+ if row[1] != "#N/A":
78
+ processed_categories.append(row[1])
79
+ processed_descriptions.append(description)
80
+ continue
81
+ description_embedding = get_embeddings([description])[0]
82
+ top_categories = get_top_n_categories(description_embedding, statement_embeddings, categories)
83
+ row[1] = ', '.join(top_categories)
84
+ processed_descriptions.append(description)
85
+ processed_categories.append(', '.join(top_categories))
86
+ return csv2_data
87
+ except Exception as e:
88
+ raise gr.Error(f"Error processing data: {str(e)}")
89
+
90
+ def update_excel(excel_path, processed_data):
91
+ try:
92
+ excel = openpyxl.load_workbook(excel_path)
93
+ sheet_0 = excel["1Q2024"]
94
+ idx = 0
95
+ for row in sheet_0.iter_rows(min_row=2):
96
+ description = row[0]
97
+ category = row[6]
98
+ if not description.value:
99
+ continue
100
+ try:
101
+ sheet_0.cell(row=category.row, column=category.col_idx, value=processed_data[idx][1])
102
+ idx += 1
103
+ except IndexError:
104
+ print(f"Warning: Not enough processed data for row {category.row}")
105
+ return excel
106
+ except Exception as e:
107
+ raise gr.Error(f"Error updating Excel: {str(e)}")
108
+
109
+ def process_files(excel1, excel2, min_row1, max_row1, min_row2, max_row2, sheetname):
110
+ try:
111
+ gr.Info("Starting processing...")
112
+
113
+ # Process Excel 1
114
+ gr.Info("Processing Excel 1...")
115
+ csv1_data = filter_excel1(excel1, min_row1, max_row1)
116
+
117
+ # Process Excel 2
118
+ gr.Info("Processing Excel 2...")
119
+ csv2_data = filter_excel2(excel2, min_row2, max_row2, sheetname)
120
+
121
+ # Process data
122
+ gr.Info("Running similarity search...")
123
+ processed_data = process_data(csv1_data, csv2_data)
124
+
125
+ # Update Excel 2
126
+ gr.Info("Updating Excel file...")
127
+ updated_excel = update_excel(excel2, processed_data[1:])
128
+
129
+ # Save the updated Excel file
130
+ gr.Info("Saving updated Excel file...")
131
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp:
132
+ updated_excel.save(tmp.name)
133
+
134
+ gr.Info("Processing complete!")
135
+ return tmp.name
136
+ except gr.Error as e:
137
+ # Re-raise Gradio errors to display them in the interface
138
+ raise e
139
+ except Exception as e:
140
+ # Catch any other unexpected errors
141
+ raise gr.Error(f"An unexpected error occurred: {str(e)}")
142
+
143
+ # Gradio interface
144
+ iface = gr.Interface(
145
+ fn=process_files,
146
+ inputs=[
147
+ gr.File(label="Upload Source Excel (Excel 1)"),
148
+ gr.File(label="Upload Excel to be Filled (Excel 2)"),
149
+ gr.Number(label="Min Row for Excel 1", value=2),
150
+ gr.Number(label="Max Row for Excel 1", value=1000),
151
+ gr.Number(label="Min Row for Excel 2", value=2),
152
+ gr.Number(label="Max Row for Excel 2", value=3009),
153
+ gr.Textbox(label="Sheet Name for Excel 2")
154
+ ],
155
+ outputs=gr.File(label="Download Updated Excel"),
156
+ title="Excel Processor",
157
+ description="Upload two Excel files, specify row ranges, and download the processed Excel file."
158
+ )
159
+
160
+ iface.launch()