heymenn commited on
Commit
cd97f48
·
verified ·
1 Parent(s): 06cf435

Delete app

Browse files
app/__init__.py DELETED
File without changes
app/core.py DELETED
@@ -1,17 +0,0 @@
1
- from app.services.utils import *
2
- from app.services.processor import *
3
-
4
- def process_input(data):
5
- prompt = set_prompt(data)
6
- constraints = retrieve_constraints(prompt)
7
- constraints_stemmed = stem(constraints, "constraints")
8
- save_dataframe(constraints_stemmed, "constraints_stemmed.xlsx")
9
- df = load_technologies()
10
- global_tech, keys, original_tech = preprocess_tech_data(df)
11
- save_dataframe(global_tech, "global_tech.xlsx")
12
- result_similarities, matrix = get_contrastive_similarities(global_tech, constraints_stemmed)
13
- save_to_pickle(result_similarities)
14
- best_combinations = find_best_list_combinations(constraints_stemmed,global_tech, matrix)
15
- best_technologies_id = select_technologies(best_combinations)
16
- best_technologies = get_technologies_by_id(best_technologies_id,global_tech)
17
- return best_technologies
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/services/processor.py DELETED
@@ -1,217 +0,0 @@
1
- from app.services.utils import tech_to_dict, stem
2
- import requests as r
3
- import json
4
- import nltk
5
- import itertools
6
- import numpy as np
7
-
8
- from sentence_transformers import *
9
- model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
10
-
11
- def retrieve_constraints(prompt):
12
- request_input = {"models": ["meta-llama/llama-4-scout-17b-16e-instruct"], "messages": [{"role":"user", "content":prompt}]}
13
- response = r.post("https://organizedprogrammers-bettergroqinterface.hf.space/chat", json=request_input)
14
-
15
- decoded_content = json.loads(response.content.decode())
16
- llm_response = decoded_content["content"][0]["message"]["content"]
17
-
18
- start_marker = '{'
19
- end_marker = '}'
20
- start_index = llm_response.find(start_marker) + len(start_marker)
21
- end_index = llm_response.find(end_marker, start_index)
22
- json_str = llm_response[start_index:end_index].strip()
23
-
24
- constraints_json = json.loads("{"+json_str+"}")
25
-
26
- return constraints_json
27
-
28
-
29
- def preprocess_tech_data(_df):
30
- if _df is None or "description" not in _df.columns:
31
- return [], []
32
-
33
- technologies_list = _df["description"].to_list()
34
- tech_dict_raw = tech_to_dict(technologies_list)
35
-
36
- tech_dict_filtered = [
37
- t for t in tech_dict_raw if (
38
- len(t.get("title", "")) >= 5 and
39
- len(t.get("advantages", "")) >= 5 and
40
- len(t.get("key_components", "")) >= 5
41
- )
42
- ]
43
-
44
- if not tech_dict_filtered:
45
- return [], []
46
-
47
- processed_tech_wt = stem(tech_dict_filtered,"technologies")
48
-
49
- for t_item_wt in processed_tech_wt:
50
- kc = t_item_wt.get("key_components")
51
- if isinstance(kc, str):
52
- t_item_wt["key_components"] = ''.join(nltk.sent_tokenize(kc))
53
- else:
54
- t_item_wt["key_components"] = ""
55
-
56
- original_tech_for_display = tech_dict_filtered[:len(processed_tech_wt)]
57
-
58
-
59
- _keys = list(processed_tech_wt[0].keys()) if processed_tech_wt else []
60
- return processed_tech_wt, _keys, original_tech_for_display
61
-
62
-
63
- def remove_over_repeated_technologies(result):
64
- total_lists = len(result)
65
- tech_title = {}
66
-
67
- for idx, item in enumerate(result):
68
- for tech in item['technologies']:
69
- tech_title[tech[0]['title']] = 0 if tech[0]['title'] not in tech_title else tech_title[tech[0]['title']] + 1
70
-
71
- threshold = total_lists * 0.3
72
- print(threshold)
73
- print(tech_title)
74
- to_delete = []
75
- for tech, lists in tech_title.items():
76
- if lists > threshold:
77
- print(f"This technology have been found over repeated : " + tech)
78
- to_delete.append(tech)
79
-
80
- for idx, item in enumerate(result):
81
- result[idx]['technologies'] = [tech for tech in item['technologies'] if tech[0]['title'] not in to_delete]
82
-
83
- return result
84
-
85
- def get_contrastive_similarities(global_tech, constraints):
86
- selected_pairs = []
87
- matrix = []
88
-
89
- for i, constraint in enumerate(constraints):
90
- print(constraint)
91
- for j, tech2 in enumerate(global_tech):
92
- if i >= j:
93
- continue
94
-
95
- purpose_sim = model.similarity(model.encode(constraint["description"]), model.encode(tech2["purpose"]))
96
-
97
- print(f"Constraint: {constraint}, Tech 2: {tech2['title']}")
98
- print(f"Purpose Similarity: {purpose_sim}")
99
- selected_pairs.append({
100
- "constraint": constraint,
101
- "id2": tech2["id"],
102
- "similarity": purpose_sim
103
- })
104
- if purpose_sim == np.float32(None):
105
- purpose_sim = 0.0
106
- matrix.append(purpose_sim)
107
-
108
- return selected_pairs,matrix
109
-
110
-
111
- def find_best_list_combinations(list1: list[str], list2: list[str], matrix) -> list[dict]:
112
- if not list1 or not list2:
113
- print("Warning: One or both input lists are empty. Returning an empty list.")
114
- return []
115
-
116
- MIN_SIMILARITY = 0.3
117
- MAX_SIMILARITY = 0.8
118
-
119
- possible_matches_for_each_l1 = []
120
- for i in range(len(list1)):
121
- valid_matches_for_l1_element = []
122
- for j in range(len(list2)):
123
- score = matrix[i, j]
124
-
125
- if MIN_SIMILARITY <= score <= MAX_SIMILARITY:
126
- valid_matches_for_l1_element.append((list2[j], score))
127
-
128
- if not valid_matches_for_l1_element:
129
- print(f"No valid matches found in list2 for '{list1[i]}' from list1 "
130
- f"(score between {MIN_SIMILARITY} and {MAX_SIMILARITY}). "
131
- "Returning an empty list as no complete combinations can be formed.")
132
-
133
- else:
134
- possible_matches_for_each_l1.append((valid_matches_for_l1_element, list1[i]))
135
-
136
- result = []
137
- for tech_list, problem in possible_matches_for_each_l1:
138
- sorted_list = sorted(
139
- tech_list,
140
- key=lambda x: x[1].item() if hasattr(x[1], 'item') else float(x[1]),
141
- reverse=True
142
- )
143
- top5 = sorted_list[:5]
144
- result.append({
145
- 'technologies': top5,
146
- 'problem': problem
147
- })
148
-
149
- result = remove_over_repeated_technologies(result)
150
- return result
151
-
152
-
153
- def select_technologies(problem_technology_list):
154
- distinct_techs = set()
155
- candidate_map = []
156
-
157
- for problem_data in problem_technology_list:
158
- cand_dict = {}
159
- for tech_info, sim in problem_data['technologies']:
160
- tech_id = tech_info['id']
161
- distinct_techs.add(tech_id)
162
- cand_dict[tech_id] = float(sim)
163
- candidate_map.append(cand_dict)
164
-
165
- distinct_techs = sorted(list(distinct_techs))
166
- n = len(problem_technology_list)
167
-
168
- if n == 0:
169
- return set()
170
-
171
- min_k = None
172
- best_set = None
173
- best_avg = -1
174
-
175
- print(f"Distinct technologies: {distinct_techs}")
176
- print(f"Candidate map: {candidate_map}")
177
- print(f"Number of problems: {n}")
178
-
179
- for k in range(1, len(distinct_techs)+1):
180
- if min_k is not None and k > min_k:
181
- break
182
-
183
- for T in itertools.combinations(distinct_techs, k):
184
- total_sim = 0.0
185
- covered = True
186
- print(f"Trying combination: {T}")
187
- for i in range(n):
188
- max_sim = -1.0
189
- found = False
190
- for tech in T:
191
- if tech in candidate_map[i]:
192
- found = True
193
- sim_val = candidate_map[i][tech]
194
- if sim_val > max_sim:
195
- max_sim = sim_val
196
- if not found:
197
- covered = False
198
- break
199
- else:
200
- total_sim += max_sim
201
-
202
- if covered:
203
- avg_sim = total_sim / n
204
- if min_k is None or k < min_k:
205
- min_k = k
206
- best_set = T
207
- best_avg = avg_sim
208
- elif k == min_k and avg_sim > best_avg:
209
- best_set = T
210
- best_avg = avg_sim
211
-
212
- if min_k is not None and k == min_k:
213
- break
214
-
215
- if best_set is None:
216
- return set()
217
- return set(best_set)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/services/technologies_database.xlsx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:370d7a151085850b5fb7a6f9de41313e83686e4da434b6e8be94da38838c1ef7
3
- size 213138
 
 
 
 
app/services/utils.py DELETED
@@ -1,110 +0,0 @@
1
- import pickle
2
- import numpy as np
3
- import pandas as pd
4
-
5
- import nltk
6
- from nltk.stem import *
7
- nltk.download("punkt_tab")
8
-
9
-
10
- def set_prompt(InputData):
11
- prompt = """Task : Find all the constraints in this technical problem making sure each are premised on the problem only.
12
- Take into account different technical domains to encompass the whole problem.
13
- Output each constraints in a json such as : ({"title of the constraints1":"description1","title of the constraintsN":"descriptionN"})
14
- Technical problem :
15
- """ + InputData['problem']
16
- return prompt
17
-
18
- def load_technologies():
19
- df = pd.read_excel('technologies_database.xlsx')
20
- return df
21
-
22
- def tech_to_dict(technologies):
23
- tech_dict = []
24
- for index, tech in enumerate(technologies):
25
- if not tech.find("<title>") > 1:
26
- tab = tech.split("\n")
27
- tab.pop(0)
28
- tab.pop(len(tab)-1)
29
- tech_dict.append({"title": tab[0][tab[0].find(": ")+2:],
30
- "purpose": tab[1][tab[1].find(": ")+2:],
31
- "key_components": tab[2][tab[2].find(": ")+2:],
32
- "advantages": tab[3][tab[3].find(": ")+2:],
33
- "limitations": tab[4][tab[4].find(": ")+2:],
34
- "id": index})
35
- return tech_dict
36
-
37
- def save_dataframe(df, title):
38
- pd.DataFrame(df).to_excel(title)
39
- return title
40
-
41
- def stem(data,data_type):
42
- stemmer = SnowballStemmer("english")
43
- processed_data = []
44
- if data_type == "technologies":
45
- for t_item in data:
46
- processed_data.append({
47
- "title": stemmer.stem(t_item["title"]),
48
- "purpose": stemmer.stem(t_item["purpose"]),
49
- "key_components": stemmer.stem(t_item["key_components"]),
50
- "advantages": stemmer.stem(t_item["advantages"]),
51
- "limitations": stemmer.stem(t_item["limitations"]),
52
- "id": t_item["id"]
53
- })
54
- else:
55
- for t_item in data:
56
- print(t_item)
57
- processed_data.append({
58
- "title": stemmer.stem(t_item),
59
- "description": stemmer.stem(data[t_item])
60
- })
61
-
62
- return processed_data
63
-
64
-
65
- def get_technologies_by_id(id_list, technologies):
66
- result = []
67
- id_set = set(id_list)
68
- for tech in technologies:
69
- if tech.get('id') in id_set:
70
- result.append(tech)
71
- return result
72
-
73
- def save_to_pickle(result_similarites):
74
-
75
- constraint_titles = sorted(list(set([item['constraint']['title'] for item in result_similarites])))
76
- max_id2 = max([item['id2'] for item in result_similarites])
77
-
78
- row_label_to_index = {title: i for i, title in enumerate(constraint_titles)}
79
- col_labels = list(range(1, max_id2 + 1))
80
-
81
- num_rows = len(constraint_titles)
82
- num_cols = max_id2
83
-
84
- matrix = np.full((num_rows, num_cols), np.nan, dtype=np.float32)
85
-
86
- for item in result_similarites:
87
- row_idx = row_label_to_index[item['constraint']['title']]
88
- col_idx = item['id2'] - 1 #
89
- similarity_value = item['similarity'].item()
90
-
91
- matrix[row_idx, col_idx] = similarity_value
92
-
93
- print(f"Successfully created matrix with shape: {matrix.shape}")
94
- print(f"Number of rows (unique constraints): {num_rows}")
95
- print(f"Number of columns (max id2): {num_cols}")
96
- print("\nExample 5x5 block of the created matrix (NaN for missing values):")
97
- print(matrix[:5, :5])
98
-
99
- output_filename = "cosine_similarity_matrix_with_labels.pkl"
100
- data_to_save = {
101
- 'matrix': matrix,
102
- 'row_labels': constraint_titles,
103
- 'col_labels': col_labels
104
- }
105
-
106
- with open(output_filename, 'wb') as f:
107
- pickle.dump(data_to_save, f)
108
-
109
- print(f"\nMatrix and labels saved to {output_filename}")
110
- return output_filename