insight-finder / src /services /processor.py
ALLOUNE
add visual prior art
be240c1
from src.services.utils import tech_to_dict, stem, set_gemini
import requests as r
import json
import nltk
import itertools
import numpy as np
from sentence_transformers import *
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
def retrieve_constraints(prompt):
request_input = {"models": ["meta-llama/llama-4-scout-17b-16e-instruct"], "messages": [{"role":"user", "content":prompt}]}
response = r.post("https://organizedprogrammers-bettergroqinterface.hf.space/chat", json=request_input)
decoded_content = json.loads(response.content.decode())
llm_response = decoded_content["content"]
print(f"llm response : {llm_response}")
start_marker = '{'
end_marker = '}'
start_index = llm_response.find(start_marker) + len(start_marker)
end_index = llm_response.find(end_marker, start_index)
json_str = llm_response[start_index:end_index].strip()
constraints_json = json.loads("{"+json_str+"}")
print(f"Whats returned : {constraints_json}")
return constraints_json
def preprocess_tech_data(_df):
if _df is None or "description" not in _df.columns:
return [], []
technologies_list = _df["description"].to_list()
tech_dict_raw = tech_to_dict(technologies_list)
tech_dict_filtered = [
t for t in tech_dict_raw if (
len(t.get("title", "")) >= 5 and
len(t.get("advantages", "")) >= 5 and
len(t.get("key_components", "")) >= 5
)
]
if not tech_dict_filtered:
return [], []
processed_tech_wt = stem(tech_dict_filtered,"technologies")
for t_item_wt in processed_tech_wt:
kc = t_item_wt.get("key_components")
if isinstance(kc, str):
t_item_wt["key_components"] = ''.join(nltk.sent_tokenize(kc))
else:
t_item_wt["key_components"] = ""
original_tech_for_display = tech_dict_filtered[:len(processed_tech_wt)]
_keys = list(processed_tech_wt[0].keys()) if processed_tech_wt else []
return processed_tech_wt, _keys, original_tech_for_display
def remove_over_repeated_technologies(result):
total_lists = len(result)
tech_title = {}
for idx, item in enumerate(result):
for tech in item['technologies']:
tech_title[tech[0]['title']] = 0 if tech[0]['title'] not in tech_title else tech_title[tech[0]['title']] + 1
threshold = total_lists * 0.3
print(threshold)
print(tech_title)
to_delete = []
for tech, lists in tech_title.items():
if lists > threshold:
print(f"This technology have been found over repeated : " + tech)
to_delete.append(tech)
for idx, item in enumerate(result):
result[idx]['technologies'] = [tech for tech in item['technologies'] if tech[0]['title'] not in to_delete]
return result
def get_contrastive_similarities(constraints, pre_encoded_tech_data, pre_encoded_tech_embeddings):
selected_pairs = []
matrix = []
constraint_descriptions = [c["description"] for c in constraints]
constraint_embeddings = model.encode(constraint_descriptions, show_progress_bar=False)
for i, constraint in enumerate(constraints):
constraint_embedding = constraint_embeddings[i]
constraint_matrix = []
for j, tech2 in enumerate(pre_encoded_tech_data):
tech_embedding = pre_encoded_tech_embeddings[j]
purpose_sim = model.similarity(constraint_embedding, tech_embedding)
if np.isnan(purpose_sim):
purpose_sim = 0.0
selected_pairs.append({
"constraint": constraint,
"id2": tech2["id"],
"similarity": purpose_sim
})
constraint_matrix.append(purpose_sim)
matrix.append(constraint_matrix)
return selected_pairs, matrix
def find_best_list_combinations(list1: list[str], list2: list[str], matrix) -> list[dict]:
if not list1 or not list2:
print("Warning: One or both input lists are empty. Returning an empty list.")
return []
MIN_SIMILARITY = 0.3
MAX_SIMILARITY = 0.8
possible_matches_for_each_l1 = []
for i in range(len(list1)):
valid_matches_for_l1_element = []
for j in range(len(list2)):
score = matrix[i][j]
if MIN_SIMILARITY <= score <= MAX_SIMILARITY:
valid_matches_for_l1_element.append((list2[j], score))
if not valid_matches_for_l1_element:
print(f"No valid matches found in list2 for '{list1[i]}' from list1 "
f"(score between {MIN_SIMILARITY} and {MAX_SIMILARITY}). "
"Returning an empty list as no complete combinations can be formed.")
else:
possible_matches_for_each_l1.append((valid_matches_for_l1_element, list1[i]))
result = []
for tech_list, problem in possible_matches_for_each_l1:
sorted_list = sorted(
tech_list,
key=lambda x: x[1].item() if hasattr(x[1], 'item') else float(x[1]),
reverse=True
)
top5 = sorted_list[:5]
result.append({
'technologies': top5,
'problem': problem
})
result = remove_over_repeated_technologies(result)
return result
def select_technologies(problem_technology_list):
distinct_techs = set()
candidate_map = []
for problem_data in problem_technology_list:
cand_dict = {}
for tech_info, sim in problem_data['technologies']:
tech_id = tech_info['id']
distinct_techs.add(tech_id)
cand_dict[tech_id] = float(sim)
candidate_map.append(cand_dict)
distinct_techs = sorted(list(distinct_techs))
n = len(problem_technology_list)
if n == 0:
return set()
min_k = None
best_set = None
best_avg = -1
print(f"Distinct technologies: {distinct_techs}")
print(f"Candidate map: {candidate_map}")
print(f"Number of problems: {n}")
for k in range(1, len(distinct_techs)+1):
if min_k is not None and k > min_k:
break
for T in itertools.combinations(distinct_techs, k):
total_sim = 0.0
covered = True
for i in range(n):
max_sim = -1.0
found = False
for tech in T:
if tech in candidate_map[i]:
found = True
sim_val = candidate_map[i][tech]
if sim_val > max_sim:
max_sim = sim_val
if not found:
covered = False
break
else:
total_sim += max_sim
if covered:
avg_sim = total_sim / n
if min_k is None or k < min_k:
min_k = k
best_set = T
best_avg = avg_sim
elif k == min_k and avg_sim > best_avg:
best_set = T
best_avg = avg_sim
if min_k is not None and k == min_k:
break
if best_set is None:
return set()
return set(best_set)
def load_titles(techno, data_type):
if data_type == "pydantic":
technology_titles = [tech.title for tech in techno]
else: # data_type == "dict"
technologies = techno["technologies"]
technology_titles = [tech["title"] for tech in technologies]
return technology_titles
def search_prior_art(technologies_input: list, data: str, data_type: str, techno_type: str) -> json:
"""
Searches for prior art patents online that solve a given technical problem
using a set of specified technologies, leveraging the Gemini model's search capabilities.
"""
technology_titles = load_titles(technologies_input, techno_type)
if data_type == "problem":
prompt = f"Find prior art patents or research paper online that address the technical problem: '{data}'. " \
elif data_type == "constraints":
prompt = f"Find prior art patents or research paper online that address those constraints: '{data}'. " \
prompt += f"Using any combination of the following technologies: {', '.join(technology_titles)}. " \
f"Specifically look for patents that integrate multiple of these technologies. " \
f"Indicate for each document found what technologies is used inside of it from the provided list"
client,config = set_gemini()
response = client.models.generate_content(
model="gemini-2.5-flash",
contents=prompt,
config=config,
)
return response
def add_citations_and_collect_uris(response):
try:
print(response)
text = response.text
supports = response.candidates[0].grounding_metadata.grounding_supports
chunks = response.candidates[0].grounding_metadata.grounding_chunks
sorted_supports = sorted(supports, key=lambda s: s.segment.end_index, reverse=True)
uris_added = set()
for support in sorted_supports:
end_index = support.segment.end_index
if support.grounding_chunk_indices:
citation_links = []
for i in support.grounding_chunk_indices:
if i < len(chunks):
uri = chunks[i].web.uri
# Add URI only if not already in text or collected
if uri not in text and uri not in uris_added:
citation_links.append(f"[{i + 1}]({uri})")
uris_added.add(uri)
if citation_links:
citation_string = ", ".join(citation_links)
text = text[:end_index] + citation_string + text[end_index:]
return {"content": text,"uris": list(uris_added)}
except Exception as e:
print(f"Error : {e}")
return {"content": e, "uris": []}