Spaces:
Sleeping
Sleeping
import logging | |
from enum import Enum | |
def configure_logging(level=logging.INFO, config_file_path='./common.log'): | |
formatter = logging.Formatter("[%(asctime)s.%(msecs)03d] %(module)30s:%(lineno)4d %(levelname)-7s - %(message)s") | |
console_handler = logging.StreamHandler() | |
console_handler.setLevel(level) | |
console_handler.setFormatter(formatter) | |
logging.basicConfig( | |
filename=config_file_path, | |
filemode="a", | |
level=level, | |
datefmt="%Y-%m-%d %H:%M:%S", | |
format="[%(asctime)s.%(msecs)03d] %(module)30s:%(lineno)4d %(levelname)-7s - %(message)s", | |
handlers=[console_handler] | |
) | |
def get_elastic_query(query): | |
return { | |
"query": { | |
"multi_match": { | |
"query": f"{query}", | |
"fields": ["text"], | |
"fuzziness": "AUTO", | |
"analyzer": "russian", | |
} | |
} | |
} | |
def get_elastic_people_query(query): | |
has_business_curator = ( | |
"бизнес куратор" in query.lower() | |
or "бизнес-куратор" in query.lower() | |
or "куратор" in query.lower() | |
) | |
business_curator_boost = 30 if has_business_curator else 15 | |
return { | |
"query": { | |
"bool": { | |
"should": [ | |
{ | |
"multi_match": { | |
"query": f"{query}", | |
"fields": ["person_name^3"], | |
"fuzziness": "AUTO", | |
"analyzer": "standard", | |
} | |
}, | |
{ | |
"nested": { | |
"path": "business_processes", | |
"query": { | |
"multi_match": { | |
"query": f"{query}", | |
"fields": [ | |
"business_processes.production_activities_section", | |
"business_processes.processes_name", | |
], | |
"fuzziness": "AUTO", | |
"analyzer": "standard", | |
} | |
}, | |
} | |
}, | |
{ | |
"nested": { | |
"path": "organizatinal_structure", | |
"query": { | |
"multi_match": { | |
"query": f"{query}", | |
"fields": ["organizatinal_structure.position^2"], | |
"fuzziness": "AUTO", | |
"analyzer": "standard", | |
} | |
}, | |
} | |
}, | |
{ | |
"nested": { | |
"path": "business_curator", | |
"query": { | |
"multi_match": { | |
"query": f"{query}", | |
"fields": [ | |
f"business_curator.company_name^{business_curator_boost}" | |
], | |
"fuzziness": "AUTO", | |
"analyzer": "standard", | |
} | |
}, | |
} | |
}, | |
] | |
} | |
}, | |
"min_score": 13.0, | |
} | |
def get_elastic_group_query(query): | |
return { | |
"query": { | |
"bool": { | |
"should": [ | |
{ | |
"multi_match": { | |
"query": f"{query}", | |
"fields": ["group_name"], | |
"fuzziness": "AUTO", | |
"analyzer": "standard", | |
} | |
}, | |
{ | |
"multi_match": { | |
"query": "персонального состава Персональный состав Комитета ПАО ГМК Норильский никель Рабочей группы", | |
"fields": ["group_name"], | |
"operator": "or", | |
"boost": 0.1, | |
} | |
}, | |
] | |
} | |
}, | |
"min_score": 7.5, | |
} | |
def get_elastic_rocks_nn_query(query): | |
return { | |
"query": { | |
"function_score": { | |
"query": { | |
"multi_match": { | |
"query": f"{query}", | |
"fields": ["division_name", "division_name_2", "company_name"], | |
"fuzziness": "AUTO", | |
"analyzer": "custom_analyzer", | |
} | |
}, | |
"functions": [{"filter": {"term": {"_id": "3"}}, "weight": 0.5}], | |
"boost_mode": "multiply", | |
} | |
}, | |
"min_score": 0.5, | |
} | |
def get_elastic_segmentation_query(query): | |
return { | |
"query": { | |
"bool": { | |
"should": [ | |
{ | |
"multi_match": { | |
"query": f"{query}", | |
"fields": [ | |
"segmentation_model", | |
"segmentation_model2", | |
"company_name", | |
], | |
"fuzziness": "AUTO", | |
"analyzer": "russian", | |
} | |
}, | |
{ | |
"multi_match": { | |
"query": "модели сегментации модель сегментации", | |
"fields": ["segmentation_model", "segmentation_model2"], | |
"operator": "or", | |
"boost": 0.1, | |
} | |
}, | |
] | |
} | |
}, | |
"min_score": 1.0, | |
} | |
def get_elastic_abbreviation_query(query): | |
return { | |
"query": { | |
"multi_match": { | |
"query": f"{query}", | |
"fuzziness": "AUTO", | |
"fields": ["text"], | |
"analyzer": "russian", | |
} | |
} | |
} | |
def combine_answer(answer): | |
""" | |
Args: | |
answer: | |
Returns: | |
""" | |
answer_combined = {} | |
indexes = [] | |
for key in answer: | |
if key != 'people_search': | |
for answer_key in answer[key]: | |
answer_value = answer[key][answer_key] | |
filename_i = answer_value["doc_name"] | |
title_i = answer_value["title"] | |
if ( | |
filename_i in answer_combined | |
and answer_value['index_answer'] not in indexes | |
): | |
answer_combined[filename_i]["chunks"].append(answer_value) | |
else: | |
answer_combined[filename_i] = { | |
"filename": filename_i, | |
"title": title_i, | |
"chunks": [answer_value], | |
} | |
indexes.append(answer_value['index_answer']) | |
return list(answer_combined.values()) | |
class TypeQuestion(Enum): | |
TYPE_ONE = '[1]' | |
TYPE_TWO = '[2]' | |
TYPE_THREE = '[3]' | |
def get_source_format(filename: str) -> str: | |
""" | |
Получает формат файла из имени файла. | |
""" | |
format_ = filename.split('.')[-1] | |
return format_.upper() | |