mtDNALocation / NER /word2Vec /heuristic.py
VyLala's picture
update new codes
7fc87fe verified
raw
history blame
2.06 kB
import logging
from datetime import datetime
class HeuristicManager:
def __init__(self, model, log_file="heuristic_log.txt", min_similarity_threshold=0.5, min_new_data_len=50):
self.model = model
self.min_similarity_threshold = min_similarity_threshold
self.min_new_data_len = min_new_data_len
self.log_file = log_file
logging.basicConfig(filename=self.log_file, level=logging.INFO)
def log(self, message):
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
logging.info(f"[{timestamp}] {message}")
print(f"[{timestamp}] {message}")
def check_similarity(self, test_terms):
triggers = []
for term in test_terms:
try:
sim = self.model.wv.most_similar(term)[0][1]
if sim < self.min_similarity_threshold:
triggers.append(f"Low similarity for '{term}': {sim}")
except KeyError:
triggers.append(f"'{term}' not in vocabulary")
return triggers
def check_metadata(self, metadata):
triggers = []
if any(keyword in str(metadata).lower() for keyword in ["haplogroup b", "eastasia", "asian"]):
triggers.append("Detected new haplogroup or regional bias: 'Asian' or 'B'")
return triggers
def check_new_data_volume(self, new_data):
if len(new_data) < self.min_new_data_len:
return ["Not enough new data to justify retraining"]
return []
def should_retrain(self, test_terms, new_data, metadata):
triggers = []
triggers += self.check_similarity(test_terms)
triggers += self.check_metadata(metadata)
triggers += self.check_new_data_volume(new_data)
if triggers:
self.log("Retraining triggered due to:")
for trigger in triggers:
self.log(f" - {trigger}")
return True
else:
self.log("No retraining needed.")
return False