SciPIP / src /utils /hash.py
lihuigu
init commit
e17c9f2
raw
history blame
2.37 kB
import re
import os
import hashlib
import struct
from collections import Counter
def check_env():
env_name_list = [
"NEO4J_URL",
"NEO4J_USERNAME",
"NEO4J_PASSWD",
"MODEL_NAME",
"MODEL_TYPE",
"MODEL_API_KEY",
"BASE_URL",
]
for env_name in env_name_list:
if env_name not in os.environ or os.environ[env_name] == "":
raise ValueError(f"{env_name} is not set...")
def generate_hash_id(input_string):
if input_string is None:
return None
sha1_hash = hashlib.sha256(input_string.lower().encode("utf-8")).hexdigest()
binary_hash = bytes.fromhex(sha1_hash)
int64_hash = struct.unpack(">q", binary_hash[:8])[0]
return abs(int64_hash)
def extract_ref_id(text, references):
"""
references: paper["references"]
"""
# 正则表达式模式,用于匹配[数字, 数字]格式
pattern = r"\[\d+(?:,\s*\d+)*\]"
# 提取所有匹配的内容
ref_list = re.findall(pattern, text)
# ref ['[15, 16]', '[5]', '[2, 3, 8]']
combined_ref_list = []
if len(ref_list) > 0:
# 说明是pattern 0
for ref in ref_list:
# 移除方括号并分割数字
numbers = re.findall(r"\d+", ref)
# 将字符串数字转换为整数并加入到列表中
combined_ref_list.extend(map(int, numbers))
# 去重并排序
ref_counts = Counter(combined_ref_list)
ref_counts = dict(sorted(ref_counts.items()))
# 对多个,只保留引用最多的一个
for ref in ref_list:
# 移除方括号并分割数字
numbers = re.findall(r"\d+", ref)
# 找到只引用了一次的
temp_list = []
for num in numbers:
num = int(num)
if ref_counts[num] == 1:
temp_list.append(num)
if len(temp_list) == len(numbers):
temp_list = temp_list[1:]
for num in temp_list:
del ref_counts[num]
hash_id_list = []
for idx in ref_counts.keys():
hash_id_list.append(generate_hash_id(references[idx]))
return hash_id_list
if __name__ == "__main__":
# 示例用法
input_string = "example_string"
hash_id = generate_hash_id(input_string)
print("INT64 Hash ID:", hash_id)