Spaces:

lihuigu
/

SciPIP

Running

lihuigu

init commit

e17c9f2 9 months ago

2.37 kB

	import re
	import os
	import hashlib
	import struct
	from collections import Counter


	def check_env():
	env_name_list = [
	"NEO4J_URL",
	"NEO4J_USERNAME",
	"NEO4J_PASSWD",
	"MODEL_NAME",
	"MODEL_TYPE",
	"MODEL_API_KEY",
	"BASE_URL",
	]
	for env_name in env_name_list:
	if env_name not in os.environ or os.environ[env_name] == "":
	raise ValueError(f"{env_name} is not set...")


	def generate_hash_id(input_string):
	if input_string is None:
	return None
	sha1_hash = hashlib.sha256(input_string.lower().encode("utf-8")).hexdigest()
	binary_hash = bytes.fromhex(sha1_hash)
	int64_hash = struct.unpack(">q", binary_hash[:8])[0]
	return abs(int64_hash)


	def extract_ref_id(text, references):
	"""
	references: paper["references"]
	"""
	# 正则表达式模式，用于匹配[数字, 数字]格式
	pattern = r"\[\d+(?:,\s\d+)\]"
	# 提取所有匹配的内容
	ref_list = re.findall(pattern, text)
	# ref ['[15, 16]', '[5]', '[2, 3, 8]']
	combined_ref_list = []
	if len(ref_list) > 0:
	# 说明是pattern 0
	for ref in ref_list:
	# 移除方括号并分割数字
	numbers = re.findall(r"\d+", ref)
	# 将字符串数字转换为整数并加入到列表中
	combined_ref_list.extend(map(int, numbers))
	# 去重并排序
	ref_counts = Counter(combined_ref_list)
	ref_counts = dict(sorted(ref_counts.items()))
	# 对多个，只保留引用最多的一个
	for ref in ref_list:
	# 移除方括号并分割数字
	numbers = re.findall(r"\d+", ref)
	# 找到只引用了一次的
	temp_list = []
	for num in numbers:
	num = int(num)
	if ref_counts[num] == 1:
	temp_list.append(num)
	if len(temp_list) == len(numbers):
	temp_list = temp_list[1:]
	for num in temp_list:
	del ref_counts[num]
	hash_id_list = []
	for idx in ref_counts.keys():
	hash_id_list.append(generate_hash_id(references[idx]))
	return hash_id_list


	if __name__ == "__main__":
	# 示例用法
	input_string = "example_string"
	hash_id = generate_hash_id(input_string)
	print("INT64 Hash ID:", hash_id)