|
import re |
|
import os |
|
import hashlib |
|
import struct |
|
from collections import Counter |
|
|
|
|
|
def check_env(): |
|
env_name_list = [ |
|
"NEO4J_URL", |
|
"NEO4J_USERNAME", |
|
"NEO4J_PASSWD", |
|
"MODEL_NAME", |
|
"MODEL_TYPE", |
|
"MODEL_API_KEY", |
|
"BASE_URL", |
|
] |
|
for env_name in env_name_list: |
|
if env_name not in os.environ or os.environ[env_name] == "": |
|
raise ValueError(f"{env_name} is not set...") |
|
|
|
|
|
def generate_hash_id(input_string): |
|
if input_string is None: |
|
return None |
|
sha1_hash = hashlib.sha256(input_string.lower().encode("utf-8")).hexdigest() |
|
binary_hash = bytes.fromhex(sha1_hash) |
|
int64_hash = struct.unpack(">q", binary_hash[:8])[0] |
|
return abs(int64_hash) |
|
|
|
|
|
def extract_ref_id(text, references): |
|
""" |
|
references: paper["references"] |
|
""" |
|
|
|
pattern = r"\[\d+(?:,\s*\d+)*\]" |
|
|
|
ref_list = re.findall(pattern, text) |
|
|
|
combined_ref_list = [] |
|
if len(ref_list) > 0: |
|
|
|
for ref in ref_list: |
|
|
|
numbers = re.findall(r"\d+", ref) |
|
|
|
combined_ref_list.extend(map(int, numbers)) |
|
|
|
ref_counts = Counter(combined_ref_list) |
|
ref_counts = dict(sorted(ref_counts.items())) |
|
|
|
for ref in ref_list: |
|
|
|
numbers = re.findall(r"\d+", ref) |
|
|
|
temp_list = [] |
|
for num in numbers: |
|
num = int(num) |
|
if ref_counts[num] == 1: |
|
temp_list.append(num) |
|
if len(temp_list) == len(numbers): |
|
temp_list = temp_list[1:] |
|
for num in temp_list: |
|
del ref_counts[num] |
|
hash_id_list = [] |
|
for idx in ref_counts.keys(): |
|
hash_id_list.append(generate_hash_id(references[idx])) |
|
return hash_id_list |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
input_string = "example_string" |
|
hash_id = generate_hash_id(input_string) |
|
print("INT64 Hash ID:", hash_id) |
|
|