Spaces:

lihuigu
/

SciPIP

Running

File size: 2,368 Bytes

e17c9f2

import re
import os
import hashlib
import struct
from collections import Counter


def check_env():
    env_name_list = [
        "NEO4J_URL",
        "NEO4J_USERNAME",
        "NEO4J_PASSWD",
        "MODEL_NAME",
        "MODEL_TYPE",
        "MODEL_API_KEY",
        "BASE_URL",
    ]
    for env_name in env_name_list:
        if env_name not in os.environ or os.environ[env_name] == "":
            raise ValueError(f"{env_name} is not set...")


def generate_hash_id(input_string):
    if input_string is None:
        return None
    sha1_hash = hashlib.sha256(input_string.lower().encode("utf-8")).hexdigest()
    binary_hash = bytes.fromhex(sha1_hash)
    int64_hash = struct.unpack(">q", binary_hash[:8])[0]
    return abs(int64_hash)


def extract_ref_id(text, references):
    """
    references: paper["references"]
    """
    # 正则表达式模式，用于匹配[数字, 数字]格式
    pattern = r"\[\d+(?:,\s*\d+)*\]"
    # 提取所有匹配的内容
    ref_list = re.findall(pattern, text)
    # ref ['[15, 16]', '[5]', '[2, 3, 8]']
    combined_ref_list = []
    if len(ref_list) > 0:
        # 说明是pattern 0
        for ref in ref_list:
            # 移除方括号并分割数字
            numbers = re.findall(r"\d+", ref)
            # 将字符串数字转换为整数并加入到列表中
            combined_ref_list.extend(map(int, numbers))
        # 去重并排序
        ref_counts = Counter(combined_ref_list)
        ref_counts = dict(sorted(ref_counts.items()))
        # 对多个，只保留引用最多的一个
        for ref in ref_list:
            # 移除方括号并分割数字
            numbers = re.findall(r"\d+", ref)
            # 找到只引用了一次的
            temp_list = []
            for num in numbers:
                num = int(num)
                if ref_counts[num] == 1:
                    temp_list.append(num)
            if len(temp_list) == len(numbers):
                temp_list = temp_list[1:]
            for num in temp_list:
                del ref_counts[num]
    hash_id_list = []
    for idx in ref_counts.keys():
        hash_id_list.append(generate_hash_id(references[idx]))
    return hash_id_list


if __name__ == "__main__":
    # 示例用法
    input_string = "example_string"
    hash_id = generate_hash_id(input_string)
    print("INT64 Hash ID:", hash_id)