3ie-intervention-outcome-entity-linking

Sleeping

@@ -1,97 +0,0 @@
-import logging
-import sys
-import threading
-from typing import Optional
-from rich import get_console
-_lock = threading.Lock()
-_default_handler: Optional[logging.Handler] = None
-_default_log_level = logging.WARNING
-# fancy logger
-_console = get_console()
-def _get_library_name() -> str:
-    return __name__.split(".")[0]
-def _get_library_root_logger() -> logging.Logger:
-    return logging.getLogger(_get_library_name())
-def _configure_library_root_logger() -> None:
-    global _default_handler
-    with _lock:
-        if _default_handler:
-            # This library has already configured the library root logger.
-            return
-        _default_handler = logging.StreamHandler()  # Set sys.stderr as stream.
-        _default_handler.flush = sys.stderr.flush
-        # Apply our default configuration to the library root logger.
-        library_root_logger = _get_library_root_logger()
-        library_root_logger.addHandler(_default_handler)
-        library_root_logger.setLevel(_default_log_level)
-        library_root_logger.propagate = False
-def _reset_library_root_logger() -> None:
-    global _default_handler
-    with _lock:
-        if not _default_handler:
-            return
-        library_root_logger = _get_library_root_logger()
-        library_root_logger.removeHandler(_default_handler)
-        library_root_logger.setLevel(logging.NOTSET)
-        _default_handler = None
-def set_log_level(level: int, logger: logging.Logger = None) -> None:
-    """
-    Set the log level.
-    Args:
-        level (:obj:`int`):
-            Logging level.
-        logger (:obj:`logging.Logger`):
-            Logger to set the log level.
-    """
-    if not logger:
-        _configure_library_root_logger()
-        logger = _get_library_root_logger()
-    logger.setLevel(level)
-def get_logger(
-    name: Optional[str] = None,
-    level: Optional[int] = None,
-    formatter: Optional[str] = None,
-) -> logging.Logger:
-    """
-    Return a logger with the specified name.
-    """
-    if name is None:
-        name = _get_library_name()
-    _configure_library_root_logger()
-    if level is not None:
-        set_log_level(level)
-    if formatter is None:
-        formatter = logging.Formatter(
-            "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
-        )
-    _default_handler.setFormatter(formatter)
-    return logging.getLogger(name)
-def get_console_logger():
-    return _console

relik/common/upload.py DELETED Viewed

@@ -1,128 +0,0 @@
-import argparse
-import json
-import logging
-import os
-import tempfile
-import zipfile
-from datetime import datetime
-from pathlib import Path
-from typing import Optional, Union
-import huggingface_hub
-from relik.common.log import get_logger
-from relik.common.utils import SAPIENZANLP_DATE_FORMAT, get_md5
-logger = get_logger(level=logging.DEBUG)
-def create_info_file(tmpdir: Path):
-    logger.debug("Computing md5 of model.zip")
-    md5 = get_md5(tmpdir / "model.zip")
-    date = datetime.now().strftime(SAPIENZANLP_DATE_FORMAT)
-    logger.debug("Dumping info.json file")
-    with (tmpdir / "info.json").open("w") as f:
-        json.dump(dict(md5=md5, upload_date=date), f, indent=2)
-def zip_run(
-    dir_path: Union[str, os.PathLike],
-    tmpdir: Union[str, os.PathLike],
-    zip_name: str = "model.zip",
-) -> Path:
-    logger.debug(f"zipping {dir_path} to {tmpdir}")
-    # creates a zip version of the provided dir_path
-    run_dir = Path(dir_path)
-    zip_path = tmpdir / zip_name
-    with zipfile.ZipFile(zip_path, "w") as zip_file:
-        # fully zip the run directory maintaining its structure
-        for file in run_dir.rglob("*.*"):
-            if file.is_dir():
-                continue
-            zip_file.write(file, arcname=file.relative_to(run_dir))
-    return zip_path
-def upload(
-    model_dir: Union[str, os.PathLike],
-    model_name: str,
-    organization: Optional[str] = None,
-    repo_name: Optional[str] = None,
-    commit: Optional[str] = None,
-    archive: bool = False,
-):
-    token = huggingface_hub.HfFolder.get_token()
-    if token is None:
-        print(
-            "No HuggingFace token found. You need to execute `huggingface-cli login` first!"
-        )
-        return
-    repo_id = repo_name or model_name
-    if organization is not None:
-        repo_id = f"{organization}/{repo_id}"
-    with tempfile.TemporaryDirectory() as tmpdir:
-        api = huggingface_hub.HfApi()
-        repo_url = api.create_repo(
-            token=token,
-            repo_id=repo_id,
-            exist_ok=True,
-        )
-        repo = huggingface_hub.Repository(
-            str(tmpdir), clone_from=repo_url, use_auth_token=token
-        )
-        tmp_path = Path(tmpdir)
-        if archive:
-            # otherwise we zip the model_dir
-            logger.debug(f"Zipping {model_dir} to {tmp_path}")
-            zip_run(model_dir, tmp_path)
-            create_info_file(tmp_path)
-        else:
-            # if the user wants to upload a transformers model, we don't need to zip it
-            # we just need to copy the files to the tmpdir
-            logger.debug(f"Copying {model_dir} to {tmpdir}")
-            os.system(f"cp -r {model_dir}/* {tmpdir}")
-        # this method automatically puts large files (>10MB) into git lfs
-        repo.push_to_hub(commit_message=commit or "Automatic push from sapienzanlp")
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "model_dir", help="The directory of the model you want to upload"
-    )
-    parser.add_argument("model_name", help="The model you want to upload")
-    parser.add_argument(
-        "--organization",
-        help="the name of the organization where you want to upload the model",
-    )
-    parser.add_argument(
-        "--repo_name",
-        help="Optional name to use when uploading to the HuggingFace repository",
-    )
-    parser.add_argument(
-        "--commit", help="Commit message to use when pushing to the HuggingFace Hub"
-    )
-    parser.add_argument(
-        "--archive",
-        action="store_true",
-        help="""
-            Whether to compress the model directory before uploading it.
-            If True, the model directory will be zipped and the zip file will be uploaded.
-            If False, the model directory will be uploaded as is.""",
-    )
-    return parser.parse_args()
-def main():
-    upload(**vars(parse_args()))
-if __name__ == "__main__":
-    main()

relik/common/utils.py DELETED Viewed

@@ -1,609 +0,0 @@
-import importlib.util
-import json
-import logging
-import os
-import shutil
-import tarfile
-import tempfile
-from functools import partial
-from hashlib import sha256
-from pathlib import Path
-from typing import Any, BinaryIO, Dict, List, Optional, Union
-from urllib.parse import urlparse
-from zipfile import ZipFile, is_zipfile
-import huggingface_hub
-import requests
-import tqdm
-from filelock import FileLock
-from transformers.utils.hub import cached_file as hf_cached_file
-from relik.common.log import get_logger
-# name constants
-WEIGHTS_NAME = "weights.pt"
-ONNX_WEIGHTS_NAME = "weights.onnx"
-CONFIG_NAME = "config.yaml"
-LABELS_NAME = "labels.json"
-# SAPIENZANLP_USER_NAME = "sapienzanlp"
-SAPIENZANLP_USER_NAME = "riccorl"
-SAPIENZANLP_HF_MODEL_REPO_URL = "riccorl/{model_id}"
-SAPIENZANLP_HF_MODEL_REPO_ARCHIVE_URL = (
-    f"{SAPIENZANLP_HF_MODEL_REPO_URL}/resolve/main/model.zip"
-)
-# path constants
-SAPIENZANLP_CACHE_DIR = os.getenv("SAPIENZANLP_CACHE_DIR", Path.home() / ".sapienzanlp")
-SAPIENZANLP_DATE_FORMAT = "%Y-%m-%d %H-%M-%S"
-logger = get_logger(__name__)
-def sapienzanlp_model_urls(model_id: str) -> str:
-    """
-    Returns the URL for a possible SapienzaNLP valid model.
-    Args:
-        model_id (:obj:`str`):
-            A SapienzaNLP model id.
-    Returns:
-        :obj:`str`: The url for the model id.
-    """
-    # check if there is already the namespace of the user
-    if "/" in model_id:
-        return model_id
-    return SAPIENZANLP_HF_MODEL_REPO_URL.format(model_id=model_id)
-def is_package_available(package_name: str) -> bool:
-    """
-    Check if a package is available.
-    Args:
-        package_name (`str`): The name of the package to check.
-    """
-    return importlib.util.find_spec(package_name) is not None
-def load_json(path: Union[str, Path]) -> Any:
-    """
-    Load a json file provided in input.
-    Args:
-        path (`Union[str, Path]`): The path to the json file to load.
-    Returns:
-        `Any`: The loaded json file.
-    """
-    with open(path, encoding="utf8") as f:
-        return json.load(f)
-def dump_json(document: Any, path: Union[str, Path], indent: Optional[int] = None):
-    """
-    Dump input to json file.
-    Args:
-        document (`Any`): The document to dump.
-        path (`Union[str, Path]`): The path to dump the document to.
-        indent (`Optional[int]`): The indent to use for the json file.
-    """
-    with open(path, "w", encoding="utf8") as outfile:
-        json.dump(document, outfile, indent=indent)
-def get_md5(path: Path):
-    """
-    Get the MD5 value of a path.
-    """
-    import hashlib
-    with path.open("rb") as fin:
-        data = fin.read()
-    return hashlib.md5(data).hexdigest()
-def file_exists(path: Union[str, os.PathLike]) -> bool:
-    """
-    Check if the file at :obj:`path` exists.
-    Args:
-        path (:obj:`str`, :obj:`os.PathLike`):
-            Path to check.
-    Returns:
-        :obj:`bool`: :obj:`True` if the file exists.
-    """
-    return Path(path).exists()
-def dir_exists(path: Union[str, os.PathLike]) -> bool:
-    """
-    Check if the directory at :obj:`path` exists.
-    Args:
-        path (:obj:`str`, :obj:`os.PathLike`):
-            Path to check.
-    Returns:
-        :obj:`bool`: :obj:`True` if the directory exists.
-    """
-    return Path(path).is_dir()
-def is_remote_url(url_or_filename: Union[str, Path]):
-    """
-    Returns :obj:`True` if the input path is an url.
-    Args:
-        url_or_filename (:obj:`str`, :obj:`Path`):
-            path to check.
-    Returns:
-        :obj:`bool`: :obj:`True` if the input path is an url, :obj:`False` otherwise.
-    """
-    if isinstance(url_or_filename, Path):
-        url_or_filename = str(url_or_filename)
-    parsed = urlparse(url_or_filename)
-    return parsed.scheme in ("http", "https")
-def url_to_filename(resource: str, etag: str = None) -> str:
-    """
-    Convert a `resource` into a hashed filename in a repeatable way.
-    If `etag` is specified, append its hash to the resources's, delimited
-    by a period.
-    """
-    resource_bytes = resource.encode("utf-8")
-    resource_hash = sha256(resource_bytes)
-    filename = resource_hash.hexdigest()
-    if etag:
-        etag_bytes = etag.encode("utf-8")
-        etag_hash = sha256(etag_bytes)
-        filename += "." + etag_hash.hexdigest()
-    return filename
-def download_resource(
-    url: str,
-    temp_file: BinaryIO,
-    headers=None,
-):
-    """
-    Download remote file.
-    """
-    if headers is None:
-        headers = {}
-    r = requests.get(url, stream=True, headers=headers)
-    r.raise_for_status()
-    content_length = r.headers.get("Content-Length")
-    total = int(content_length) if content_length is not None else None
-    progress = tqdm(
-        unit="B",
-        unit_scale=True,
-        total=total,
-        desc="Downloading",
-        disable=logger.level in [logging.NOTSET],
-    )
-    for chunk in r.iter_content(chunk_size=1024):
-        if chunk:  # filter out keep-alive new chunks
-            progress.update(len(chunk))
-            temp_file.write(chunk)
-    progress.close()
-def download_and_cache(
-    url: Union[str, Path],
-    cache_dir: Union[str, Path] = None,
-    force_download: bool = False,
-):
-    if cache_dir is None:
-        cache_dir = SAPIENZANLP_CACHE_DIR
-    if isinstance(url, Path):
-        url = str(url)
-    # check if cache dir exists
-    Path(cache_dir).mkdir(parents=True, exist_ok=True)
-    # check if file is private
-    headers = {}
-    try:
-        r = requests.head(url, allow_redirects=False, timeout=10)
-        r.raise_for_status()
-    except requests.exceptions.HTTPError:
-        if r.status_code == 401:
-            hf_token = huggingface_hub.HfFolder.get_token()
-            if hf_token is None:
-                raise ValueError(
-                    "You need to login to HuggingFace to download this model "
-                    "(use the `huggingface-cli login` command)"
-                )
-            headers["Authorization"] = f"Bearer {hf_token}"
-    etag = None
-    try:
-        r = requests.head(url, allow_redirects=True, timeout=10, headers=headers)
-        r.raise_for_status()
-        etag = r.headers.get("X-Linked-Etag") or r.headers.get("ETag")
-        # We favor a custom header indicating the etag of the linked resource, and
-        # we fallback to the regular etag header.
-        # If we don't have any of those, raise an error.
-        if etag is None:
-            raise OSError(
-                "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
-            )
-        # In case of a redirect,
-        # save an extra redirect on the request.get call,
-        # and ensure we download the exact atomic version even if it changed
-        # between the HEAD and the GET (unlikely, but hey).
-        if 300 <= r.status_code <= 399:
-            url = r.headers["Location"]
-    except (requests.exceptions.SSLError, requests.exceptions.ProxyError):
-        # Actually raise for those subclasses of ConnectionError
-        raise
-    except (requests.exceptions.ConnectionError, requests.exceptions.Timeout):
-        # Otherwise, our Internet connection is down.
-        # etag is None
-        pass
-    # get filename from the url
-    filename = url_to_filename(url, etag)
-    # get cache path to put the file
-    cache_path = cache_dir / filename
-    # the file is already here, return it
-    if file_exists(cache_path) and not force_download:
-        logger.info(
-            f"{url} found in cache, set `force_download=True` to force the download"
-        )
-        return cache_path
-    cache_path = str(cache_path)
-    # Prevent parallel downloads of the same file with a lock.
-    lock_path = cache_path + ".lock"
-    with FileLock(lock_path):
-        # If the download just completed while the lock was activated.
-        if file_exists(cache_path) and not force_download:
-            # Even if returning early like here, the lock will be released.
-            return cache_path
-        temp_file_manager = partial(
-            tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False
-        )
-        # Download to temporary file, then copy to cache dir once finished.
-        # Otherwise, you get corrupt cache entries if the download gets interrupted.
-        with temp_file_manager() as temp_file:
-            logger.info(
-                f"{url} not found in cache or `force_download` set to `True`, downloading to {temp_file.name}"
-            )
-            download_resource(url, temp_file, headers)
-        logger.info(f"storing {url} in cache at {cache_path}")
-        os.replace(temp_file.name, cache_path)
-        # NamedTemporaryFile creates a file with hardwired 0600 perms (ignoring umask), so fixing it.
-        umask = os.umask(0o666)
-        os.umask(umask)
-        os.chmod(cache_path, 0o666 & ~umask)
-        logger.info(f"creating metadata file for {cache_path}")
-        meta = {"url": url}  # , "etag": etag}
-        meta_path = cache_path + ".json"
-        with open(meta_path, "w") as meta_file:
-            json.dump(meta, meta_file)
-    return cache_path
-def download_from_hf(
-    path_or_repo_id: Union[str, Path],
-    filenames: Optional[List[str]],
-    cache_dir: Union[str, Path] = None,
-    force_download: bool = False,
-    resume_download: bool = False,
-    proxies: Optional[Dict[str, str]] = None,
-    use_auth_token: Optional[Union[bool, str]] = None,
-    revision: Optional[str] = None,
-    local_files_only: bool = False,
-    subfolder: str = "",
-):
-    if isinstance(path_or_repo_id, Path):
-        path_or_repo_id = str(path_or_repo_id)
-    downloaded_paths = []
-    for filename in filenames:
-        downloaded_path = hf_cached_file(
-            path_or_repo_id,
-            filename,
-            cache_dir=cache_dir,
-            force_download=force_download,
-            proxies=proxies,
-            resume_download=resume_download,
-            use_auth_token=use_auth_token,
-            revision=revision,
-            local_files_only=local_files_only,
-            subfolder=subfolder,
-        )
-        downloaded_paths.append(downloaded_path)
-    # we want the folder where the files are downloaded
-    # the best guess is the parent folder of the first file
-    probably_the_folder = Path(downloaded_paths[0]).parent
-    return probably_the_folder
-def model_name_or_path_resolver(model_name_or_dir: Union[str, os.PathLike]) -> str:
-    """
-    Resolve a model name or directory to a model archive name or directory.
-    Args:
-        model_name_or_dir (:obj:`str` or :obj:`os.PathLike`):
-            A model name or directory.
-    Returns:
-        :obj:`str`: The model archive name or directory.
-    """
-    if is_remote_url(model_name_or_dir):
-        # if model_name_or_dir is a URL
-        # download it and try to load
-        model_archive = model_name_or_dir
-    elif Path(model_name_or_dir).is_dir() or Path(model_name_or_dir).is_file():
-        # if model_name_or_dir is a local directory or
-        # an archive file try to load it
-        model_archive = model_name_or_dir
-    else:
-        # probably model_name_or_dir is a sapienzanlp model id
-        # guess the url and try to download
-        model_name_or_dir_ = model_name_or_dir
-        # raise ValueError(f"Providing a model id is not supported yet.")
-        model_archive = sapienzanlp_model_urls(model_name_or_dir_)
-    return model_archive
-def from_cache(
-    url_or_filename: Union[str, Path],
-    cache_dir: Union[str, Path] = None,
-    force_download: bool = False,
-    resume_download: bool = False,
-    proxies: Optional[Dict[str, str]] = None,
-    use_auth_token: Optional[Union[bool, str]] = None,
-    revision: Optional[str] = None,
-    local_files_only: bool = False,
-    subfolder: str = "",
-    filenames: Optional[List[str]] = None,
-) -> Path:
-    """
-    Given something that could be either a local path or a URL (or a SapienzaNLP model id),
-    determine which one and return a path to the corresponding file.
-    Args:
-        url_or_filename (:obj:`str` or :obj:`Path`):
-            A path to a local file or a URL (or a SapienzaNLP model id).
-        cache_dir (:obj:`str` or :obj:`Path`, `optional`):
-            Path to a directory in which a downloaded file will be cached.
-        force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to re-download the file even if it already exists.
-        resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to delete incompletely received files. Attempts to resume the download if such a file
-            exists.
-        proxies (:obj:`Dict[str, str]`, `optional`):
-            A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
-            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-        use_auth_token (:obj:`Union[bool, str]`, `optional`):
-            Optional string or boolean to use as Bearer token for remote files. If :obj:`True`, will get token from
-            :obj:`~transformers.hf_api.HfApi`. If :obj:`str`, will use that string as token.
-        revision (:obj:`str`, `optional`):
-            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-            git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
-            identifier allowed by git.
-        local_files_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to raise an error if the file to be downloaded is local.
-        subfolder (:obj:`str`, `optional`):
-            In case the relevant file is in a subfolder of the URL, specify it here.
-        filenames (:obj:`List[str]`, `optional`):
-            List of filenames to look for in the directory structure.
-    Returns:
-        :obj:`Path`: Path to the cached file.
-    """
-    url_or_filename = model_name_or_path_resolver(url_or_filename)
-    if cache_dir is None:
-        cache_dir = SAPIENZANLP_CACHE_DIR
-    if file_exists(url_or_filename):
-        logger.info(f"{url_or_filename} is a local path or file")
-        output_path = url_or_filename
-    elif is_remote_url(url_or_filename):
-        # URL, so get it from the cache (downloading if necessary)
-        output_path = download_and_cache(
-            url_or_filename,
-            cache_dir=cache_dir,
-            force_download=force_download,
-        )
-    else:
-        if filenames is None:
-            filenames = [WEIGHTS_NAME, CONFIG_NAME, LABELS_NAME]
-        output_path = download_from_hf(
-            url_or_filename,
-            filenames,
-            cache_dir,
-            force_download,
-            resume_download,
-            proxies,
-            use_auth_token,
-            revision,
-            local_files_only,
-            subfolder,
-        )
-    # if is_hf_hub_url(url_or_filename):
-    # HuggingFace Hub
-    # output_path = hf_hub_download_url(url_or_filename)
-    # elif is_remote_url(url_or_filename):
-    #     # URL, so get it from the cache (downloading if necessary)
-    #     output_path = download_and_cache(
-    #         url_or_filename,
-    #         cache_dir=cache_dir,
-    #         force_download=force_download,
-    #     )
-    # elif file_exists(url_or_filename):
-    #     logger.info(f"{url_or_filename} is a local path or file")
-    #     # File, and it exists.
-    #     output_path = url_or_filename
-    # elif urlparse(url_or_filename).scheme == "":
-    #     # File, but it doesn't exist.
-    #     raise EnvironmentError(f"file {url_or_filename} not found")
-    # else:
-    #     # Something unknown
-    #     raise ValueError(
-    #         f"unable to parse {url_or_filename} as a URL or as a local path"
-    #     )
-    if dir_exists(output_path) or (
-        not is_zipfile(output_path) and not tarfile.is_tarfile(output_path)
-    ):
-        return Path(output_path)
-    # Path where we extract compressed archives
-    # for now it will extract it in the same folder
-    # maybe implement extraction in the sapienzanlp folder
-    # when using local archive path?
-    logger.info("Extracting compressed archive")
-    output_dir, output_file = os.path.split(output_path)
-    output_extract_dir_name = output_file.replace(".", "-") + "-extracted"
-    output_path_extracted = os.path.join(output_dir, output_extract_dir_name)
-    # already extracted, do not extract
-    if (
-        os.path.isdir(output_path_extracted)
-        and os.listdir(output_path_extracted)
-        and not force_download
-    ):
-        return Path(output_path_extracted)
-    # Prevent parallel extractions
-    lock_path = output_path + ".lock"
-    with FileLock(lock_path):
-        shutil.rmtree(output_path_extracted, ignore_errors=True)
-        os.makedirs(output_path_extracted)
-        if is_zipfile(output_path):
-            with ZipFile(output_path, "r") as zip_file:
-                zip_file.extractall(output_path_extracted)
-                zip_file.close()
-        elif tarfile.is_tarfile(output_path):
-            tar_file = tarfile.open(output_path)
-            tar_file.extractall(output_path_extracted)
-            tar_file.close()
-        else:
-            raise EnvironmentError(
-                f"Archive format of {output_path} could not be identified"
-            )
-    # remove lock file, is it safe?
-    os.remove(lock_path)
-    return Path(output_path_extracted)
-def is_str_a_path(maybe_path: str) -> bool:
-    """
-    Check if a string is a path.
-    Args:
-        maybe_path (`str`): The string to check.
-    Returns:
-        `bool`: `True` if the string is a path, `False` otherwise.
-    """
-    # first check if it is a path
-    if Path(maybe_path).exists():
-        return True
-    # check if it is a relative path
-    if Path(os.path.join(os.getcwd(), maybe_path)).exists():
-        return True
-    # otherwise it is not a path
-    return False
-def relative_to_absolute_path(path: str) -> os.PathLike:
-    """
-    Convert a relative path to an absolute path.
-    Args:
-        path (`str`): The relative path to convert.
-    Returns:
-        `os.PathLike`: The absolute path.
-    """
-    if not is_str_a_path(path):
-        raise ValueError(f"{path} is not a path")
-    if Path(path).exists():
-        return Path(path).absolute()
-    if Path(os.path.join(os.getcwd(), path)).exists():
-        return Path(os.path.join(os.getcwd(), path)).absolute()
-    raise ValueError(f"{path} is not a path")
-def to_config(object_to_save: Any) -> Dict[str, Any]:
-    """
-    Convert an object to a dictionary.
-    Returns:
-        `Dict[str, Any]`: The dictionary representation of the object.
-    """
-    def obj_to_dict(obj):
-        match obj:
-            case dict():
-                data = {}
-                for k, v in obj.items():
-                    data[k] = obj_to_dict(v)
-                return data
-            case list() | tuple():
-                return [obj_to_dict(x) for x in obj]
-            case object(__dict__=_):
-                data = {
-                    "_target_": f"{obj.__class__.__module__}.{obj.__class__.__name__}",
-                }
-                for k, v in obj.__dict__.items():
-                    if not k.startswith("_"):
-                        data[k] = obj_to_dict(v)
-                return data
-            case _:
-                return obj
-    return obj_to_dict(object_to_save)
-def get_callable_from_string(callable_fn: str) -> Any:
-    """
-    Get a callable from a string.
-    Args:
-        callable_fn (`str`):
-            The string representation of the callable.
-    Returns:
-        `Any`: The callable.
-    """
-    # separate the function name from the module name
-    module_name, function_name = callable_fn.rsplit(".", 1)
-    # import the module
-    module = importlib.import_module(module_name)
-    # get the function
-    return getattr(module, function_name)

relik/inference/__init__.py DELETED Viewed

File without changes

relik/inference/annotator.py DELETED Viewed

@@ -1,428 +0,0 @@
-import os
-from pathlib import Path
-from typing import Any, Callable, Dict, Optional, Union
-import hydra
-from omegaconf import OmegaConf
-from relik.retriever.indexers.faiss import FaissDocumentIndex
-from relik.retriever.pytorch_modules.hf import GoldenRetrieverModel
-from rich.pretty import pprint
-from relik.common.log import get_console_logger, get_logger
-from relik.common.upload import upload
-from relik.common.utils import CONFIG_NAME, from_cache, get_callable_from_string
-from relik.inference.data.objects import EntitySpan, RelikOutput
-from relik.inference.data.tokenizers.spacy_tokenizer import SpacyTokenizer
-from relik.inference.data.window.manager import WindowManager
-from relik.reader.pytorch_modules.span import RelikReaderForSpanExtraction
-from relik.reader.relik_reader import RelikReader
-from relik.retriever.data.utils import batch_generator
-from relik.retriever.indexers.base import BaseDocumentIndex
-from relik.retriever.pytorch_modules.model import GoldenRetriever
-logger = get_logger(__name__)
-console_logger = get_console_logger()
-class Relik:
-    """
-    Relik main class. It is a wrapper around a retriever and a reader.
-    Args:
-        retriever (`Optional[GoldenRetriever]`, `optional`):
-            The retriever to use. If `None`, a retriever will be instantiated from the
-            provided `question_encoder`, `passage_encoder` and `document_index`.
-            Defaults to `None`.
-        question_encoder (`Optional[Union[str, GoldenRetrieverModel]]`, `optional`):
-            The question encoder to use. If `retriever` is `None`, a retriever will be
-            instantiated from this parameter. Defaults to `None`.
-        passage_encoder (`Optional[Union[str, GoldenRetrieverModel]]`, `optional`):
-            The passage encoder to use. If `retriever` is `None`, a retriever will be
-            instantiated from this parameter. Defaults to `None`.
-        document_index (`Optional[Union[str, BaseDocumentIndex]]`, `optional`):
-            The document index to use. If `retriever` is `None`, a retriever will be
-            instantiated from this parameter. Defaults to `None`.
-        reader (`Optional[Union[str, RelikReader]]`, `optional`):
-            The reader to use. If `None`, a reader will be instantiated from the
-            provided `reader`. Defaults to `None`.
-        retriever_device (`str`, `optional`, defaults to `cpu`):
-            The device to use for the retriever.
-    """
-    def __init__(
-        self,
-        retriever: GoldenRetriever | None = None,
-        question_encoder: str | GoldenRetrieverModel | None = None,
-        passage_encoder: str | GoldenRetrieverModel | None = None,
-        document_index: str | BaseDocumentIndex | None = None,
-        reader: str | RelikReader | None = None,
-        device: str = "cpu",
-        retriever_device: str | None = None,
-        document_index_device: str | None = None,
-        reader_device: str | None = None,
-        precision: int = 32,
-        retriever_precision: int | None = None,
-        document_index_precision: int | None = None,
-        reader_precision: int | None = None,
-        reader_kwargs: dict | None = None,
-        retriever_kwargs: dict | None = None,
-        candidates_preprocessing_fn: str | Callable | None = None,
-        top_k: int | None = None,
-        window_size: int | None = None,
-        window_stride: int | None = None,
-        **kwargs,
-    ) -> None:
-        # retriever
-        retriever_device = retriever_device or device
-        document_index_device = document_index_device or device
-        retriever_precision = retriever_precision or precision
-        document_index_precision = document_index_precision or precision
-        if retriever is None and question_encoder is None:
-            raise ValueError(
-                "Either `retriever` or `question_encoder` must be provided"
-            )
-        if retriever is None:
-            self.retriever_kwargs = dict(
-                question_encoder=question_encoder,
-                passage_encoder=passage_encoder,
-                document_index=document_index,
-                device=retriever_device,
-                precision=retriever_precision,
-                index_device=document_index_device,
-                index_precision=document_index_precision,
-            )
-            # overwrite default_retriever_kwargs with retriever_kwargs
-            self.retriever_kwargs.update(retriever_kwargs or {})
-            retriever = GoldenRetriever(**self.retriever_kwargs)
-        retriever.training = False
-        retriever.eval()
-        self.retriever = retriever
-        # reader
-        self.reader_device = reader_device or device
-        self.reader_precision = reader_precision or precision
-        self.reader_kwargs = reader_kwargs
-        if isinstance(reader, str):
-            reader_kwargs = reader_kwargs or {}
-            reader = RelikReaderForSpanExtraction(reader, **reader_kwargs)
-        self.reader = reader
-        # windowization stuff
-        self.tokenizer = SpacyTokenizer(language="en")
-        self.window_manager: WindowManager | None = None
-        # candidates preprocessing
-        # TODO: maybe move this logic somewhere else
-        candidates_preprocessing_fn = candidates_preprocessing_fn or (lambda x: x)
-        if isinstance(candidates_preprocessing_fn, str):
-            candidates_preprocessing_fn = get_callable_from_string(
-                candidates_preprocessing_fn
-            )
-        self.candidates_preprocessing_fn = candidates_preprocessing_fn
-        # inference params
-        self.top_k = top_k
-        self.window_size = window_size
-        self.window_stride = window_stride
-    def __call__(
-        self,
-        text: Union[str, list],
-        top_k: Optional[int] = None,
-        window_size: Optional[int] = None,
-        window_stride: Optional[int] = None,
-        retriever_batch_size: Optional[int] = 32,
-        reader_batch_size: Optional[int] = 32,
-        return_also_windows: bool = False,
-        **kwargs,
-    ) -> Union[RelikOutput, list[RelikOutput]]:
-        """
-        Annotate a text with entities.
-        Args:
-            text (`str` or `list`):
-                The text to annotate. If a list is provided, each element of the list
-                 will be annotated separately.
-            top_k (`int`, `optional`, defaults to `None`):
-                The number of candidates to retrieve for each window.
-            window_size (`int`, `optional`, defaults to `None`):
-                The size of the window. If `None`, the whole text will be annotated.
-            window_stride (`int`, `optional`, defaults to `None`):
-                The stride of the window. If `None`, there will be no overlap between windows.
-            retriever_batch_size (`int`, `optional`, defaults to `None`):
-                The batch size to use for the retriever. The whole input is the batch for the retriever.
-            reader_batch_size (`int`, `optional`, defaults to `None`):
-                The batch size to use for the reader. The whole input is the batch for the reader.
-            return_also_windows (`bool`, `optional`, defaults to `False`):
-                Whether to return the windows in the output.
-            **kwargs:
-                Additional keyword arguments to pass to the retriever and the reader.
-        Returns:
-            `RelikOutput` or `list[RelikOutput]`:
-                The annotated text. If a list was provided as input, a list of
-                `RelikOutput` objects will be returned.
-        """
-        if top_k is None:
-            top_k = self.top_k or 100
-        if window_size is None:
-            window_size = self.window_size
-        if window_stride is None:
-            window_stride = self.window_stride
-        if isinstance(text, str):
-            text = [text]
-        if window_size is not None:
-            if self.window_manager is None:
-                self.window_manager = WindowManager(self.tokenizer)
-        if window_size == "sentence":
-            # todo: implement sentence windowizer
-            raise NotImplementedError("Sentence windowizer not implemented yet")
-        # if window_size < window_stride:
-        #     raise ValueError(
-        #         f"Window size ({window_size}) must be greater than window stride ({window_stride})"
-        #     )
-        # window generator
-        windows = [
-            window
-            for doc_id, t in enumerate(text)
-            for window in self.window_manager.create_windows(
-                t,
-                window_size=window_size,
-                stride=window_stride,
-                doc_id=doc_id,
-            )
-        ]
-        # retrieve candidates first
-        windows_candidates = []
-        # TODO: Move batching inside retriever
-        for batch in batch_generator(windows, batch_size=retriever_batch_size):
-            retriever_out = self.retriever.retrieve([b.text for b in batch], k=top_k)
-            windows_candidates.extend(
-                [[p.label for p in predictions] for predictions in retriever_out]
-            )
-        # add passage to the windows
-        for window, candidates in zip(windows, windows_candidates):
-            window.window_candidates = [
-                self.candidates_preprocessing_fn(c) for c in candidates
-            ]
-        windows = self.reader.read(samples=windows, max_batch_size=reader_batch_size)
-        windows = self.window_manager.merge_windows(windows)
-        # transform predictions into RelikOutput objects
-        output = []
-        for w in windows:
-            sample_output = RelikOutput(
-                text=text[w.doc_id],
-                labels=sorted(
-                    [
-                        EntitySpan(
-                            start=ss, end=se, label=sl, text=text[w.doc_id][ss:se]
-                        )
-                        for ss, se, sl in w.predicted_window_labels_chars
-                    ],
-                    key=lambda x: x.start,
-                ),
-            )
-            output.append(sample_output)
-        if return_also_windows:
-            for i, sample_output in enumerate(output):
-                sample_output.windows = [w for w in windows if w.doc_id == i]
-        # if only one text was provided, return a single RelikOutput object
-        if len(output) == 1:
-            return output[0]
-        return output
-    @classmethod
-    def from_pretrained(
-        cls,
-        model_name_or_dir: Union[str, os.PathLike],
-        config_kwargs: Optional[Dict] = None,
-        config_file_name: str = CONFIG_NAME,
-        *args,
-        **kwargs,
-    ) -> "Relik":
-        cache_dir = kwargs.pop("cache_dir", None)
-        force_download = kwargs.pop("force_download", False)
-        model_dir = from_cache(
-            model_name_or_dir,
-            filenames=[config_file_name],
-            cache_dir=cache_dir,
-            force_download=force_download,
-        )
-        config_path = model_dir / config_file_name
-        if not config_path.exists():
-            raise FileNotFoundError(
-                f"Model configuration file not found at {config_path}."
-            )
-        # overwrite config with config_kwargs
-        config = OmegaConf.load(config_path)
-        if config_kwargs is not None:
-            # TODO: check merging behavior
-            config = OmegaConf.merge(config, OmegaConf.create(config_kwargs))
-        # do we want to print the config? I like it
-        pprint(OmegaConf.to_container(config), console=console_logger, expand_all=True)
-        # load relik from config
-        relik = hydra.utils.instantiate(config, *args, **kwargs)
-        return relik
-    def save_pretrained(
-        self,
-        output_dir: Union[str, os.PathLike],
-        config: Optional[Dict[str, Any]] = None,
-        config_file_name: Optional[str] = None,
-        save_weights: bool = False,
-        push_to_hub: bool = False,
-        model_id: Optional[str] = None,
-        organization: Optional[str] = None,
-        repo_name: Optional[str] = None,
-        **kwargs,
-    ):
-        """
-        Save the configuration of Relik to the specified directory as a YAML file.
-        Args:
-            output_dir (`str`):
-                The directory to save the configuration file to.
-            config (`Optional[Dict[str, Any]]`, `optional`):
-                The configuration to save. If `None`, the current configuration will be
-                saved. Defaults to `None`.
-            config_file_name (`Optional[str]`, `optional`):
-                The name of the configuration file. Defaults to `config.yaml`.
-            save_weights (`bool`, `optional`):
-                Whether to save the weights of the model. Defaults to `False`.
-            push_to_hub (`bool`, `optional`):
-                Whether to push the saved model to the hub. Defaults to `False`.
-            model_id (`Optional[str]`, `optional`):
-                The id of the model to push to the hub. If `None`, the name of the
-                directory will be used. Defaults to `None`.
-            organization (`Optional[str]`, `optional`):
-                The organization to push the model to. Defaults to `None`.
-            repo_name (`Optional[str]`, `optional`):
-                The name of the repository to push the model to. Defaults to `None`.
-            **kwargs:
-                Additional keyword arguments to pass to `OmegaConf.save`.
-        """
-        if config is None:
-            # create a default config
-            config = {
-                "_target_": f"{self.__class__.__module__}.{self.__class__.__name__}"
-            }
-            if self.retriever is not None:
-                if self.retriever.question_encoder is not None:
-                    config[
-                        "question_encoder"
-                    ] = self.retriever.question_encoder.name_or_path
-                if self.retriever.passage_encoder is not None:
-                    config[
-                        "passage_encoder"
-                    ] = self.retriever.passage_encoder.name_or_path
-                if self.retriever.document_index is not None:
-                    config["document_index"] = self.retriever.document_index.name_or_dir
-            if self.reader is not None:
-                config["reader"] = self.reader.model_path
-            config["retriever_kwargs"] = self.retriever_kwargs
-            config["reader_kwargs"] = self.reader_kwargs
-            # expand the fn as to be able to save it and load it later
-            config[
-                "candidates_preprocessing_fn"
-            ] = f"{self.candidates_preprocessing_fn.__module__}.{self.candidates_preprocessing_fn.__name__}"
-            # these are model-specific and should be saved
-            config["top_k"] = self.top_k
-            config["window_size"] = self.window_size
-            config["window_stride"] = self.window_stride
-        config_file_name = config_file_name or CONFIG_NAME
-        # create the output directory
-        output_dir = Path(output_dir)
-        output_dir.mkdir(parents=True, exist_ok=True)
-        logger.info(f"Saving relik config to {output_dir / config_file_name}")
-        # pretty print the config
-        pprint(config, console=console_logger, expand_all=True)
-        OmegaConf.save(config, output_dir / config_file_name)
-        if save_weights:
-            model_id = model_id or output_dir.name
-            retriever_model_id = model_id + "-retriever"
-            # save weights
-            logger.info(f"Saving retriever to {output_dir / retriever_model_id}")
-            self.retriever.save_pretrained(
-                output_dir / retriever_model_id,
-                question_encoder_name=retriever_model_id + "-question-encoder",
-                passage_encoder_name=retriever_model_id + "-passage-encoder",
-                document_index_name=retriever_model_id + "-index",
-                push_to_hub=push_to_hub,
-                organization=organization,
-                repo_name=repo_name,
-                **kwargs,
-            )
-            reader_model_id = model_id + "-reader"
-            logger.info(f"Saving reader to {output_dir / reader_model_id}")
-            self.reader.save_pretrained(
-                output_dir / reader_model_id,
-                push_to_hub=push_to_hub,
-                organization=organization,
-                repo_name=repo_name,
-                **kwargs,
-            )
-        if push_to_hub:
-            # push to hub
-            logger.info(f"Pushing to hub")
-            model_id = model_id or output_dir.name
-            upload(output_dir, model_id, organization=organization, repo_name=repo_name)
-def main():
-    from pprint import pprint
-    document_index = FaissDocumentIndex.from_pretrained(
-        "/root/relik-spaces/models/relik-retriever-small-aida-blink-pretrain-omniencoder/document_index",
-        config_kwargs={"_target_": "relik.retriever.indexers.faiss.FaissDocumentIndex", "index_type": "IVFx,Flat"},
-    )
-    relik = Relik(
-        question_encoder="/root/relik-spaces/models/relik-retriever-small-aida-blink-pretrain-omniencoder/question_encoder",
-        document_index=document_index,
-        reader="/root/relik-spaces/models/relik-reader-aida-deberta-small",
-        device="cuda",
-        precision=16,
-        top_k=100,
-        window_size=32,
-        window_stride=16,
-        candidates_preprocessing_fn="relik.inference.preprocessing.wikipedia_title_and_openings_preprocessing",
-    )
-    input_text = """
-    Bernie Ecclestone, the former boss of Formula One, has admitted fraud after failing to declare more than £400m held in a trust in Singapore.
-    The 92-year-old billionaire did not disclose the trust to the government in July 2015.
-    Appearing at Southwark Crown Court on Thursday, he told the judge "I plead guilty" after having previously pleaded not guilty.
-    Ecclestone had been due to go on trial next month.
-    """
-    preds = relik(input_text)
-    pprint(preds)
-if __name__ == "__main__":
-    main()

relik/inference/data/__init__.py DELETED Viewed

File without changes

relik/inference/data/objects.py DELETED Viewed

@@ -1,64 +0,0 @@
-from __future__ import annotations
-from dataclasses import dataclass
-from typing import List, NamedTuple, Optional
-from relik.reader.pytorch_modules.hf.modeling_relik import RelikReaderSample
-@dataclass
-class Word:
-    """
-    A word representation that includes text, index in the sentence, POS tag, lemma,
-    dependency relation, and similar information.
-    # Parameters
-    text : `str`, optional
-        The text representation.
-    index : `int`, optional
-        The word offset in the sentence.
-    lemma : `str`, optional
-        The lemma of this word.
-    pos : `str`, optional
-        The coarse-grained part of speech of this word.
-    dep : `str`, optional
-        The dependency relation for this word.
-    input_id : `int`, optional
-        Integer representation of the word, used to pass it to a model.
-    token_type_id : `int`, optional
-        Token type id used by some transformers.
-    attention_mask: `int`, optional
-        Attention mask used by transformers, indicates to the model which tokens should
-        be attended to, and which should not.
-    """
-    text: str
-    index: int
-    start_char: Optional[int] = None
-    end_char: Optional[int] = None
-    # preprocessing fields
-    lemma: Optional[str] = None
-    pos: Optional[str] = None
-    dep: Optional[str] = None
-    head: Optional[int] = None
-    def __str__(self):
-        return self.text
-    def __repr__(self):
-        return self.__str__()
-class EntitySpan(NamedTuple):
-    start: int
-    end: int
-    label: str
-    text: str
-@dataclass
-class RelikOutput:
-    text: str
-    labels: List[EntitySpan]
-    windows: Optional[List[RelikReaderSample]] = None

relik/inference/data/tokenizers/__init__.py DELETED Viewed

@@ -1,89 +0,0 @@
-SPACY_LANGUAGE_MAPPER = {
-    "ca": "ca_core_news_sm",
-    "da": "da_core_news_sm",
-    "de": "de_core_news_sm",
-    "el": "el_core_news_sm",
-    "en": "en_core_web_sm",
-    "es": "es_core_news_sm",
-    "fr": "fr_core_news_sm",
-    "it": "it_core_news_sm",
-    "ja": "ja_core_news_sm",
-    "lt": "lt_core_news_sm",
-    "mk": "mk_core_news_sm",
-    "nb": "nb_core_news_sm",
-    "nl": "nl_core_news_sm",
-    "pl": "pl_core_news_sm",
-    "pt": "pt_core_news_sm",
-    "ro": "ro_core_news_sm",
-    "ru": "ru_core_news_sm",
-    "xx": "xx_sent_ud_sm",
-    "zh": "zh_core_web_sm",
-    "ca_core_news_sm": "ca_core_news_sm",
-    "ca_core_news_md": "ca_core_news_md",
-    "ca_core_news_lg": "ca_core_news_lg",
-    "ca_core_news_trf": "ca_core_news_trf",
-    "da_core_news_sm": "da_core_news_sm",
-    "da_core_news_md": "da_core_news_md",
-    "da_core_news_lg": "da_core_news_lg",
-    "da_core_news_trf": "da_core_news_trf",
-    "de_core_news_sm": "de_core_news_sm",
-    "de_core_news_md": "de_core_news_md",
-    "de_core_news_lg": "de_core_news_lg",
-    "de_dep_news_trf": "de_dep_news_trf",
-    "el_core_news_sm": "el_core_news_sm",
-    "el_core_news_md": "el_core_news_md",
-    "el_core_news_lg": "el_core_news_lg",
-    "en_core_web_sm": "en_core_web_sm",
-    "en_core_web_md": "en_core_web_md",
-    "en_core_web_lg": "en_core_web_lg",
-    "en_core_web_trf": "en_core_web_trf",
-    "es_core_news_sm": "es_core_news_sm",
-    "es_core_news_md": "es_core_news_md",
-    "es_core_news_lg": "es_core_news_lg",
-    "es_dep_news_trf": "es_dep_news_trf",
-    "fr_core_news_sm": "fr_core_news_sm",
-    "fr_core_news_md": "fr_core_news_md",
-    "fr_core_news_lg": "fr_core_news_lg",
-    "fr_dep_news_trf": "fr_dep_news_trf",
-    "it_core_news_sm": "it_core_news_sm",
-    "it_core_news_md": "it_core_news_md",
-    "it_core_news_lg": "it_core_news_lg",
-    "ja_core_news_sm": "ja_core_news_sm",
-    "ja_core_news_md": "ja_core_news_md",
-    "ja_core_news_lg": "ja_core_news_lg",
-    "ja_dep_news_trf": "ja_dep_news_trf",
-    "lt_core_news_sm": "lt_core_news_sm",
-    "lt_core_news_md": "lt_core_news_md",
-    "lt_core_news_lg": "lt_core_news_lg",
-    "mk_core_news_sm": "mk_core_news_sm",
-    "mk_core_news_md": "mk_core_news_md",
-    "mk_core_news_lg": "mk_core_news_lg",
-    "nb_core_news_sm": "nb_core_news_sm",
-    "nb_core_news_md": "nb_core_news_md",
-    "nb_core_news_lg": "nb_core_news_lg",
-    "nl_core_news_sm": "nl_core_news_sm",
-    "nl_core_news_md": "nl_core_news_md",
-    "nl_core_news_lg": "nl_core_news_lg",
-    "pl_core_news_sm": "pl_core_news_sm",
-    "pl_core_news_md": "pl_core_news_md",
-    "pl_core_news_lg": "pl_core_news_lg",
-    "pt_core_news_sm": "pt_core_news_sm",
-    "pt_core_news_md": "pt_core_news_md",
-    "pt_core_news_lg": "pt_core_news_lg",
-    "ro_core_news_sm": "ro_core_news_sm",
-    "ro_core_news_md": "ro_core_news_md",
-    "ro_core_news_lg": "ro_core_news_lg",
-    "ru_core_news_sm": "ru_core_news_sm",
-    "ru_core_news_md": "ru_core_news_md",
-    "ru_core_news_lg": "ru_core_news_lg",
-    "xx_ent_wiki_sm": "xx_ent_wiki_sm",
-    "xx_sent_ud_sm": "xx_sent_ud_sm",
-    "zh_core_web_sm": "zh_core_web_sm",
-    "zh_core_web_md": "zh_core_web_md",
-    "zh_core_web_lg": "zh_core_web_lg",
-    "zh_core_web_trf": "zh_core_web_trf",
-}
-from relik.inference.data.tokenizers.regex_tokenizer import RegexTokenizer
-from relik.inference.data.tokenizers.spacy_tokenizer import SpacyTokenizer
-from relik.inference.data.tokenizers.whitespace_tokenizer import WhitespaceTokenizer

relik/inference/data/tokenizers/base_tokenizer.py DELETED Viewed

@@ -1,84 +0,0 @@
-from typing import List, Union
-from relik.inference.data.objects import Word
-class BaseTokenizer:
-    """
-    A :obj:`Tokenizer` splits strings of text into single words, optionally adds
-    pos tags and perform lemmatization.
-    """
-    def __call__(
-        self,
-        texts: Union[str, List[str], List[List[str]]],
-        is_split_into_words: bool = False,
-        **kwargs
-    ) -> List[List[Word]]:
-        """
-        Tokenize the input into single words.
-        Args:
-            texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
-                Text to tag. It can be a single string, a batch of string and pre-tokenized strings.
-            is_split_into_words (:obj:`bool`, optional, defaults to :obj:`False`):
-                If :obj:`True` and the input is a string, the input is split on spaces.
-        Returns:
-            :obj:`List[List[Word]]`: The input text tokenized in single words.
-        """
-        raise NotImplementedError
-    def tokenize(self, text: str) -> List[Word]:
-        """
-        Implements splitting words into tokens.
-        Args:
-            text (:obj:`str`):
-                Text to tokenize.
-        Returns:
-            :obj:`List[Word]`: The input text tokenized in single words.
-        """
-        raise NotImplementedError
-    def tokenize_batch(self, texts: List[str]) -> List[List[Word]]:
-        """
-        Implements batch splitting words into tokens.
-        Args:
-            texts (:obj:`List[str]`):
-                Batch of text to tokenize.
-        Returns:
-            :obj:`List[List[Word]]`: The input batch tokenized in single words.
-        """
-        return [self.tokenize(text) for text in texts]
-    @staticmethod
-    def check_is_batched(
-        texts: Union[str, List[str], List[List[str]]], is_split_into_words: bool
-    ):
-        """
-        Check if input is batched or a single sample.
-        Args:
-            texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
-                Text to check.
-            is_split_into_words (:obj:`bool`):
-                If :obj:`True` and the input is a string, the input is split on spaces.
-        Returns:
-            :obj:`bool`: ``True`` if ``texts`` is batched, ``False`` otherwise.
-        """
-        return bool(
-            (not is_split_into_words and isinstance(texts, (list, tuple)))
-            or (
-                is_split_into_words
-                and isinstance(texts, (list, tuple))
-                and texts
-                and isinstance(texts[0], (list, tuple))
-            )
-        )

relik/inference/data/tokenizers/regex_tokenizer.py DELETED Viewed

@@ -1,73 +0,0 @@
-import re
-from typing import List, Union
-from overrides import overrides
-from relik.inference.data.objects import Word
-from relik.inference.data.tokenizers.base_tokenizer import BaseTokenizer
-class RegexTokenizer(BaseTokenizer):
-    """
-    A :obj:`Tokenizer` that splits the text based on a simple regex.
-    """
-    def __init__(self):
-        super(RegexTokenizer, self).__init__()
-        # regex for splitting on spaces and punctuation and new lines
-        # self._regex = re.compile(r"\S+|[\[\](),.!?;:\"]|\\n")
-        self._regex = re.compile(
-            r"\w+|\$[\d\.]+|\S+", re.UNICODE | re.MULTILINE | re.DOTALL
-        )
-    def __call__(
-        self,
-        texts: Union[str, List[str], List[List[str]]],
-        is_split_into_words: bool = False,
-        **kwargs,
-    ) -> List[List[Word]]:
-        """
-        Tokenize the input into single words by splitting using a simple regex.
-        Args:
-            texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
-                Text to tag. It can be a single string, a batch of string and pre-tokenized strings.
-            is_split_into_words (:obj:`bool`, optional, defaults to :obj:`False`):
-                If :obj:`True` and the input is a string, the input is split on spaces.
-        Returns:
-            :obj:`List[List[Word]]`: The input text tokenized in single words.
-        Example::
-            >>> from relik.retriever.serve.tokenizers.regex_tokenizer import RegexTokenizer
-            >>> regex_tokenizer = RegexTokenizer()
-            >>> regex_tokenizer("Mary sold the car to John.")
-        """
-        # check if input is batched or a single sample
-        is_batched = self.check_is_batched(texts, is_split_into_words)
-        if is_batched:
-            tokenized = self.tokenize_batch(texts)
-        else:
-            tokenized = self.tokenize(texts)
-        return tokenized
-    @overrides
-    def tokenize(self, text: Union[str, List[str]]) -> List[Word]:
-        if not isinstance(text, (str, list)):
-            raise ValueError(
-                f"text must be either `str` or `list`, found: `{type(text)}`"
-            )
-        if isinstance(text, list):
-            text = " ".join(text)
-        return [
-            Word(t[0], i, start_char=t[1], end_char=t[2])
-            for i, t in enumerate(
-                (m.group(0), m.start(), m.end()) for m in self._regex.finditer(text)
-            )
-        ]

relik/inference/data/tokenizers/spacy_tokenizer.py DELETED Viewed

@@ -1,228 +0,0 @@
-import logging
-from typing import Dict, List, Tuple, Union
-import spacy
-# from ipa.common.utils import load_spacy
-from overrides import overrides
-from spacy.cli.download import download as spacy_download
-from spacy.tokens import Doc
-from relik.common.log import get_logger
-from relik.inference.data.objects import Word
-from relik.inference.data.tokenizers import SPACY_LANGUAGE_MAPPER
-from relik.inference.data.tokenizers.base_tokenizer import BaseTokenizer
-logger = get_logger(level=logging.DEBUG)
-# Spacy and Stanza stuff
-LOADED_SPACY_MODELS: Dict[Tuple[str, bool, bool, bool, bool], spacy.Language] = {}
-def load_spacy(
-    language: str,
-    pos_tags: bool = False,
-    lemma: bool = False,
-    parse: bool = False,
-    split_on_spaces: bool = False,
-) -> spacy.Language:
-    """
-    Download and load spacy model.
-    Args:
-        language (:obj:`str`, defaults to :obj:`en`):
-            Language of the text to tokenize.
-        pos_tags (:obj:`bool`, optional, defaults to :obj:`False`):
-            If :obj:`True`, performs POS tagging with spacy model.
-        lemma (:obj:`bool`, optional, defaults to :obj:`False`):
-            If :obj:`True`, performs lemmatization with spacy model.
-        parse (:obj:`bool`, optional, defaults to :obj:`False`):
-            If :obj:`True`, performs dependency parsing with spacy model.
-        split_on_spaces (:obj:`bool`, optional, defaults to :obj:`False`):
-            If :obj:`True`, will split by spaces without performing tokenization.
-    Returns:
-        :obj:`spacy.Language`: The spacy model loaded.
-    """
-    exclude = ["vectors", "textcat", "ner"]
-    if not pos_tags:
-        exclude.append("tagger")
-    if not lemma:
-        exclude.append("lemmatizer")
-    if not parse:
-        exclude.append("parser")
-    # check if the model is already loaded
-    # if so, there is no need to reload it
-    spacy_params = (language, pos_tags, lemma, parse, split_on_spaces)
-    if spacy_params not in LOADED_SPACY_MODELS:
-        try:
-            spacy_tagger = spacy.load(language, exclude=exclude)
-        except OSError:
-            logger.warning(
-                "Spacy model '%s' not found. Downloading and installing.", language
-            )
-            spacy_download(language)
-            spacy_tagger = spacy.load(language, exclude=exclude)
-        # if everything is disabled, return only the tokenizer
-        # for faster tokenization
-        # TODO: is it really faster?
-        # if len(exclude) >= 6:
-        #     spacy_tagger = spacy_tagger.tokenizer
-        LOADED_SPACY_MODELS[spacy_params] = spacy_tagger
-    return LOADED_SPACY_MODELS[spacy_params]
-class SpacyTokenizer(BaseTokenizer):
-    """
-    A :obj:`Tokenizer` that uses SpaCy to tokenizer and preprocess the text. It returns :obj:`Word` objects.
-    Args:
-        language (:obj:`str`, optional, defaults to :obj:`en`):
-            Language of the text to tokenize.
-        return_pos_tags (:obj:`bool`, optional, defaults to :obj:`False`):
-            If :obj:`True`, performs POS tagging with spacy model.
-        return_lemmas (:obj:`bool`, optional, defaults to :obj:`False`):
-            If :obj:`True`, performs lemmatization with spacy model.
-        return_deps (:obj:`bool`, optional, defaults to :obj:`False`):
-            If :obj:`True`, performs dependency parsing with spacy model.
-        split_on_spaces (:obj:`bool`, optional, defaults to :obj:`False`):
-            If :obj:`True`, will split by spaces without performing tokenization.
-        use_gpu (:obj:`bool`, optional, defaults to :obj:`False`):
-            If :obj:`True`, will load the Stanza model on GPU.
-    """
-    def __init__(
-        self,
-        language: str = "en",
-        return_pos_tags: bool = False,
-        return_lemmas: bool = False,
-        return_deps: bool = False,
-        split_on_spaces: bool = False,
-        use_gpu: bool = False,
-    ):
-        super(SpacyTokenizer, self).__init__()
-        if language not in SPACY_LANGUAGE_MAPPER:
-            raise ValueError(
-                f"`{language}` language not supported. The supported "
-                f"languages are: {list(SPACY_LANGUAGE_MAPPER.keys())}."
-            )
-        if use_gpu:
-            # load the model on GPU
-            # if the GPU is not available or not correctly configured,
-            # it will rise an error
-            spacy.require_gpu()
-        self.spacy = load_spacy(
-            SPACY_LANGUAGE_MAPPER[language],
-            return_pos_tags,
-            return_lemmas,
-            return_deps,
-            split_on_spaces,
-        )
-        self.split_on_spaces = split_on_spaces
-    def __call__(
-        self,
-        texts: Union[str, List[str], List[List[str]]],
-        is_split_into_words: bool = False,
-        **kwargs,
-    ) -> Union[List[Word], List[List[Word]]]:
-        """
-        Tokenize the input into single words using SpaCy models.
-        Args:
-            texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
-                Text to tag. It can be a single string, a batch of string and pre-tokenized strings.
-            is_split_into_words (:obj:`bool`, optional, defaults to :obj:`False`):
-                If :obj:`True` and the input is a string, the input is split on spaces.
-        Returns:
-            :obj:`List[List[Word]]`: The input text tokenized in single words.
-        Example::
-            >>> from ipa import SpacyTokenizer
-            >>> spacy_tokenizer = SpacyTokenizer(language="en", pos_tags=True, lemma=True)
-            >>> spacy_tokenizer("Mary sold the car to John.")
-        """
-        # check if input is batched or a single sample
-        is_batched = self.check_is_batched(texts, is_split_into_words)
-        if is_batched:
-            tokenized = self.tokenize_batch(texts)
-        else:
-            tokenized = self.tokenize(texts)
-        return tokenized
-    @overrides
-    def tokenize(self, text: Union[str, List[str]]) -> List[Word]:
-        if self.split_on_spaces:
-            if isinstance(text, str):
-                text = text.split(" ")
-            spaces = [True] * len(text)
-            text = Doc(self.spacy.vocab, words=text, spaces=spaces)
-        return self._clean_tokens(self.spacy(text))
-    @overrides
-    def tokenize_batch(
-        self, texts: Union[List[str], List[List[str]]]
-    ) -> List[List[Word]]:
-        if self.split_on_spaces:
-            if isinstance(texts[0], str):
-                texts = [text.split(" ") for text in texts]
-            spaces = [[True] * len(text) for text in texts]
-            texts = [
-                Doc(self.spacy.vocab, words=text, spaces=space)
-                for text, space in zip(texts, spaces)
-            ]
-        return [self._clean_tokens(tokens) for tokens in self.spacy.pipe(texts)]
-    @staticmethod
-    def _clean_tokens(tokens: Doc) -> List[Word]:
-        """
-        Converts spaCy tokens to :obj:`Word`.
-        Args:
-            tokens (:obj:`spacy.tokens.Doc`):
-                Tokens from SpaCy model.
-        Returns:
-            :obj:`List[Word]`: The SpaCy model output converted into :obj:`Word` objects.
-        """
-        words = [
-            Word(
-                token.text,
-                token.i,
-                token.idx,
-                token.idx + len(token),
-                token.lemma_,
-                token.pos_,
-                token.dep_,
-                token.head.i,
-            )
-            for token in tokens
-        ]
-        return words
-class WhitespaceSpacyTokenizer:
-    """Simple white space tokenizer for SpaCy."""
-    def __init__(self, vocab):
-        self.vocab = vocab
-    def __call__(self, text):
-        if isinstance(text, str):
-            words = text.split(" ")
-        elif isinstance(text, list):
-            words = text
-        else:
-            raise ValueError(
-                f"text must be either `str` or `list`, found: `{type(text)}`"
-            )
-        spaces = [True] * len(words)
-        return Doc(self.vocab, words=words, spaces=spaces)

relik/inference/data/tokenizers/whitespace_tokenizer.py DELETED Viewed

@@ -1,70 +0,0 @@
-import re
-from typing import List, Union
-from overrides import overrides
-from relik.inference.data.objects import Word
-from relik.inference.data.tokenizers.base_tokenizer import BaseTokenizer
-class WhitespaceTokenizer(BaseTokenizer):
-    """
-    A :obj:`Tokenizer` that splits the text on spaces.
-    """
-    def __init__(self):
-        super(WhitespaceTokenizer, self).__init__()
-        self.whitespace_regex = re.compile(r"\S+")
-    def __call__(
-        self,
-        texts: Union[str, List[str], List[List[str]]],
-        is_split_into_words: bool = False,
-        **kwargs,
-    ) -> List[List[Word]]:
-        """
-        Tokenize the input into single words by splitting on spaces.
-        Args:
-            texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
-                Text to tag. It can be a single string, a batch of string and pre-tokenized strings.
-            is_split_into_words (:obj:`bool`, optional, defaults to :obj:`False`):
-                If :obj:`True` and the input is a string, the input is split on spaces.
-        Returns:
-            :obj:`List[List[Word]]`: The input text tokenized in single words.
-        Example::
-            >>> from nlp_preprocessing_wrappers import WhitespaceTokenizer
-            >>> whitespace_tokenizer = WhitespaceTokenizer()
-            >>> whitespace_tokenizer("Mary sold the car to John .")
-        """
-        # check if input is batched or a single sample
-        is_batched = self.check_is_batched(texts, is_split_into_words)
-        if is_batched:
-            tokenized = self.tokenize_batch(texts)
-        else:
-            tokenized = self.tokenize(texts)
-        return tokenized
-    @overrides
-    def tokenize(self, text: Union[str, List[str]]) -> List[Word]:
-        if not isinstance(text, (str, list)):
-            raise ValueError(
-                f"text must be either `str` or `list`, found: `{type(text)}`"
-            )
-        if isinstance(text, list):
-            text = " ".join(text)
-        return [
-            Word(t[0], i, start_char=t[1], end_char=t[2])
-            for i, t in enumerate(
-                (m.group(0), m.start(), m.end())
-                for m in self.whitespace_regex.finditer(text)
-            )
-        ]

relik/inference/data/window/__init__.py DELETED Viewed

File without changes

relik/inference/data/window/manager.py DELETED Viewed

@@ -1,262 +0,0 @@
-import collections
-import itertools
-from dataclasses import dataclass
-from typing import List, Optional, Set, Tuple
-from relik.inference.data.tokenizers.base_tokenizer import BaseTokenizer
-from relik.reader.data.relik_reader_sample import RelikReaderSample
-@dataclass
-class Window:
-    doc_id: int
-    window_id: int
-    text: str
-    tokens: List[str]
-    doc_topic: Optional[str]
-    offset: int
-    token2char_start: dict
-    token2char_end: dict
-    window_candidates: Optional[List[str]] = None
-class WindowManager:
-    def __init__(self, tokenizer: BaseTokenizer) -> None:
-        self.tokenizer = tokenizer
-    def tokenize(self, document: str) -> Tuple[List[str], List[Tuple[int, int]]]:
-        tokenized_document = self.tokenizer(document)
-        tokens = []
-        tokens_char_mapping = []
-        for token in tokenized_document:
-            tokens.append(token.text)
-            tokens_char_mapping.append((token.start_char, token.end_char))
-        return tokens, tokens_char_mapping
-    def create_windows(
-        self,
-        document: str,
-        window_size: int,
-        stride: int,
-        doc_id: int = 0,
-        doc_topic: str = None,
-    ) -> List[RelikReaderSample]:
-        document_tokens, tokens_char_mapping = self.tokenize(document)
-        if doc_topic is None:
-            doc_topic = document_tokens[0] if len(document_tokens) > 0 else ""
-        document_windows = []
-        if len(document_tokens) <= window_size:
-            text = document
-            # relik_reader_sample = RelikReaderSample()
-            document_windows.append(
-                # Window(
-                RelikReaderSample(
-                    doc_id=doc_id,
-                    window_id=0,
-                    text=text,
-                    tokens=document_tokens,
-                    doc_topic=doc_topic,
-                    offset=0,
-                    token2char_start={
-                        str(i): tokens_char_mapping[i][0]
-                        for i in range(len(document_tokens))
-                    },
-                    token2char_end={
-                        str(i): tokens_char_mapping[i][1]
-                        for i in range(len(document_tokens))
-                    },
-                )
-            )
-        else:
-            for window_id, i in enumerate(range(0, len(document_tokens), stride)):
-                # if the last stride is smaller than the window size, then we can
-                # include more tokens form the previous window.
-                if i != 0 and i + window_size > len(document_tokens):
-                    overflowing_tokens = i + window_size - len(document_tokens)
-                    if overflowing_tokens >= stride:
-                        break
-                    i -= overflowing_tokens
-                involved_token_indices = list(
-                    range(i, min(i + window_size, len(document_tokens) - 1))
-                )
-                window_tokens = [document_tokens[j] for j in involved_token_indices]
-                window_text_start = tokens_char_mapping[involved_token_indices[0]][0]
-                window_text_end = tokens_char_mapping[involved_token_indices[-1]][1]
-                text = document[window_text_start:window_text_end]
-                document_windows.append(
-                    # Window(
-                    RelikReaderSample(
-                        # dict(
-                        doc_id=doc_id,
-                        window_id=window_id,
-                        text=text,
-                        tokens=window_tokens,
-                        doc_topic=doc_topic,
-                        offset=window_text_start,
-                        token2char_start={
-                            str(i): tokens_char_mapping[ti][0]
-                            for i, ti in enumerate(involved_token_indices)
-                        },
-                        token2char_end={
-                            str(i): tokens_char_mapping[ti][1]
-                            for i, ti in enumerate(involved_token_indices)
-                        },
-                        # )
-                    )
-                )
-        return document_windows
-    def merge_windows(
-        self, windows: List[RelikReaderSample]
-    ) -> List[RelikReaderSample]:
-        windows_by_doc_id = collections.defaultdict(list)
-        for window in windows:
-            windows_by_doc_id[window.doc_id].append(window)
-        merged_window_by_doc = {
-            doc_id: self.merge_doc_windows(doc_windows)
-            for doc_id, doc_windows in windows_by_doc_id.items()
-        }
-        return list(merged_window_by_doc.values())
-    def merge_doc_windows(self, windows: List[RelikReaderSample]) -> RelikReaderSample:
-        if len(windows) == 1:
-            return windows[0]
-        if len(windows) > 0 and getattr(windows[0], "offset", None) is not None:
-            windows = sorted(windows, key=(lambda x: x.offset))
-        window_accumulator = windows[0]
-        for next_window in windows[1:]:
-            window_accumulator = self._merge_window_pair(
-                window_accumulator, next_window
-            )
-        return window_accumulator
-    def _merge_tokens(
-        self, window1: RelikReaderSample, window2: RelikReaderSample
-    ) -> Tuple[list, dict, dict]:
-        w1_tokens = window1.tokens[1:-1]
-        w2_tokens = window2.tokens[1:-1]
-        # find intersection
-        tokens_intersection = None
-        for k in reversed(range(1, len(w1_tokens))):
-            if w1_tokens[-k:] == w2_tokens[:k]:
-                tokens_intersection = k
-                break
-        assert tokens_intersection is not None, (
-            f"{window1.doc_id} - {window1.sent_id} - {window1.offset}"
-            + f" {window2.doc_id} - {window2.sent_id} - {window2.offset}\n"
-            + f"w1 tokens: {w1_tokens}\n"
-            + f"w2 tokens: {w2_tokens}\n"
-        )
-        final_tokens = (
-            [window1.tokens[0]]  # CLS
-            + w1_tokens
-            + w2_tokens[tokens_intersection:]
-            + [window1.tokens[-1]]  # SEP
-        )
-        w2_starting_offset = len(w1_tokens) - tokens_intersection
-        def merge_char_mapping(t2c1: dict, t2c2: dict) -> dict:
-            final_t2c = dict()
-            final_t2c.update(t2c1)
-            for t, c in t2c2.items():
-                t = int(t)
-                if t < tokens_intersection:
-                    continue
-                final_t2c[str(t + w2_starting_offset)] = c
-            return final_t2c
-        return (
-            final_tokens,
-            merge_char_mapping(window1.token2char_start, window2.token2char_start),
-            merge_char_mapping(window1.token2char_end, window2.token2char_end),
-        )
-    def _merge_span_annotation(
-        self, span_annotation1: List[list], span_annotation2: List[list]
-    ) -> List[list]:
-        uniq_store = set()
-        final_span_annotation_store = []
-        for span_annotation in itertools.chain(span_annotation1, span_annotation2):
-            span_annotation_id = tuple(span_annotation)
-            if span_annotation_id not in uniq_store:
-                uniq_store.add(span_annotation_id)
-                final_span_annotation_store.append(span_annotation)
-        return sorted(final_span_annotation_store, key=lambda x: x[0])
-    def _merge_predictions(
-        self,
-        window1: RelikReaderSample,
-        window2: RelikReaderSample,
-    ) -> Tuple[Set[Tuple[int, int, str]], dict]:
-        merged_predictions = window1.predicted_window_labels_chars.union(
-            window2.predicted_window_labels_chars
-        )
-        span_title_probabilities = dict()
-        # probabilities
-        for span_prediction, predicted_probs in itertools.chain(
-            window1.probs_window_labels_chars.items(),
-            window2.probs_window_labels_chars.items(),
-        ):
-            if span_prediction not in span_title_probabilities:
-                span_title_probabilities[span_prediction] = predicted_probs
-        return merged_predictions, span_title_probabilities
-    def _merge_window_pair(
-        self,
-        window1: RelikReaderSample,
-        window2: RelikReaderSample,
-    ) -> RelikReaderSample:
-        merging_output = dict()
-        if getattr(window1, "doc_id", None) is not None:
-            assert window1.doc_id == window2.doc_id
-        if getattr(window1, "offset", None) is not None:
-            assert (
-                window1.offset < window2.offset
-            ), f"window 2 offset ({window2.offset}) is smaller that window 1 offset({window1.offset})"
-        merging_output["doc_id"] = window1.doc_id
-        merging_output["offset"] = window2.offset
-        m_tokens, m_token2char_start, m_token2char_end = self._merge_tokens(
-            window1, window2
-        )
-        window_labels = None
-        if getattr(window1, "window_labels", None) is not None:
-            window_labels = self._merge_span_annotation(
-                window1.window_labels, window2.window_labels
-            )
-        (
-            predicted_window_labels_chars,
-            probs_window_labels_chars,
-        ) = self._merge_predictions(
-            window1,
-            window2,
-        )
-        merging_output.update(
-            dict(
-                tokens=m_tokens,
-                token2char_start=m_token2char_start,
-                token2char_end=m_token2char_end,
-                window_labels=window_labels,
-                predicted_window_labels_chars=predicted_window_labels_chars,
-                probs_window_labels_chars=probs_window_labels_chars,
-            )
-        )
-        return RelikReaderSample(**merging_output)

relik/inference/gerbil.py DELETED Viewed

@@ -1,254 +0,0 @@
-import argparse
-import json
-import os
-import re
-import sys
-from http.server import BaseHTTPRequestHandler, HTTPServer
-from typing import Iterator, List, Optional, Tuple
-from relik.inference.annotator import Relik
-from relik.inference.data.objects import RelikOutput
-# sys.path += ['../']
-sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
-import logging
-logger = logging.getLogger(__name__)
-class GerbilAlbyManager:
-    def __init__(
-        self,
-        annotator: Optional[Relik] = None,
-        response_logger_dir: Optional[str] = None,
-    ) -> None:
-        self.annotator = annotator
-        self.response_logger_dir = response_logger_dir
-        self.predictions_counter = 0
-        self.labels_mapping = None
-    def annotate(self, document: str):
-        relik_output: RelikOutput = self.annotator(document)
-        annotations = [(ss, se, l) for ss, se, l, _ in relik_output.labels]
-        if self.labels_mapping is not None:
-            return [
-                (ss, se, self.labels_mapping.get(l, l)) for ss, se, l in annotations
-            ]
-        return annotations
-    def set_mapping_file(self, mapping_file_path: str):
-        with open(mapping_file_path) as f:
-            labels_mapping = json.load(f)
-        self.labels_mapping = {v: k for k, v in labels_mapping.items()}
-    def write_response_bundle(
-        self,
-        document: str,
-        new_document: str,
-        annotations: list,
-        mapped_annotations: list,
-    ) -> None:
-        if self.response_logger_dir is None:
-            return
-        if not os.path.isdir(self.response_logger_dir):
-            os.mkdir(self.response_logger_dir)
-        with open(
-            f"{self.response_logger_dir}/{self.predictions_counter}.json", "w"
-        ) as f:
-            out_json_obj = dict(
-                document=document,
-                new_document=new_document,
-                annotations=annotations,
-                mapped_annotations=mapped_annotations,
-            )
-            out_json_obj["span_annotations"] = [
-                (ss, se, document[ss:se], label) for (ss, se, label) in annotations
-            ]
-            out_json_obj["span_mapped_annotations"] = [
-                (ss, se, new_document[ss:se], label)
-                for (ss, se, label) in mapped_annotations
-            ]
-            json.dump(out_json_obj, f, indent=2)
-        self.predictions_counter += 1
-manager = GerbilAlbyManager()
-def preprocess_document(document: str) -> Tuple[str, List[Tuple[int, int]]]:
-    pattern_subs = {
-        "-LPR- ": " (",
-        "-RPR-": ")",
-        "\n\n": "\n",
-        "-LRB-": "(",
-        "-RRB-": ")",
-        '","': ",",
-    }
-    document_acc = document
-    curr_offset = 0
-    char2offset = []
-    matchings = re.finditer("({})".format("|".join(pattern_subs)), document)
-    for span_matching in sorted(matchings, key=lambda x: x.span()[0]):
-        span_start, span_end = span_matching.span()
-        span_start -= curr_offset
-        span_end -= curr_offset
-        span_text = document_acc[span_start:span_end]
-        span_sub = pattern_subs[span_text]
-        document_acc = document_acc[:span_start] + span_sub + document_acc[span_end:]
-        offset = len(span_text) - len(span_sub)
-        curr_offset += offset
-        char2offset.append((span_start + len(span_sub), curr_offset))
-    return document_acc, char2offset
-def map_back_annotations(
-    annotations: List[Tuple[int, int, str]], char_mapping: List[Tuple[int, int]]
-) -> Iterator[Tuple[int, int, str]]:
-    def map_char(char_idx: int) -> int:
-        current_offset = 0
-        for offset_idx, offset_value in char_mapping:
-            if char_idx >= offset_idx:
-                current_offset = offset_value
-            else:
-                break
-        return char_idx + current_offset
-    for ss, se, label in annotations:
-        yield map_char(ss), map_char(se), label
-def annotate(document: str) -> List[Tuple[int, int, str]]:
-    new_document, mapping = preprocess_document(document)
-    logger.info("Mapping: " + str(mapping))
-    logger.info("Document: " + str(document))
-    annotations = [
-        (cs, ce, label.replace(" ", "_"))
-        for cs, ce, label in manager.annotate(new_document)
-    ]
-    logger.info("New document: " + str(new_document))
-    mapped_annotations = (
-        list(map_back_annotations(annotations, mapping))
-        if len(mapping) > 0
-        else annotations
-    )
-    logger.info(
-        "Annotations: "
-        + str([(ss, se, document[ss:se], ann) for ss, se, ann in mapped_annotations])
-    )
-    manager.write_response_bundle(
-        document, new_document, mapped_annotations, annotations
-    )
-    if not all(
-        [
-            new_document[ss:se] == document[mss:mse]
-            for (mss, mse, _), (ss, se, _) in zip(mapped_annotations, annotations)
-        ]
-    ):
-        diff_mappings = [
-            (new_document[ss:se], document[mss:mse])
-            for (mss, mse, _), (ss, se, _) in zip(mapped_annotations, annotations)
-        ]
-        return None
-    assert all(
-        [
-            document[mss:mse] == new_document[ss:se]
-            for (mss, mse, _), (ss, se, _) in zip(mapped_annotations, annotations)
-        ]
-    ), (mapped_annotations, annotations)
-    return [(cs, ce - cs, label) for cs, ce, label in mapped_annotations]
-class GetHandler(BaseHTTPRequestHandler):
-    def do_POST(self):
-        content_length = int(self.headers["Content-Length"])
-        post_data = self.rfile.read(content_length)
-        self.send_response(200)
-        self.end_headers()
-        doc_text = read_json(post_data)
-        # try:
-        response = annotate(doc_text)
-        self.wfile.write(bytes(json.dumps(response), "utf-8"))
-        return
-def read_json(post_data):
-    data = json.loads(post_data.decode("utf-8"))
-    # logger.info("received data:", data)
-    text = data["text"]
-    # spans = [(int(j["start"]), int(j["length"])) for j in data["spans"]]
-    return text
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--relik-model-name", required=True)
-    parser.add_argument("--responses-log-dir")
-    parser.add_argument("--log-file", default="logs/logging.txt")
-    parser.add_argument("--mapping-file")
-    return parser.parse_args()
-def main():
-    args = parse_args()
-    # init manager
-    manager.response_logger_dir = args.responses_log_dir
-    # manager.annotator = Relik.from_pretrained(args.relik_model_name)
-    print("Debugging, not using you relik model but an hardcoded one.")
-    manager.annotator = Relik(
-        question_encoder="riccorl/relik-retriever-aida-blink-pretrain-omniencoder",
-        document_index="riccorl/index-relik-retriever-aida-blink-pretrain-omniencoder",
-        reader="relik/reader/models/relik-reader-deberta-base-new-data",
-        window_size=32,
-        window_stride=16,
-        candidates_preprocessing_fn=(lambda x: x.split("<def>")[0].strip()),
-    )
-    if args.mapping_file is not None:
-        manager.set_mapping_file(args.mapping_file)
-    port = 6654
-    server = HTTPServer(("localhost", port), GetHandler)
-    logger.info(f"Starting server at http://localhost:{port}")
-    # Create a file handler and set its level
-    file_handler = logging.FileHandler(args.log_file)
-    file_handler.setLevel(logging.DEBUG)
-    # Create a log formatter and set it on the handler
-    formatter = logging.Formatter(
-        "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
-    )
-    file_handler.setFormatter(formatter)
-    # Add the file handler to the logger
-    logger.addHandler(file_handler)
-    try:
-        server.serve_forever()
-    except KeyboardInterrupt:
-        exit(0)
-if __name__ == "__main__":
-    main()

relik/inference/preprocessing.py DELETED Viewed

@@ -1,4 +0,0 @@
-def wikipedia_title_and_openings_preprocessing(
-    wikipedia_title_and_openings: str, sepator: str = " <def>"
-):
-    return wikipedia_title_and_openings.split(sepator, 1)[0]

relik/inference/serve/__init__.py DELETED Viewed

File without changes

relik/inference/serve/backend/__init__.py DELETED Viewed

File without changes

relik/inference/serve/backend/relik.py DELETED Viewed

@@ -1,210 +0,0 @@
-import logging
-from pathlib import Path
-from typing import List, Optional, Union
-from relik.common.utils import is_package_available
-from relik.inference.annotator import Relik
-if not is_package_available("fastapi"):
-    raise ImportError(
-        "FastAPI is not installed. Please install FastAPI with `pip install relik[serve]`."
-    )
-from fastapi import FastAPI, HTTPException
-if not is_package_available("ray"):
-    raise ImportError(
-        "Ray is not installed. Please install Ray with `pip install relik[serve]`."
-    )
-from ray import serve
-from relik.common.log import get_logger
-from relik.inference.serve.backend.utils import (
-    RayParameterManager,
-    ServerParameterManager,
-)
-from relik.retriever.data.utils import batch_generator
-logger = get_logger(__name__, level=logging.INFO)
-VERSION = {}  # type: ignore
-with open(
-    Path(__file__).parent.parent.parent.parent / "version.py", "r"
-) as version_file:
-    exec(version_file.read(), VERSION)
-# Env variables for server
-SERVER_MANAGER = ServerParameterManager()
-RAY_MANAGER = RayParameterManager()
-app = FastAPI(
-    title="ReLiK",
-    version=VERSION["VERSION"],
-    description="ReLiK REST API",
-)
-@serve.deployment(
-    ray_actor_options={
-        "num_gpus": RAY_MANAGER.num_gpus
-        if (
-            SERVER_MANAGER.retriver_device == "cuda"
-            or SERVER_MANAGER.reader_device == "cuda"
-        )
-        else 0
-    },
-    autoscaling_config={
-        "min_replicas": RAY_MANAGER.min_replicas,
-        "max_replicas": RAY_MANAGER.max_replicas,
-    },
-)
-@serve.ingress(app)
-class RelikServer:
-    def __init__(
-        self,
-        question_encoder: str,
-        document_index: str,
-        passage_encoder: Optional[str] = None,
-        reader_encoder: Optional[str] = None,
-        top_k: int = 100,
-        retriver_device: str = "cpu",
-        reader_device: str = "cpu",
-        index_device: Optional[str] = None,
-        precision: int = 32,
-        index_precision: Optional[int] = None,
-        use_faiss: bool = False,
-        window_batch_size: int = 32,
-        window_size: int = 32,
-        window_stride: int = 16,
-        split_on_spaces: bool = False,
-    ):
-        # parameters
-        self.question_encoder = question_encoder
-        self.passage_encoder = passage_encoder
-        self.reader_encoder = reader_encoder
-        self.document_index = document_index
-        self.top_k = top_k
-        self.retriver_device = retriver_device
-        self.index_device = index_device or retriver_device
-        self.reader_device = reader_device
-        self.precision = precision
-        self.index_precision = index_precision or precision
-        self.use_faiss = use_faiss
-        self.window_batch_size = window_batch_size
-        self.window_size = window_size
-        self.window_stride = window_stride
-        self.split_on_spaces = split_on_spaces
-        # log stuff for debugging
-        logger.info("Initializing RelikServer with parameters:")
-        logger.info(f"QUESTION_ENCODER: {self.question_encoder}")
-        logger.info(f"PASSAGE_ENCODER: {self.passage_encoder}")
-        logger.info(f"READER_ENCODER: {self.reader_encoder}")
-        logger.info(f"DOCUMENT_INDEX: {self.document_index}")
-        logger.info(f"TOP_K: {self.top_k}")
-        logger.info(f"RETRIEVER_DEVICE: {self.retriver_device}")
-        logger.info(f"READER_DEVICE: {self.reader_device}")
-        logger.info(f"INDEX_DEVICE: {self.index_device}")
-        logger.info(f"PRECISION: {self.precision}")
-        logger.info(f"INDEX_PRECISION: {self.index_precision}")
-        logger.info(f"WINDOW_BATCH_SIZE: {self.window_batch_size}")
-        logger.info(f"SPLIT_ON_SPACES: {self.split_on_spaces}")
-        self.relik = Relik(
-            question_encoder=self.question_encoder,
-            passage_encoder=self.passage_encoder,
-            document_index=self.document_index,
-            reader=self.reader_encoder,
-            retriever_device=self.retriver_device,
-            document_index_device=self.index_device,
-            reader_device=self.reader_device,
-            retriever_precision=self.precision,
-            document_index_precision=self.index_precision,
-            reader_precision=self.precision,
-        )
-    # @serve.batch()
-    async def handle_batch(self, documents: List[str]) -> List:
-        return self.relik(
-            documents,
-            top_k=self.top_k,
-            window_size=self.window_size,
-            window_stride=self.window_stride,
-            batch_size=self.window_batch_size,
-        )
-    @app.post("/api/entities")
-    async def entities_endpoint(
-        self,
-        documents: Union[str, List[str]],
-    ):
-        try:
-            # normalize input
-            if isinstance(documents, str):
-                documents = [documents]
-            if document_topics is not None:
-                if isinstance(document_topics, str):
-                    document_topics = [document_topics]
-                assert len(documents) == len(document_topics)
-            # get predictions for the retriever
-            return await self.handle_batch(documents, document_topics)
-        except Exception as e:
-            # log the entire stack trace
-            logger.exception(e)
-            raise HTTPException(status_code=500, detail=f"Server Error: {e}")
-    @app.post("/api/gerbil")
-    async def gerbil_endpoint(self, documents: Union[str, List[str]]):
-        try:
-            # normalize input
-            if isinstance(documents, str):
-                documents = [documents]
-            # output list
-            windows_passages = []
-            # split documents into windows
-            document_windows = [
-                window
-                for doc_id, document in enumerate(documents)
-                for window in self.window_manager(
-                    self.tokenizer,
-                    document,
-                    window_size=self.window_size,
-                    stride=self.window_stride,
-                    doc_id=doc_id,
-                )
-            ]
-            # get text and topic from document windows and create new list
-            model_inputs = [
-                (window.text, window.doc_topic) for window in document_windows
-            ]
-            # batch generator
-            for batch in batch_generator(
-                model_inputs, batch_size=self.window_batch_size
-            ):
-                text, text_pair = zip(*batch)
-                batch_predictions = await self.handle_batch_retriever(text, text_pair)
-                windows_passages.extend(
-                    [
-                        [p.label for p in predictions]
-                        for predictions in batch_predictions
-                    ]
-                )
-            # add passage to document windows
-            for window, passages in zip(document_windows, windows_passages):
-                # clean up passages (remove everything after first <def> tag if present)
-                passages = [c.split(" <def>", 1)[0] for c in passages]
-                window.window_candidates = passages
-            # return document windows
-            return document_windows
-        except Exception as e:
-            # log the entire stack trace
-            logger.exception(e)
-            raise HTTPException(status_code=500, detail=f"Server Error: {e}")
-server = RelikServer.bind(**vars(SERVER_MANAGER))

relik/inference/serve/backend/retriever.py DELETED Viewed

@@ -1,206 +0,0 @@
-import logging
-from pathlib import Path
-from typing import List, Optional, Union
-from relik.common.utils import is_package_available
-if not is_package_available("fastapi"):
-    raise ImportError(
-        "FastAPI is not installed. Please install FastAPI with `pip install relik[serve]`."
-    )
-from fastapi import FastAPI, HTTPException
-if not is_package_available("ray"):
-    raise ImportError(
-        "Ray is not installed. Please install Ray with `pip install relik[serve]`."
-    )
-from ray import serve
-from relik.common.log import get_logger
-from relik.inference.data.tokenizers import SpacyTokenizer, WhitespaceTokenizer
-from relik.inference.data.window.manager import WindowManager
-from relik.inference.serve.backend.utils import (
-    RayParameterManager,
-    ServerParameterManager,
-)
-from relik.retriever.data.utils import batch_generator
-from relik.retriever.pytorch_modules import GoldenRetriever
-logger = get_logger(__name__, level=logging.INFO)
-VERSION = {}  # type: ignore
-with open(Path(__file__).parent.parent.parent / "version.py", "r") as version_file:
-    exec(version_file.read(), VERSION)
-# Env variables for server
-SERVER_MANAGER = ServerParameterManager()
-RAY_MANAGER = RayParameterManager()
-app = FastAPI(
-    title="Golden Retriever",
-    version=VERSION["VERSION"],
-    description="Golden Retriever REST API",
-)
-@serve.deployment(
-    ray_actor_options={
-        "num_gpus": RAY_MANAGER.num_gpus if SERVER_MANAGER.device == "cuda" else 0
-    },
-    autoscaling_config={
-        "min_replicas": RAY_MANAGER.min_replicas,
-        "max_replicas": RAY_MANAGER.max_replicas,
-    },
-)
-@serve.ingress(app)
-class GoldenRetrieverServer:
-    def __init__(
-        self,
-        question_encoder: str,
-        document_index: str,
-        passage_encoder: Optional[str] = None,
-        top_k: int = 100,
-        device: str = "cpu",
-        index_device: Optional[str] = None,
-        precision: int = 32,
-        index_precision: Optional[int] = None,
-        use_faiss: bool = False,
-        window_batch_size: int = 32,
-        window_size: int = 32,
-        window_stride: int = 16,
-        split_on_spaces: bool = False,
-    ):
-        # parameters
-        self.question_encoder = question_encoder
-        self.passage_encoder = passage_encoder
-        self.document_index = document_index
-        self.top_k = top_k
-        self.device = device
-        self.index_device = index_device or device
-        self.precision = precision
-        self.index_precision = index_precision or precision
-        self.use_faiss = use_faiss
-        self.window_batch_size = window_batch_size
-        self.window_size = window_size
-        self.window_stride = window_stride
-        self.split_on_spaces = split_on_spaces
-        # log stuff for debugging
-        logger.info("Initializing GoldenRetrieverServer with parameters:")
-        logger.info(f"QUESTION_ENCODER: {self.question_encoder}")
-        logger.info(f"PASSAGE_ENCODER: {self.passage_encoder}")
-        logger.info(f"DOCUMENT_INDEX: {self.document_index}")
-        logger.info(f"TOP_K: {self.top_k}")
-        logger.info(f"DEVICE: {self.device}")
-        logger.info(f"INDEX_DEVICE: {self.index_device}")
-        logger.info(f"PRECISION: {self.precision}")
-        logger.info(f"INDEX_PRECISION: {self.index_precision}")
-        logger.info(f"WINDOW_BATCH_SIZE: {self.window_batch_size}")
-        logger.info(f"SPLIT_ON_SPACES: {self.split_on_spaces}")
-        self.retriever = GoldenRetriever(
-            question_encoder=self.question_encoder,
-            passage_encoder=self.passage_encoder,
-            document_index=self.document_index,
-            device=self.device,
-            index_device=self.index_device,
-            index_precision=self.index_precision,
-        )
-        self.retriever.eval()
-        if self.split_on_spaces:
-            logger.info("Using WhitespaceTokenizer")
-            self.tokenizer = WhitespaceTokenizer()
-            # logger.info("Using RegexTokenizer")
-            # self.tokenizer = RegexTokenizer()
-        else:
-            logger.info("Using SpacyTokenizer")
-            self.tokenizer = SpacyTokenizer(language="en")
-        self.window_manager = WindowManager(tokenizer=self.tokenizer)
-    # @serve.batch()
-    async def handle_batch(
-        self, documents: List[str], document_topics: List[str]
-    ) -> List:
-        return self.retriever.retrieve(
-            documents, text_pair=document_topics, k=self.top_k, precision=self.precision
-        )
-    @app.post("/api/retrieve")
-    async def retrieve_endpoint(
-        self,
-        documents: Union[str, List[str]],
-        document_topics: Optional[Union[str, List[str]]] = None,
-    ):
-        try:
-            # normalize input
-            if isinstance(documents, str):
-                documents = [documents]
-            if document_topics is not None:
-                if isinstance(document_topics, str):
-                    document_topics = [document_topics]
-                assert len(documents) == len(document_topics)
-            # get predictions
-            return await self.handle_batch(documents, document_topics)
-        except Exception as e:
-            # log the entire stack trace
-            logger.exception(e)
-            raise HTTPException(status_code=500, detail=f"Server Error: {e}")
-    @app.post("/api/gerbil")
-    async def gerbil_endpoint(self, documents: Union[str, List[str]]):
-        try:
-            # normalize input
-            if isinstance(documents, str):
-                documents = [documents]
-            # output list
-            windows_passages = []
-            # split documents into windows
-            document_windows = [
-                window
-                for doc_id, document in enumerate(documents)
-                for window in self.window_manager(
-                    self.tokenizer,
-                    document,
-                    window_size=self.window_size,
-                    stride=self.window_stride,
-                    doc_id=doc_id,
-                )
-            ]
-            # get text and topic from document windows and create new list
-            model_inputs = [
-                (window.text, window.doc_topic) for window in document_windows
-            ]
-            # batch generator
-            for batch in batch_generator(
-                model_inputs, batch_size=self.window_batch_size
-            ):
-                text, text_pair = zip(*batch)
-                batch_predictions = await self.handle_batch(text, text_pair)
-                windows_passages.extend(
-                    [
-                        [p.label for p in predictions]
-                        for predictions in batch_predictions
-                    ]
-                )
-            # add passage to document windows
-            for window, passages in zip(document_windows, windows_passages):
-                # clean up passages (remove everything after first <def> tag if present)
-                passages = [c.split(" <def>", 1)[0] for c in passages]
-                window.window_candidates = passages
-            # return document windows
-            return document_windows
-        except Exception as e:
-            # log the entire stack trace
-            logger.exception(e)
-            raise HTTPException(status_code=500, detail=f"Server Error: {e}")
-server = GoldenRetrieverServer.bind(**vars(SERVER_MANAGER))

relik/inference/serve/backend/utils.py DELETED Viewed

@@ -1,29 +0,0 @@
-import os
-from dataclasses import dataclass
-from typing import Union
-@dataclass
-class ServerParameterManager:
-    retriver_device: str = os.environ.get("RETRIEVER_DEVICE", "cpu")
-    reader_device: str = os.environ.get("READER_DEVICE", "cpu")
-    index_device: str = os.environ.get("INDEX_DEVICE", retriver_device)
-    precision: Union[str, int] = os.environ.get("PRECISION", "fp32")
-    index_precision: Union[str, int] = os.environ.get("INDEX_PRECISION", precision)
-    question_encoder: str = os.environ.get("QUESTION_ENCODER", None)
-    passage_encoder: str = os.environ.get("PASSAGE_ENCODER", None)
-    document_index: str = os.environ.get("DOCUMENT_INDEX", None)
-    reader_encoder: str = os.environ.get("READER_ENCODER", None)
-    top_k: int = int(os.environ.get("TOP_K", 100))
-    use_faiss: bool = os.environ.get("USE_FAISS", False)
-    window_batch_size: int = int(os.environ.get("WINDOW_BATCH_SIZE", 32))
-    window_size: int = int(os.environ.get("WINDOW_SIZE", 32))
-    window_stride: int = int(os.environ.get("WINDOW_SIZE", 16))
-    split_on_spaces: bool = os.environ.get("SPLIT_ON_SPACES", False)
-class RayParameterManager:
-    def __init__(self) -> None:
-        self.num_gpus = int(os.environ.get("NUM_GPUS", 1))
-        self.min_replicas = int(os.environ.get("MIN_REPLICAS", 1))
-        self.max_replicas = int(os.environ.get("MAX_REPLICAS", 1))

relik/inference/serve/frontend/__init__.py DELETED Viewed

File without changes

relik/inference/serve/frontend/relik.py DELETED Viewed

@@ -1,231 +0,0 @@
-import os
-import re
-import time
-from pathlib import Path
-import requests
-import streamlit as st
-from spacy import displacy
-from streamlit_extras.badges import badge
-from streamlit_extras.stylable_container import stylable_container
-RELIK = os.getenv("RELIK", "localhost:8000/api/entities")
-import random
-def get_random_color(ents):
-    colors = {}
-    random_colors = generate_pastel_colors(len(ents))
-    for ent in ents:
-        colors[ent] = random_colors.pop(random.randint(0, len(random_colors) - 1))
-    return colors
-def floatrange(start, stop, steps):
-    if int(steps) == 1:
-        return [stop]
-    return [
-        start + float(i) * (stop - start) / (float(steps) - 1) for i in range(steps)
-    ]
-def hsl_to_rgb(h, s, l):
-    def hue_2_rgb(v1, v2, v_h):
-        while v_h < 0.0:
-            v_h += 1.0
-        while v_h > 1.0:
-            v_h -= 1.0
-        if 6 * v_h < 1.0:
-            return v1 + (v2 - v1) * 6.0 * v_h
-        if 2 * v_h < 1.0:
-            return v2
-        if 3 * v_h < 2.0:
-            return v1 + (v2 - v1) * ((2.0 / 3.0) - v_h) * 6.0
-        return v1
-    # if not (0 <= s <= 1): raise ValueError, "s (saturation) parameter must be between 0 and 1."
-    # if not (0 <= l <= 1): raise ValueError, "l (lightness) parameter must be between 0 and 1."
-    r, b, g = (l * 255,) * 3
-    if s != 0.0:
-        if l < 0.5:
-            var_2 = l * (1.0 + s)
-        else:
-            var_2 = (l + s) - (s * l)
-        var_1 = 2.0 * l - var_2
-        r = 255 * hue_2_rgb(var_1, var_2, h + (1.0 / 3.0))
-        g = 255 * hue_2_rgb(var_1, var_2, h)
-        b = 255 * hue_2_rgb(var_1, var_2, h - (1.0 / 3.0))
-    return int(round(r)), int(round(g)), int(round(b))
-def generate_pastel_colors(n):
-    """Return different pastel colours.
-    Input:
-        n (integer) : The number of colors to return
-    Output:
-        A list of colors in HTML notation (eg.['#cce0ff', '#ffcccc', '#ccffe0', '#f5ccff', '#f5ffcc'])
-    Example:
-        >>> print generate_pastel_colors(5)
-        ['#cce0ff', '#f5ccff', '#ffcccc', '#f5ffcc', '#ccffe0']
-    """
-    if n == 0:
-        return []
-    # To generate colors, we use the HSL colorspace (see http://en.wikipedia.org/wiki/HSL_color_space)
-    start_hue = 0.6  # 0=red    1/3=0.333=green   2/3=0.666=blue
-    saturation = 1.0
-    lightness = 0.8
-    # We take points around the chromatic circle (hue):
-    # (Note: we generate n+1 colors, then drop the last one ([:-1]) because
-    # it equals the first one (hue 0 = hue 1))
-    return [
-        "#%02x%02x%02x" % hsl_to_rgb(hue, saturation, lightness)
-        for hue in floatrange(start_hue, start_hue + 1, n + 1)
-    ][:-1]
-def set_sidebar(css):
-    white_link_wrapper = "<link rel='stylesheet' href='https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/all.min.css'><a href='{}'>{}</a>"
-    with st.sidebar:
-        st.markdown(f"<style>{css}</style>", unsafe_allow_html=True)
-        st.image(
-            "http://nlp.uniroma1.it/static/website/sapienza-nlp-logo-wh.svg",
-            use_column_width=True,
-        )
-        st.markdown("## ReLiK")
-        st.write(
-            f"""
-                - {white_link_wrapper.format("#", "<i class='fa-solid fa-file'></i>&nbsp; Paper")}
-                - {white_link_wrapper.format("https://github.com/SapienzaNLP/relik", "<i class='fa-brands fa-github'></i>&nbsp; GitHub")}
-                - {white_link_wrapper.format("https://hub.docker.com/repository/docker/sapienzanlp/relik", "<i class='fa-brands fa-docker'></i>&nbsp; Docker Hub")}
-                """,
-            unsafe_allow_html=True,
-        )
-        st.markdown("## Sapienza NLP")
-        st.write(
-            f"""
-                - {white_link_wrapper.format("https://nlp.uniroma1.it", "<i class='fa-solid fa-globe'></i>&nbsp; Webpage")}
-                - {white_link_wrapper.format("https://github.com/SapienzaNLP", "<i class='fa-brands fa-github'></i>&nbsp; GitHub")}
-                - {white_link_wrapper.format("https://twitter.com/SapienzaNLP", "<i class='fa-brands fa-twitter'></i>&nbsp; Twitter")}
-                - {white_link_wrapper.format("https://www.linkedin.com/company/79434450", "<i class='fa-brands fa-linkedin'></i>&nbsp; LinkedIn")}
-                """,
-            unsafe_allow_html=True,
-        )
-def get_el_annotations(response):
-    # swap labels key with ents
-    response["ents"] = response.pop("labels")
-    label_in_text = set(l["label"] for l in response["ents"])
-    options = {"ents": label_in_text, "colors": get_random_color(label_in_text)}
-    return response, options
-def set_intro(css):
-    # intro
-    st.markdown("# ReLik")
-    st.markdown(
-        "### Retrieve, Read and LinK: Fast and Accurate Entity Linking and Relation Extraction on an Academic Budget"
-    )
-    # st.markdown(
-    #     "This is a front-end for the paper [Universal Semantic Annotator: the First Unified API "
-    #     "for WSD, SRL and Semantic Parsing](https://www.researchgate.net/publication/360671045_Universal_Semantic_Annotator_the_First_Unified_API_for_WSD_SRL_and_Semantic_Parsing), which will be presented at LREC 2022 by "
-    #     "[Riccardo Orlando](https://riccorl.github.io), [Simone Conia](https://c-simone.github.io/), "
-    #     "[Stefano Faralli](https://corsidilaurea.uniroma1.it/it/users/stefanofaralliuniroma1it), and [Roberto Navigli](https://www.diag.uniroma1.it/navigli/)."
-    # )
-    badge(type="github", name="sapienzanlp/relik")
-    badge(type="pypi", name="relik")
-def run_client():
-    with open(Path(__file__).parent / "style.css") as f:
-        css = f.read()
-    st.set_page_config(
-        page_title="ReLik",
-        page_icon="🦮",
-        layout="wide",
-    )
-    set_sidebar(css)
-    set_intro(css)
-    # text input
-    text = st.text_area(
-        "Enter Text Below:",
-        value="Obama went to Rome for a quick vacation.",
-        height=200,
-        max_chars=500,
-    )
-    with stylable_container(
-        key="annotate_button",
-        css_styles="""
-            button {
-                background-color: #802433;
-                color: white;
-                border-radius: 25px;
-            }
-            """,
-    ):
-        submit = st.button("Annotate")
-    # submit = st.button("Run")
-    # ReLik API call
-    if submit:
-        text = text.strip()
-        if text:
-            st.markdown("####")
-            st.markdown("#### Entity Linking")
-            with st.spinner(text="In progress"):
-                response = requests.post(RELIK, json=text)
-                if response.status_code != 200:
-                    st.error("Error: {}".format(response.status_code))
-                else:
-                    response = response.json()
-                    # Entity Linking
-                    # with stylable_container(
-                    #     key="container_with_border",
-                    #     css_styles="""
-                    #         {
-                    #             border: 1px solid rgba(49, 51, 63, 0.2);
-                    #             border-radius: 0.5rem;
-                    #             padding: 0.5rem;
-                    #             padding-bottom: 2rem;
-                    #         }
-                    #         """,
-                    # ):
-                    # st.markdown("##")
-                    dict_of_ents, options = get_el_annotations(response=response)
-                    display = displacy.render(
-                        dict_of_ents, manual=True, style="ent", options=options
-                    )
-                    display = display.replace("\n", " ")
-                    # wsd_display = re.sub(
-                    #     r"(wiki::\d+\w)",
-                    #     r"<a href='https://babelnet.org/synset?id=\g<1>&orig=\g<1>&lang={}'>\g<1></a>".format(
-                    #         language.upper()
-                    #     ),
-                    #     wsd_display,
-                    # )
-                    with st.container():
-                        st.write(display, unsafe_allow_html=True)
-                    st.markdown("####")
-                    st.markdown("#### Relation Extraction")
-                    with st.container():
-                        st.write("Coming :)", unsafe_allow_html=True)
-        else:
-            st.error("Please enter some text.")
-if __name__ == "__main__":
-    run_client()

relik/inference/serve/frontend/style.css DELETED Viewed

@@ -1,33 +0,0 @@
-/* Sidebar */
-.eczjsme11 {
-    background-color: #802433;
-}
-.st-emotion-cache-10oheav h2 {
-    color: white;
-}
-.st-emotion-cache-10oheav li {
-    color: white;
-}
-/* Main */
-a:link {
-    text-decoration: none;
-    color: white;
-}
-a:visited {
-    text-decoration: none;
-    color: white;
-}
-a:hover {
-    text-decoration: none;
-    color: rgba(255, 255, 255, 0.871);
-}
-a:active {
-    text-decoration: none;
-    color: white;
-}

relik/reader/__init__.py DELETED Viewed

File without changes

relik/reader/conf/config.yaml DELETED Viewed

@@ -1,14 +0,0 @@
-# Required to make the "experiments" dir the default one for the output of the models
-hydra:
-  run:
-    dir: ./experiments/${model_name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
-model_name: relik-reader-deberta-base  # used to name the model in wandb and output dir
-project_name: relik-reader # used to name the project in wandb
-defaults:
-  - _self_
-  - training: base
-  - model: base
-  - data: base

relik/reader/conf/data/base.yaml DELETED Viewed

@@ -1,21 +0,0 @@
-train_dataset_path: "relik/reader/data/train.jsonl"
-val_dataset_path: "relik/reader/data/testa.jsonl"
-train_dataset:
-  _target_: "relik.reader.relik_reader_data.RelikDataset"
-  transformer_model: "${model.model.transformer_model}"
-  materialize_samples: False
-  shuffle_candidates: 0.5
-  random_drop_gold_candidates: 0.05
-  noise_param: 0.0
-  for_inference: False
-  tokens_per_batch: 4096
-  special_symbols: null
-val_dataset:
-  _target_: "relik.reader.relik_reader_data.RelikDataset"
-  transformer_model: "${model.model.transformer_model}"
-  materialize_samples: False
-  shuffle_candidates: False
-  for_inference: True
-  special_symbols: null

relik/reader/conf/data/re.yaml DELETED Viewed

@@ -1,54 +0,0 @@
-train_dataset_path: "relik/reader/data/nyt-alby+/train.jsonl"
-val_dataset_path: "relik/reader/data/nyt-alby+/valid.jsonl"
-test_dataset_path: "relik/reader/data/nyt-alby+/test.jsonl"
-relations_definitions:
-  /people/person/nationality: "nationality"
-  /sports/sports_team/location: "sports team location"
-  /location/country/administrative_divisions: "administrative divisions"
-  /business/company/major_shareholders: "shareholders"
-  /people/ethnicity/people: "ethnicity"
-  /people/ethnicity/geographic_distribution: "geographic distributi6on"
-  /business/company_shareholder/major_shareholder_of: "major shareholder"
-  /location/location/contains: "location"
-  /business/company/founders: "founders"
-  /business/person/company: "company"
-  /business/company/advisors: "advisor"
-  /people/deceased_person/place_of_death: "place of death"
-  /business/company/industry: "industry"
-  /people/person/ethnicity: "ethnic background"
-  /people/person/place_of_birth: "place of birth"
-  /location/administrative_division/country: "country of an administration division"
-  /people/person/place_lived: "place lived"
-  /sports/sports_team_location/teams: "sports team"
-  /people/person/children: "child"
-  /people/person/religion: "religion"
-  /location/neighborhood/neighborhood_of: "neighborhood"
-  /location/country/capital: "capital"
-  /business/company/place_founded: "company founded location"
-  /people/person/profession: "occupation"
-train_dataset:
-  _target_: "relik.reader.relik_reader_re_data.RelikREDataset"
-  transformer_model: "${model.model.transformer_model}"
-  materialize_samples: False
-  shuffle_candidates: False
-  flip_candidates: 1.0
-  noise_param: 0.0
-  for_inference: False
-  tokens_per_batch: 4096
-  min_length: -1
-  special_symbols: null
-  relations_definitions: ${data.relations_definitions}
-  sorting_fields:
-    - "predictable_candidates"
-val_dataset:
-  _target_: "relik.reader.relik_reader_re_data.RelikREDataset"
-  transformer_model: "${model.model.transformer_model}"
-  materialize_samples: False
-  shuffle_candidates: False
-  flip_candidates: False
-  for_inference: True
-  min_length: -1
-  special_symbols: null
-  relations_definitions: ${data.relations_definitions}

relik/reader/conf/training/base.yaml DELETED Viewed

@@ -1,12 +0,0 @@
-seed: 94
-trainer:
-  _target_: lightning.Trainer
-  devices:
-    - 0
-  precision: "16-mixed"
-  max_steps: 50000
-  val_check_interval: 1.0
-  num_sanity_val_steps: 0
-  limit_val_batches: 1
-  gradient_clip_val: 1.0

relik/reader/conf/training/re.yaml DELETED Viewed

@@ -1,12 +0,0 @@
-seed: 15
-trainer:
-  _target_: lightning.Trainer
-  devices:
-    - 0
-  precision: "16-mixed"
-  max_steps: 100000
-  val_check_interval: 1.0
-  num_sanity_val_steps: 0
-  limit_val_batches: 1
-  gradient_clip_val: 1.0

relik/reader/data/__init__.py DELETED Viewed

File without changes

relik/reader/data/patches.py DELETED Viewed

@@ -1,51 +0,0 @@
-from typing import List
-from relik.reader.data.relik_reader_sample import RelikReaderSample
-from relik.reader.utils.special_symbols import NME_SYMBOL
-def merge_patches_predictions(sample) -> None:
-    sample._d["predicted_window_labels"] = dict()
-    predicted_window_labels = sample._d["predicted_window_labels"]
-    sample._d["span_title_probabilities"] = dict()
-    span_title_probabilities = sample._d["span_title_probabilities"]
-    span2title = dict()
-    for _, patch_info in sorted(sample.patches.items(), key=lambda x: x[0]):
-        # selecting span predictions
-        for predicted_title, predicted_spans in patch_info[
-            "predicted_window_labels"
-        ].items():
-            for pred_span in predicted_spans:
-                pred_span = tuple(pred_span)
-                curr_title = span2title.get(pred_span)
-                if curr_title is None or curr_title == NME_SYMBOL:
-                    span2title[pred_span] = predicted_title
-                # else:
-                #     print("Merging at patch level")
-        # selecting span predictions probability
-        for predicted_span, titles_probabilities in patch_info[
-            "span_title_probabilities"
-        ].items():
-            if predicted_span not in span_title_probabilities:
-                span_title_probabilities[predicted_span] = titles_probabilities
-    for span, title in span2title.items():
-        if title not in predicted_window_labels:
-            predicted_window_labels[title] = list()
-        predicted_window_labels[title].append(span)
-def remove_duplicate_samples(
-    samples: List[RelikReaderSample],
-) -> List[RelikReaderSample]:
-    seen_sample = set()
-    samples_store = []
-    for sample in samples:
-        sample_id = f"{sample.doc_id}#{sample.sent_id}#{sample.offset}"
-        if sample_id not in seen_sample:
-            seen_sample.add(sample_id)
-            samples_store.append(sample)
-    return samples_store

relik/reader/data/relik_reader_data.py DELETED Viewed

@@ -1,965 +0,0 @@
-import logging
-from typing import (
-    Any,
-    Callable,
-    Dict,
-    Generator,
-    Iterable,
-    Iterator,
-    List,
-    NamedTuple,
-    Optional,
-    Tuple,
-    Union,
-)
-import numpy as np
-import torch
-from torch.utils.data import IterableDataset
-from tqdm import tqdm
-from transformers import AutoTokenizer, PreTrainedTokenizer
-from relik.reader.data.relik_reader_data_utils import (
-    add_noise_to_value,
-    batchify,
-    chunks,
-    flatten,
-)
-from relik.reader.data.relik_reader_sample import (
-    RelikReaderSample,
-    load_relik_reader_samples,
-)
-from relik.reader.utils.special_symbols import NME_SYMBOL
-logger = logging.getLogger(__name__)
-def preprocess_dataset(
-    input_dataset: Iterable[dict],
-    transformer_model: str,
-    add_topic: bool,
-) -> Iterable[dict]:
-    tokenizer = AutoTokenizer.from_pretrained(transformer_model)
-    for dataset_elem in tqdm(input_dataset, desc="Preprocessing input dataset"):
-        if len(dataset_elem["tokens"]) == 0:
-            print(
-                f"Dataset element with doc id: {dataset_elem['doc_id']}",
-                f"and offset {dataset_elem['offset']} does not contain any token",
-                "Skipping it",
-            )
-            continue
-        new_dataset_elem = dict(
-            doc_id=dataset_elem["doc_id"],
-            offset=dataset_elem["offset"],
-        )
-        tokenization_out = tokenizer(
-            dataset_elem["tokens"],
-            return_offsets_mapping=True,
-            add_special_tokens=False,
-        )
-        window_tokens = tokenization_out.input_ids
-        window_tokens = flatten(window_tokens)
-        offsets_mapping = [
-            [
-                (
-                    ss + dataset_elem["token2char_start"][str(i)],
-                    se + dataset_elem["token2char_start"][str(i)],
-                )
-                for ss, se in tokenization_out.offset_mapping[i]
-            ]
-            for i in range(len(dataset_elem["tokens"]))
-        ]
-        offsets_mapping = flatten(offsets_mapping)
-        assert len(offsets_mapping) == len(window_tokens)
-        window_tokens = (
-            [tokenizer.cls_token_id] + window_tokens + [tokenizer.sep_token_id]
-        )
-        topic_offset = 0
-        if add_topic:
-            topic_tokens = tokenizer(
-                dataset_elem["doc_topic"], add_special_tokens=False
-            ).input_ids
-            topic_offset = len(topic_tokens)
-            new_dataset_elem["topic_tokens"] = topic_offset
-            window_tokens = window_tokens[:1] + topic_tokens + window_tokens[1:]
-        new_dataset_elem.update(
-            dict(
-                tokens=window_tokens,
-                token2char_start={
-                    str(i): s
-                    for i, (s, _) in enumerate(offsets_mapping, start=topic_offset)
-                },
-                token2char_end={
-                    str(i): e
-                    for i, (_, e) in enumerate(offsets_mapping, start=topic_offset)
-                },
-                window_candidates=dataset_elem["window_candidates"],
-                window_candidates_scores=dataset_elem.get("window_candidates_scores"),
-            )
-        )
-        if "window_labels" in dataset_elem:
-            window_labels = [
-                (s, e, l.replace("_", " ")) for s, e, l in dataset_elem["window_labels"]
-            ]
-            new_dataset_elem["window_labels"] = window_labels
-            if not all(
-                [
-                    s in new_dataset_elem["token2char_start"].values()
-                    for s, _, _ in new_dataset_elem["window_labels"]
-                ]
-            ):
-                print(
-                    "Mismatching token start char mapping with labels",
-                    new_dataset_elem["token2char_start"],
-                    new_dataset_elem["window_labels"],
-                    dataset_elem["tokens"],
-                )
-                continue
-            if not all(
-                [
-                    e in new_dataset_elem["token2char_end"].values()
-                    for _, e, _ in new_dataset_elem["window_labels"]
-                ]
-            ):
-                print(
-                    "Mismatching token end char mapping with labels",
-                    new_dataset_elem["token2char_end"],
-                    new_dataset_elem["window_labels"],
-                    dataset_elem["tokens"],
-                )
-                continue
-        yield new_dataset_elem
-def preprocess_sample(
-    relik_sample: RelikReaderSample,
-    tokenizer,
-    lowercase_policy: float,
-    add_topic: bool = False,
-) -> None:
-    if len(relik_sample.tokens) == 0:
-        return
-    if lowercase_policy > 0:
-        lc_tokens = np.random.uniform(0, 1, len(relik_sample.tokens)) < lowercase_policy
-        relik_sample.tokens = [
-            t.lower() if lc else t for t, lc in zip(relik_sample.tokens, lc_tokens)
-        ]
-    tokenization_out = tokenizer(
-        relik_sample.tokens,
-        return_offsets_mapping=True,
-        add_special_tokens=False,
-    )
-    window_tokens = tokenization_out.input_ids
-    window_tokens = flatten(window_tokens)
-    offsets_mapping = [
-        [
-            (
-                ss + relik_sample.token2char_start[str(i)],
-                se + relik_sample.token2char_start[str(i)],
-            )
-            for ss, se in tokenization_out.offset_mapping[i]
-        ]
-        for i in range(len(relik_sample.tokens))
-    ]
-    offsets_mapping = flatten(offsets_mapping)
-    assert len(offsets_mapping) == len(window_tokens)
-    window_tokens = [tokenizer.cls_token_id] + window_tokens + [tokenizer.sep_token_id]
-    topic_offset = 0
-    if add_topic:
-        topic_tokens = tokenizer(
-            relik_sample.doc_topic, add_special_tokens=False
-        ).input_ids
-        topic_offset = len(topic_tokens)
-        relik_sample.topic_tokens = topic_offset
-        window_tokens = window_tokens[:1] + topic_tokens + window_tokens[1:]
-    relik_sample._d.update(
-        dict(
-            tokens=window_tokens,
-            token2char_start={
-                str(i): s
-                for i, (s, _) in enumerate(offsets_mapping, start=topic_offset)
-            },
-            token2char_end={
-                str(i): e
-                for i, (_, e) in enumerate(offsets_mapping, start=topic_offset)
-            },
-        )
-    )
-    if "window_labels" in relik_sample._d:
-        relik_sample.window_labels = [
-            (s, e, l.replace("_", " ")) for s, e, l in relik_sample.window_labels
-        ]
-class TokenizationOutput(NamedTuple):
-    input_ids: torch.Tensor
-    attention_mask: torch.Tensor
-    token_type_ids: torch.Tensor
-    prediction_mask: torch.Tensor
-    special_symbols_mask: torch.Tensor
-class RelikDataset(IterableDataset):
-    def __init__(
-        self,
-        dataset_path: Optional[str],
-        materialize_samples: bool,
-        transformer_model: Union[str, PreTrainedTokenizer],
-        special_symbols: List[str],
-        shuffle_candidates: Optional[Union[bool, float]] = False,
-        for_inference: bool = False,
-        noise_param: float = 0.1,
-        sorting_fields: Optional[str] = None,
-        tokens_per_batch: int = 2048,
-        batch_size: int = None,
-        max_batch_size: int = 128,
-        section_size: int = 50_000,
-        prebatch: bool = True,
-        random_drop_gold_candidates: float = 0.0,
-        use_nme: bool = True,
-        max_subwords_per_candidate: bool = 22,
-        mask_by_instances: bool = False,
-        min_length: int = 5,
-        max_length: int = 2048,
-        model_max_length: int = 1000,
-        split_on_cand_overload: bool = True,
-        skip_empty_training_samples: bool = False,
-        drop_last: bool = False,
-        samples: Optional[Iterator[RelikReaderSample]] = None,
-        lowercase_policy: float = 0.0,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.dataset_path = dataset_path
-        self.materialize_samples = materialize_samples
-        self.samples: Optional[List[RelikReaderSample]] = None
-        if self.materialize_samples:
-            self.samples = list()
-        if isinstance(transformer_model, str):
-            self.tokenizer = self._build_tokenizer(transformer_model, special_symbols)
-        else:
-            self.tokenizer = transformer_model
-        self.special_symbols = special_symbols
-        self.shuffle_candidates = shuffle_candidates
-        self.for_inference = for_inference
-        self.noise_param = noise_param
-        self.batching_fields = ["input_ids"]
-        self.sorting_fields = (
-            sorting_fields if sorting_fields is not None else self.batching_fields
-        )
-        self.tokens_per_batch = tokens_per_batch
-        self.batch_size = batch_size
-        self.max_batch_size = max_batch_size
-        self.section_size = section_size
-        self.prebatch = prebatch
-        self.random_drop_gold_candidates = random_drop_gold_candidates
-        self.use_nme = use_nme
-        self.max_subwords_per_candidate = max_subwords_per_candidate
-        self.mask_by_instances = mask_by_instances
-        self.min_length = min_length
-        self.max_length = max_length
-        self.model_max_length = (
-            model_max_length
-            if model_max_length < self.tokenizer.model_max_length
-            else self.tokenizer.model_max_length
-        )
-        # retrocompatibility workaround
-        self.transformer_model = (
-            transformer_model
-            if isinstance(transformer_model, str)
-            else transformer_model.name_or_path
-        )
-        self.split_on_cand_overload = split_on_cand_overload
-        self.skip_empty_training_samples = skip_empty_training_samples
-        self.drop_last = drop_last
-        self.lowercase_policy = lowercase_policy
-        self.samples = samples
-    def _build_tokenizer(self, transformer_model: str, special_symbols: List[str]):
-        return AutoTokenizer.from_pretrained(
-            transformer_model,
-            additional_special_tokens=[ss for ss in special_symbols],
-            add_prefix_space=True,
-        )
-    @property
-    def fields_batcher(self) -> Dict[str, Union[None, Callable[[list], Any]]]:
-        fields_batchers = {
-            "input_ids": lambda x: batchify(
-                x, padding_value=self.tokenizer.pad_token_id
-            ),
-            "attention_mask": lambda x: batchify(x, padding_value=0),
-            "token_type_ids": lambda x: batchify(x, padding_value=0),
-            "prediction_mask": lambda x: batchify(x, padding_value=1),
-            "global_attention": lambda x: batchify(x, padding_value=0),
-            "token2word": None,
-            "sample": None,
-            "special_symbols_mask": lambda x: batchify(x, padding_value=False),
-            "start_labels": lambda x: batchify(x, padding_value=-100),
-            "end_labels": lambda x: batchify(x, padding_value=-100),
-            "predictable_candidates_symbols": None,
-            "predictable_candidates": None,
-            "patch_offset": None,
-            "optimus_labels": None,
-        }
-        if "roberta" in self.transformer_model:
-            del fields_batchers["token_type_ids"]
-        return fields_batchers
-    def _build_input_ids(
-        self, sentence_input_ids: List[int], candidates_input_ids: List[List[int]]
-    ) -> List[int]:
-        return (
-            [self.tokenizer.cls_token_id]
-            + sentence_input_ids
-            + [self.tokenizer.sep_token_id]
-            + flatten(candidates_input_ids)
-            + [self.tokenizer.sep_token_id]
-        )
-    def _get_special_symbols_mask(self, input_ids: torch.Tensor) -> torch.Tensor:
-        special_symbols_mask = input_ids >= (
-            len(self.tokenizer) - len(self.special_symbols)
-        )
-        special_symbols_mask[0] = True
-        return special_symbols_mask
-    def _build_tokenizer_essentials(
-        self, input_ids, original_sequence, sample
-    ) -> TokenizationOutput:
-        input_ids = torch.tensor(input_ids, dtype=torch.long)
-        attention_mask = torch.ones_like(input_ids)
-        total_sequence_len = len(input_ids)
-        predictable_sentence_len = len(original_sequence)
-        # token type ids
-        token_type_ids = torch.cat(
-            [
-                input_ids.new_zeros(
-                    predictable_sentence_len + 2
-                ),  # original sentence bpes + CLS and SEP
-                input_ids.new_ones(total_sequence_len - predictable_sentence_len - 2),
-            ]
-        )
-        # prediction mask -> boolean on tokens that are predictable
-        prediction_mask = torch.tensor(
-            [1]
-            + ([0] * predictable_sentence_len)
-            + ([1] * (total_sequence_len - predictable_sentence_len - 1))
-        )
-        # add topic tokens to the prediction mask so that they cannot be predicted
-        # or optimized during training
-        topic_tokens = getattr(sample, "topic_tokens", None)
-        if topic_tokens is not None:
-            prediction_mask[1 : 1 + topic_tokens] = 1
-        # If mask by instances is active the prediction mask is applied to everything
-        # that is not indicated as an instance in the training set.
-        if self.mask_by_instances:
-            char_start2token = {
-                cs: int(tok) for tok, cs in sample.token2char_start.items()
-            }
-            char_end2token = {ce: int(tok) for tok, ce in sample.token2char_end.items()}
-            instances_mask = torch.ones_like(prediction_mask)
-            for _, span_info in sample.instance_id2span_data.items():
-                span_info = span_info[0]
-                token_start = char_start2token[span_info[0]] + 1  # +1 for the CLS
-                token_end = char_end2token[span_info[1]] + 1  # +1 for the CLS
-                instances_mask[token_start : token_end + 1] = 0
-            prediction_mask += instances_mask
-            prediction_mask[prediction_mask > 1] = 1
-        assert len(prediction_mask) == len(input_ids)
-        # special symbols mask
-        special_symbols_mask = self._get_special_symbols_mask(input_ids)
-        return TokenizationOutput(
-            input_ids,
-            attention_mask,
-            token_type_ids,
-            prediction_mask,
-            special_symbols_mask,
-        )
-    def _build_labels(
-        self,
-        sample,
-        tokenization_output: TokenizationOutput,
-        predictable_candidates: List[str],
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        start_labels = [0] * len(tokenization_output.input_ids)
-        end_labels = [0] * len(tokenization_output.input_ids)
-        char_start2token = {v: int(k) for k, v in sample.token2char_start.items()}
-        char_end2token = {v: int(k) for k, v in sample.token2char_end.items()}
-        for cs, ce, gold_candidate_title in sample.window_labels:
-            if gold_candidate_title not in predictable_candidates:
-                if self.use_nme:
-                    gold_candidate_title = NME_SYMBOL
-                else:
-                    continue
-            # +1 is to account for the CLS token
-            start_bpe = char_start2token[cs] + 1
-            end_bpe = char_end2token[ce] + 1
-            class_index = predictable_candidates.index(gold_candidate_title)
-            if (
-                start_labels[start_bpe] == 0 and end_labels[end_bpe] == 0
-            ):  # prevent from having entities that ends with the same label
-                start_labels[start_bpe] = class_index + 1  # +1 for the NONE class
-                end_labels[end_bpe] = class_index + 1  # +1 for the NONE class
-            else:
-                print(
-                    "Found entity with the same last subword, it will not be included."
-                )
-                print(
-                    cs,
-                    ce,
-                    gold_candidate_title,
-                    start_labels,
-                    end_labels,
-                    sample.doc_id,
-                )
-        ignored_labels_indices = tokenization_output.prediction_mask == 1
-        start_labels = torch.tensor(start_labels, dtype=torch.long)
-        start_labels[ignored_labels_indices] = -100
-        end_labels = torch.tensor(end_labels, dtype=torch.long)
-        end_labels[ignored_labels_indices] = -100
-        return start_labels, end_labels
-    def produce_sample_bag(
-        self, sample, predictable_candidates: List[str], candidates_starting_offset: int
-    ) -> Optional[Tuple[dict, list, int]]:
-        # input sentence tokenization
-        input_subwords = sample.tokens[1:-1]  # removing special tokens
-        candidates_symbols = self.special_symbols[candidates_starting_offset:]
-        predictable_candidates = list(predictable_candidates)
-        original_predictable_candidates = list(predictable_candidates)
-        # add NME as a possible candidate
-        if self.use_nme:
-            predictable_candidates.insert(0, NME_SYMBOL)
-        # candidates encoding
-        candidates_symbols = candidates_symbols[: len(predictable_candidates)]
-        candidates_encoding_result = self.tokenizer.batch_encode_plus(
-            [
-                "{} {}".format(cs, ct) if ct != NME_SYMBOL else NME_SYMBOL
-                for cs, ct in zip(candidates_symbols, predictable_candidates)
-            ],
-            add_special_tokens=False,
-        ).input_ids
-        if (
-            self.max_subwords_per_candidate is not None
-            and self.max_subwords_per_candidate > 0
-        ):
-            candidates_encoding_result = [
-                cer[: self.max_subwords_per_candidate]
-                for cer in candidates_encoding_result
-            ]
-        # drop candidates if the number of input tokens is too long for the model
-        if (
-            sum(map(len, candidates_encoding_result))
-            + len(input_subwords)
-            + 20  # + 20 special tokens
-            > self.model_max_length
-        ):
-            acceptable_tokens_from_candidates = (
-                self.model_max_length - 20 - len(input_subwords)
-            )
-            i = 0
-            cum_len = 0
-            while (
-                cum_len + len(candidates_encoding_result[i])
-                < acceptable_tokens_from_candidates
-            ):
-                cum_len += len(candidates_encoding_result[i])
-                i += 1
-            candidates_encoding_result = candidates_encoding_result[:i]
-            candidates_symbols = candidates_symbols[:i]
-            predictable_candidates = predictable_candidates[:i]
-        # final input_ids build
-        input_ids = self._build_input_ids(
-            sentence_input_ids=input_subwords,
-            candidates_input_ids=candidates_encoding_result,
-        )
-        # complete input building (e.g. attention / prediction mask)
-        tokenization_output = self._build_tokenizer_essentials(
-            input_ids, input_subwords, sample
-        )
-        output_dict = {
-            "input_ids": tokenization_output.input_ids,
-            "attention_mask": tokenization_output.attention_mask,
-            "token_type_ids": tokenization_output.token_type_ids,
-            "prediction_mask": tokenization_output.prediction_mask,
-            "special_symbols_mask": tokenization_output.special_symbols_mask,
-            "sample": sample,
-            "predictable_candidates_symbols": candidates_symbols,
-            "predictable_candidates": predictable_candidates,
-        }
-        # labels creation
-        if sample.window_labels is not None:
-            start_labels, end_labels = self._build_labels(
-                sample,
-                tokenization_output,
-                predictable_candidates,
-            )
-            output_dict.update(start_labels=start_labels, end_labels=end_labels)
-        if (
-            "roberta" in self.transformer_model
-            or "longformer" in self.transformer_model
-        ):
-            del output_dict["token_type_ids"]
-        predictable_candidates_set = set(predictable_candidates)
-        remaining_candidates = [
-            candidate
-            for candidate in original_predictable_candidates
-            if candidate not in predictable_candidates_set
-        ]
-        total_used_candidates = (
-            candidates_starting_offset
-            + len(predictable_candidates)
-            - (1 if self.use_nme else 0)
-        )
-        if self.use_nme:
-            assert predictable_candidates[0] == NME_SYMBOL
-        return output_dict, remaining_candidates, total_used_candidates
-    def __iter__(self):
-        dataset_iterator = self.dataset_iterator_func()
-        current_dataset_elements = []
-        i = None
-        for i, dataset_elem in enumerate(dataset_iterator, start=1):
-            if (
-                self.section_size is not None
-                and len(current_dataset_elements) == self.section_size
-            ):
-                for batch in self.materialize_batches(current_dataset_elements):
-                    yield batch
-                current_dataset_elements = []
-            current_dataset_elements.append(dataset_elem)
-            if i % 50_000 == 0:
-                logger.info(f"Processed: {i} number of elements")
-        if len(current_dataset_elements) != 0:
-            for batch in self.materialize_batches(current_dataset_elements):
-                yield batch
-        if i is not None:
-            logger.info(f"Dataset finished: {i} number of elements processed")
-        else:
-            logger.warning("Dataset empty")
-    def dataset_iterator_func(self):
-        skipped_instances = 0
-        data_samples = (
-            load_relik_reader_samples(self.dataset_path)
-            if self.samples is None
-            else self.samples
-        )
-        for sample in data_samples:
-            preprocess_sample(
-                sample, self.tokenizer, lowercase_policy=self.lowercase_policy
-            )
-            current_patch = 0
-            sample_bag, used_candidates = None, None
-            remaining_candidates = list(sample.window_candidates)
-            if not self.for_inference:
-                # randomly drop gold candidates at training time
-                if (
-                    self.random_drop_gold_candidates > 0.0
-                    and np.random.uniform() < self.random_drop_gold_candidates
-                    and len(set(ct for _, _, ct in sample.window_labels)) > 1
-                ):
-                    # selecting candidates to drop
-                    np.random.shuffle(sample.window_labels)
-                    n_dropped_candidates = np.random.randint(
-                        0, len(sample.window_labels) - 1
-                    )
-                    dropped_candidates = [
-                        label_elem[-1]
-                        for label_elem in sample.window_labels[:n_dropped_candidates]
-                    ]
-                    dropped_candidates = set(dropped_candidates)
-                    # saving NMEs because they should not be dropped
-                    if NME_SYMBOL in dropped_candidates:
-                        dropped_candidates.remove(NME_SYMBOL)
-                    # sample update
-                    sample.window_labels = [
-                        (s, e, _l)
-                        if _l not in dropped_candidates
-                        else (s, e, NME_SYMBOL)
-                        for s, e, _l in sample.window_labels
-                    ]
-                    remaining_candidates = [
-                        wc
-                        for wc in remaining_candidates
-                        if wc not in dropped_candidates
-                    ]
-                # shuffle candidates
-                if (
-                    isinstance(self.shuffle_candidates, bool)
-                    and self.shuffle_candidates
-                ) or (
-                    isinstance(self.shuffle_candidates, float)
-                    and np.random.uniform() < self.shuffle_candidates
-                ):
-                    np.random.shuffle(remaining_candidates)
-            while len(remaining_candidates) != 0:
-                sample_bag = self.produce_sample_bag(
-                    sample,
-                    predictable_candidates=remaining_candidates,
-                    candidates_starting_offset=used_candidates
-                    if used_candidates is not None
-                    else 0,
-                )
-                if sample_bag is not None:
-                    sample_bag, remaining_candidates, used_candidates = sample_bag
-                    if (
-                        self.for_inference
-                        or not self.skip_empty_training_samples
-                        or (
-                            (
-                                sample_bag.get("start_labels") is not None
-                                and torch.any(sample_bag["start_labels"] > 1).item()
-                            )
-                            or (
-                                sample_bag.get("optimus_labels") is not None
-                                and len(sample_bag["optimus_labels"]) > 0
-                            )
-                        )
-                    ):
-                        sample_bag["patch_offset"] = current_patch
-                        current_patch += 1
-                        yield sample_bag
-                    else:
-                        skipped_instances += 1
-                        if skipped_instances % 1000 == 0 and skipped_instances != 0:
-                            logger.info(
-                                f"Skipped {skipped_instances} instances since they did not have any gold labels..."
-                            )
-                # Just use the first fitting candidates if split on
-                #  cand is not True
-                if not self.split_on_cand_overload:
-                    break
-    def preshuffle_elements(self, dataset_elements: List):
-        # This shuffling is done so that when using the sorting function,
-        # if it is deterministic given a collection and its order, we will
-        # make the whole operation not deterministic anymore.
-        # Basically, the aim is not to build every time the same batches.
-        if not self.for_inference:
-            dataset_elements = np.random.permutation(dataset_elements)
-        sorting_fn = (
-            lambda elem: add_noise_to_value(
-                sum(len(elem[k]) for k in self.sorting_fields),
-                noise_param=self.noise_param,
-            )
-            if not self.for_inference
-            else sum(len(elem[k]) for k in self.sorting_fields)
-        )
-        dataset_elements = sorted(dataset_elements, key=sorting_fn)
-        if self.for_inference:
-            return dataset_elements
-        ds = list(chunks(dataset_elements, 64))
-        np.random.shuffle(ds)
-        return flatten(ds)
-    def materialize_batches(
-        self, dataset_elements: List[Dict[str, Any]]
-    ) -> Generator[Dict[str, Any], None, None]:
-        if self.prebatch:
-            dataset_elements = self.preshuffle_elements(dataset_elements)
-        current_batch = []
-        # function that creates a batch from the 'current_batch' list
-        def output_batch() -> Dict[str, Any]:
-            assert (
-                len(
-                    set([len(elem["predictable_candidates"]) for elem in current_batch])
-                )
-                == 1
-            ), " ".join(
-                map(
-                    str, [len(elem["predictable_candidates"]) for elem in current_batch]
-                )
-            )
-            batch_dict = dict()
-            de_values_by_field = {
-                fn: [de[fn] for de in current_batch if fn in de]
-                for fn in self.fields_batcher
-            }
-            # in case you provide fields batchers but in the batch
-            # there are no elements for that field
-            de_values_by_field = {
-                fn: fvs for fn, fvs in de_values_by_field.items() if len(fvs) > 0
-            }
-            assert len(set([len(v) for v in de_values_by_field.values()]))
-            # todo: maybe we should report the user about possible
-            #  fields filtering due to "None" instances
-            de_values_by_field = {
-                fn: fvs
-                for fn, fvs in de_values_by_field.items()
-                if all([fv is not None for fv in fvs])
-            }
-            for field_name, field_values in de_values_by_field.items():
-                field_batch = (
-                    self.fields_batcher[field_name](field_values)
-                    if self.fields_batcher[field_name] is not None
-                    else field_values
-                )
-                batch_dict[field_name] = field_batch
-            return batch_dict
-        max_len_discards, min_len_discards = 0, 0
-        should_token_batch = self.batch_size is None
-        curr_pred_elements = -1
-        for de in dataset_elements:
-            if (
-                should_token_batch
-                and self.max_batch_size != -1
-                and len(current_batch) == self.max_batch_size
-            ) or (not should_token_batch and len(current_batch) == self.batch_size):
-                yield output_batch()
-                current_batch = []
-                curr_pred_elements = -1
-            too_long_fields = [
-                k
-                for k in de
-                if self.max_length != -1
-                and torch.is_tensor(de[k])
-                and len(de[k]) > self.max_length
-            ]
-            if len(too_long_fields) > 0:
-                max_len_discards += 1
-                continue
-            too_short_fields = [
-                k
-                for k in de
-                if self.min_length != -1
-                and torch.is_tensor(de[k])
-                and len(de[k]) < self.min_length
-            ]
-            if len(too_short_fields) > 0:
-                min_len_discards += 1
-                continue
-            if should_token_batch:
-                de_len = sum(len(de[k]) for k in self.batching_fields)
-                future_max_len = max(
-                    de_len,
-                    max(
-                        [
-                            sum(len(bde[k]) for k in self.batching_fields)
-                            for bde in current_batch
-                        ],
-                        default=0,
-                    ),
-                )
-                future_tokens_per_batch = future_max_len * (len(current_batch) + 1)
-                num_predictable_candidates = len(de["predictable_candidates"])
-                if len(current_batch) > 0 and (
-                    future_tokens_per_batch >= self.tokens_per_batch
-                    or (
-                        num_predictable_candidates != curr_pred_elements
-                        and curr_pred_elements != -1
-                    )
-                ):
-                    yield output_batch()
-                    current_batch = []
-            current_batch.append(de)
-            curr_pred_elements = len(de["predictable_candidates"])
-        if len(current_batch) != 0 and not self.drop_last:
-            yield output_batch()
-        if max_len_discards > 0:
-            if self.for_inference:
-                logger.warning(
-                    f"WARNING: Inference mode is True but {max_len_discards} samples longer than max length were "
-                    f"found. The {max_len_discards} samples will be DISCARDED. If you are doing some kind of evaluation"
-                    f", this can INVALIDATE results. This might happen if the max length was not set to -1 or if the "
-                    f"sample length exceeds the maximum length supported by the current model."
-                )
-            else:
-                logger.warning(
-                    f"During iteration, {max_len_discards} elements were "
-                    f"discarded since longer than max length {self.max_length}"
-                )
-        if min_len_discards > 0:
-            if self.for_inference:
-                logger.warning(
-                    f"WARNING: Inference mode is True but {min_len_discards} samples shorter than min length were "
-                    f"found. The {min_len_discards} samples will be DISCARDED. If you are doing some kind of evaluation"
-                    f", this can INVALIDATE results. This might happen if the min length was not set to -1 or if the "
-                    f"sample length is shorter than the minimum length supported by the current model."
-                )
-            else:
-                logger.warning(
-                    f"During iteration, {min_len_discards} elements were "
-                    f"discarded since shorter than min length {self.min_length}"
-                )
-    @staticmethod
-    def convert_tokens_to_char_annotations(
-        sample: RelikReaderSample,
-        remove_nmes: bool = True,
-    ) -> RelikReaderSample:
-        """
-        Converts the token annotations to char annotations.
-        Args:
-            sample (:obj:`RelikReaderSample`):
-                The sample to convert.
-            remove_nmes (:obj:`bool`, `optional`, defaults to :obj:`True`):
-                Whether to remove the NMEs from the annotations.
-        Returns:
-            :obj:`RelikReaderSample`: The converted sample.
-        """
-        char_annotations = set()
-        for (
-            predicted_entity,
-            predicted_spans,
-        ) in sample.predicted_window_labels.items():
-            if predicted_entity == NME_SYMBOL and remove_nmes:
-                continue
-            for span_start, span_end in predicted_spans:
-                span_start = sample.token2char_start[str(span_start)]
-                span_end = sample.token2char_end[str(span_end)]
-                char_annotations.add((span_start, span_end, predicted_entity))
-        char_probs_annotations = dict()
-        for (
-            span_start,
-            span_end,
-        ), candidates_probs in sample.span_title_probabilities.items():
-            span_start = sample.token2char_start[str(span_start)]
-            span_end = sample.token2char_end[str(span_end)]
-            char_probs_annotations[(span_start, span_end)] = {
-                title for title, _ in candidates_probs
-            }
-        sample.predicted_window_labels_chars = char_annotations
-        sample.probs_window_labels_chars = char_probs_annotations
-        return sample
-    @staticmethod
-    def merge_patches_predictions(sample) -> None:
-        sample._d["predicted_window_labels"] = dict()
-        predicted_window_labels = sample._d["predicted_window_labels"]
-        sample._d["span_title_probabilities"] = dict()
-        span_title_probabilities = sample._d["span_title_probabilities"]
-        span2title = dict()
-        for _, patch_info in sorted(sample.patches.items(), key=lambda x: x[0]):
-            # selecting span predictions
-            for predicted_title, predicted_spans in patch_info[
-                "predicted_window_labels"
-            ].items():
-                for pred_span in predicted_spans:
-                    pred_span = tuple(pred_span)
-                    curr_title = span2title.get(pred_span)
-                    if curr_title is None or curr_title == NME_SYMBOL:
-                        span2title[pred_span] = predicted_title
-                    # else:
-                    #     print("Merging at patch level")
-            # selecting span predictions probability
-            for predicted_span, titles_probabilities in patch_info[
-                "span_title_probabilities"
-            ].items():
-                if predicted_span not in span_title_probabilities:
-                    span_title_probabilities[predicted_span] = titles_probabilities
-        for span, title in span2title.items():
-            if title not in predicted_window_labels:
-                predicted_window_labels[title] = list()
-            predicted_window_labels[title].append(span)

relik/reader/data/relik_reader_data_utils.py DELETED Viewed

@@ -1,51 +0,0 @@
-from typing import List
-import numpy as np
-import torch
-def flatten(lsts: List[list]) -> list:
-    acc_lst = list()
-    for lst in lsts:
-        acc_lst.extend(lst)
-    return acc_lst
-def batchify(tensors: List[torch.Tensor], padding_value: int = 0) -> torch.Tensor:
-    return torch.nn.utils.rnn.pad_sequence(
-        tensors, batch_first=True, padding_value=padding_value
-    )
-def batchify_matrices(tensors: List[torch.Tensor], padding_value: int) -> torch.Tensor:
-    x = max([t.shape[0] for t in tensors])
-    y = max([t.shape[1] for t in tensors])
-    out_matrix = torch.zeros((len(tensors), x, y))
-    out_matrix += padding_value
-    for i, tensor in enumerate(tensors):
-        out_matrix[i][0 : tensor.shape[0], 0 : tensor.shape[1]] = tensor
-    return out_matrix
-def batchify_tensor(tensors: List[torch.Tensor], padding_value: int) -> torch.Tensor:
-    x = max([t.shape[0] for t in tensors])
-    y = max([t.shape[1] for t in tensors])
-    rest = tensors[0].shape[2]
-    out_matrix = torch.zeros((len(tensors), x, y, rest))
-    out_matrix += padding_value
-    for i, tensor in enumerate(tensors):
-        out_matrix[i][0 : tensor.shape[0], 0 : tensor.shape[1], :] = tensor
-    return out_matrix
-def chunks(lst: list, chunk_size: int) -> List[list]:
-    chunks_acc = list()
-    for i in range(0, len(lst), chunk_size):
-        chunks_acc.append(lst[i : i + chunk_size])
-    return chunks_acc
-def add_noise_to_value(value: int, noise_param: float):
-    noise_value = value * noise_param
-    noise = np.random.uniform(-noise_value, noise_value)
-    return max(1, value + noise)

relik/reader/data/relik_reader_sample.py DELETED Viewed

@@ -1,49 +0,0 @@
-import json
-from typing import Iterable
-class RelikReaderSample:
-    def __init__(self, **kwargs):
-        super().__setattr__("_d", {})
-        self._d = kwargs
-    def __getattribute__(self, item):
-        return super(RelikReaderSample, self).__getattribute__(item)
-    def __getattr__(self, item):
-        if item.startswith("__") and item.endswith("__"):
-            # this is likely some python library-specific variable (such as __deepcopy__ for copy)
-            # better follow standard behavior here
-            raise AttributeError(item)
-        elif item in self._d:
-            return self._d[item]
-        else:
-            return None
-    def __setattr__(self, key, value):
-        if key in self._d:
-            self._d[key] = value
-        else:
-            super().__setattr__(key, value)
-    def to_jsons(self) -> str:
-        if "predicted_window_labels" in self._d:
-            new_obj = {
-                k: v
-                for k, v in self._d.items()
-                if k != "predicted_window_labels" and k != "span_title_probabilities"
-            }
-            new_obj["predicted_window_labels"] = [
-                [ss, se, pred_title]
-                for (ss, se), pred_title in self.predicted_window_labels_chars
-            ]
-        else:
-            return json.dumps(self._d)
-def load_relik_reader_samples(path: str) -> Iterable[RelikReaderSample]:
-    with open(path) as f:
-        for line in f:
-            jsonl_line = json.loads(line.strip())
-            relik_reader_sample = RelikReaderSample(**jsonl_line)
-            yield relik_reader_sample

relik/reader/lightning_modules/__init__.py DELETED Viewed

File without changes

relik/reader/lightning_modules/relik_reader_pl_module.py DELETED Viewed

@@ -1,50 +0,0 @@
-from typing import Any, Optional
-import lightning
-from lightning.pytorch.utilities.types import STEP_OUTPUT, OptimizerLRScheduler
-from relik.reader.relik_reader_core import RelikReaderCoreModel
-class RelikReaderPLModule(lightning.LightningModule):
-    def __init__(
-        self,
-        cfg: dict,
-        transformer_model: str,
-        additional_special_symbols: int,
-        num_layers: Optional[int] = None,
-        activation: str = "gelu",
-        linears_hidden_size: Optional[int] = 512,
-        use_last_k_layers: int = 1,
-        training: bool = False,
-        *args: Any,
-        **kwargs: Any
-    ):
-        super().__init__(*args, **kwargs)
-        self.save_hyperparameters()
-        self.relik_reader_core_model = RelikReaderCoreModel(
-            transformer_model,
-            additional_special_symbols,
-            num_layers,
-            activation,
-            linears_hidden_size,
-            use_last_k_layers,
-            training=training,
-        )
-        self.optimizer_factory = None
-    def training_step(self, batch: dict, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
-        relik_output = self.relik_reader_core_model(**batch)
-        self.log("train-loss", relik_output["loss"])
-        return relik_output["loss"]
-    def validation_step(
-        self, batch: dict, *args: Any, **kwargs: Any
-    ) -> Optional[STEP_OUTPUT]:
-        return
-    def set_optimizer_factory(self, optimizer_factory) -> None:
-        self.optimizer_factory = optimizer_factory
-    def configure_optimizers(self) -> OptimizerLRScheduler:
-        return self.optimizer_factory(self.relik_reader_core_model)

relik/reader/lightning_modules/relik_reader_re_pl_module.py DELETED Viewed

@@ -1,54 +0,0 @@
-from typing import Any, Optional
-import lightning
-from lightning.pytorch.utilities.types import STEP_OUTPUT, OptimizerLRScheduler
-from relik.reader.relik_reader_re import RelikReaderForTripletExtraction
-class RelikReaderREPLModule(lightning.LightningModule):
-    def __init__(
-        self,
-        cfg: dict,
-        transformer_model: str,
-        additional_special_symbols: int,
-        num_layers: Optional[int] = None,
-        activation: str = "gelu",
-        linears_hidden_size: Optional[int] = 512,
-        use_last_k_layers: int = 1,
-        training: bool = False,
-        *args: Any,
-        **kwargs: Any
-    ):
-        super().__init__(*args, **kwargs)
-        self.save_hyperparameters()
-        self.relik_reader_re_model = RelikReaderForTripletExtraction(
-            transformer_model,
-            additional_special_symbols,
-            num_layers,
-            activation,
-            linears_hidden_size,
-            use_last_k_layers,
-            training=training,
-        )
-        self.optimizer_factory = None
-    def training_step(self, batch: dict, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
-        relik_output = self.relik_reader_re_model(**batch)
-        self.log("train-loss", relik_output["loss"])
-        self.log("train-start_loss", relik_output["ned_start_loss"])
-        self.log("train-end_loss", relik_output["ned_end_loss"])
-        self.log("train-relation_loss", relik_output["re_loss"])
-        return relik_output["loss"]
-    def validation_step(
-        self, batch: dict, *args: Any, **kwargs: Any
-    ) -> Optional[STEP_OUTPUT]:
-        return
-    def set_optimizer_factory(self, optimizer_factory) -> None:
-        self.optimizer_factory = optimizer_factory
-    def configure_optimizers(self) -> OptimizerLRScheduler:
-        return self.optimizer_factory(self.relik_reader_re_model)

relik/reader/pytorch_modules/__init__.py DELETED Viewed

File without changes

relik/reader/pytorch_modules/base.py DELETED Viewed

@@ -1,248 +0,0 @@
-import logging
-import os
-from pathlib import Path
-from typing import Any, Dict, List
-import torch
-import transformers as tr
-from torch.utils.data import IterableDataset
-from transformers import AutoConfig
-from relik.common.log import get_console_logger, get_logger
-from relik.common.utils import get_callable_from_string
-from relik.reader.pytorch_modules.hf.modeling_relik import (
-    RelikReaderConfig,
-    RelikReaderSample,
-)
-console_logger = get_console_logger()
-logger = get_logger(__name__, level=logging.INFO)
-class RelikReaderBase(torch.nn.Module):
-    default_reader_class: str | None = None
-    default_data_class: str | None = None
-    def __init__(
-        self,
-        transformer_model: str | tr.PreTrainedModel | None = None,
-        additional_special_symbols: int = 0,
-        num_layers: int | None = None,
-        activation: str = "gelu",
-        linears_hidden_size: int | None = 512,
-        use_last_k_layers: int = 1,
-        training: bool = False,
-        device: str | torch.device | None = None,
-        precision: int = 32,
-        tokenizer: str | tr.PreTrainedTokenizer | None = None,
-        dataset: IterableDataset | str | None = None,
-        default_reader_class: tr.PreTrainedModel | str | None = None,
-        **kwargs,
-    ) -> None:
-        super().__init__()
-        self.default_reader_class = default_reader_class or self.default_reader_class
-        if self.default_reader_class is None:
-            raise ValueError("You must specify a default reader class.")
-        # get the callable for the default reader class
-        self.default_reader_class: tr.PreTrainedModel = get_callable_from_string(
-            self.default_reader_class
-        )
-        if isinstance(transformer_model, str):
-            config = AutoConfig.from_pretrained(
-                transformer_model, trust_remote_code=True
-            )
-            if "relik-reader" in config.model_type:
-                transformer_model = self.default_reader_class.from_pretrained(
-                    transformer_model, **kwargs
-                )
-            else:
-                reader_config = RelikReaderConfig(
-                    transformer_model=transformer_model,
-                    additional_special_symbols=additional_special_symbols,
-                    num_layers=num_layers,
-                    activation=activation,
-                    linears_hidden_size=linears_hidden_size,
-                    use_last_k_layers=use_last_k_layers,
-                    training=training,
-                )
-                transformer_model = self.default_reader_class(reader_config)
-        self.relik_reader_model = transformer_model
-        self.relik_reader_model_config = self.relik_reader_model.config
-        # get the tokenizer
-        self._tokenizer = tokenizer
-        # and instantiate the dataset class
-        self.dataset: IterableDataset | None = dataset
-        # move the model to the device
-        self.to(device or torch.device("cpu"))
-        # set the precision
-        self.precision = precision
-    def forward(self, **kwargs) -> Dict[str, Any]:
-        return self.relik_reader_model(**kwargs)
-    def _read(self, *args, **kwargs) -> Any:
-        raise NotImplementedError
-    @torch.no_grad()
-    @torch.inference_mode()
-    def read(
-        self,
-        text: List[str] | List[List[str]] | None = None,
-        samples: List[RelikReaderSample] | None = None,
-        input_ids: torch.Tensor | None = None,
-        attention_mask: torch.Tensor | None = None,
-        token_type_ids: torch.Tensor | None = None,
-        prediction_mask: torch.Tensor | None = None,
-        special_symbols_mask: torch.Tensor | None = None,
-        candidates: List[List[str]] | None = None,
-        max_length: int = 1000,
-        max_batch_size: int = 128,
-        token_batch_size: int = 2048,
-        precision: int | str | None = None,
-        progress_bar: bool = False,
-        *args,
-        **kwargs,
-    ) -> List[RelikReaderSample] | List[List[RelikReaderSample]]:
-        """
-        Reads the given text.
-        Args:
-            text (:obj:`List[str]` or :obj:`List[List[str]]`, `optional`):
-                The text to read in tokens. If a list of list of tokens is provided, each
-                inner list is considered a sentence.
-            samples (:obj:`List[RelikReaderSample]`, `optional`):
-                The samples to read. If provided, `text` and `candidates` are ignored.
-            input_ids (:obj:`torch.Tensor`, `optional`):
-                The input ids of the text.
-            attention_mask (:obj:`torch.Tensor`, `optional`):
-                The attention mask of the text.
-            token_type_ids (:obj:`torch.Tensor`, `optional`):
-                The token type ids of the text.
-            prediction_mask (:obj:`torch.Tensor`, `optional`):
-                The prediction mask of the text.
-            special_symbols_mask (:obj:`torch.Tensor`, `optional`):
-                The special symbols mask of the text.
-            candidates (:obj:`List[List[str]]`, `optional`):
-                The candidates of the text.
-            max_length (:obj:`int`, `optional`, defaults to 1024):
-                The maximum length of the text.
-            max_batch_size (:obj:`int`, `optional`, defaults to 128):
-                The maximum batch size.
-            token_batch_size (:obj:`int`, `optional`):
-                The maximum number of tokens per batch.
-            precision (:obj:`int` or :obj:`str`, `optional`):
-                The precision to use. If not provided, the default is 32 bit.
-            progress_bar (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether to show a progress bar.
-        Returns:
-            The predicted labels for each sample.
-        """
-        if text is None and input_ids is None and samples is None:
-            raise ValueError(
-                "Either `text` or `input_ids` or `samples` must be provided."
-            )
-        if (input_ids is None and samples is None) and (
-            text is None or candidates is None
-        ):
-            raise ValueError(
-                "`text` and `candidates` must be provided to return the predictions when "
-                "`input_ids` and `samples` is not provided."
-            )
-        if text is not None and samples is None:
-            if len(text) != len(candidates):
-                raise ValueError("`text` and `candidates` must have the same length.")
-            if isinstance(text[0], str):  # change to list of text
-                text = [text]
-                candidates = [candidates]
-            samples = [
-                RelikReaderSample(tokens=t, candidates=c)
-                for t, c in zip(text, candidates)
-            ]
-        return self._read(
-            samples,
-            input_ids,
-            attention_mask,
-            token_type_ids,
-            prediction_mask,
-            special_symbols_mask,
-            max_length,
-            max_batch_size,
-            token_batch_size,
-            precision or self.precision,
-            progress_bar,
-            *args,
-            **kwargs,
-        )
-    @property
-    def device(self) -> torch.device:
-        """
-        The device of the model.
-        """
-        return next(self.parameters()).device
-    @property
-    def tokenizer(self) -> tr.PreTrainedTokenizer:
-        """
-        The tokenizer.
-        """
-        if self._tokenizer:
-            return self._tokenizer
-        self._tokenizer = tr.AutoTokenizer.from_pretrained(
-            self.relik_reader_model.config.name_or_path
-        )
-        return self._tokenizer
-    def save_pretrained(
-        self,
-        output_dir: str | os.PathLike,
-        model_name: str | None = None,
-        push_to_hub: bool = False,
-        **kwargs,
-    ) -> None:
-        """
-        Saves the model to the given path.
-        Args:
-            output_dir (`str` or :obj:`os.PathLike`):
-                The path to save the model to.
-            model_name (`str`, `optional`):
-                The name of the model. If not provided, the model will be saved as
-                `default_reader_class.__name__`.
-            push_to_hub (`bool`, `optional`, defaults to `False`):
-                Whether to push the model to the HuggingFace Hub.
-            **kwargs:
-                Additional keyword arguments to pass to the `save_pretrained` method
-        """
-        # create the output directory
-        output_dir = Path(output_dir)
-        output_dir.mkdir(parents=True, exist_ok=True)
-        model_name = model_name or self.default_reader_class.__name__
-        logger.info(f"Saving reader to {output_dir / model_name}")
-        # save the model
-        self.relik_reader_model.register_for_auto_class()
-        self.relik_reader_model.save_pretrained(
-            output_dir / model_name, push_to_hub=push_to_hub, **kwargs
-        )
-        if self.tokenizer:
-            logger.info("Saving also the tokenizer")
-            self.tokenizer.save_pretrained(
-                output_dir / model_name, push_to_hub=push_to_hub, **kwargs
-            )

relik/reader/pytorch_modules/hf/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- from .configuration_relik import RelikReaderConfig
2	- from .modeling_relik import RelikReaderREModel

relik/reader/pytorch_modules/hf/configuration_relik.py DELETED Viewed

@@ -1,33 +0,0 @@
-from typing import Optional
-from transformers import AutoConfig
-from transformers.configuration_utils import PretrainedConfig
-class RelikReaderConfig(PretrainedConfig):
-    model_type = "relik-reader"
-    def __init__(
-        self,
-        transformer_model: str = "microsoft/deberta-v3-base",
-        additional_special_symbols: int = 101,
-        num_layers: Optional[int] = None,
-        activation: str = "gelu",
-        linears_hidden_size: Optional[int] = 512,
-        use_last_k_layers: int = 1,
-        training: bool = False,
-        default_reader_class: Optional[str] = None,
-        **kwargs
-    ) -> None:
-        self.transformer_model = transformer_model
-        self.additional_special_symbols = additional_special_symbols
-        self.num_layers = num_layers
-        self.activation = activation
-        self.linears_hidden_size = linears_hidden_size
-        self.use_last_k_layers = use_last_k_layers
-        self.training = training
-        self.default_reader_class = default_reader_class
-        super().__init__(**kwargs)
-AutoConfig.register("relik-reader", RelikReaderConfig)

relik/reader/pytorch_modules/hf/modeling_relik.py DELETED Viewed

@@ -1,981 +0,0 @@
-from typing import Any, Dict, Optional
-import torch
-from transformers import AutoModel, PreTrainedModel
-from transformers.activations import ClippedGELUActivation, GELUActivation
-from transformers.configuration_utils import PretrainedConfig
-from transformers.modeling_utils import PoolerEndLogits
-from .configuration_relik import RelikReaderConfig
-class RelikReaderSample:
-    def __init__(self, **kwargs):
-        super().__setattr__("_d", {})
-        self._d = kwargs
-    def __getattribute__(self, item):
-        return super(RelikReaderSample, self).__getattribute__(item)
-    def __getattr__(self, item):
-        if item.startswith("__") and item.endswith("__"):
-            # this is likely some python library-specific variable (such as __deepcopy__ for copy)
-            # better follow standard behavior here
-            raise AttributeError(item)
-        elif item in self._d:
-            return self._d[item]
-        else:
-            return None
-    def __setattr__(self, key, value):
-        if key in self._d:
-            self._d[key] = value
-        else:
-            super().__setattr__(key, value)
-activation2functions = {
-    "relu": torch.nn.ReLU(),
-    "gelu": GELUActivation(),
-    "gelu_10": ClippedGELUActivation(-10, 10),
-}
-class PoolerEndLogitsBi(PoolerEndLogits):
-    def __init__(self, config: PretrainedConfig):
-        super().__init__(config)
-        self.dense_1 = torch.nn.Linear(config.hidden_size, 2)
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        start_states: Optional[torch.FloatTensor] = None,
-        start_positions: Optional[torch.LongTensor] = None,
-        p_mask: Optional[torch.FloatTensor] = None,
-    ) -> torch.FloatTensor:
-        if p_mask is not None:
-            p_mask = p_mask.unsqueeze(-1)
-        logits = super().forward(
-            hidden_states,
-            start_states,
-            start_positions,
-            p_mask,
-        )
-        return logits
-class RelikReaderSpanModel(PreTrainedModel):
-    config_class = RelikReaderConfig
-    def __init__(self, config: RelikReaderConfig, *args, **kwargs):
-        super().__init__(config)
-        # Transformer model declaration
-        self.config = config
-        self.transformer_model = (
-            AutoModel.from_pretrained(self.config.transformer_model)
-            if self.config.num_layers is None
-            else AutoModel.from_pretrained(
-                self.config.transformer_model, num_hidden_layers=self.config.num_layers
-            )
-        )
-        self.transformer_model.resize_token_embeddings(
-            self.transformer_model.config.vocab_size
-            + self.config.additional_special_symbols
-        )
-        self.activation = self.config.activation
-        self.linears_hidden_size = self.config.linears_hidden_size
-        self.use_last_k_layers = self.config.use_last_k_layers
-        # named entity detection layers
-        self.ned_start_classifier = self._get_projection_layer(
-            self.activation, last_hidden=2, layer_norm=False
-        )
-        self.ned_end_classifier = PoolerEndLogits(self.transformer_model.config)
-        # END entity disambiguation layer
-        self.ed_start_projector = self._get_projection_layer(self.activation)
-        self.ed_end_projector = self._get_projection_layer(self.activation)
-        self.training = self.config.training
-        # criterion
-        self.criterion = torch.nn.CrossEntropyLoss()
-    def _get_projection_layer(
-        self,
-        activation: str,
-        last_hidden: Optional[int] = None,
-        input_hidden=None,
-        layer_norm: bool = True,
-    ) -> torch.nn.Sequential:
-        head_components = [
-            torch.nn.Dropout(0.1),
-            torch.nn.Linear(
-                self.transformer_model.config.hidden_size * self.use_last_k_layers
-                if input_hidden is None
-                else input_hidden,
-                self.linears_hidden_size,
-            ),
-            activation2functions[activation],
-            torch.nn.Dropout(0.1),
-            torch.nn.Linear(
-                self.linears_hidden_size,
-                self.linears_hidden_size if last_hidden is None else last_hidden,
-            ),
-        ]
-        if layer_norm:
-            head_components.append(
-                torch.nn.LayerNorm(
-                    self.linears_hidden_size if last_hidden is None else last_hidden,
-                    self.transformer_model.config.layer_norm_eps,
-                )
-            )
-        return torch.nn.Sequential(*head_components)
-    def _mask_logits(self, logits: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
-        mask = mask.unsqueeze(-1)
-        if next(self.parameters()).dtype == torch.float16:
-            logits = logits * (1 - mask) - 65500 * mask
-        else:
-            logits = logits * (1 - mask) - 1e30 * mask
-        return logits
-    def _get_model_features(
-        self,
-        input_ids: torch.Tensor,
-        attention_mask: torch.Tensor,
-        token_type_ids: Optional[torch.Tensor],
-    ):
-        model_input = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "output_hidden_states": self.use_last_k_layers > 1,
-        }
-        if token_type_ids is not None:
-            model_input["token_type_ids"] = token_type_ids
-        model_output = self.transformer_model(**model_input)
-        if self.use_last_k_layers > 1:
-            model_features = torch.cat(
-                model_output[1][-self.use_last_k_layers :], dim=-1
-            )
-        else:
-            model_features = model_output[0]
-        return model_features
-    def compute_ned_end_logits(
-        self,
-        start_predictions,
-        start_labels,
-        model_features,
-        prediction_mask,
-        batch_size,
-    ) -> Optional[torch.Tensor]:
-        # todo: maybe when constraining on the spans,
-        #  we should not use a prediction_mask for the end tokens.
-        #  at least we should not during training imo
-        start_positions = start_labels if self.training else start_predictions
-        start_positions_indices = (
-            torch.arange(start_positions.size(1), device=start_positions.device)
-            .unsqueeze(0)
-            .expand(batch_size, -1)[start_positions > 0]
-        ).to(start_positions.device)
-        if len(start_positions_indices) > 0:
-            expanded_features = torch.cat(
-                [
-                    model_features[i].unsqueeze(0).expand(x, -1, -1)
-                    for i, x in enumerate(torch.sum(start_positions > 0, dim=-1))
-                    if x > 0
-                ],
-                dim=0,
-            ).to(start_positions_indices.device)
-            expanded_prediction_mask = torch.cat(
-                [
-                    prediction_mask[i].unsqueeze(0).expand(x, -1)
-                    for i, x in enumerate(torch.sum(start_positions > 0, dim=-1))
-                    if x > 0
-                ],
-                dim=0,
-            ).to(expanded_features.device)
-            end_logits = self.ned_end_classifier(
-                hidden_states=expanded_features,
-                start_positions=start_positions_indices,
-                p_mask=expanded_prediction_mask,
-            )
-            return end_logits
-        return None
-    def compute_classification_logits(
-        self,
-        model_features,
-        special_symbols_mask,
-        prediction_mask,
-        batch_size,
-        start_positions=None,
-        end_positions=None,
-    ) -> torch.Tensor:
-        if start_positions is None or end_positions is None:
-            start_positions = torch.zeros_like(prediction_mask)
-            end_positions = torch.zeros_like(prediction_mask)
-        model_start_features = self.ed_start_projector(model_features)
-        model_end_features = self.ed_end_projector(model_features)
-        model_end_features[start_positions > 0] = model_end_features[end_positions > 0]
-        model_ed_features = torch.cat(
-            [model_start_features, model_end_features], dim=-1
-        )
-        # computing ed features
-        classes_representations = torch.sum(special_symbols_mask, dim=1)[0].item()
-        special_symbols_representation = model_ed_features[special_symbols_mask].view(
-            batch_size, classes_representations, -1
-        )
-        logits = torch.bmm(
-            model_ed_features,
-            torch.permute(special_symbols_representation, (0, 2, 1)),
-        )
-        logits = self._mask_logits(logits, prediction_mask)
-        return logits
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        attention_mask: torch.Tensor,
-        token_type_ids: Optional[torch.Tensor] = None,
-        prediction_mask: Optional[torch.Tensor] = None,
-        special_symbols_mask: Optional[torch.Tensor] = None,
-        start_labels: Optional[torch.Tensor] = None,
-        end_labels: Optional[torch.Tensor] = None,
-        use_predefined_spans: bool = False,
-        *args,
-        **kwargs,
-    ) -> Dict[str, Any]:
-        batch_size, seq_len = input_ids.shape
-        model_features = self._get_model_features(
-            input_ids, attention_mask, token_type_ids
-        )
-        ned_start_labels = None
-        # named entity detection if required
-        if use_predefined_spans:  # no need to compute spans
-            ned_start_logits, ned_start_probabilities, ned_start_predictions = (
-                None,
-                None,
-                torch.clone(start_labels)
-                if start_labels is not None
-                else torch.zeros_like(input_ids),
-            )
-            ned_end_logits, ned_end_probabilities, ned_end_predictions = (
-                None,
-                None,
-                torch.clone(end_labels)
-                if end_labels is not None
-                else torch.zeros_like(input_ids),
-            )
-            ned_start_predictions[ned_start_predictions > 0] = 1
-            ned_end_predictions[ned_end_predictions > 0] = 1
-        else:  # compute spans
-            # start boundary prediction
-            ned_start_logits = self.ned_start_classifier(model_features)
-            ned_start_logits = self._mask_logits(ned_start_logits, prediction_mask)
-            ned_start_probabilities = torch.softmax(ned_start_logits, dim=-1)
-            ned_start_predictions = ned_start_probabilities.argmax(dim=-1)
-            # end boundary prediction
-            ned_start_labels = (
-                torch.zeros_like(start_labels) if start_labels is not None else None
-            )
-            if ned_start_labels is not None:
-                ned_start_labels[start_labels == -100] = -100
-                ned_start_labels[start_labels > 0] = 1
-            ned_end_logits = self.compute_ned_end_logits(
-                ned_start_predictions,
-                ned_start_labels,
-                model_features,
-                prediction_mask,
-                batch_size,
-            )
-            if ned_end_logits is not None:
-                ned_end_probabilities = torch.softmax(ned_end_logits, dim=-1)
-                ned_end_predictions = torch.argmax(ned_end_probabilities, dim=-1)
-            else:
-                ned_end_logits, ned_end_probabilities = None, None
-                ned_end_predictions = ned_start_predictions.new_zeros(batch_size)
-            # flattening end predictions
-            #   (flattening can happen only if the
-            #   end boundaries were not predicted using the gold labels)
-            if not self.training:
-                flattened_end_predictions = torch.clone(ned_start_predictions)
-                flattened_end_predictions[flattened_end_predictions > 0] = 0
-                batch_start_predictions = list()
-                for elem_idx in range(batch_size):
-                    batch_start_predictions.append(
-                        torch.where(ned_start_predictions[elem_idx] > 0)[0].tolist()
-                    )
-                # check that the total number of start predictions
-                # is equal to the end predictions
-                total_start_predictions = sum(map(len, batch_start_predictions))
-                total_end_predictions = len(ned_end_predictions)
-                assert (
-                    total_start_predictions == 0
-                    or total_start_predictions == total_end_predictions
-                ), (
-                    f"Total number of start predictions = {total_start_predictions}. "
-                    f"Total number of end predictions = {total_end_predictions}"
-                )
-                curr_end_pred_num = 0
-                for elem_idx, bsp in enumerate(batch_start_predictions):
-                    for sp in bsp:
-                        ep = ned_end_predictions[curr_end_pred_num].item()
-                        if ep < sp:
-                            ep = sp
-                        # if we already set this span throw it (no overlap)
-                        if flattened_end_predictions[elem_idx, ep] == 1:
-                            ned_start_predictions[elem_idx, sp] = 0
-                        else:
-                            flattened_end_predictions[elem_idx, ep] = 1
-                        curr_end_pred_num += 1
-                ned_end_predictions = flattened_end_predictions
-        start_position, end_position = (
-            (start_labels, end_labels)
-            if self.training
-            else (ned_start_predictions, ned_end_predictions)
-        )
-        # Entity disambiguation
-        ed_logits = self.compute_classification_logits(
-            model_features,
-            special_symbols_mask,
-            prediction_mask,
-            batch_size,
-            start_position,
-            end_position,
-        )
-        ed_probabilities = torch.softmax(ed_logits, dim=-1)
-        ed_predictions = torch.argmax(ed_probabilities, dim=-1)
-        # output build
-        output_dict = dict(
-            batch_size=batch_size,
-            ned_start_logits=ned_start_logits,
-            ned_start_probabilities=ned_start_probabilities,
-            ned_start_predictions=ned_start_predictions,
-            ned_end_logits=ned_end_logits,
-            ned_end_probabilities=ned_end_probabilities,
-            ned_end_predictions=ned_end_predictions,
-            ed_logits=ed_logits,
-            ed_probabilities=ed_probabilities,
-            ed_predictions=ed_predictions,
-        )
-        # compute loss if labels
-        if start_labels is not None and end_labels is not None and self.training:
-            # named entity detection loss
-            # start
-            if ned_start_logits is not None:
-                ned_start_loss = self.criterion(
-                    ned_start_logits.view(-1, ned_start_logits.shape[-1]),
-                    ned_start_labels.view(-1),
-                )
-            else:
-                ned_start_loss = 0
-            # end
-            if ned_end_logits is not None:
-                ned_end_labels = torch.zeros_like(end_labels)
-                ned_end_labels[end_labels == -100] = -100
-                ned_end_labels[end_labels > 0] = 1
-                ned_end_loss = self.criterion(
-                    ned_end_logits,
-                    (
-                        torch.arange(
-                            ned_end_labels.size(1), device=ned_end_labels.device
-                        )
-                        .unsqueeze(0)
-                        .expand(batch_size, -1)[ned_end_labels > 0]
-                    ).to(ned_end_labels.device),
-                )
-            else:
-                ned_end_loss = 0
-            # entity disambiguation loss
-            start_labels[ned_start_labels != 1] = -100
-            ed_labels = torch.clone(start_labels)
-            ed_labels[end_labels > 0] = end_labels[end_labels > 0]
-            ed_loss = self.criterion(
-                ed_logits.view(-1, ed_logits.shape[-1]),
-                ed_labels.view(-1),
-            )
-            output_dict["ned_start_loss"] = ned_start_loss
-            output_dict["ned_end_loss"] = ned_end_loss
-            output_dict["ed_loss"] = ed_loss
-            output_dict["loss"] = ned_start_loss + ned_end_loss + ed_loss
-        return output_dict
-class RelikReaderREModel(PreTrainedModel):
-    config_class = RelikReaderConfig
-    def __init__(self, config, *args, **kwargs):
-        super().__init__(config)
-        # Transformer model declaration
-        # self.transformer_model_name = transformer_model
-        self.config = config
-        self.transformer_model = (
-            AutoModel.from_pretrained(config.transformer_model)
-            if config.num_layers is None
-            else AutoModel.from_pretrained(
-                config.transformer_model, num_hidden_layers=config.num_layers
-            )
-        )
-        self.transformer_model.resize_token_embeddings(
-            self.transformer_model.config.vocab_size + config.additional_special_symbols
-        )
-        # named entity detection layers
-        self.ned_start_classifier = self._get_projection_layer(
-            config.activation, last_hidden=2, layer_norm=False
-        )
-        self.ned_end_classifier = PoolerEndLogitsBi(self.transformer_model.config)
-        self.entity_type_loss = (
-            config.entity_type_loss if hasattr(config, "entity_type_loss") else False
-        )
-        self.relation_disambiguation_loss = (
-            config.relation_disambiguation_loss
-            if hasattr(config, "relation_disambiguation_loss")
-            else False
-        )
-        input_hidden_ents = 2 * self.transformer_model.config.hidden_size
-        self.re_subject_projector = self._get_projection_layer(
-            config.activation, input_hidden=input_hidden_ents
-        )
-        self.re_object_projector = self._get_projection_layer(
-            config.activation, input_hidden=input_hidden_ents
-        )
-        self.re_relation_projector = self._get_projection_layer(config.activation)
-        if self.entity_type_loss or self.relation_disambiguation_loss:
-            self.re_entities_projector = self._get_projection_layer(
-                config.activation,
-                input_hidden=2 * self.transformer_model.config.hidden_size,
-            )
-            self.re_definition_projector = self._get_projection_layer(
-                config.activation,
-            )
-        self.re_classifier = self._get_projection_layer(
-            config.activation,
-            input_hidden=config.linears_hidden_size,
-            last_hidden=2,
-            layer_norm=False,
-        )
-        if self.entity_type_loss or self.relation_disambiguation_loss:
-            self.re_ed_classifier = self._get_projection_layer(
-                config.activation,
-                input_hidden=config.linears_hidden_size,
-                last_hidden=2,
-                layer_norm=False,
-            )
-        self.training = config.training
-        # criterion
-        self.criterion = torch.nn.CrossEntropyLoss()
-    def _get_projection_layer(
-        self,
-        activation: str,
-        last_hidden: Optional[int] = None,
-        input_hidden=None,
-        layer_norm: bool = True,
-    ) -> torch.nn.Sequential:
-        head_components = [
-            torch.nn.Dropout(0.1),
-            torch.nn.Linear(
-                self.transformer_model.config.hidden_size
-                * self.config.use_last_k_layers
-                if input_hidden is None
-                else input_hidden,
-                self.config.linears_hidden_size,
-            ),
-            activation2functions[activation],
-            torch.nn.Dropout(0.1),
-            torch.nn.Linear(
-                self.config.linears_hidden_size,
-                self.config.linears_hidden_size if last_hidden is None else last_hidden,
-            ),
-        ]
-        if layer_norm:
-            head_components.append(
-                torch.nn.LayerNorm(
-                    self.config.linears_hidden_size
-                    if last_hidden is None
-                    else last_hidden,
-                    self.transformer_model.config.layer_norm_eps,
-                )
-            )
-        return torch.nn.Sequential(*head_components)
-    def _mask_logits(self, logits: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
-        mask = mask.unsqueeze(-1)
-        if next(self.parameters()).dtype == torch.float16:
-            logits = logits * (1 - mask) - 65500 * mask
-        else:
-            logits = logits * (1 - mask) - 1e30 * mask
-        return logits
-    def _get_model_features(
-        self,
-        input_ids: torch.Tensor,
-        attention_mask: torch.Tensor,
-        token_type_ids: Optional[torch.Tensor],
-    ):
-        model_input = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "output_hidden_states": self.config.use_last_k_layers > 1,
-        }
-        if token_type_ids is not None:
-            model_input["token_type_ids"] = token_type_ids
-        model_output = self.transformer_model(**model_input)
-        if self.config.use_last_k_layers > 1:
-            model_features = torch.cat(
-                model_output[1][-self.config.use_last_k_layers :], dim=-1
-            )
-        else:
-            model_features = model_output[0]
-        return model_features
-    def compute_ned_end_logits(
-        self,
-        start_predictions,
-        start_labels,
-        model_features,
-        prediction_mask,
-        batch_size,
-    ) -> Optional[torch.Tensor]:
-        # todo: maybe when constraining on the spans,
-        #  we should not use a prediction_mask for the end tokens.
-        #  at least we should not during training imo
-        start_positions = start_labels if self.training else start_predictions
-        start_positions_indices = (
-            torch.arange(start_positions.size(1), device=start_positions.device)
-            .unsqueeze(0)
-            .expand(batch_size, -1)[start_positions > 0]
-        ).to(start_positions.device)
-        if len(start_positions_indices) > 0:
-            expanded_features = torch.cat(
-                [
-                    model_features[i].unsqueeze(0).expand(x, -1, -1)
-                    for i, x in enumerate(torch.sum(start_positions > 0, dim=-1))
-                    if x > 0
-                ],
-                dim=0,
-            ).to(start_positions_indices.device)
-            expanded_prediction_mask = torch.cat(
-                [
-                    prediction_mask[i].unsqueeze(0).expand(x, -1)
-                    for i, x in enumerate(torch.sum(start_positions > 0, dim=-1))
-                    if x > 0
-                ],
-                dim=0,
-            ).to(expanded_features.device)
-            # mask all tokens before start_positions_indices ie, mask all tokens with
-            # indices < start_positions_indices with 1, ie. [range(x) for x in start_positions_indices]
-            expanded_prediction_mask = torch.stack(
-                [
-                    torch.cat(
-                        [
-                            torch.ones(x, device=expanded_features.device),
-                            expanded_prediction_mask[i, x:],
-                        ]
-                    )
-                    for i, x in enumerate(start_positions_indices)
-                    if x > 0
-                ],
-                dim=0,
-            ).to(expanded_features.device)
-            end_logits = self.ned_end_classifier(
-                hidden_states=expanded_features,
-                start_positions=start_positions_indices,
-                p_mask=expanded_prediction_mask,
-            )
-            return end_logits
-        return None
-    def compute_relation_logits(
-        self,
-        model_entity_features,
-        special_symbols_features,
-    ) -> torch.Tensor:
-        model_subject_features = self.re_subject_projector(model_entity_features)
-        model_object_features = self.re_object_projector(model_entity_features)
-        special_symbols_start_representation = self.re_relation_projector(
-            special_symbols_features
-        )
-        re_logits = torch.einsum(
-            "bse,bde,bfe->bsdfe",
-            model_subject_features,
-            model_object_features,
-            special_symbols_start_representation,
-        )
-        re_logits = self.re_classifier(re_logits)
-        return re_logits
-    def compute_entity_logits(
-        self,
-        model_entity_features,
-        special_symbols_features,
-    ) -> torch.Tensor:
-        model_ed_features = self.re_entities_projector(model_entity_features)
-        special_symbols_ed_representation = self.re_definition_projector(
-            special_symbols_features
-        )
-        logits = torch.einsum(
-            "bce,bde->bcde",
-            model_ed_features,
-            special_symbols_ed_representation,
-        )
-        logits = self.re_ed_classifier(logits)
-        start_logits = self._mask_logits(
-            logits,
-            (model_entity_features == -100)
-            .all(2)
-            .long()
-            .unsqueeze(2)
-            .repeat(1, 1, torch.sum(model_entity_features, dim=1)[0].item()),
-        )
-        return logits
-    def compute_loss(self, logits, labels, mask=None):
-        logits = logits.view(-1, logits.shape[-1])
-        labels = labels.view(-1).long()
-        if mask is not None:
-            return self.criterion(logits[mask], labels[mask])
-        return self.criterion(logits, labels)
-    def compute_ned_end_loss(self, ned_end_logits, end_labels):
-        if ned_end_logits is None:
-            return 0
-        ned_end_labels = torch.zeros_like(end_labels)
-        ned_end_labels[end_labels == -100] = -100
-        ned_end_labels[end_labels > 0] = 1
-        return self.compute_loss(ned_end_logits, ned_end_labels)
-    def compute_ned_type_loss(
-        self,
-        disambiguation_labels,
-        re_ned_entities_logits,
-        ned_type_logits,
-        re_entities_logits,
-        entity_types,
-    ):
-        if self.entity_type_loss and self.relation_disambiguation_loss:
-            return self.compute_loss(disambiguation_labels, re_ned_entities_logits)
-        if self.entity_type_loss:
-            return self.compute_loss(
-                disambiguation_labels[:, :, :entity_types], ned_type_logits
-            )
-        if self.relation_disambiguation_loss:
-            return self.compute_loss(disambiguation_labels, re_entities_logits)
-        return 0
-    def compute_relation_loss(self, relation_labels, re_logits):
-        return self.compute_loss(
-            re_logits, relation_labels, relation_labels.view(-1) != -100
-        )
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        attention_mask: torch.Tensor,
-        token_type_ids: torch.Tensor,
-        prediction_mask: Optional[torch.Tensor] = None,
-        special_symbols_mask: Optional[torch.Tensor] = None,
-        special_symbols_mask_entities: Optional[torch.Tensor] = None,
-        start_labels: Optional[torch.Tensor] = None,
-        end_labels: Optional[torch.Tensor] = None,
-        disambiguation_labels: Optional[torch.Tensor] = None,
-        relation_labels: Optional[torch.Tensor] = None,
-        is_validation: bool = False,
-        is_prediction: bool = False,
-        *args,
-        **kwargs,
-    ) -> Dict[str, Any]:
-        batch_size = input_ids.shape[0]
-        model_features = self._get_model_features(
-            input_ids, attention_mask, token_type_ids
-        )
-        # named entity detection
-        if is_prediction and start_labels is not None:
-            ned_start_logits, ned_start_probabilities, ned_start_predictions = (
-                None,
-                None,
-                torch.zeros_like(start_labels),
-            )
-            ned_end_logits, ned_end_probabilities, ned_end_predictions = (
-                None,
-                None,
-                torch.zeros_like(end_labels),
-            )
-            ned_start_predictions[start_labels > 0] = 1
-            ned_end_predictions[end_labels > 0] = 1
-            ned_end_predictions = ned_end_predictions[~(end_labels == -100).all(2)]
-        else:
-            # start boundary prediction
-            ned_start_logits = self.ned_start_classifier(model_features)
-            ned_start_logits = self._mask_logits(
-                ned_start_logits, prediction_mask
-            )  # why?
-            ned_start_probabilities = torch.softmax(ned_start_logits, dim=-1)
-            ned_start_predictions = ned_start_probabilities.argmax(dim=-1)
-            # end boundary prediction
-            ned_start_labels = (
-                torch.zeros_like(start_labels) if start_labels is not None else None
-            )
-            # start_labels contain entity id at their position, we just need 1 for start of entity
-            if ned_start_labels is not None:
-                ned_start_labels[start_labels > 0] = 1
-            # compute end logits only if there are any start predictions.
-            # For each start prediction, n end predictions are made
-            ned_end_logits = self.compute_ned_end_logits(
-                ned_start_predictions,
-                ned_start_labels,
-                model_features,
-                prediction_mask,
-                batch_size,
-            )
-            # For each start prediction, n end predictions are made based on
-            # binary classification ie. argmax at each position.
-            ned_end_probabilities = torch.softmax(ned_end_logits, dim=-1)
-            ned_end_predictions = ned_end_probabilities.argmax(dim=-1)
-            if is_prediction or is_validation:
-                end_preds_count = ned_end_predictions.sum(1)
-                # If there are no end predictions for a start prediction, remove the start prediction
-                ned_start_predictions[ned_start_predictions == 1] = (
-                    end_preds_count != 0
-                ).long()
-                ned_end_predictions = ned_end_predictions[end_preds_count != 0]
-        if end_labels is not None:
-            end_labels = end_labels[~(end_labels == -100).all(2)]
-        start_position, end_position = (
-            (start_labels, end_labels)
-            if (not is_prediction and not is_validation)
-            else (ned_start_predictions, ned_end_predictions)
-        )
-        start_counts = (start_position > 0).sum(1)
-        ned_end_predictions = ned_end_predictions.split(start_counts.tolist())
-        # We can only predict relations if we have start and end predictions
-        if (end_position > 0).sum() > 0:
-            ends_count = (end_position > 0).sum(1)
-            model_subject_features = torch.cat(
-                [
-                    torch.repeat_interleave(
-                        model_features[start_position > 0], ends_count, dim=0
-                    ),  # start position features
-                    torch.repeat_interleave(model_features, start_counts, dim=0)[
-                        end_position > 0
-                    ],  # end position features
-                ],
-                dim=-1,
-            )
-            ents_count = torch.nn.utils.rnn.pad_sequence(
-                torch.split(ends_count, start_counts.tolist()),
-                batch_first=True,
-                padding_value=0,
-            ).sum(1)
-            model_subject_features = torch.nn.utils.rnn.pad_sequence(
-                torch.split(model_subject_features, ents_count.tolist()),
-                batch_first=True,
-                padding_value=-100,
-            )
-            if is_validation or is_prediction:
-                model_subject_features = model_subject_features[:, :30, :]
-            # entity disambiguation. Here relation_disambiguation_loss would only be useful to
-            # reduce the number of candidate relations for the next step, but currently unused.
-            if self.entity_type_loss or self.relation_disambiguation_loss:
-                (re_ned_entities_logits) = self.compute_entity_logits(
-                    model_subject_features,
-                    model_features[
-                        special_symbols_mask | special_symbols_mask_entities
-                    ].view(batch_size, -1, model_features.shape[-1]),
-                )
-                entity_types = torch.sum(special_symbols_mask_entities, dim=1)[0].item()
-                ned_type_logits = re_ned_entities_logits[:, :, :entity_types]
-                re_entities_logits = re_ned_entities_logits[:, :, entity_types:]
-                if self.entity_type_loss:
-                    ned_type_probabilities = torch.softmax(ned_type_logits, dim=-1)
-                    ned_type_predictions = ned_type_probabilities.argmax(dim=-1)
-                    ned_type_predictions = ned_type_predictions.argmax(dim=-1)
-                re_entities_probabilities = torch.softmax(re_entities_logits, dim=-1)
-                re_entities_predictions = re_entities_probabilities.argmax(dim=-1)
-            else:
-                (
-                    ned_type_logits,
-                    ned_type_probabilities,
-                    re_entities_logits,
-                    re_entities_probabilities,
-                ) = (None, None, None, None)
-                ned_type_predictions, re_entities_predictions = (
-                    torch.zeros([batch_size, 1], dtype=torch.long).to(input_ids.device),
-                    torch.zeros([batch_size, 1], dtype=torch.long).to(input_ids.device),
-                )
-            # Compute relation logits
-            re_logits = self.compute_relation_logits(
-                model_subject_features,
-                model_features[special_symbols_mask].view(
-                    batch_size, -1, model_features.shape[-1]
-                ),
-            )
-            re_probabilities = torch.softmax(re_logits, dim=-1)
-            # we set a thresshold instead of argmax in cause it needs to be tweaked
-            re_predictions = re_probabilities[:, :, :, :, 1] > 0.5
-            # re_predictions = re_probabilities.argmax(dim=-1)
-            re_probabilities = re_probabilities[:, :, :, :, 1]
-        else:
-            (
-                ned_type_logits,
-                ned_type_probabilities,
-                re_entities_logits,
-                re_entities_probabilities,
-            ) = (None, None, None, None)
-            ned_type_predictions, re_entities_predictions = (
-                torch.zeros([batch_size, 1], dtype=torch.long).to(input_ids.device),
-                torch.zeros([batch_size, 1], dtype=torch.long).to(input_ids.device),
-            )
-            re_logits, re_probabilities, re_predictions = (
-                torch.zeros(
-                    [batch_size, 1, 1, special_symbols_mask.sum(1)[0]], dtype=torch.long
-                ).to(input_ids.device),
-                torch.zeros(
-                    [batch_size, 1, 1, special_symbols_mask.sum(1)[0]], dtype=torch.long
-                ).to(input_ids.device),
-                torch.zeros(
-                    [batch_size, 1, 1, special_symbols_mask.sum(1)[0]], dtype=torch.long
-                ).to(input_ids.device),
-            )
-        # output build
-        output_dict = dict(
-            batch_size=batch_size,
-            ned_start_logits=ned_start_logits,
-            ned_start_probabilities=ned_start_probabilities,
-            ned_start_predictions=ned_start_predictions,
-            ned_end_logits=ned_end_logits,
-            ned_end_probabilities=ned_end_probabilities,
-            ned_end_predictions=ned_end_predictions,
-            ned_type_logits=ned_type_logits,
-            ned_type_probabilities=ned_type_probabilities,
-            ned_type_predictions=ned_type_predictions,
-            re_entities_logits=re_entities_logits,
-            re_entities_probabilities=re_entities_probabilities,
-            re_entities_predictions=re_entities_predictions,
-            re_logits=re_logits,
-            re_probabilities=re_probabilities,
-            re_predictions=re_predictions,
-        )
-        if (
-            start_labels is not None
-            and end_labels is not None
-            and relation_labels is not None
-        ):
-            ned_start_loss = self.compute_loss(ned_start_logits, ned_start_labels)
-            ned_end_loss = self.compute_ned_end_loss(ned_end_logits, end_labels)
-            if self.entity_type_loss or self.relation_disambiguation_loss:
-                ned_type_loss = self.compute_ned_type_loss(
-                    disambiguation_labels,
-                    re_ned_entities_logits,
-                    ned_type_logits,
-                    re_entities_logits,
-                    entity_types,
-                )
-            relation_loss = self.compute_relation_loss(relation_labels, re_logits)
-            # compute loss. We can skip the relation loss if we are in the first epochs (optional)
-            if self.entity_type_loss or self.relation_disambiguation_loss:
-                output_dict["loss"] = (
-                    ned_start_loss + ned_end_loss + relation_loss + ned_type_loss
-                ) / 4
-                output_dict["ned_type_loss"] = ned_type_loss
-            else:
-                output_dict["loss"] = (
-                    ned_start_loss + ned_end_loss + relation_loss
-                ) / 3
-            output_dict["ned_start_loss"] = ned_start_loss
-            output_dict["ned_end_loss"] = ned_end_loss
-            output_dict["re_loss"] = relation_loss
-        return output_dict

relik/reader/pytorch_modules/optim/__init__.py DELETED Viewed

@@ -1,6 +0,0 @@
-from relik.reader.pytorch_modules.optim.adamw_with_warmup import (
-    AdamWWithWarmupOptimizer,
-)
-from relik.reader.pytorch_modules.optim.layer_wise_lr_decay import (
-    LayerWiseLRDecayOptimizer,
-)

relik/reader/pytorch_modules/optim/adamw_with_warmup.py DELETED Viewed

@@ -1,66 +0,0 @@
-from typing import List
-import torch
-import transformers
-from torch.optim import AdamW
-class AdamWWithWarmupOptimizer:
-    def __init__(
-        self,
-        lr: float,
-        warmup_steps: int,
-        total_steps: int,
-        weight_decay: float,
-        no_decay_params: List[str],
-    ):
-        self.lr = lr
-        self.warmup_steps = warmup_steps
-        self.total_steps = total_steps
-        self.weight_decay = weight_decay
-        self.no_decay_params = no_decay_params
-    def group_params(self, module: torch.nn.Module) -> list:
-        if self.no_decay_params is not None:
-            optimizer_grouped_parameters = [
-                {
-                    "params": [
-                        p
-                        for n, p in module.named_parameters()
-                        if not any(nd in n for nd in self.no_decay_params)
-                    ],
-                    "weight_decay": self.weight_decay,
-                },
-                {
-                    "params": [
-                        p
-                        for n, p in module.named_parameters()
-                        if any(nd in n for nd in self.no_decay_params)
-                    ],
-                    "weight_decay": 0.0,
-                },
-            ]
-        else:
-            optimizer_grouped_parameters = [
-                {"params": module.parameters(), "weight_decay": self.weight_decay}
-            ]
-        return optimizer_grouped_parameters
-    def __call__(self, module: torch.nn.Module):
-        optimizer_grouped_parameters = self.group_params(module)
-        optimizer = AdamW(
-            optimizer_grouped_parameters, lr=self.lr, weight_decay=self.weight_decay
-        )
-        scheduler = transformers.get_linear_schedule_with_warmup(
-            optimizer, self.warmup_steps, self.total_steps
-        )
-        return {
-            "optimizer": optimizer,
-            "lr_scheduler": {
-                "scheduler": scheduler,
-                "interval": "step",
-                "frequency": 1,
-            },
-        }

relik/reader/pytorch_modules/optim/layer_wise_lr_decay.py DELETED Viewed

@@ -1,104 +0,0 @@
-import collections
-from typing import List
-import torch
-import transformers
-from torch.optim import AdamW
-class LayerWiseLRDecayOptimizer:
-    def __init__(
-        self,
-        lr: float,
-        warmup_steps: int,
-        total_steps: int,
-        weight_decay: float,
-        lr_decay: float,
-        no_decay_params: List[str],
-        total_reset: int,
-    ):
-        self.lr = lr
-        self.warmup_steps = warmup_steps
-        self.total_steps = total_steps
-        self.weight_decay = weight_decay
-        self.lr_decay = lr_decay
-        self.no_decay_params = no_decay_params
-        self.total_reset = total_reset
-    def group_layers(self, module) -> dict:
-        grouped_layers = collections.defaultdict(list)
-        module_named_parameters = list(module.named_parameters())
-        for ln, lp in module_named_parameters:
-            if "embeddings" in ln:
-                grouped_layers["embeddings"].append((ln, lp))
-            elif "encoder.layer" in ln:
-                layer_num = ln.split("transformer_model.encoder.layer.")[-1]
-                layer_num = layer_num[0 : layer_num.index(".")]
-                grouped_layers[layer_num].append((ln, lp))
-            else:
-                grouped_layers["head"].append((ln, lp))
-        depth = len(grouped_layers) - 1
-        final_dict = dict()
-        for key, value in grouped_layers.items():
-            if key == "head":
-                final_dict[0] = value
-            elif key == "embeddings":
-                final_dict[depth] = value
-            else:
-                # -1 because layer number starts from zero
-                final_dict[depth - int(key) - 1] = value
-        assert len(module_named_parameters) == sum(
-            len(v) for _, v in final_dict.items()
-        )
-        return final_dict
-    def group_params(self, module) -> list:
-        optimizer_grouped_params = []
-        for inverse_depth, layer in self.group_layers(module).items():
-            layer_lr = self.lr * (self.lr_decay**inverse_depth)
-            layer_wd_params = {
-                "params": [
-                    lp
-                    for ln, lp in layer
-                    if not any(nd in ln for nd in self.no_decay_params)
-                ],
-                "weight_decay": self.weight_decay,
-                "lr": layer_lr,
-            }
-            layer_no_wd_params = {
-                "params": [
-                    lp
-                    for ln, lp in layer
-                    if any(nd in ln for nd in self.no_decay_params)
-                ],
-                "weight_decay": 0,
-                "lr": layer_lr,
-            }
-            if len(layer_wd_params) != 0:
-                optimizer_grouped_params.append(layer_wd_params)
-            if len(layer_no_wd_params) != 0:
-                optimizer_grouped_params.append(layer_no_wd_params)
-        return optimizer_grouped_params
-    def __call__(self, module: torch.nn.Module):
-        optimizer_grouped_parameters = self.group_params(module)
-        optimizer = AdamW(optimizer_grouped_parameters, lr=self.lr)
-        scheduler = transformers.get_cosine_with_hard_restarts_schedule_with_warmup(
-            optimizer,
-            self.warmup_steps,
-            self.total_steps,
-            num_cycles=self.total_reset,
-        )
-        return {
-            "optimizer": optimizer,
-            "lr_scheduler": {
-                "scheduler": scheduler,
-                "interval": "step",
-                "frequency": 1,
-            },
-        }

relik/reader/pytorch_modules/span.py DELETED Viewed

@@ -1,367 +0,0 @@
-import collections
-import contextlib
-import logging
-from typing import Any, Dict, Iterator, List
-import torch
-import transformers as tr
-from lightning_fabric.utilities import move_data_to_device
-from torch.utils.data import DataLoader, IterableDataset
-from tqdm import tqdm
-from relik.common.log import get_console_logger, get_logger
-from relik.common.utils import get_callable_from_string
-from relik.reader.data.relik_reader_sample import RelikReaderSample
-from relik.reader.pytorch_modules.base import RelikReaderBase
-from relik.reader.utils.special_symbols import get_special_symbols
-from relik.retriever.pytorch_modules import PRECISION_MAP
-console_logger = get_console_logger()
-logger = get_logger(__name__, level=logging.INFO)
-class RelikReaderForSpanExtraction(RelikReaderBase):
-    """
-    A class for the RelikReader model for span extraction.
-    Args:
-        transformer_model (:obj:`str` or :obj:`transformers.PreTrainedModel` or :obj:`None`, `optional`):
-            The transformer model to use. If `None`, the default model is used.
-        additional_special_symbols (:obj:`int`, `optional`, defaults to 0):
-            The number of additional special symbols to add to the tokenizer.
-        num_layers (:obj:`int`, `optional`):
-            The number of layers to use. If `None`, all layers are used.
-        activation (:obj:`str`, `optional`, defaults to "gelu"):
-            The activation function to use.
-        linears_hidden_size (:obj:`int`, `optional`, defaults to 512):
-            The hidden size of the linears.
-        use_last_k_layers (:obj:`int`, `optional`, defaults to 1):
-            The number of last layers to use.
-        training (:obj:`bool`, `optional`, defaults to False):
-            Whether the model is in training mode.
-        device (:obj:`str` or :obj:`torch.device` or :obj:`None`, `optional`):
-            The device to use. If `None`, the default device is used.
-        tokenizer (:obj:`str` or :obj:`transformers.PreTrainedTokenizer` or :obj:`None`, `optional`):
-            The tokenizer to use. If `None`, the default tokenizer is used.
-        dataset (:obj:`IterableDataset` or :obj:`str` or :obj:`None`, `optional`):
-            The dataset to use. If `None`, the default dataset is used.
-        dataset_kwargs (:obj:`Dict[str, Any]` or :obj:`None`, `optional`):
-            The keyword arguments to pass to the dataset class.
-        default_reader_class (:obj:`str` or :obj:`transformers.PreTrainedModel` or :obj:`None`, `optional`):
-            The default reader class to use. If `None`, the default reader class is used.
-        **kwargs:
-            Keyword arguments.
-    """
-    default_reader_class: str = (
-        "relik.reader.pytorch_modules.hf.modeling_relik.RelikReaderSpanModel"
-    )
-    default_data_class: str = "relik.reader.data.relik_reader_data.RelikDataset"
-    def __init__(
-        self,
-        transformer_model: str | tr.PreTrainedModel | None = None,
-        additional_special_symbols: int = 0,
-        num_layers: int | None = None,
-        activation: str = "gelu",
-        linears_hidden_size: int | None = 512,
-        use_last_k_layers: int = 1,
-        training: bool = False,
-        device: str | torch.device | None = None,
-        tokenizer: str | tr.PreTrainedTokenizer | None = None,
-        dataset: IterableDataset | str | None = None,
-        dataset_kwargs: Dict[str, Any] | None = None,
-        default_reader_class: tr.PreTrainedModel | str | None = None,
-        **kwargs,
-    ):
-        super().__init__(
-            transformer_model=transformer_model,
-            additional_special_symbols=additional_special_symbols,
-            num_layers=num_layers,
-            activation=activation,
-            linears_hidden_size=linears_hidden_size,
-            use_last_k_layers=use_last_k_layers,
-            training=training,
-            device=device,
-            tokenizer=tokenizer,
-            dataset=dataset,
-            default_reader_class=default_reader_class,
-            **kwargs,
-        )
-        # and instantiate the dataset class
-        self.dataset = dataset
-        if self.dataset is None:
-            default_data_kwargs = dict(
-                dataset_path=None,
-                materialize_samples=False,
-                transformer_model=self.tokenizer,
-                special_symbols=get_special_symbols(
-                    self.relik_reader_model.config.additional_special_symbols
-                ),
-                for_inference=True,
-            )
-            # merge the default data kwargs with the ones passed to the model
-            default_data_kwargs.update(dataset_kwargs or {})
-            self.dataset = get_callable_from_string(self.default_data_class)(
-                **default_data_kwargs
-            )
-    @torch.no_grad()
-    @torch.inference_mode()
-    def _read(
-        self,
-        samples: List[RelikReaderSample] | None = None,
-        input_ids: torch.Tensor | None = None,
-        attention_mask: torch.Tensor | None = None,
-        token_type_ids: torch.Tensor | None = None,
-        prediction_mask: torch.Tensor | None = None,
-        special_symbols_mask: torch.Tensor | None = None,
-        max_length: int = 1000,
-        max_batch_size: int = 128,
-        token_batch_size: int = 2048,
-        precision: str = 32,
-        annotation_type: str = "char",
-        progress_bar: bool = False,
-        *args: object,
-        **kwargs: object,
-    ) -> List[RelikReaderSample] | List[List[RelikReaderSample]]:
-        """
-        A wrapper around the forward method that returns the predicted labels for each sample.
-        Args:
-            samples (:obj:`List[RelikReaderSample]`, `optional`):
-                The samples to read. If provided, `text` and `candidates` are ignored.
-            input_ids (:obj:`torch.Tensor`, `optional`):
-                The input ids of the text. If `samples` is provided, this is ignored.
-            attention_mask (:obj:`torch.Tensor`, `optional`):
-                The attention mask of the text. If `samples` is provided, this is ignored.
-            token_type_ids (:obj:`torch.Tensor`, `optional`):
-                The token type ids of the text. If `samples` is provided, this is ignored.
-            prediction_mask (:obj:`torch.Tensor`, `optional`):
-                The prediction mask of the text. If `samples` is provided, this is ignored.
-            special_symbols_mask (:obj:`torch.Tensor`, `optional`):
-                The special symbols mask of the text. If `samples` is provided, this is ignored.
-            max_length (:obj:`int`, `optional`, defaults to 1000):
-                The maximum length of the text.
-            max_batch_size (:obj:`int`, `optional`, defaults to 128):
-                The maximum batch size.
-            token_batch_size (:obj:`int`, `optional`):
-                The token batch size.
-            progress_bar (:obj:`bool`, `optional`, defaults to False):
-                Whether to show a progress bar.
-            precision (:obj:`str`, `optional`, defaults to 32):
-                The precision to use for the model.
-            annotation_type (:obj:`str`, `optional`, defaults to "char"):
-                The annotation type to use. It can be either "char", "token" or "word".
-            *args:
-                Positional arguments.
-            **kwargs:
-                Keyword arguments.
-        Returns:
-            :obj:`List[RelikReaderSample]` or :obj:`List[List[RelikReaderSample]]`:
-                The predicted labels for each sample.
-        """
-        precision = precision or self.precision
-        if samples is not None:
-            def _read_iterator():
-                def samples_it():
-                    for i, sample in enumerate(samples):
-                        assert sample._mixin_prediction_position is None
-                        sample._mixin_prediction_position = i
-                        yield sample
-                next_prediction_position = 0
-                position2predicted_sample = {}
-                # instantiate dataset
-                if self.dataset is None:
-                    raise ValueError(
-                        "You need to pass a dataset to the model in order to predict"
-                    )
-                self.dataset.samples = samples_it()
-                self.dataset.model_max_length = max_length
-                self.dataset.tokens_per_batch = token_batch_size
-                self.dataset.max_batch_size = max_batch_size
-                # instantiate dataloader
-                iterator = DataLoader(
-                    self.dataset, batch_size=None, num_workers=0, shuffle=False
-                )
-                if progress_bar:
-                    iterator = tqdm(iterator, desc="Predicting with RelikReader")
-                # fucking autocast only wants pure strings like 'cpu' or 'cuda'
-                # we need to convert the model device to that
-                device_type_for_autocast = str(self.device).split(":")[0]
-                # autocast doesn't work with CPU and stuff different from bfloat16
-                autocast_mngr = (
-                    contextlib.nullcontext()
-                    if device_type_for_autocast == "cpu"
-                    else (
-                        torch.autocast(
-                            device_type=device_type_for_autocast,
-                            dtype=PRECISION_MAP[precision],
-                        )
-                    )
-                )
-                with autocast_mngr:
-                    for batch in iterator:
-                        batch = move_data_to_device(batch, self.device)
-                        batch_out = self._batch_predict(**batch)
-                        for sample in batch_out:
-                            if (
-                                sample._mixin_prediction_position
-                                >= next_prediction_position
-                            ):
-                                position2predicted_sample[
-                                    sample._mixin_prediction_position
-                                ] = sample
-                        # yield
-                        while next_prediction_position in position2predicted_sample:
-                            yield position2predicted_sample[next_prediction_position]
-                            del position2predicted_sample[next_prediction_position]
-                            next_prediction_position += 1
-            outputs = list(_read_iterator())
-            for sample in outputs:
-                self.dataset.merge_patches_predictions(sample)
-                self.dataset.convert_tokens_to_char_annotations(sample)
-        else:
-            outputs = list(
-                self._batch_predict(
-                    input_ids,
-                    attention_mask,
-                    token_type_ids,
-                    prediction_mask,
-                    special_symbols_mask,
-                    *args,
-                    **kwargs,
-                )
-            )
-        return outputs
-    def _batch_predict(
-        self,
-        input_ids: torch.Tensor,
-        attention_mask: torch.Tensor,
-        token_type_ids: torch.Tensor | None = None,
-        prediction_mask: torch.Tensor | None = None,
-        special_symbols_mask: torch.Tensor | None = None,
-        sample: List[RelikReaderSample] | None = None,
-        top_k: int = 5,  # the amount of top-k most probable entities to predict
-        *args,
-        **kwargs,
-    ) -> Iterator[RelikReaderSample]:
-        """
-        A wrapper around the forward method that returns the predicted labels for each sample.
-        It also adds the predicted labels to the samples.
-        Args:
-            input_ids (:obj:`torch.Tensor`):
-                The input ids of the text.
-            attention_mask (:obj:`torch.Tensor`):
-                The attention mask of the text.
-            token_type_ids (:obj:`torch.Tensor`, `optional`):
-                The token type ids of the text.
-            prediction_mask (:obj:`torch.Tensor`, `optional`):
-                The prediction mask of the text.
-            special_symbols_mask (:obj:`torch.Tensor`, `optional`):
-                The special symbols mask of the text.
-            sample (:obj:`List[RelikReaderSample]`, `optional`):
-                The samples to read. If provided, `text` and `candidates` are ignored.
-            top_k (:obj:`int`, `optional`, defaults to 5):
-                The amount of top-k most probable entities to predict.
-            *args:
-                Positional arguments.
-            **kwargs:
-                Keyword arguments.
-        Returns:
-            The predicted labels for each sample.
-        """
-        forward_output = self.forward(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            prediction_mask=prediction_mask,
-            special_symbols_mask=special_symbols_mask,
-        )
-        ned_start_predictions = forward_output["ned_start_predictions"].cpu().numpy()
-        ned_end_predictions = forward_output["ned_end_predictions"].cpu().numpy()
-        ed_predictions = forward_output["ed_predictions"].cpu().numpy()
-        ed_probabilities = forward_output["ed_probabilities"].cpu().numpy()
-        batch_predictable_candidates = kwargs["predictable_candidates"]
-        patch_offset = kwargs["patch_offset"]
-        for ts, ne_sp, ne_ep, edp, edpr, pred_cands, po in zip(
-            sample,
-            ned_start_predictions,
-            ned_end_predictions,
-            ed_predictions,
-            ed_probabilities,
-            batch_predictable_candidates,
-            patch_offset,
-        ):
-            ne_start_indices = [ti for ti, c in enumerate(ne_sp[1:]) if c > 0]
-            ne_end_indices = [ti for ti, c in enumerate(ne_ep[1:]) if c > 0]
-            final_class2predicted_spans = collections.defaultdict(list)
-            spans2predicted_probabilities = dict()
-            for start_token_index, end_token_index in zip(
-                ne_start_indices, ne_end_indices
-            ):
-                # predicted candidate
-                token_class = edp[start_token_index + 1] - 1
-                predicted_candidate_title = pred_cands[token_class]
-                final_class2predicted_spans[predicted_candidate_title].append(
-                    [start_token_index, end_token_index]
-                )
-                # candidates probabilities
-                classes_probabilities = edpr[start_token_index + 1]
-                classes_probabilities_best_indices = classes_probabilities.argsort()[
-                    ::-1
-                ]
-                titles_2_probs = []
-                top_k = (
-                    min(
-                        top_k,
-                        len(classes_probabilities_best_indices),
-                    )
-                    if top_k != -1
-                    else len(classes_probabilities_best_indices)
-                )
-                for i in range(top_k):
-                    titles_2_probs.append(
-                        (
-                            pred_cands[classes_probabilities_best_indices[i] - 1],
-                            classes_probabilities[
-                                classes_probabilities_best_indices[i]
-                            ].item(),
-                        )
-                    )
-                spans2predicted_probabilities[
-                    (start_token_index, end_token_index)
-                ] = titles_2_probs
-            if "patches" not in ts._d:
-                ts._d["patches"] = dict()
-            ts._d["patches"][po] = dict()
-            sample_patch = ts._d["patches"][po]
-            sample_patch["predicted_window_labels"] = final_class2predicted_spans
-            sample_patch["span_title_probabilities"] = spans2predicted_probabilities
-            # additional info
-            sample_patch["predictable_candidates"] = pred_cands
-            yield ts

relik/reader/relik_reader.py DELETED Viewed

@@ -1,629 +0,0 @@
-import collections
-import logging
-from pathlib import Path
-from typing import Any, Callable, Dict, Iterator, List, Union
-import torch
-import transformers as tr
-from tqdm import tqdm
-from transformers import AutoConfig
-from relik.common.log import get_console_logger, get_logger
-from relik.reader.data.relik_reader_data_utils import batchify, flatten
-from relik.reader.data.relik_reader_sample import RelikReaderSample
-from relik.reader.pytorch_modules.hf.modeling_relik import (
-    RelikReaderConfig,
-    RelikReaderSpanModel,
-)
-from relik.reader.relik_reader_predictor import RelikReaderPredictor
-from relik.reader.utils.save_load_utilities import load_model_and_conf
-from relik.reader.utils.special_symbols import NME_SYMBOL, get_special_symbols
-console_logger = get_console_logger()
-logger = get_logger(__name__, level=logging.INFO)
-class RelikReaderForSpanExtraction(torch.nn.Module):
-    def __init__(
-        self,
-        transformer_model: str | tr.PreTrainedModel | None = None,
-        additional_special_symbols: int = 0,
-        num_layers: int | None = None,
-        activation: str = "gelu",
-        linears_hidden_size: int | None = 512,
-        use_last_k_layers: int = 1,
-        training: bool = False,
-        device: str | torch.device | None = None,
-        tokenizer: str | tr.PreTrainedTokenizer | None = None,
-        **kwargs,
-    ) -> None:
-        super().__init__()
-        if isinstance(transformer_model, str):
-            config = AutoConfig.from_pretrained(
-                transformer_model, trust_remote_code=True
-            )
-            if "relik-reader" in config.model_type:
-                transformer_model = RelikReaderSpanModel.from_pretrained(
-                    transformer_model, **kwargs
-                )
-            else:
-                reader_config = RelikReaderConfig(
-                    transformer_model=transformer_model,
-                    additional_special_symbols=additional_special_symbols,
-                    num_layers=num_layers,
-                    activation=activation,
-                    linears_hidden_size=linears_hidden_size,
-                    use_last_k_layers=use_last_k_layers,
-                    training=training,
-                )
-                transformer_model = RelikReaderSpanModel(reader_config)
-        self.relik_reader_model = transformer_model
-        self._tokenizer = tokenizer
-        # move the model to the device
-        self.to(device or torch.device("cpu"))
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        attention_mask: torch.Tensor,
-        token_type_ids: torch.Tensor,
-        prediction_mask: torch.Tensor | None = None,
-        special_symbols_mask: torch.Tensor | None = None,
-        special_symbols_mask_entities: torch.Tensor | None = None,
-        start_labels: torch.Tensor | None = None,
-        end_labels: torch.Tensor | None = None,
-        disambiguation_labels: torch.Tensor | None = None,
-        relation_labels: torch.Tensor | None = None,
-        is_validation: bool = False,
-        is_prediction: bool = False,
-        *args,
-        **kwargs,
-    ) -> Dict[str, Any]:
-        return self.relik_reader_model(
-            input_ids,
-            attention_mask,
-            token_type_ids,
-            prediction_mask,
-            special_symbols_mask,
-            special_symbols_mask_entities,
-            start_labels,
-            end_labels,
-            disambiguation_labels,
-            relation_labels,
-            is_validation,
-            is_prediction,
-            *args,
-            **kwargs,
-        )
-    def batch_predict(
-        self,
-        input_ids: torch.Tensor,
-        attention_mask: torch.Tensor,
-        token_type_ids: torch.Tensor | None = None,
-        prediction_mask: torch.Tensor | None = None,
-        special_symbols_mask: torch.Tensor | None = None,
-        sample: List[RelikReaderSample] | None = None,
-        top_k: int = 5,  # the amount of top-k most probable entities to predict
-        *args,
-        **kwargs,
-    ) -> Iterator[RelikReaderSample]:
-        """
-        Args:
-            input_ids:
-            attention_mask:
-            token_type_ids:
-            prediction_mask:
-            special_symbols_mask:
-            sample:
-            top_k:
-            *args:
-            **kwargs:
-        Returns:
-        """
-        forward_output = self.forward(
-            input_ids,
-            attention_mask,
-            token_type_ids,
-            prediction_mask,
-            special_symbols_mask,
-        )
-        ned_start_predictions = forward_output["ned_start_predictions"].cpu().numpy()
-        ned_end_predictions = forward_output["ned_end_predictions"].cpu().numpy()
-        ed_predictions = forward_output["ed_predictions"].cpu().numpy()
-        ed_probabilities = forward_output["ed_probabilities"].cpu().numpy()
-        batch_predictable_candidates = kwargs["predictable_candidates"]
-        patch_offset = kwargs["patch_offset"]
-        for ts, ne_sp, ne_ep, edp, edpr, pred_cands, po in zip(
-            sample,
-            ned_start_predictions,
-            ned_end_predictions,
-            ed_predictions,
-            ed_probabilities,
-            batch_predictable_candidates,
-            patch_offset,
-        ):
-            ne_start_indices = [ti for ti, c in enumerate(ne_sp[1:]) if c > 0]
-            ne_end_indices = [ti for ti, c in enumerate(ne_ep[1:]) if c > 0]
-            final_class2predicted_spans = collections.defaultdict(list)
-            spans2predicted_probabilities = dict()
-            for start_token_index, end_token_index in zip(
-                ne_start_indices, ne_end_indices
-            ):
-                # predicted candidate
-                token_class = edp[start_token_index + 1] - 1
-                predicted_candidate_title = pred_cands[token_class]
-                final_class2predicted_spans[predicted_candidate_title].append(
-                    [start_token_index, end_token_index]
-                )
-                # candidates probabilities
-                classes_probabilities = edpr[start_token_index + 1]
-                classes_probabilities_best_indices = classes_probabilities.argsort()[
-                    ::-1
-                ]
-                titles_2_probs = []
-                top_k = (
-                    min(
-                        top_k,
-                        len(classes_probabilities_best_indices),
-                    )
-                    if top_k != -1
-                    else len(classes_probabilities_best_indices)
-                )
-                for i in range(top_k):
-                    titles_2_probs.append(
-                        (
-                            pred_cands[classes_probabilities_best_indices[i] - 1],
-                            classes_probabilities[
-                                classes_probabilities_best_indices[i]
-                            ].item(),
-                        )
-                    )
-                spans2predicted_probabilities[
-                    (start_token_index, end_token_index)
-                ] = titles_2_probs
-            if "patches" not in ts._d:
-                ts._d["patches"] = dict()
-            ts._d["patches"][po] = dict()
-            sample_patch = ts._d["patches"][po]
-            sample_patch["predicted_window_labels"] = final_class2predicted_spans
-            sample_patch["span_title_probabilities"] = spans2predicted_probabilities
-            # additional info
-            sample_patch["predictable_candidates"] = pred_cands
-            yield ts
-    def _build_input(self, text: List[str], candidates: List[List[str]]) -> list[str]:
-        candidates_symbols = get_special_symbols(len(candidates))
-        candidates = [
-            [cs, ct] if ct != NME_SYMBOL else [NME_SYMBOL]
-            for cs, ct in zip(candidates_symbols, candidates)
-        ]
-        return (
-            [self.tokenizer.cls_token]
-            + text
-            + [self.tokenizer.sep_token]
-            + flatten(candidates)
-            + [self.tokenizer.sep_token]
-        )
-    @staticmethod
-    def _compute_offsets(offsets_mapping):
-        offsets_mapping = offsets_mapping.numpy()
-        token2word = []
-        word2token = {}
-        count = 0
-        for i, offset in enumerate(offsets_mapping):
-            if offset[0] == 0:
-                token2word.append(i - count)
-                word2token[i - count] = [i]
-            else:
-                token2word.append(token2word[-1])
-                word2token[token2word[-1]].append(i)
-                count += 1
-        return token2word, word2token
-    @staticmethod
-    def _convert_tokens_to_word_annotations(sample: RelikReaderSample):
-        triplets = []
-        entities = []
-        for entity in sample.predicted_entities:
-            if sample.entity_candidates:
-                entities.append(
-                    (
-                        sample.token2word[entity[0] - 1],
-                        sample.token2word[entity[1] - 1] + 1,
-                        sample.entity_candidates[entity[2]],
-                    )
-                )
-            else:
-                entities.append(
-                    (
-                        sample.token2word[entity[0] - 1],
-                        sample.token2word[entity[1] - 1] + 1,
-                        -1,
-                    )
-                )
-        for predicted_triplet, predicted_triplet_probabilities in zip(
-            sample.predicted_relations, sample.predicted_relations_probabilities
-        ):
-            subject, object_, relation = predicted_triplet
-            subject = entities[subject]
-            object_ = entities[object_]
-            relation = sample.candidates[relation]
-            triplets.append(
-                {
-                    "subject": {
-                        "start": subject[0],
-                        "end": subject[1],
-                        "type": subject[2],
-                        "name": " ".join(sample.tokens[subject[0] : subject[1]]),
-                    },
-                    "relation": {
-                        "name": relation,
-                        "probability": float(predicted_triplet_probabilities.round(2)),
-                    },
-                    "object": {
-                        "start": object_[0],
-                        "end": object_[1],
-                        "type": object_[2],
-                        "name": " ".join(sample.tokens[object_[0] : object_[1]]),
-                    },
-                }
-            )
-        sample.predicted_entities = entities
-        sample.predicted_relations = triplets
-        sample.predicted_relations_probabilities = None
-    @torch.no_grad()
-    @torch.inference_mode()
-    def read(
-        self,
-        text: List[str] | List[List[str]] | None = None,
-        samples: List[RelikReaderSample] | None = None,
-        input_ids: torch.Tensor | None = None,
-        attention_mask: torch.Tensor | None = None,
-        token_type_ids: torch.Tensor | None = None,
-        prediction_mask: torch.Tensor | None = None,
-        special_symbols_mask: torch.Tensor | None = None,
-        special_symbols_mask_entities: torch.Tensor | None = None,
-        candidates: List[List[str]] | None = None,
-        max_length: int | None = 1024,
-        max_batch_size: int | None = 64,
-        token_batch_size: int | None = None,
-        progress_bar: bool = False,
-        *args,
-        **kwargs,
-    ) -> List[List[RelikReaderSample]]:
-        """
-        Reads the given text.
-        Args:
-            text: The text to read in tokens.
-            samples:
-            input_ids: The input ids of the text.
-            attention_mask: The attention mask of the text.
-            token_type_ids: The token type ids of the text.
-            prediction_mask: The prediction mask of the text.
-            special_symbols_mask: The special symbols mask of the text.
-            special_symbols_mask_entities: The special symbols mask entities of the text.
-            candidates: The candidates of the text.
-            max_length: The maximum length of the text.
-            max_batch_size: The maximum batch size.
-            token_batch_size: The maximum number of tokens per batch.
-            progress_bar:
-        Returns:
-            The predicted labels for each sample.
-        """
-        if text is None and input_ids is None and samples is None:
-            raise ValueError(
-                "Either `text` or `input_ids` or `samples` must be provided."
-            )
-        if (input_ids is None and samples is None) and (
-            text is None or candidates is None
-        ):
-            raise ValueError(
-                "`text` and `candidates` must be provided to return the predictions when "
-                "`input_ids` and `samples` is not provided."
-            )
-        if text is not None and samples is None:
-            if len(text) != len(candidates):
-                raise ValueError("`text` and `candidates` must have the same length.")
-            if isinstance(text[0], str):  # change to list of text
-                text = [text]
-                candidates = [candidates]
-            samples = [
-                RelikReaderSample(tokens=t, candidates=c)
-                for t, c in zip(text, candidates)
-            ]
-        if samples is not None:
-            # function that creates a batch from the 'current_batch' list
-            def output_batch() -> Dict[str, Any]:
-                assert (
-                    len(
-                        set(
-                            [
-                                len(elem["predictable_candidates"])
-                                for elem in current_batch
-                            ]
-                        )
-                    )
-                    == 1
-                ), " ".join(
-                    map(
-                        str,
-                        [len(elem["predictable_candidates"]) for elem in current_batch],
-                    )
-                )
-                batch_dict = dict()
-                de_values_by_field = {
-                    fn: [de[fn] for de in current_batch if fn in de]
-                    for fn in self.fields_batcher
-                }
-                # in case you provide fields batchers but in the batch
-                # there are no elements for that field
-                de_values_by_field = {
-                    fn: fvs for fn, fvs in de_values_by_field.items() if len(fvs) > 0
-                }
-                assert len(set([len(v) for v in de_values_by_field.values()]))
-                # todo: maybe we should report the user about possible
-                #  fields filtering due to "None" instances
-                de_values_by_field = {
-                    fn: fvs
-                    for fn, fvs in de_values_by_field.items()
-                    if all([fv is not None for fv in fvs])
-                }
-                for field_name, field_values in de_values_by_field.items():
-                    field_batch = (
-                        self.fields_batcher[field_name]([fv[0] for fv in field_values])
-                        if self.fields_batcher[field_name] is not None
-                        else field_values
-                    )
-                    batch_dict[field_name] = field_batch
-                batch_dict = {
-                    k: v.to(self.device) if isinstance(v, torch.Tensor) else v
-                    for k, v in batch_dict.items()
-                }
-                return batch_dict
-            current_batch = []
-            predictions = []
-            current_cand_len = -1
-            for sample in tqdm(samples, disable=not progress_bar):
-                sample.candidates = [NME_SYMBOL] + sample.candidates
-                inputs_text = self._build_input(sample.tokens, sample.candidates)
-                model_inputs = self.tokenizer(
-                    inputs_text,
-                    is_split_into_words=True,
-                    add_special_tokens=False,
-                    padding=False,
-                    truncation=True,
-                    max_length=max_length or self.tokenizer.model_max_length,
-                    return_offsets_mapping=True,
-                    return_tensors="pt",
-                )
-                model_inputs["special_symbols_mask"] = (
-                    model_inputs["input_ids"] > self.tokenizer.vocab_size
-                )
-                # prediction mask is 0 until the first special symbol
-                model_inputs["token_type_ids"] = (
-                    torch.cumsum(model_inputs["special_symbols_mask"], dim=1) > 0
-                ).long()
-                # shift prediction_mask to the left
-                model_inputs["prediction_mask"] = model_inputs["token_type_ids"].roll(
-                    shifts=-1, dims=1
-                )
-                model_inputs["prediction_mask"][:, -1] = 1
-                model_inputs["prediction_mask"][:, 0] = 1
-                assert (
-                    len(model_inputs["special_symbols_mask"])
-                    == len(model_inputs["prediction_mask"])
-                    == len(model_inputs["input_ids"])
-                )
-                model_inputs["sample"] = sample
-                # compute cand_len using special_symbols_mask
-                model_inputs["predictable_candidates"] = sample.candidates[
-                    : model_inputs["special_symbols_mask"].sum().item()
-                ]
-                # cand_len = sum([id_ > self.tokenizer.vocab_size for id_ in model_inputs["input_ids"]])
-                offsets = model_inputs.pop("offset_mapping")
-                offsets = offsets[model_inputs["prediction_mask"] == 0]
-                sample.token2word, sample.word2token = self._compute_offsets(offsets)
-                future_max_len = max(
-                    len(model_inputs["input_ids"]),
-                    max([len(b["input_ids"]) for b in current_batch], default=0),
-                )
-                future_tokens_per_batch = future_max_len * (len(current_batch) + 1)
-                if len(current_batch) > 0 and (
-                    (
-                        len(model_inputs["predictable_candidates"]) != current_cand_len
-                        and current_cand_len != -1
-                    )
-                    or (
-                        isinstance(token_batch_size, int)
-                        and future_tokens_per_batch >= token_batch_size
-                    )
-                    or len(current_batch) == max_batch_size
-                ):
-                    batch_inputs = output_batch()
-                    current_batch = []
-                    predictions.extend(list(self.batch_predict(**batch_inputs)))
-                current_cand_len = len(model_inputs["predictable_candidates"])
-                current_batch.append(model_inputs)
-            if current_batch:
-                batch_inputs = output_batch()
-                predictions.extend(list(self.batch_predict(**batch_inputs)))
-        else:
-            predictions = list(
-                self.batch_predict(
-                    input_ids,
-                    attention_mask,
-                    token_type_ids,
-                    prediction_mask,
-                    special_symbols_mask,
-                    special_symbols_mask_entities,
-                    *args,
-                    **kwargs,
-                )
-            )
-        return predictions
-    @property
-    def device(self) -> torch.device:
-        """
-        The device of the model.
-        """
-        return next(self.parameters()).device
-    @property
-    def tokenizer(self) -> tr.PreTrainedTokenizer:
-        """
-        The tokenizer.
-        """
-        if self._tokenizer:
-            return self._tokenizer
-        self._tokenizer = tr.AutoTokenizer.from_pretrained(
-            self.relik_reader_model.config.name_or_path
-        )
-        return self._tokenizer
-    @property
-    def fields_batcher(self) -> Dict[str, Union[None, Callable[[list], Any]]]:
-        fields_batchers = {
-            "input_ids": lambda x: batchify(
-                x, padding_value=self.tokenizer.pad_token_id
-            ),
-            "attention_mask": lambda x: batchify(x, padding_value=0),
-            "token_type_ids": lambda x: batchify(x, padding_value=0),
-            "prediction_mask": lambda x: batchify(x, padding_value=1),
-            "global_attention": lambda x: batchify(x, padding_value=0),
-            "token2word": None,
-            "sample": None,
-            "special_symbols_mask": lambda x: batchify(x, padding_value=False),
-            "special_symbols_mask_entities": lambda x: batchify(x, padding_value=False),
-        }
-        if "roberta" in self.relik_reader_model.config.model_type:
-            del fields_batchers["token_type_ids"]
-        return fields_batchers
-    def save_pretrained(
-        self,
-        output_dir: str,
-        model_name: str | None = None,
-        push_to_hub: bool = False,
-        **kwargs,
-    ) -> None:
-        """
-        Saves the model to the given path.
-        Args:
-            output_dir: The path to save the model to.
-            model_name: The name of the model.
-            push_to_hub: Whether to push the model to the hub.
-        """
-        # create the output directory
-        output_dir = Path(output_dir)
-        output_dir.mkdir(parents=True, exist_ok=True)
-        model_name = model_name or "relik-reader-for-span-extraction"
-        logger.info(f"Saving reader to {output_dir / model_name}")
-        # save the model
-        self.relik_reader_model.register_for_auto_class()
-        self.relik_reader_model.save_pretrained(
-            output_dir / model_name, push_to_hub=push_to_hub, **kwargs
-        )
-        logger.info("Saving reader to disk done.")
-        if self.tokenizer:
-            self.tokenizer.save_pretrained(
-                output_dir / model_name, push_to_hub=push_to_hub, **kwargs
-            )
-            logger.info("Saving tokenizer to disk done.")
-class RelikReader:
-    def __init__(self, model_path: str, predict_nmes: bool = False):
-        model, model_conf = load_model_and_conf(model_path)
-        model.training = False
-        model.eval()
-        val_dataset_conf = model_conf.data.val_dataset
-        val_dataset_conf.special_symbols = get_special_symbols(
-            model_conf.model.entities_per_forward
-        )
-        val_dataset_conf.transformer_model = model_conf.model.model.transformer_model
-        self.predictor = RelikReaderPredictor(
-            model,
-            dataset_conf=model_conf.data.val_dataset,
-            predict_nmes=predict_nmes,
-        )
-        self.model_path = model_path
-    def link_entities(
-        self,
-        dataset_path_or_samples: str | Iterator[RelikReaderSample],
-        token_batch_size: int = 2048,
-        progress_bar: bool = False,
-    ) -> List[RelikReaderSample]:
-        data_input = (
-            (dataset_path_or_samples, None)
-            if isinstance(dataset_path_or_samples, str)
-            else (None, dataset_path_or_samples)
-        )
-        return self.predictor.predict(
-            *data_input,
-            dataset_conf=None,
-            token_batch_size=token_batch_size,
-            progress_bar=progress_bar,
-        )
-    # def save_pretrained(self, path: Union[str, Path]):
-    #     self.predictor.save(path)
-def main():
-    rr = RelikReader("riccorl/relik-reader-aida-deberta-small-old", predict_nmes=True)
-    predictions = rr.link_entities(
-        "/Users/ric/Documents/PhD/Projects/relik/data/reader/aida/testa.jsonl"
-    )
-    print(predictions)
-if __name__ == "__main__":
-    main()