3ie-intervention-outcome-entity-linking

Sleeping

App Files Files Community

CarlosMalaga commited on May 2, 2024

Commit

2f044c1

verified ·

1 Parent(s): 3376207

Upload 201 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

relik/__init__.py +8 -0
relik/common/__init__.py +0 -0
relik/common/__pycache__/__init__.cpython-310.pyc +0 -0
relik/common/__pycache__/log.cpython-310.pyc +0 -0
relik/common/__pycache__/torch_utils.cpython-310.pyc +0 -0
relik/common/__pycache__/upload.cpython-310.pyc +0 -0
relik/common/__pycache__/utils.cpython-310.pyc +0 -0
relik/common/log.py +174 -0
relik/common/torch_utils.py +82 -0
relik/common/upload.py +144 -0
relik/common/utils.py +610 -0
relik/inference/__init__.py +0 -0
relik/inference/__pycache__/__init__.cpython-310.pyc +0 -0
relik/inference/__pycache__/annotator.cpython-310.pyc +0 -0
relik/inference/annotator.py +840 -0
relik/inference/data/__init__.py +0 -0
relik/inference/data/__pycache__/__init__.cpython-310.pyc +0 -0
relik/inference/data/__pycache__/objects.cpython-310.pyc +0 -0
relik/inference/data/objects.py +88 -0
relik/inference/data/splitters/__init__.py +0 -0
relik/inference/data/splitters/__pycache__/__init__.cpython-310.pyc +0 -0
relik/inference/data/splitters/__pycache__/base_sentence_splitter.cpython-310.pyc +0 -0
relik/inference/data/splitters/__pycache__/blank_sentence_splitter.cpython-310.pyc +0 -0
relik/inference/data/splitters/__pycache__/spacy_sentence_splitter.cpython-310.pyc +0 -0
relik/inference/data/splitters/__pycache__/window_based_splitter.cpython-310.pyc +0 -0
relik/inference/data/splitters/base_sentence_splitter.py +55 -0
relik/inference/data/splitters/blank_sentence_splitter.py +29 -0
relik/inference/data/splitters/spacy_sentence_splitter.py +153 -0
relik/inference/data/splitters/window_based_splitter.py +62 -0
relik/inference/data/tokenizers/__init__.py +87 -0
relik/inference/data/tokenizers/__pycache__/__init__.cpython-310.pyc +0 -0
relik/inference/data/tokenizers/__pycache__/base_tokenizer.cpython-310.pyc +0 -0
relik/inference/data/tokenizers/__pycache__/spacy_tokenizer.cpython-310.pyc +0 -0
relik/inference/data/tokenizers/base_tokenizer.py +84 -0
relik/inference/data/tokenizers/spacy_tokenizer.py +194 -0
relik/inference/data/window/__init__.py +0 -0
relik/inference/data/window/__pycache__/__init__.cpython-310.pyc +0 -0
relik/inference/data/window/__pycache__/manager.cpython-310.pyc +0 -0
relik/inference/data/window/manager.py +431 -0
relik/inference/gerbil.py +269 -0
relik/inference/serve/__init__.py +0 -0
relik/inference/serve/backend/__init__.py +0 -0
relik/inference/serve/backend/fastapi.py +122 -0
relik/inference/serve/backend/ray.py +165 -0
relik/inference/serve/backend/utils.py +38 -0
relik/inference/serve/frontend/__init__.py +0 -0
relik/inference/serve/frontend/relik_front.py +229 -0
relik/inference/serve/frontend/relik_re_front.py +251 -0
relik/inference/serve/frontend/style.css +33 -0
relik/inference/serve/frontend/utils.py +132 -0

relik/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+from relik.inference.annotator import Relik
+from pathlib import Path
+VERSION = {}  # type: ignore
+with open(Path(__file__).parent / "version.py", "r") as version_file:
+    exec(version_file.read(), VERSION)
+__version__ = VERSION["VERSION"]

relik/common/__init__.py ADDED Viewed

File without changes

relik/common/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (180 Bytes). View file

relik/common/__pycache__/log.cpython-310.pyc ADDED Viewed

Binary file (4.39 kB). View file

relik/common/__pycache__/torch_utils.cpython-310.pyc ADDED Viewed

Binary file (1.11 kB). View file

relik/common/__pycache__/upload.cpython-310.pyc ADDED Viewed

Binary file (4.04 kB). View file

relik/common/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (14.8 kB). View file

relik/common/log.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import logging
+import os
+import sys
+import threading
+from logging.config import dictConfig
+from typing import Any, Dict, Optional
+from art import text2art, tprint
+from colorama import Fore, Style, init
+from rich import get_console
+_lock = threading.Lock()
+_default_handler: Optional[logging.Handler] = None
+_default_log_level = logging.WARNING
+# fancy logger
+_console = get_console()
+class ColorfulFormatter(logging.Formatter):
+    """
+    Formatter to add coloring to log messages by log type
+    """
+    COLORS = {
+        "WARNING": Fore.YELLOW,
+        "ERROR": Fore.RED,
+        "CRITICAL": Fore.RED + Style.BRIGHT,
+        "DEBUG": Fore.CYAN,
+        # "INFO": Fore.GREEN,
+    }
+    def format(self, record):
+        record.rank = int(os.getenv("LOCAL_RANK", "0"))
+        log_message = super().format(record)
+        return self.COLORS.get(record.levelname, "") + log_message + Fore.RESET
+DEFAULT_LOGGING_CONFIG: Dict[str, Any] = {
+    "version": 1,
+    "formatters": {
+        "simple": {
+            "format": "[%(asctime)s] [%(levelname)s] [%(name)s.%(funcName)s:%(lineno)d] [PID:%(process)d] %(message)s",
+        },
+        "colorful": {
+            "()": ColorfulFormatter,
+            "format": "[%(asctime)s] [%(levelname)s] [%(name)s.%(funcName)s:%(lineno)d] [PID:%(process)d] [RANK:%(rank)d] %(message)s",
+        },
+    },
+    "filters": {},
+    "handlers": {
+        "console": {
+            "class": "logging.StreamHandler",
+            "formatter": "simple",
+            "filters": [],
+            "stream": sys.stdout,
+        },
+        "color_console": {
+            "class": "logging.StreamHandler",
+            "formatter": "colorful",
+            "filters": [],
+            "stream": sys.stdout,
+        },
+    },
+    "root": {"handlers": ["console"], "level": os.getenv("LOG_LEVEL", "INFO")},
+    "loggers": {
+        "relik": {
+            "handlers": ["color_console"],
+            "level": "DEBUG",
+            "propagate": False,
+        },
+    },
+}
+def configure_logging(**kwargs):
+    """Configure with default logging"""
+    init()  # Initialize colorama
+    # merge DEFAULT_LOGGING_CONFIG with kwargs
+    logger_config = DEFAULT_LOGGING_CONFIG
+    if kwargs:
+        logger_config.update(kwargs)
+    dictConfig(logger_config)
+def _get_library_name() -> str:
+    return __name__.split(".")[0]
+def _get_library_root_logger() -> logging.Logger:
+    return logging.getLogger(_get_library_name())
+def _configure_library_root_logger() -> None:
+    global _default_handler
+    with _lock:
+        if _default_handler:
+            # This library has already configured the library root logger.
+            return
+        _default_handler = logging.StreamHandler()  # Set sys.stderr as stream.
+        _default_handler.flush = sys.stderr.flush
+        # Apply our default configuration to the library root logger.
+        library_root_logger = _get_library_root_logger()
+        library_root_logger.addHandler(_default_handler)
+        library_root_logger.setLevel(_default_log_level)
+        library_root_logger.propagate = False
+def _reset_library_root_logger() -> None:
+    global _default_handler
+    with _lock:
+        if not _default_handler:
+            return
+        library_root_logger = _get_library_root_logger()
+        library_root_logger.removeHandler(_default_handler)
+        library_root_logger.setLevel(logging.NOTSET)
+        _default_handler = None
+def set_log_level(level: int, logger: logging.Logger = None) -> None:
+    """
+    Set the log level.
+    Args:
+        level (:obj:`int`):
+            Logging level.
+        logger (:obj:`logging.Logger`):
+            Logger to set the log level.
+    """
+    if not logger:
+        _configure_library_root_logger()
+        logger = _get_library_root_logger()
+    logger.setLevel(level)
+def get_logger(
+    name: Optional[str] = None,
+    level: Optional[int] = None,
+    formatter: Optional[str] = None,
+    **kwargs,
+) -> logging.Logger:
+    """
+    Return a logger with the specified name.
+    """
+    configure_logging(**kwargs)
+    if name is None:
+        name = _get_library_name()
+    _configure_library_root_logger()
+    if level is not None:
+        set_log_level(level)
+    if formatter is None:
+        formatter = logging.Formatter(
+            "%(asctime)s - %(levelname)s - %(name)s - %(message)s"
+        )
+    _default_handler.setFormatter(formatter)
+    return logging.getLogger(name)
+def get_console_logger():
+    return _console
+def print_relik_text_art(text: str = "relik", font: str = "larry3d", **kwargs):
+    tprint(text, font=font, **kwargs)

relik/common/torch_utils.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import contextlib
+import tempfile
+import torch
+import transformers as tr
+from relik.common.utils import is_package_available
+# check if ORT is available
+if is_package_available("onnxruntime"):
+    from optimum.onnxruntime import (
+        ORTModel,
+        ORTModelForCustomTasks,
+        ORTModelForSequenceClassification,
+        ORTOptimizer,
+    )
+    from optimum.onnxruntime.configuration import AutoOptimizationConfig
+# from relik.retriever.pytorch_modules import PRECISION_MAP
+def get_autocast_context(
+    device: str | torch.device, precision: str
+) -> contextlib.AbstractContextManager:
+    # fucking autocast only wants pure strings like 'cpu' or 'cuda'
+    # we need to convert the model device to that
+    device_type_for_autocast = str(device).split(":")[0]
+    from relik.retriever.pytorch_modules import PRECISION_MAP
+    # autocast doesn't work with CPU and stuff different from bfloat16
+    autocast_manager = (
+        contextlib.nullcontext()
+        if device_type_for_autocast in ["cpu", "mps"]
+        and PRECISION_MAP[precision] != torch.bfloat16
+        else (
+            torch.autocast(
+                device_type=device_type_for_autocast,
+                dtype=PRECISION_MAP[precision],
+            )
+        )
+    )
+    return autocast_manager
+# def load_ort_optimized_hf_model(
+#     hf_model: tr.PreTrainedModel,
+#     provider: str = "CPUExecutionProvider",
+#     ort_model_type: callable = "ORTModelForCustomTasks",
+# ) -> ORTModel:
+#     """
+#     Load an optimized ONNX Runtime HF model.
+#
+#     Args:
+#         hf_model (`tr.PreTrainedModel`):
+#             The HF model to optimize.
+#         provider (`str`, optional):
+#             The ONNX Runtime provider to use. Defaults to "CPUExecutionProvider".
+#
+#     Returns:
+#         `ORTModel`: The optimized HF model.
+#     """
+#     if isinstance(hf_model, ORTModel):
+#         return hf_model
+#     temp_dir = tempfile.mkdtemp()
+#     hf_model.save_pretrained(temp_dir)
+#     ort_model = ort_model_type.from_pretrained(
+#         temp_dir, export=True, provider=provider, use_io_binding=True
+#     )
+#     if is_package_available("onnxruntime"):
+#         optimizer = ORTOptimizer.from_pretrained(ort_model)
+#         optimization_config = AutoOptimizationConfig.O4()
+#         optimizer.optimize(save_dir=temp_dir, optimization_config=optimization_config)
+#         ort_model = ort_model_type.from_pretrained(
+#             temp_dir,
+#             export=True,
+#             provider=provider,
+#             use_io_binding=bool(provider == "CUDAExecutionProvider"),
+#         )
+#         return ort_model
+#     else:
+#         raise ValueError("onnxruntime is not installed. Please install Ray with `pip install relik[serve]`.")

relik/common/upload.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import argparse
+import json
+import logging
+import os
+import tempfile
+import zipfile
+from datetime import datetime
+from pathlib import Path
+from typing import Optional, Union
+import huggingface_hub
+from relik.common.log import get_logger
+from relik.common.utils import SAPIENZANLP_DATE_FORMAT, get_md5
+logger = get_logger(__name__, level=logging.DEBUG)
+def create_info_file(tmpdir: Path):
+    logger.debug("Computing md5 of model.zip")
+    md5 = get_md5(tmpdir / "model.zip")
+    date = datetime.now().strftime(SAPIENZANLP_DATE_FORMAT)
+    logger.debug("Dumping info.json file")
+    with (tmpdir / "info.json").open("w") as f:
+        json.dump(dict(md5=md5, upload_date=date), f, indent=2)
+def zip_run(
+    dir_path: Union[str, os.PathLike],
+    tmpdir: Union[str, os.PathLike],
+    zip_name: str = "model.zip",
+) -> Path:
+    logger.debug(f"zipping {dir_path} to {tmpdir}")
+    # creates a zip version of the provided dir_path
+    run_dir = Path(dir_path)
+    zip_path = tmpdir / zip_name
+    with zipfile.ZipFile(zip_path, "w") as zip_file:
+        # fully zip the run directory maintaining its structure
+        for file in run_dir.rglob("*.*"):
+            if file.is_dir():
+                continue
+            zip_file.write(file, arcname=file.relative_to(run_dir))
+    return zip_path
+def get_logged_in_username():
+    token = huggingface_hub.HfFolder.get_token()
+    if token is None:
+        raise ValueError(
+            "No HuggingFace token found. You need to execute `huggingface-cli login` first!"
+        )
+    api = huggingface_hub.HfApi()
+    user = api.whoami(token=token)
+    return user["name"]
+def upload(
+    model_dir: Union[str, os.PathLike],
+    model_name: str,
+    filenames: Optional[list[str]] = None,
+    organization: Optional[str] = None,
+    repo_name: Optional[str] = None,
+    commit: Optional[str] = None,
+    archive: bool = False,
+):
+    token = huggingface_hub.HfFolder.get_token()
+    if token is None:
+        raise ValueError(
+            "No HuggingFace token found. You need to execute `huggingface-cli login` first!"
+        )
+    repo_id = repo_name or model_name
+    if organization is not None:
+        repo_id = f"{organization}/{repo_id}"
+    with tempfile.TemporaryDirectory() as tmpdir:
+        api = huggingface_hub.HfApi()
+        repo_url = api.create_repo(
+            token=token,
+            repo_id=repo_id,
+            exist_ok=True,
+        )
+        repo = huggingface_hub.Repository(
+            str(tmpdir), clone_from=repo_url, use_auth_token=token
+        )
+        tmp_path = Path(tmpdir)
+        if archive:
+            # otherwise we zip the model_dir
+            logger.debug(f"Zipping {model_dir} to {tmp_path}")
+            zip_run(model_dir, tmp_path)
+            create_info_file(tmp_path)
+        else:
+            # if the user wants to upload a transformers model, we don't need to zip it
+            # we just need to copy the files to the tmpdir
+            logger.debug(f"Copying {model_dir} to {tmpdir}")
+            # copy only the files that are needed
+            if filenames is not None:
+                for filename in filenames:
+                    os.system(f"cp {model_dir}/{filename} {tmpdir}")
+            else:
+                os.system(f"cp -r {model_dir}/* {tmpdir}")
+        # this method automatically puts large files (>10MB) into git lfs
+        repo.push_to_hub(commit_message=commit or "Automatic push from sapienzanlp")
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "model_dir", help="The directory of the model you want to upload"
+    )
+    parser.add_argument("model_name", help="The model you want to upload")
+    parser.add_argument(
+        "--organization",
+        help="the name of the organization where you want to upload the model",
+    )
+    parser.add_argument(
+        "--repo_name",
+        help="Optional name to use when uploading to the HuggingFace repository",
+    )
+    parser.add_argument(
+        "--commit", help="Commit message to use when pushing to the HuggingFace Hub"
+    )
+    parser.add_argument(
+        "--archive",
+        action="store_true",
+        help="""
+            Whether to compress the model directory before uploading it.
+            If True, the model directory will be zipped and the zip file will be uploaded.
+            If False, the model directory will be uploaded as is.""",
+    )
+    return parser.parse_args()
+def main():
+    upload(**vars(parse_args()))
+if __name__ == "__main__":
+    main()

relik/common/utils.py ADDED Viewed

	@@ -0,0 +1,610 @@

+import importlib.util
+import json
+import logging
+import os
+import shutil
+import tarfile
+import tempfile
+from functools import partial
+from hashlib import sha256
+from pathlib import Path
+from typing import Any, BinaryIO, Dict, List, Optional, Union
+from urllib.parse import urlparse
+from zipfile import ZipFile, is_zipfile
+import huggingface_hub
+import requests
+import tqdm
+from filelock import FileLock
+from transformers.utils.hub import cached_file as hf_cached_file
+from relik.common.log import get_logger
+# name constants
+WEIGHTS_NAME = "weights.pt"
+ONNX_WEIGHTS_NAME = "weights.onnx"
+CONFIG_NAME = "config.yaml"
+LABELS_NAME = "labels.json"
+# SAPIENZANLP_USER_NAME = "sapienzanlp"
+SAPIENZANLP_USER_NAME = "riccorl"
+SAPIENZANLP_HF_MODEL_REPO_URL = "riccorl/{model_id}"
+SAPIENZANLP_HF_MODEL_REPO_ARCHIVE_URL = (
+    f"{SAPIENZANLP_HF_MODEL_REPO_URL}/resolve/main/model.zip"
+)
+# path constants
+HF_CACHE_DIR = Path(os.getenv("HF_HOME", Path.home() / ".cache/huggingface/hub"))
+SAPIENZANLP_CACHE_DIR = os.getenv("SAPIENZANLP_CACHE_DIR", HF_CACHE_DIR)
+SAPIENZANLP_DATE_FORMAT = "%Y-%m-%d %H-%M-%S"
+logger = get_logger(__name__)
+def sapienzanlp_model_urls(model_id: str) -> str:
+    """
+    Returns the URL for a possible SapienzaNLP valid model.
+    Args:
+        model_id (:obj:`str`):
+            A SapienzaNLP model id.
+    Returns:
+        :obj:`str`: The url for the model id.
+    """
+    # check if there is already the namespace of the user
+    if "/" in model_id:
+        return model_id
+    return SAPIENZANLP_HF_MODEL_REPO_URL.format(model_id=model_id)
+def is_package_available(package_name: str) -> bool:
+    """
+    Check if a package is available.
+    Args:
+        package_name (`str`): The name of the package to check.
+    """
+    return importlib.util.find_spec(package_name) is not None
+def load_json(path: Union[str, Path]) -> Any:
+    """
+    Load a json file provided in input.
+    Args:
+        path (`Union[str, Path]`): The path to the json file to load.
+    Returns:
+        `Any`: The loaded json file.
+    """
+    with open(path, encoding="utf8") as f:
+        return json.load(f)
+def dump_json(document: Any, path: Union[str, Path], indent: Optional[int] = None):
+    """
+    Dump input to json file.
+    Args:
+        document (`Any`): The document to dump.
+        path (`Union[str, Path]`): The path to dump the document to.
+        indent (`Optional[int]`): The indent to use for the json file.
+    """
+    with open(path, "w", encoding="utf8") as outfile:
+        json.dump(document, outfile, indent=indent)
+def get_md5(path: Path):
+    """
+    Get the MD5 value of a path.
+    """
+    import hashlib
+    with path.open("rb") as fin:
+        data = fin.read()
+    return hashlib.md5(data).hexdigest()
+def file_exists(path: Union[str, os.PathLike]) -> bool:
+    """
+    Check if the file at :obj:`path` exists.
+    Args:
+        path (:obj:`str`, :obj:`os.PathLike`):
+            Path to check.
+    Returns:
+        :obj:`bool`: :obj:`True` if the file exists.
+    """
+    return Path(path).exists()
+def dir_exists(path: Union[str, os.PathLike]) -> bool:
+    """
+    Check if the directory at :obj:`path` exists.
+    Args:
+        path (:obj:`str`, :obj:`os.PathLike`):
+            Path to check.
+    Returns:
+        :obj:`bool`: :obj:`True` if the directory exists.
+    """
+    return Path(path).is_dir()
+def is_remote_url(url_or_filename: Union[str, Path]):
+    """
+    Returns :obj:`True` if the input path is an url.
+    Args:
+        url_or_filename (:obj:`str`, :obj:`Path`):
+            path to check.
+    Returns:
+        :obj:`bool`: :obj:`True` if the input path is an url, :obj:`False` otherwise.
+    """
+    if isinstance(url_or_filename, Path):
+        url_or_filename = str(url_or_filename)
+    parsed = urlparse(url_or_filename)
+    return parsed.scheme in ("http", "https")
+def url_to_filename(resource: str, etag: str = None) -> str:
+    """
+    Convert a `resource` into a hashed filename in a repeatable way.
+    If `etag` is specified, append its hash to the resources's, delimited
+    by a period.
+    """
+    resource_bytes = resource.encode("utf-8")
+    resource_hash = sha256(resource_bytes)
+    filename = resource_hash.hexdigest()
+    if etag:
+        etag_bytes = etag.encode("utf-8")
+        etag_hash = sha256(etag_bytes)
+        filename += "." + etag_hash.hexdigest()
+    return filename
+def download_resource(
+    url: str,
+    temp_file: BinaryIO,
+    headers=None,
+):
+    """
+    Download remote file.
+    """
+    if headers is None:
+        headers = {}
+    r = requests.get(url, stream=True, headers=headers)
+    r.raise_for_status()
+    content_length = r.headers.get("Content-Length")
+    total = int(content_length) if content_length is not None else None
+    progress = tqdm(
+        unit="B",
+        unit_scale=True,
+        total=total,
+        desc="Downloading",
+        disable=logger.level in [logging.NOTSET],
+    )
+    for chunk in r.iter_content(chunk_size=1024):
+        if chunk:  # filter out keep-alive new chunks
+            progress.update(len(chunk))
+            temp_file.write(chunk)
+    progress.close()
+def download_and_cache(
+    url: Union[str, Path],
+    cache_dir: Union[str, Path] = None,
+    force_download: bool = False,
+):
+    if cache_dir is None:
+        cache_dir = SAPIENZANLP_CACHE_DIR
+    if isinstance(url, Path):
+        url = str(url)
+    # check if cache dir exists
+    Path(cache_dir).mkdir(parents=True, exist_ok=True)
+    # check if file is private
+    headers = {}
+    try:
+        r = requests.head(url, allow_redirects=False, timeout=10)
+        r.raise_for_status()
+    except requests.exceptions.HTTPError:
+        if r.status_code == 401:
+            hf_token = huggingface_hub.HfFolder.get_token()
+            if hf_token is None:
+                raise ValueError(
+                    "You need to login to HuggingFace to download this model "
+                    "(use the `huggingface-cli login` command)"
+                )
+            headers["Authorization"] = f"Bearer {hf_token}"
+    etag = None
+    try:
+        r = requests.head(url, allow_redirects=True, timeout=10, headers=headers)
+        r.raise_for_status()
+        etag = r.headers.get("X-Linked-Etag") or r.headers.get("ETag")
+        # We favor a custom header indicating the etag of the linked resource, and
+        # we fallback to the regular etag header.
+        # If we don't have any of those, raise an error.
+        if etag is None:
+            raise OSError(
+                "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
+            )
+        # In case of a redirect,
+        # save an extra redirect on the request.get call,
+        # and ensure we download the exact atomic version even if it changed
+        # between the HEAD and the GET (unlikely, but hey).
+        if 300 <= r.status_code <= 399:
+            url = r.headers["Location"]
+    except (requests.exceptions.SSLError, requests.exceptions.ProxyError):
+        # Actually raise for those subclasses of ConnectionError
+        raise
+    except (requests.exceptions.ConnectionError, requests.exceptions.Timeout):
+        # Otherwise, our Internet connection is down.
+        # etag is None
+        pass
+    # get filename from the url
+    filename = url_to_filename(url, etag)
+    # get cache path to put the file
+    cache_path = cache_dir / filename
+    # the file is already here, return it
+    if file_exists(cache_path) and not force_download:
+        logger.info(
+            f"{url} found in cache, set `force_download=True` to force the download"
+        )
+        return cache_path
+    cache_path = str(cache_path)
+    # Prevent parallel downloads of the same file with a lock.
+    lock_path = cache_path + ".lock"
+    with FileLock(lock_path):
+        # If the download just completed while the lock was activated.
+        if file_exists(cache_path) and not force_download:
+            # Even if returning early like here, the lock will be released.
+            return cache_path
+        temp_file_manager = partial(
+            tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False
+        )
+        # Download to temporary file, then copy to cache dir once finished.
+        # Otherwise, you get corrupt cache entries if the download gets interrupted.
+        with temp_file_manager() as temp_file:
+            logger.info(
+                f"{url} not found in cache or `force_download` set to `True`, downloading to {temp_file.name}"
+            )
+            download_resource(url, temp_file, headers)
+        logger.info(f"storing {url} in cache at {cache_path}")
+        os.replace(temp_file.name, cache_path)
+        # NamedTemporaryFile creates a file with hardwired 0600 perms (ignoring umask), so fixing it.
+        umask = os.umask(0o666)
+        os.umask(umask)
+        os.chmod(cache_path, 0o666 & ~umask)
+        logger.info(f"creating metadata file for {cache_path}")
+        meta = {"url": url}  # , "etag": etag}
+        meta_path = cache_path + ".json"
+        with open(meta_path, "w") as meta_file:
+            json.dump(meta, meta_file)
+    return cache_path
+def download_from_hf(
+    path_or_repo_id: Union[str, Path],
+    filenames: List[str],
+    cache_dir: Union[str, Path] = None,
+    force_download: bool = False,
+    resume_download: bool = False,
+    proxies: Optional[Dict[str, str]] = None,
+    use_auth_token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+    subfolder: str = "",
+    repo_type: str = "model",
+):
+    if isinstance(path_or_repo_id, Path):
+        path_or_repo_id = str(path_or_repo_id)
+    downloaded_paths = []
+    for filename in filenames:
+        downloaded_path = hf_cached_file(
+            path_or_repo_id,
+            filename,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            resume_download=resume_download,
+            use_auth_token=use_auth_token,
+            revision=revision,
+            local_files_only=local_files_only,
+            subfolder=subfolder,
+        )
+        downloaded_paths.append(downloaded_path)
+    # we want the folder where the files are downloaded
+    # the best guess is the parent folder of the first file
+    probably_the_folder = Path(downloaded_paths[0]).parent
+    return probably_the_folder
+def model_name_or_path_resolver(model_name_or_dir: Union[str, os.PathLike]) -> str:
+    """
+    Resolve a model name or directory to a model archive name or directory.
+    Args:
+        model_name_or_dir (:obj:`str` or :obj:`os.PathLike`):
+            A model name or directory.
+    Returns:
+        :obj:`str`: The model archive name or directory.
+    """
+    if is_remote_url(model_name_or_dir):
+        # if model_name_or_dir is a URL
+        # download it and try to load
+        model_archive = model_name_or_dir
+    elif Path(model_name_or_dir).is_dir() or Path(model_name_or_dir).is_file():
+        # if model_name_or_dir is a local directory or
+        # an archive file try to load it
+        model_archive = model_name_or_dir
+    else:
+        # probably model_name_or_dir is a sapienzanlp model id
+        # guess the url and try to download
+        model_name_or_dir_ = model_name_or_dir
+        # raise ValueError(f"Providing a model id is not supported yet.")
+        model_archive = sapienzanlp_model_urls(model_name_or_dir_)
+    return model_archive
+def from_cache(
+    url_or_filename: Union[str, Path],
+    cache_dir: Union[str, Path] = None,
+    force_download: bool = False,
+    resume_download: bool = False,
+    proxies: Optional[Dict[str, str]] = None,
+    use_auth_token: Optional[Union[bool, str]] = None,
+    revision: Optional[str] = None,
+    local_files_only: bool = False,
+    subfolder: str = "",
+    filenames: Optional[List[str]] = None,
+) -> Path:
+    """
+    Given something that could be either a local path or a URL (or a SapienzaNLP model id),
+    determine which one and return a path to the corresponding file.
+    Args:
+        url_or_filename (:obj:`str` or :obj:`Path`):
+            A path to a local file or a URL (or a SapienzaNLP model id).
+        cache_dir (:obj:`str` or :obj:`Path`, `optional`):
+            Path to a directory in which a downloaded file will be cached.
+        force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not to re-download the file even if it already exists.
+        resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not to delete incompletely received files. Attempts to resume the download if such a file
+            exists.
+        proxies (:obj:`Dict[str, str]`, `optional`):
+            A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
+            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+        use_auth_token (:obj:`Union[bool, str]`, `optional`):
+            Optional string or boolean to use as Bearer token for remote files. If :obj:`True`, will get token from
+            :obj:`~transformers.hf_api.HfApi`. If :obj:`str`, will use that string as token.
+        revision (:obj:`str`, `optional`):
+            The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+            git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+            identifier allowed by git.
+        local_files_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not to raise an error if the file to be downloaded is local.
+        subfolder (:obj:`str`, `optional`):
+            In case the relevant file is in a subfolder of the URL, specify it here.
+        filenames (:obj:`List[str]`, `optional`):
+            List of filenames to look for in the directory structure.
+    Returns:
+        :obj:`Path`: Path to the cached file.
+    """
+    url_or_filename = model_name_or_path_resolver(url_or_filename)
+    if cache_dir is None:
+        cache_dir = SAPIENZANLP_CACHE_DIR
+    if file_exists(url_or_filename):
+        logger.info(f"{url_or_filename} is a local path or file")
+        output_path = url_or_filename
+    elif is_remote_url(url_or_filename):
+        # URL, so get it from the cache (downloading if necessary)
+        output_path = download_and_cache(
+            url_or_filename,
+            cache_dir=cache_dir,
+            force_download=force_download,
+        )
+    else:
+        if filenames is None:
+            filenames = [WEIGHTS_NAME, CONFIG_NAME, LABELS_NAME]
+        output_path = download_from_hf(
+            url_or_filename,
+            filenames,
+            cache_dir,
+            force_download,
+            resume_download,
+            proxies,
+            use_auth_token,
+            revision,
+            local_files_only,
+            subfolder,
+        )
+    # if is_hf_hub_url(url_or_filename):
+    # HuggingFace Hub
+    # output_path = hf_hub_download_url(url_or_filename)
+    # elif is_remote_url(url_or_filename):
+    #     # URL, so get it from the cache (downloading if necessary)
+    #     output_path = download_and_cache(
+    #         url_or_filename,
+    #         cache_dir=cache_dir,
+    #         force_download=force_download,
+    #     )
+    # elif file_exists(url_or_filename):
+    #     logger.info(f"{url_or_filename} is a local path or file")
+    #     # File, and it exists.
+    #     output_path = url_or_filename
+    # elif urlparse(url_or_filename).scheme == "":
+    #     # File, but it doesn't exist.
+    #     raise EnvironmentError(f"file {url_or_filename} not found")
+    # else:
+    #     # Something unknown
+    #     raise ValueError(
+    #         f"unable to parse {url_or_filename} as a URL or as a local path"
+    #     )
+    if dir_exists(output_path) or (
+        not is_zipfile(output_path) and not tarfile.is_tarfile(output_path)
+    ):
+        return Path(output_path)
+    # Path where we extract compressed archives
+    # for now it will extract it in the same folder
+    # maybe implement extraction in the sapienzanlp folder
+    # when using local archive path?
+    logger.info("Extracting compressed archive")
+    output_dir, output_file = os.path.split(output_path)
+    output_extract_dir_name = output_file.replace(".", "-") + "-extracted"
+    output_path_extracted = os.path.join(output_dir, output_extract_dir_name)
+    # already extracted, do not extract
+    if (
+        os.path.isdir(output_path_extracted)
+        and os.listdir(output_path_extracted)
+        and not force_download
+    ):
+        return Path(output_path_extracted)
+    # Prevent parallel extractions
+    lock_path = output_path + ".lock"
+    with FileLock(lock_path):
+        shutil.rmtree(output_path_extracted, ignore_errors=True)
+        os.makedirs(output_path_extracted)
+        if is_zipfile(output_path):
+            with ZipFile(output_path, "r") as zip_file:
+                zip_file.extractall(output_path_extracted)
+                zip_file.close()
+        elif tarfile.is_tarfile(output_path):
+            tar_file = tarfile.open(output_path)
+            tar_file.extractall(output_path_extracted)
+            tar_file.close()
+        else:
+            raise EnvironmentError(
+                f"Archive format of {output_path} could not be identified"
+            )
+    # remove lock file, is it safe?
+    os.remove(lock_path)
+    return Path(output_path_extracted)
+def is_str_a_path(maybe_path: str) -> bool:
+    """
+    Check if a string is a path.
+    Args:
+        maybe_path (`str`): The string to check.
+    Returns:
+        `bool`: `True` if the string is a path, `False` otherwise.
+    """
+    # first check if it is a path
+    if Path(maybe_path).exists():
+        return True
+    # check if it is a relative path
+    if Path(os.path.join(os.getcwd(), maybe_path)).exists():
+        return True
+    # otherwise it is not a path
+    return False
+def relative_to_absolute_path(path: str) -> os.PathLike:
+    """
+    Convert a relative path to an absolute path.
+    Args:
+        path (`str`): The relative path to convert.
+    Returns:
+        `os.PathLike`: The absolute path.
+    """
+    if not is_str_a_path(path):
+        raise ValueError(f"{path} is not a path")
+    if Path(path).exists():
+        return Path(path).absolute()
+    if Path(os.path.join(os.getcwd(), path)).exists():
+        return Path(os.path.join(os.getcwd(), path)).absolute()
+    raise ValueError(f"{path} is not a path")
+def to_config(object_to_save: Any) -> Dict[str, Any]:
+    """
+    Convert an object to a dictionary.
+    Returns:
+        `Dict[str, Any]`: The dictionary representation of the object.
+    """
+    def obj_to_dict(obj):
+        match obj:
+            case dict():
+                data = {}
+                for k, v in obj.items():
+                    data[k] = obj_to_dict(v)
+                return data
+            case list() | tuple():
+                return [obj_to_dict(x) for x in obj]
+            case object(__dict__=_):
+                data = {
+                    "_target_": f"{obj.__class__.__module__}.{obj.__class__.__name__}",
+                }
+                for k, v in obj.__dict__.items():
+                    if not k.startswith("_"):
+                        data[k] = obj_to_dict(v)
+                return data
+            case _:
+                return obj
+    return obj_to_dict(object_to_save)
+def get_callable_from_string(callable_fn: str) -> Any:
+    """
+    Get a callable from a string.
+    Args:
+        callable_fn (`str`):
+            The string representation of the callable.
+    Returns:
+        `Any`: The callable.
+    """
+    # separate the function name from the module name
+    module_name, function_name = callable_fn.rsplit(".", 1)
+    # import the module
+    module = importlib.import_module(module_name)
+    # get the function
+    return getattr(module, function_name)

relik/inference/__init__.py ADDED Viewed

File without changes

relik/inference/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (183 Bytes). View file

relik/inference/__pycache__/annotator.cpython-310.pyc ADDED Viewed

Binary file (22.7 kB). View file

relik/inference/annotator.py ADDED Viewed

	@@ -0,0 +1,840 @@

+import logging
+import os
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union
+import hydra
+import torch
+from omegaconf import DictConfig, OmegaConf
+from pprintpp import pformat
+from relik.inference.data.splitters.blank_sentence_splitter import BlankSentenceSplitter
+from relik.common.log import get_logger
+from relik.common.upload import get_logged_in_username, upload
+from relik.common.utils import CONFIG_NAME, from_cache
+from relik.inference.data.objects import (
+    AnnotationType,
+    RelikOutput,
+    Span,
+    TaskType,
+    Triples,
+)
+from relik.inference.data.splitters.base_sentence_splitter import BaseSentenceSplitter
+from relik.inference.data.splitters.spacy_sentence_splitter import SpacySentenceSplitter
+from relik.inference.data.splitters.window_based_splitter import WindowSentenceSplitter
+from relik.inference.data.tokenizers.spacy_tokenizer import SpacyTokenizer
+from relik.inference.data.window.manager import WindowManager
+from relik.reader.data.relik_reader_sample import RelikReaderSample
+from relik.reader.pytorch_modules.base import RelikReaderBase
+from relik.reader.pytorch_modules.span import RelikReaderForSpanExtraction
+from relik.reader.pytorch_modules.triplet import RelikReaderForTripletExtraction
+from relik.retriever.indexers.base import BaseDocumentIndex
+from relik.retriever.indexers.document import Document
+from relik.retriever.pytorch_modules import PRECISION_MAP
+from relik.retriever.pytorch_modules.model import GoldenRetriever
+# set tokenizers parallelism to False
+os.environ["TOKENIZERS_PARALLELISM"] = os.getenv("TOKENIZERS_PARALLELISM", "false")
+LOG_QUERY = os.getenv("RELIK_LOG_QUERY_ON_FILE", "false").lower() == "true"
+logger = get_logger(__name__, level=logging.INFO)
+file_logger = None
+if LOG_QUERY:
+    RELIK_LOG_PATH = Path(__file__).parent.parent.parent / "relik.log"
+    # create file handler which logs even debug messages
+    fh = logging.FileHandler(RELIK_LOG_PATH)
+    fh.setLevel(logging.INFO)
+    file_logger = get_logger("relik", level=logging.INFO)
+    file_logger.addHandler(fh)
+class Relik:
+    """
+    Relik main class. It is a wrapper around a retriever and a reader.
+    Args:
+        retriever (:obj:`GoldenRetriever`):
+            The retriever to use.
+        reader (:obj:`RelikReaderBase`):
+            The reader to use.
+        document_index (:obj:`BaseDocumentIndex`, `optional`):
+            The document index to use. If `None`, the retriever's document index will be used.
+        device (`str`, `optional`, defaults to `cpu`):
+            The device to use for both the retriever and the reader.
+        retriever_device (`str`, `optional`, defaults to `None`):
+            The device to use for the retriever. If `None`, the `device` argument will be used.
+        document_index_device (`str`, `optional`, defaults to `None`):
+            The device to use for the document index. If `None`, the `device` argument will be used.
+        reader_device (`str`, `optional`, defaults to `None`):
+            The device to use for the reader. If `None`, the `device` argument will be used.
+        precision (`int`, `str` or `torch.dtype`, `optional`, defaults to `32`):
+            The precision to use for both the retriever and the reader.
+        retriever_precision (`int`, `str` or `torch.dtype`, `optional`, defaults to `None`):
+            The precision to use for the retriever. If `None`, the `precision` argument will be used.
+        document_index_precision (`int`, `str` or `torch.dtype`, `optional`, defaults to `None`):
+            The precision to use for the document index. If `None`, the `precision` argument will be used.
+        reader_precision (`int`, `str` or `torch.dtype`, `optional`, defaults to `None`):
+            The precision to use for the reader. If `None`, the `precision` argument will be used.
+        metadata_fields (`list[str]`, `optional`, defaults to `None`):
+            The fields to add to the candidates for the reader.
+        top_k (`int`, `optional`, defaults to `None`):
+            The number of candidates to retrieve for each window.
+        window_size (`int`, `optional`, defaults to `None`):
+            The size of the window. If `None`, the whole text will be annotated.
+        window_stride (`int`, `optional`, defaults to `None`):
+            The stride of the window. If `None`, there will be no overlap between windows.
+        **kwargs:
+            Additional keyword arguments to pass to the retriever and the reader.
+    """
+    def __init__(
+        self,
+        retriever: GoldenRetriever | DictConfig | Dict | None = None,
+        reader: RelikReaderBase | DictConfig | None = None,
+        device: str | None = None,
+        retriever_device: str | None = None,
+        document_index_device: str | None = None,
+        reader_device: str | None = None,
+        precision: int | str | torch.dtype | None = None,
+        retriever_precision: int | str | torch.dtype | None = None,
+        document_index_precision: int | str | torch.dtype | None = None,
+        reader_precision: int | str | torch.dtype | None = None,
+        task: TaskType | str = TaskType.SPAN,
+        metadata_fields: list[str] | None = None,
+        top_k: int | None = None,
+        window_size: int | str | None = None,
+        window_stride: int | None = None,
+        retriever_kwargs: Dict[str, Any] | None = None,
+        reader_kwargs: Dict[str, Any] | None = None,
+        **kwargs,
+    ) -> None:
+        # parse task into a TaskType
+        if isinstance(task, str):
+            try:
+                task = TaskType(task.lower())
+            except ValueError:
+                raise ValueError(
+                    f"Task `{task}` not recognized. "
+                    f"Please choose one of {list(TaskType)}."
+                )
+        self.task = task
+        # organize devices
+        if device is not None:
+            if retriever_device is None:
+                retriever_device = device
+            if document_index_device is None:
+                document_index_device = device
+            if reader_device is None:
+                reader_device = device
+        # organize precision
+        if precision is not None:
+            if retriever_precision is None:
+                retriever_precision = precision
+            if document_index_precision is None:
+                document_index_precision = precision
+            if reader_precision is None:
+                reader_precision = precision
+        # retriever
+        self.retriever: Dict[TaskType, GoldenRetriever] = {
+            TaskType.SPAN: None,
+            TaskType.TRIPLET: None,
+        }
+        if retriever:
+            # check retriever type, it can be a GoldenRetriever, a DictConfig or a Dict
+            if not isinstance(retriever, (GoldenRetriever, DictConfig, Dict)):
+                raise ValueError(
+                    f"`retriever` must be a `GoldenRetriever`, a `DictConfig` or "
+                    f"a `Dict`, got `{type(retriever)}`."
+                )
+            # we need to check weather the DictConfig is a DictConfig for an instance of GoldenRetriever
+            # or a primitive Dict
+            if isinstance(retriever, DictConfig):
+                # then it is probably a primitive Dict
+                if "_target_" not in retriever:
+                    retriever = OmegaConf.to_container(retriever, resolve=True)
+                    # convert the key to TaskType
+                    try:
+                        retriever = {
+                            TaskType(k.lower()): v for k, v in retriever.items()
+                        }
+                    except ValueError as e:
+                        raise ValueError(
+                            f"Please choose a valid task type (one of {list(TaskType)}) for each retriever."
+                        ) from e
+            if isinstance(retriever, Dict):
+                # convert the key to TaskType
+                retriever = {TaskType(k): v for k, v in retriever.items()}
+            else:
+                retriever = {task: retriever}
+            # instantiate each retriever
+            if self.task in [TaskType.SPAN, TaskType.BOTH]:
+                self.retriever[TaskType.SPAN] = self._instantiate_retriever(
+                    retriever[TaskType.SPAN],
+                    retriever_device,
+                    retriever_precision,
+                    None,
+                    document_index_device,
+                    document_index_precision,
+                )
+            if self.task in [TaskType.TRIPLET, TaskType.BOTH]:
+                self.retriever[TaskType.TRIPLET] = self._instantiate_retriever(
+                    retriever[TaskType.TRIPLET],
+                    retriever_device,
+                    retriever_precision,
+                    None,
+                    document_index_device,
+                    document_index_precision,
+                )
+            # clean up None retrievers from the dictionary
+            self.retriever = {
+                task_type: r for task_type, r in self.retriever.items() if r is not None
+            }
+            # torch compile
+            # self.retriever = {task_type: torch.compile(r, backend="onnxrt") for task_type, r in self.retriever.items()}
+        # reader
+        self.reader: RelikReaderBase | None = None
+        if reader:
+            reader = (
+                hydra.utils.instantiate(
+                    reader,
+                    device=reader_device,
+                    precision=reader_precision,
+                )
+                if isinstance(reader, DictConfig)
+                else reader
+            )
+            reader.training = False
+            reader.eval()
+            if reader_device is not None:
+                logger.info(f"Moving reader to `{reader_device}`.")
+                reader.to(reader_device)
+            if reader_precision is not None and reader.precision != PRECISION_MAP[reader_precision]:
+                logger.info(
+                    f"Setting precision of reader to `{PRECISION_MAP[reader_precision]}`."
+                )
+                reader.to(PRECISION_MAP[reader_precision])
+            self.reader = reader
+            # self.reader = torch.compile(self.reader, backend="tvm")
+        # windowization stuff
+        self.tokenizer = SpacyTokenizer(language="en")  # TODO: parametrize?
+        self.sentence_splitter: BaseSentenceSplitter | None = None
+        self.window_manager: WindowManager | None = None
+        if metadata_fields is None:
+            metadata_fields = []
+        self.metadata_fields = metadata_fields
+        # inference params
+        self.top_k = top_k
+        self.window_size = window_size
+        self.window_stride = window_stride
+    @staticmethod
+    def _instantiate_retriever(
+        retriever,
+        retriever_device,
+        retriever_precision,
+        document_index,
+        document_index_device,
+        document_index_precision,
+    ):
+        if not isinstance(retriever, GoldenRetriever):
+            # convert to DictConfig
+            retriever = hydra.utils.instantiate(
+                OmegaConf.create(retriever),
+                device=retriever_device,
+                precision=retriever_precision,
+                index_device=document_index_device,
+                index_precision=document_index_precision,
+            )
+        retriever.training = False
+        retriever.eval()
+        if document_index is not None:
+            if retriever.document_index is not None:
+                logger.info(
+                    "The Retriever already has a document index, replacing it with the provided one."
+                    "If you want to keep using the old one, please do not provide a document index."
+                )
+                retriever.document_index = document_index
+        # we override the device and the precision of the document index if provided
+        if document_index_device is not None:
+            logger.info(f"Moving document index to `{document_index_device}`.")
+            retriever.document_index.to(document_index_device)
+        if document_index_precision is not None:
+            logger.info(
+                f"Setting precision of document index to `{PRECISION_MAP[document_index_precision]}`."
+            )
+            retriever.document_index.to(PRECISION_MAP[document_index_precision])
+        # retriever.document_index = document_index
+        # now we can move the retriever to the right device and set the precision
+        if retriever_device is not None:
+            logger.info(f"Moving retriever to `{retriever_device}`.")
+            retriever.to(retriever_device)
+        if retriever_precision is not None:
+            logger.info(
+                f"Setting precision of retriever to `{PRECISION_MAP[retriever_precision]}`."
+            )
+            retriever.to(PRECISION_MAP[retriever_precision])
+        return retriever
+    def __call__(
+        self,
+        text: str | List[str] | None = None,
+        windows: List[RelikReaderSample] | None = None,
+        candidates: List[str]
+        | List[Document]
+        | Dict[TaskType, List[Document]]
+        | None = None,
+        mentions: List[List[int]] | List[List[List[int]]] | None = None,
+        top_k: int | None = None,
+        window_size: int | None = None,
+        window_stride: int | None = None,
+        is_split_into_words: bool = False,
+        retriever_batch_size: int | None = 32,
+        reader_batch_size: int | None = 32,
+        return_also_windows: bool = False,
+        annotation_type: str | AnnotationType = AnnotationType.CHAR,
+        progress_bar: bool = False,
+        **kwargs,
+    ) -> Union[RelikOutput, list[RelikOutput]]:
+        """
+        Annotate a text with entities.
+        Args:
+            text (`str` or `list`):
+                The text to annotate. If a list is provided, each element of the list
+                 will be annotated separately.
+            candidates (`list[str]`, `list[Document]`, `optional`, defaults to `None`):
+                The candidates to use for the reader. If `None`, the candidates will be
+                retrieved from the retriever.
+            mentions (`list[list[int]]` or `list[list[list[int]]]`, `optional`, defaults to `None`):
+                The mentions to use for the reader. If `None`, the mentions will be
+                predicted by the reader.
+            top_k (`int`, `optional`, defaults to `None`):
+                The number of candidates to retrieve for each window.
+            window_size (`int`, `optional`, defaults to `None`):
+                The size of the window. If `None`, the whole text will be annotated.
+            window_stride (`int`, `optional`, defaults to `None`):
+                The stride of the window. If `None`, there will be no overlap between windows.
+            retriever_batch_size (`int`, `optional`, defaults to `None`):
+                The batch size to use for the retriever. The whole input is the batch for the retriever.
+            reader_batch_size (`int`, `optional`, defaults to `None`):
+                The batch size to use for the reader. The whole input is the batch for the reader.
+            return_also_windows (`bool`, `optional`, defaults to `False`):
+                Whether to return the windows in the output.
+            annotation_type (`str` or `AnnotationType`, `optional`, defaults to `char`):
+                The type of annotation to return. If `char`, the spans will be in terms of
+                character offsets. If `word`, the spans will be in terms of word offsets.
+            **kwargs:
+                Additional keyword arguments to pass to the retriever and the reader.
+        Returns:
+            `RelikOutput` or `list[RelikOutput]`:
+                The annotated text. If a list was provided as input, a list of
+                `RelikOutput` objects will be returned.
+        """
+        if text is None and windows is None:
+            raise ValueError(
+                "Either `text` or `windows` must be provided. Both are `None`."
+            )
+        if isinstance(annotation_type, str):
+            try:
+                annotation_type = AnnotationType(annotation_type)
+            except ValueError:
+                raise ValueError(
+                    f"Annotation type {annotation_type} not recognized. "
+                    f"Please choose one of {list(AnnotationType)}."
+                )
+        if top_k is None:
+            top_k = self.top_k or 100
+        if window_size is None:
+            window_size = self.window_size
+        if window_stride is None:
+            window_stride = self.window_stride
+        if text:
+            if isinstance(text, str):
+                text = [text]
+                if mentions is not None:
+                    mentions = [mentions]
+            if file_logger is not None:
+                file_logger.info("Annotating the following text:")
+                for t in text:
+                    file_logger.info(f" {t}")
+            if self.window_manager is None:
+                if window_size == "none":
+                    self.sentence_splitter = BlankSentenceSplitter()
+                elif window_size == "sentence":
+                    self.sentence_splitter = SpacySentenceSplitter()
+                else:
+                    self.sentence_splitter = WindowSentenceSplitter(
+                        window_size=window_size, window_stride=window_stride
+                    )
+                self.window_manager = WindowManager(
+                    self.tokenizer, self.sentence_splitter
+                )
+            if (
+                window_size not in ["sentence", "none"]
+                and window_stride is not None
+                and window_size < window_stride
+            ):
+                raise ValueError(
+                    f"Window size ({window_size}) must be greater than window stride ({window_stride})"
+                )
+        if windows is None:
+            # windows were provided, use them
+            windows, blank_windows = self.window_manager.create_windows(
+                text,
+                window_size,
+                window_stride,
+                is_split_into_words=is_split_into_words,
+                mentions=mentions
+            )
+        else:
+            blank_windows = []
+            text = {w.doc_id: w.text for w in windows}
+        if candidates is not None and any(
+            r is not None for r in self.retriever.values()
+        ):
+            logger.info(
+                "Both candidates and a retriever were provided. "
+                "Retriever will be ignored."
+            )
+        windows_candidates = {TaskType.SPAN: None, TaskType.TRIPLET: None}
+        if candidates is not None:
+            # again, check if candidates is a dict
+            if isinstance(candidates, Dict):
+                if self.task not in candidates:
+                    raise ValueError(
+                        f"Task `{self.task}` not found in `candidates`."
+                        f"Please choose one of {list(TaskType)}."
+                    )
+            else:
+                candidates = {self.task: candidates}
+            for task_type, _candidates in candidates.items():
+                if isinstance(_candidates, list):
+                    _candidates = [
+                        [
+                            c if isinstance(c, Document) else Document(c)
+                            for c in _candidates[w.doc_id]
+                        ]
+                        for w in windows
+                    ]
+                windows_candidates[task_type] = _candidates
+        else:
+            # retrieve candidates first
+            if self.retriever is None:
+                raise ValueError(
+                    "No retriever was provided, please provide a retriever or candidates."
+                )
+            start_retr = time.time()
+            for task_type, retriever in self.retriever.items():
+                retriever_out = retriever.retrieve(
+                    [w.text for w in windows],
+                    text_pair=[w.doc_topic.text if w.doc_topic is not None else None for w in windows],
+                    k=top_k,
+                    batch_size=retriever_batch_size,
+                    progress_bar=progress_bar,
+                    **kwargs,
+                )
+                windows_candidates[task_type] = [
+                    [p.document for p in predictions] for predictions in retriever_out
+                ]
+            end_retr = time.time()
+            logger.info(f"Retrieval took {end_retr - start_retr} seconds.")
+        # clean up None's
+        windows_candidates = {
+            t: c for t, c in windows_candidates.items() if c is not None
+        }
+        # add passage to the windows
+        for task_type, task_candidates in windows_candidates.items():
+            for window, candidates in zip(windows, task_candidates):
+                # construct the candidates for the reader
+                formatted_candidates = []
+                for candidate in candidates:
+                    window_candidate_text = candidate.text
+                    for field in self.metadata_fields:
+                        window_candidate_text += f"{candidate.metadata.get(field, '')}"
+                    formatted_candidates.append(window_candidate_text)
+                # create a member for the windows that is named like the task
+                setattr(window, f"{task_type.value}_candidates", formatted_candidates)
+        for task_type, task_candidates in windows_candidates.items():
+            for window in blank_windows:
+                setattr(window, f"{task_type.value}_candidates", [])
+                setattr(window, "predicted_spans", [])
+                setattr(window, "predicted_triples", [])
+        if self.reader is not None:
+            start_read = time.time()
+            windows = self.reader.read(
+                samples=windows,
+                max_batch_size=reader_batch_size,
+                annotation_type=annotation_type,
+                progress_bar=progress_bar,
+                **kwargs,
+            )
+            end_read = time.time()
+            logger.info(f"Reading took {end_read - start_read} seconds.")
+            # TODO: check merging behavior without a reader
+            # do we want to merge windows if there is no reader?
+            if self.window_size is not None and self.window_size not in ["sentence", "none"]:
+                start_w = time.time()
+                windows = windows + blank_windows
+                windows.sort(key=lambda x: (x.doc_id, x.offset))
+                merged_windows = self.window_manager.merge_windows(windows)
+                end_w = time.time()
+                logger.info(f"Merging took {end_w - start_w} seconds.")
+            else:
+                merged_windows = windows
+        else:
+            windows = windows + blank_windows
+            windows.sort(key=lambda x: (x.doc_id, x.offset))
+            merged_windows = windows
+        # transform predictions into RelikOutput objects
+        output = []
+        for w in merged_windows:
+            span_labels = []
+            triples_labels = []
+            # span extraction should always be present
+            if getattr(w, "predicted_spans", None) is not None:
+                span_labels = sorted(
+                    [
+                        Span(start=ss, end=se, label=sl, text=text[w.doc_id][ss:se])
+                        if annotation_type == AnnotationType.CHAR
+                        else Span(start=ss, end=se, label=sl, text=w.words[ss:se])
+                        for ss, se, sl in w.predicted_spans
+                    ],
+                    key=lambda x: x.start,
+                )
+                # triple extraction is optional, if here add it
+                if getattr(w, "predicted_triples", None) is not None:
+                    triples_labels = [
+                        Triples(
+                            subject=span_labels[subj],
+                            label=label,
+                            object=span_labels[obj],
+                            confidence=conf,
+                        )
+                        for subj, label, obj, conf in w.predicted_triples
+                    ]
+            # create the output
+            sample_output = RelikOutput(
+                text=text[w.doc_id],
+                tokens=w.words,
+                spans=span_labels,
+                triples=triples_labels,
+                candidates={
+                    task_type: [
+                        r.document_index.documents.get_document_from_text(c)
+                        for c in getattr(w, f"{task_type.value}_candidates", [])
+                        if r.document_index.documents.get_document_from_text(c) is not None
+                    ]
+                    for task_type, r in self.retriever.items()
+                },
+            )
+            output.append(sample_output)
+        # add windows to the output if requested
+        # do we want to force windows to be returned if there is no reader?
+        if return_also_windows:
+            for i, sample_output in enumerate(output):
+                sample_output.windows = [w for w in windows if w.doc_id == i]
+        # if only one text was provided, return a single RelikOutput object
+        if len(output) == 1:
+            return output[0]
+        return output
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name_or_dir: Union[str, os.PathLike],
+        config_file_name: str = CONFIG_NAME,
+        *args,
+        **kwargs,
+    ) -> "Relik":
+        """
+        Instantiate a `Relik` from a pretrained model.
+        Args:
+            model_name_or_dir (`str` or `os.PathLike`):
+                The name or path of the model to load.
+            config_file_name (`str`, `optional`, defaults to `config.yaml`):
+                The name of the configuration file to load.
+            *args:
+                Additional positional arguments to pass to `OmegaConf.merge`.
+            **kwargs:
+                Additional keyword arguments to pass to `OmegaConf.merge`.
+        Returns:
+            `Relik`:
+                The instantiated `Relik`.
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        model_dir = from_cache(
+            model_name_or_dir,
+            filenames=[config_file_name],
+            cache_dir=cache_dir,
+            force_download=force_download,
+        )
+        config_path = model_dir / config_file_name
+        if not config_path.exists():
+            raise FileNotFoundError(
+                f"Model configuration file not found at {config_path}."
+            )
+        # overwrite config with config_kwargs
+        config = OmegaConf.load(config_path)
+        # if kwargs is not None:
+        config = OmegaConf.merge(config, OmegaConf.create(kwargs))
+        # do we want to print the config? I like it
+        logger.info(f"Loading Relik from {model_name_or_dir}")
+        logger.info(pformat(OmegaConf.to_container(config)))
+        # load relik from config
+        relik = hydra.utils.instantiate(config, _recursive_=False, *args)
+        return relik
+    def save_pretrained(
+        self,
+        output_dir: Union[str, os.PathLike],
+        config: Optional[Dict[str, Any]] = None,
+        config_file_name: Optional[str] = None,
+        save_weights: bool = False,
+        push_to_hub: bool = False,
+        model_id: Optional[str] = None,
+        organization: Optional[str] = None,
+        repo_name: Optional[str] = None,
+        retriever_model_id: Optional[str] = None,
+        reader_model_id: Optional[str] = None,
+        **kwargs,
+    ):
+        """
+        Save the configuration of Relik to the specified directory as a YAML file.
+        Args:
+            output_dir (`str`):
+                The directory to save the configuration file to.
+            config (`Optional[Dict[str, Any]]`, `optional`):
+                The configuration to save. If `None`, the current configuration will be
+                saved. Defaults to `None`.
+            config_file_name (`Optional[str]`, `optional`):
+                The name of the configuration file. Defaults to `config.yaml`.
+            save_weights (`bool`, `optional`):
+                Whether to save the weights of the model. Defaults to `False`.
+            push_to_hub (`bool`, `optional`):
+                Whether to push the saved model to the hub. Defaults to `False`.
+            model_id (`Optional[str]`, `optional`):
+                The id of the model to push to the hub. If `None`, the name of the
+                directory will be used. Defaults to `None`.
+            organization (`Optional[str]`, `optional`):
+                The organization to push the model to. Defaults to `None`.
+            repo_name (`Optional[str]`, `optional`):
+                The name of the repository to push the model to. Defaults to `None`.
+            retriever_model_id (`Optional[str]`, `optional`):
+                The id of the retriever model to push to the hub. If `None`, the name of the
+                directory will be used. Defaults to `None`.
+            reader_model_id (`Optional[str]`, `optional`):
+                The id of the reader model to push to the hub. If `None`, the name of the
+                directory will be used. Defaults to `None`.
+            **kwargs:
+                Additional keyword arguments to pass to `OmegaConf.save`.
+        """
+        # create the output directory
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        retrievers_names: Dict[TaskType, Dict | None] = {
+            TaskType.SPAN: {
+                "question_encoder_name": None,
+                "passage_encoder_name": None,
+                "document_index_name": None,
+            },
+            TaskType.TRIPLET: {
+                "question_encoder_name": None,
+                "passage_encoder_name": None,
+                "document_index_name": None,
+            },
+        }
+        if save_weights:
+            # save weights
+            # retriever
+            model_id = model_id or output_dir.name
+            retriever_model_id = retriever_model_id or f"retriever-{model_id}"
+            for task_type, retriever in self.retriever.items():
+                if retriever is None:
+                    continue
+                task_retriever_model_id = f"{retriever_model_id}-{task_type.value}"
+                question_encoder_name = f"{task_retriever_model_id}-question-encoder"
+                passage_encoder_name = f"{task_retriever_model_id}-passage-encoder"
+                document_index_name = f"{task_retriever_model_id}-index"
+                logger.info(
+                    f"Saving retriever to {output_dir / task_retriever_model_id}"
+                )
+                retriever.save_pretrained(
+                    output_dir / task_retriever_model_id,
+                    question_encoder_name=question_encoder_name,
+                    passage_encoder_name=passage_encoder_name,
+                    document_index_name=document_index_name,
+                    push_to_hub=push_to_hub,
+                    organization=organization,
+                    **kwargs,
+                )
+                retrievers_names[task_type] = {
+                    "reader_model_id": task_retriever_model_id,
+                    "question_encoder_name": question_encoder_name,
+                    "passage_encoder_name": passage_encoder_name,
+                    "document_index_name": document_index_name,
+                }
+            # reader
+            reader_model_id = reader_model_id or f"reader-{model_id}"
+            logger.info(f"Saving reader to {output_dir / reader_model_id}")
+            self.reader.save_pretrained(
+                output_dir / reader_model_id,
+                push_to_hub=push_to_hub,
+                organization=organization,
+                **kwargs,
+            )
+            if push_to_hub:
+                user = organization or get_logged_in_username()
+                # we need to update the config with the model ids that will
+                # result from the push to hub
+                for task_type, retriever_names in retrievers_names.items():
+                    retriever_names[
+                        "question_encoder_name"
+                    ] = f"{user}/{retriever_names['question_encoder_name']}"
+                    retriever_names[
+                        "passage_encoder_name"
+                    ] = f"{user}/{retriever_names['passage_encoder_name']}"
+                    retriever_names[
+                        "document_index_name"
+                    ] = f"{user}/{retriever_names['document_index_name']}"
+                # question_encoder_name = f"{user}/{question_encoder_name}"
+                # passage_encoder_name = f"{user}/{passage_encoder_name}"
+                # document_index_name = f"{user}/{document_index_name}"
+                reader_model_id = f"{user}/{reader_model_id}"
+            else:
+                for task_type, retriever_names in retrievers_names.items():
+                    retriever_names["question_encoder_name"] = (
+                        output_dir / retriever_names["question_encoder_name"]
+                    )
+                    retriever_names["passage_encoder_name"] = (
+                        output_dir / retriever_names["passage_encoder_name"]
+                    )
+                    retriever_names["document_index_name"] = (
+                        output_dir / retriever_names["document_index_name"]
+                    )
+                reader_model_id = output_dir / reader_model_id
+        else:
+            # save config only
+            for task_type, retriever_names in retrievers_names.items():
+                retriever = self.retriever.get(task_type, None)
+                if retriever is None:
+                    continue
+                retriever_names[
+                    "question_encoder_name"
+                ] = retriever.question_encoder.name_or_path
+                retriever_names[
+                    "passage_encoder_name"
+                ] = retriever.passage_encoder.name_or_path
+                retriever_names[
+                    "document_index_name"
+                ] = retriever.document_index.name_or_path
+            reader_model_id = self.reader.name_or_path
+        if config is None:
+            # create a default config
+            config = {
+                "_target_": f"{self.__class__.__module__}.{self.__class__.__name__}"
+            }
+            if self.retriever is not None:
+                config["retriever"] = {}
+                for task_type, retriever in self.retriever.items():
+                    if retriever is None:
+                        continue
+                    config["retriever"][task_type.value] = {
+                        "_target_": f"{retriever.__class__.__module__}.{retriever.__class__.__name__}",
+                    }
+                    if retriever.question_encoder is not None:
+                        config["retriever"][task_type.value][
+                            "question_encoder"
+                        ] = retrievers_names[task_type]["question_encoder_name"]
+                    if (
+                        retriever.passage_encoder is not None
+                        and not retriever.passage_encoder_is_question_encoder
+                    ):
+                        config["retriever"][task_type.value][
+                            "passage_encoder"
+                        ] = retrievers_names[task_type]["passage_encoder_name"]
+                    if retriever.document_index is not None:
+                        config["retriever"][task_type.value][
+                            "document_index"
+                        ] = retrievers_names[task_type]["document_index_name"]
+                if self.reader is not None:
+                    config["reader"] = {
+                        "_target_": f"{self.reader.__class__.__module__}.{self.reader.__class__.__name__}",
+                        "transformer_model": reader_model_id,
+                    }
+            # these are model-specific and should be saved
+            config["task"] = self.task
+            config["metadata_fields"] = self.metadata_fields
+            config["top_k"] = self.top_k
+            config["window_size"] = self.window_size
+            config["window_stride"] = self.window_stride
+        config_file_name = config_file_name or CONFIG_NAME
+        logger.info(f"Saving relik config to {output_dir / config_file_name}")
+        # pretty print the config
+        logger.info(pformat(config))
+        OmegaConf.save(config, output_dir / config_file_name)
+        if push_to_hub:
+            # push to hub
+            logger.info("Pushing to hub")
+            model_id = model_id or output_dir.name
+            upload(
+                output_dir,
+                model_id,
+                filenames=[config_file_name],
+                organization=organization,
+                repo_name=repo_name,
+            )

relik/inference/data/__init__.py ADDED Viewed

File without changes

relik/inference/data/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (188 Bytes). View file

relik/inference/data/__pycache__/objects.cpython-310.pyc ADDED Viewed

Binary file (3.24 kB). View file

relik/inference/data/objects.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Dict, List, NamedTuple, Optional
+from relik.reader.pytorch_modules.hf.modeling_relik import RelikReaderSample
+from relik.retriever.indexers.document import Document
+@dataclass
+class Word:
+    """
+    A word representation that includes text, index in the sentence, POS tag, lemma,
+    dependency relation, and similar information.
+    # Parameters
+    text : `str`, optional
+        The text representation.
+    index : `int`, optional
+        The word offset in the sentence.
+    lemma : `str`, optional
+        The lemma of this word.
+    pos : `str`, optional
+        The coarse-grained part of speech of this word.
+    dep : `str`, optional
+        The dependency relation for this word.
+    input_id : `int`, optional
+        Integer representation of the word, used to pass it to a model.
+    token_type_id : `int`, optional
+        Token type id used by some transformers.
+    attention_mask: `int`, optional
+        Attention mask used by transformers, indicates to the model which tokens should
+        be attended to, and which should not.
+    """
+    text: str
+    i: int
+    idx: Optional[int] = None
+    idx_end: Optional[int] = None
+    # preprocessing fields
+    lemma: Optional[str] = None
+    pos: Optional[str] = None
+    dep: Optional[str] = None
+    head: Optional[int] = None
+    def __str__(self):
+        return self.text
+    def __repr__(self):
+        return self.__str__()
+class Span(NamedTuple):
+    start: int
+    end: int
+    label: str
+    text: str
+class Triples(NamedTuple):
+    subject: Span
+    label: str
+    object: Span
+    confidence: float
+@dataclass
+class RelikOutput:
+    text: str
+    tokens: List[str]
+    spans: List[Span]
+    triples: List[Triples]
+    candidates: Dict[TaskType, List[Document]]
+    windows: Optional[List[RelikReaderSample]] = None
+from enum import Enum
+class AnnotationType(Enum):
+    CHAR = "char"
+    WORD = "word"
+class TaskType(Enum):
+    SPAN = "span"
+    TRIPLET = "triplet"
+    BOTH = "both"

relik/inference/data/splitters/__init__.py ADDED Viewed

File without changes

relik/inference/data/splitters/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (198 Bytes). View file

relik/inference/data/splitters/__pycache__/base_sentence_splitter.cpython-310.pyc ADDED Viewed

Binary file (2.38 kB). View file

relik/inference/data/splitters/__pycache__/blank_sentence_splitter.cpython-310.pyc ADDED Viewed

Binary file (1.6 kB). View file

relik/inference/data/splitters/__pycache__/spacy_sentence_splitter.cpython-310.pyc ADDED Viewed

Binary file (5.31 kB). View file

relik/inference/data/splitters/__pycache__/window_based_splitter.cpython-310.pyc ADDED Viewed

Binary file (2.49 kB). View file

relik/inference/data/splitters/base_sentence_splitter.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from typing import List, Union
+class BaseSentenceSplitter:
+    """
+    A `BaseSentenceSplitter` splits strings into sentences.
+    """
+    def __call__(self, *args, **kwargs):
+        """
+        Calls :meth:`split_sentences`.
+        """
+        return self.split_sentences(*args, **kwargs)
+    def split_sentences(
+        self, text: str, max_len: int = 0, *args, **kwargs
+    ) -> List[str]:
+        """
+        Splits a `text` :class:`str` paragraph into a list of :class:`str`, where each is a sentence.
+        """
+        raise NotImplementedError
+    def split_sentences_batch(
+        self, texts: List[str], *args, **kwargs
+    ) -> List[List[str]]:
+        """
+        Default implementation is to just iterate over the texts and call `split_sentences`.
+        """
+        return [self.split_sentences(text) for text in texts]
+    @staticmethod
+    def check_is_batched(
+        texts: Union[str, List[str], List[List[str]]], is_split_into_words: bool
+    ):
+        """
+        Check if input is batched or a single sample.
+        Args:
+            texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+                Text to check.
+            is_split_into_words (:obj:`bool`):
+                If :obj:`True` and the input is a string, the input is split on spaces.
+        Returns:
+            :obj:`bool`: ``True`` if ``texts`` is batched, ``False`` otherwise.
+        """
+        return bool(
+            (not is_split_into_words and isinstance(texts, (list, tuple)))
+            or (
+                is_split_into_words
+                and isinstance(texts, (list, tuple))
+                and texts
+                and isinstance(texts[0], (list, tuple))
+            )
+        )

relik/inference/data/splitters/blank_sentence_splitter.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from typing import List, Union
+class BlankSentenceSplitter:
+    """
+    A `BlankSentenceSplitter` splits strings into sentences.
+    """
+    def __call__(self, *args, **kwargs):
+        """
+        Calls :meth:`split_sentences`.
+        """
+        return self.split_sentences(*args, **kwargs)
+    def split_sentences(
+        self, text: str, max_len: int = 0, *args, **kwargs
+    ) -> List[str]:
+        """
+        Splits a `text` :class:`str` paragraph into a list of :class:`str`, where each is a sentence.
+        """
+        return [text]
+    def split_sentences_batch(
+        self, texts: List[str], *args, **kwargs
+    ) -> List[List[str]]:
+        """
+        Default implementation is to just iterate over the texts and call `split_sentences`.
+        """
+        return [self.split_sentences(text) for text in texts]

relik/inference/data/splitters/spacy_sentence_splitter.py ADDED Viewed

	@@ -0,0 +1,153 @@

+from typing import Any, Iterable, List, Optional, Union
+import spacy
+from relik.inference.data.objects import Word
+from relik.inference.data.splitters.base_sentence_splitter import BaseSentenceSplitter
+from relik.inference.data.tokenizers.spacy_tokenizer import load_spacy
+SPACY_LANGUAGE_MAPPER = {
+    "cs": "xx_sent_ud_sm",
+    "da": "xx_sent_ud_sm",
+    "de": "xx_sent_ud_sm",
+    "fa": "xx_sent_ud_sm",
+    "fi": "xx_sent_ud_sm",
+    "fr": "xx_sent_ud_sm",
+    "el": "el_core_news_sm",
+    "en": "xx_sent_ud_sm",
+    "es": "xx_sent_ud_sm",
+    "ga": "xx_sent_ud_sm",
+    "hr": "xx_sent_ud_sm",
+    "id": "xx_sent_ud_sm",
+    "it": "xx_sent_ud_sm",
+    "ja": "ja_core_news_sm",
+    "lv": "xx_sent_ud_sm",
+    "lt": "xx_sent_ud_sm",
+    "mr": "xx_sent_ud_sm",
+    "nb": "xx_sent_ud_sm",
+    "nl": "xx_sent_ud_sm",
+    "no": "xx_sent_ud_sm",
+    "pl": "pl_core_news_sm",
+    "pt": "xx_sent_ud_sm",
+    "ro": "xx_sent_ud_sm",
+    "ru": "xx_sent_ud_sm",
+    "sk": "xx_sent_ud_sm",
+    "sr": "xx_sent_ud_sm",
+    "sv": "xx_sent_ud_sm",
+    "te": "xx_sent_ud_sm",
+    "vi": "xx_sent_ud_sm",
+    "zh": "zh_core_web_sm",
+}
+class SpacySentenceSplitter(BaseSentenceSplitter):
+    """
+    A :obj:`SentenceSplitter` that uses spaCy's built-in sentence boundary detection.
+    Args:
+        language (:obj:`str`, optional, defaults to :obj:`en`):
+            Language of the text to tokenize.
+        model_type (:obj:`str`, optional, defaults to :obj:`statistical`):
+            Three different type of sentence splitter:
+                - ``dependency``: sentence splitter uses a dependency parse to detect sentence boundaries,
+                    slow, but accurate.
+                - ``statistical``:
+                - ``rule_based``: It's fast and has a small memory footprint, since it uses punctuation to detect
+                    sentence boundaries.
+    """
+    def __init__(self, language: str = "en", model_type: str = "statistical") -> None:
+        # we need spacy's dependency parser if we're not using rule-based sentence boundary detection.
+        # self.spacy = get_spacy_model(language, parse=not rule_based, ner=False)
+        dep = bool(model_type == "dependency")
+        if language in SPACY_LANGUAGE_MAPPER:
+            self.spacy = load_spacy(SPACY_LANGUAGE_MAPPER[language], parse=dep)
+        else:
+            self.spacy = spacy.blank(language)
+            # force type to rule_based since there is no pre-trained model
+            model_type = "rule_based"
+        if model_type == "dependency":
+            # dependency type must declared at model init
+            pass
+        elif model_type == "statistical":
+            if not self.spacy.has_pipe("senter"):
+                self.spacy.enable_pipe("senter")
+        elif model_type == "rule_based":
+            # we use `sentencizer`, a built-in spacy module for rule-based sentence boundary detection.
+            # depending on the spacy version, it could be called 'sentencizer' or 'sbd'
+            if not self.spacy.has_pipe("sentencizer"):
+                self.spacy.add_pipe("sentencizer")
+        else:
+            raise ValueError(
+                f"type {model_type} not supported. Choose between `dependency`, `statistical` or `rule_based`"
+            )
+    def __call__(
+        self,
+        texts: Union[str, List[str], List[List[str]]],
+        max_length: Optional[int] = None,
+        is_split_into_words: bool = False,
+        **kwargs,
+    ) -> Union[List[str], List[List[str]]]:
+        """
+        Tokenize the input into single words using SpaCy models.
+        Args:
+            texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+                Text to tag. It can be a single string, a batch of string and pre-tokenized strings.
+            max_len (:obj:`int`, optional, defaults to :obj:`0`):
+                Maximum length of a single text. If the text is longer than `max_len`, it will be split
+                into multiple sentences.
+        Returns:
+            :obj:`List[List[str]]`: The input doc split into sentences.
+        """
+        # check if input is batched or a single sample
+        is_batched = self.check_is_batched(texts, is_split_into_words)
+        if is_batched:
+            sents = self.split_sentences_batch(texts)
+        else:
+            sents = self.split_sentences(texts, max_length)
+        return sents
+    @staticmethod
+    def chunked(iterable, n: int) -> Iterable[List[Any]]:
+        """
+        Chunks a list into n sized chunks.
+        Args:
+            iterable (:obj:`List[Any]`):
+                List to chunk.
+            n (:obj:`int`):
+                Size of the chunks.
+        Returns:
+            :obj:`Iterable[List[Any]]`: The input list chunked into n sized chunks.
+        """
+        return [iterable[i : i + n] for i in range(0, len(iterable), n)]
+    def split_sentences(
+        self, text: str | List[Word], max_length: Optional[int] = None, *args, **kwargs
+    ) -> List[str]:
+        """
+        Splits a `text` into smaller sentences.
+        Args:
+            text (:obj:`str`):
+                Text to split.
+            max_length (:obj:`int`, optional, defaults to :obj:`0`):
+                Maximum length of a single sentence. If the text is longer than `max_len`, it will be split
+                into multiple sentences.
+        Returns:
+            :obj:`List[str]`: The input text split into sentences.
+        """
+        sentences = [sent for sent in self.spacy(text).sents]
+        if max_length is not None and max_length > 0:
+            sentences = [
+                chunk
+                for sentence in sentences
+                for chunk in self.chunked(sentence, max_length)
+            ]
+        return sentences

relik/inference/data/splitters/window_based_splitter.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from typing import List, Union
+from relik.inference.data.splitters.base_sentence_splitter import BaseSentenceSplitter
+class WindowSentenceSplitter(BaseSentenceSplitter):
+    """
+    A :obj:`WindowSentenceSplitter` that splits a text into windows of a given size.
+    """
+    def __init__(self, window_size: int, window_stride: int, *args, **kwargs) -> None:
+        super(WindowSentenceSplitter, self).__init__()
+        self.window_size = window_size
+        self.window_stride = window_stride
+    def __call__(
+        self,
+        texts: Union[str, List[str], List[List[str]]],
+        is_split_into_words: bool = False,
+        **kwargs,
+    ) -> Union[List[str], List[List[str]]]:
+        """
+        Tokenize the input into single words using SpaCy models.
+        Args:
+            texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+                Text to tag. It can be a single string, a batch of string and pre-tokenized strings.
+        Returns:
+            :obj:`List[List[str]]`: The input doc split into sentences.
+        """
+        return self.split_sentences(texts)
+    def split_sentences(self, text: str | List, *args, **kwargs) -> List[List]:
+        """
+        Splits a `text` into sentences.
+        Args:
+            text (:obj:`str`):
+                Text to split.
+        Returns:
+            :obj:`List[str]`: The input text split into sentences.
+        """
+        if isinstance(text, str):
+            text = text.split()
+        sentences = []
+        for i in range(0, len(text), self.window_stride):
+            # if the last stride is smaller than the window size, then we can
+            # include more tokens form the previous window.
+            if i != 0 and i + self.window_size > len(text):
+                overflowing_tokens = i + self.window_size - len(text)
+                if overflowing_tokens >= self.window_stride:
+                    break
+                i -= overflowing_tokens
+            involved_token_indices = list(
+                range(i, min(i + self.window_size, len(text)))
+            )
+            window_tokens = [text[j] for j in involved_token_indices]
+            sentences.append(window_tokens)
+        return sentences

relik/inference/data/tokenizers/__init__.py ADDED Viewed

	@@ -0,0 +1,87 @@

+SPACY_LANGUAGE_MAPPER = {
+    "ca": "ca_core_news_sm",
+    "da": "da_core_news_sm",
+    "de": "de_core_news_sm",
+    "el": "el_core_news_sm",
+    "en": "en_core_web_sm",
+    "es": "es_core_news_sm",
+    "fr": "fr_core_news_sm",
+    "it": "it_core_news_sm",
+    "ja": "ja_core_news_sm",
+    "lt": "lt_core_news_sm",
+    "mk": "mk_core_news_sm",
+    "nb": "nb_core_news_sm",
+    "nl": "nl_core_news_sm",
+    "pl": "pl_core_news_sm",
+    "pt": "pt_core_news_sm",
+    "ro": "ro_core_news_sm",
+    "ru": "ru_core_news_sm",
+    "xx": "xx_sent_ud_sm",
+    "zh": "zh_core_web_sm",
+    "ca_core_news_sm": "ca_core_news_sm",
+    "ca_core_news_md": "ca_core_news_md",
+    "ca_core_news_lg": "ca_core_news_lg",
+    "ca_core_news_trf": "ca_core_news_trf",
+    "da_core_news_sm": "da_core_news_sm",
+    "da_core_news_md": "da_core_news_md",
+    "da_core_news_lg": "da_core_news_lg",
+    "da_core_news_trf": "da_core_news_trf",
+    "de_core_news_sm": "de_core_news_sm",
+    "de_core_news_md": "de_core_news_md",
+    "de_core_news_lg": "de_core_news_lg",
+    "de_dep_news_trf": "de_dep_news_trf",
+    "el_core_news_sm": "el_core_news_sm",
+    "el_core_news_md": "el_core_news_md",
+    "el_core_news_lg": "el_core_news_lg",
+    "en_core_web_sm": "en_core_web_sm",
+    "en_core_web_md": "en_core_web_md",
+    "en_core_web_lg": "en_core_web_lg",
+    "en_core_web_trf": "en_core_web_trf",
+    "es_core_news_sm": "es_core_news_sm",
+    "es_core_news_md": "es_core_news_md",
+    "es_core_news_lg": "es_core_news_lg",
+    "es_dep_news_trf": "es_dep_news_trf",
+    "fr_core_news_sm": "fr_core_news_sm",
+    "fr_core_news_md": "fr_core_news_md",
+    "fr_core_news_lg": "fr_core_news_lg",
+    "fr_dep_news_trf": "fr_dep_news_trf",
+    "it_core_news_sm": "it_core_news_sm",
+    "it_core_news_md": "it_core_news_md",
+    "it_core_news_lg": "it_core_news_lg",
+    "ja_core_news_sm": "ja_core_news_sm",
+    "ja_core_news_md": "ja_core_news_md",
+    "ja_core_news_lg": "ja_core_news_lg",
+    "ja_dep_news_trf": "ja_dep_news_trf",
+    "lt_core_news_sm": "lt_core_news_sm",
+    "lt_core_news_md": "lt_core_news_md",
+    "lt_core_news_lg": "lt_core_news_lg",
+    "mk_core_news_sm": "mk_core_news_sm",
+    "mk_core_news_md": "mk_core_news_md",
+    "mk_core_news_lg": "mk_core_news_lg",
+    "nb_core_news_sm": "nb_core_news_sm",
+    "nb_core_news_md": "nb_core_news_md",
+    "nb_core_news_lg": "nb_core_news_lg",
+    "nl_core_news_sm": "nl_core_news_sm",
+    "nl_core_news_md": "nl_core_news_md",
+    "nl_core_news_lg": "nl_core_news_lg",
+    "pl_core_news_sm": "pl_core_news_sm",
+    "pl_core_news_md": "pl_core_news_md",
+    "pl_core_news_lg": "pl_core_news_lg",
+    "pt_core_news_sm": "pt_core_news_sm",
+    "pt_core_news_md": "pt_core_news_md",
+    "pt_core_news_lg": "pt_core_news_lg",
+    "ro_core_news_sm": "ro_core_news_sm",
+    "ro_core_news_md": "ro_core_news_md",
+    "ro_core_news_lg": "ro_core_news_lg",
+    "ru_core_news_sm": "ru_core_news_sm",
+    "ru_core_news_md": "ru_core_news_md",
+    "ru_core_news_lg": "ru_core_news_lg",
+    "xx_ent_wiki_sm": "xx_ent_wiki_sm",
+    "xx_sent_ud_sm": "xx_sent_ud_sm",
+    "zh_core_web_sm": "zh_core_web_sm",
+    "zh_core_web_md": "zh_core_web_md",
+    "zh_core_web_lg": "zh_core_web_lg",
+    "zh_core_web_trf": "zh_core_web_trf",
+}
+from relik.inference.data.tokenizers.spacy_tokenizer import SpacyTokenizer

relik/inference/data/tokenizers/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (2.31 kB). View file

relik/inference/data/tokenizers/__pycache__/base_tokenizer.cpython-310.pyc ADDED Viewed

Binary file (3.13 kB). View file

relik/inference/data/tokenizers/__pycache__/spacy_tokenizer.cpython-310.pyc ADDED Viewed

Binary file (6.55 kB). View file

relik/inference/data/tokenizers/base_tokenizer.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from typing import List, Union
+from relik.inference.data.objects import Word
+class BaseTokenizer:
+    """
+    A :obj:`Tokenizer` splits strings of text into single words, optionally adds
+    pos tags and perform lemmatization.
+    """
+    def __call__(
+        self,
+        texts: Union[str, List[str], List[List[str]]],
+        is_split_into_words: bool = False,
+        **kwargs
+    ) -> List[List[Word]]:
+        """
+        Tokenize the input into single words.
+        Args:
+            texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+                Text to tag. It can be a single string, a batch of string and pre-tokenized strings.
+            is_split_into_words (:obj:`bool`, optional, defaults to :obj:`False`):
+                If :obj:`True` and the input is a string, the input is split on spaces.
+        Returns:
+            :obj:`List[List[Word]]`: The input text tokenized in single words.
+        """
+        raise NotImplementedError
+    def tokenize(self, text: str) -> List[Word]:
+        """
+        Implements splitting words into tokens.
+        Args:
+            text (:obj:`str`):
+                Text to tokenize.
+        Returns:
+            :obj:`List[Word]`: The input text tokenized in single words.
+        """
+        raise NotImplementedError
+    def tokenize_batch(self, texts: List[str]) -> List[List[Word]]:
+        """
+        Implements batch splitting words into tokens.
+        Args:
+            texts (:obj:`List[str]`):
+                Batch of text to tokenize.
+        Returns:
+            :obj:`List[List[Word]]`: The input batch tokenized in single words.
+        """
+        return [self.tokenize(text) for text in texts]
+    @staticmethod
+    def check_is_batched(
+        texts: Union[str, List[str], List[List[str]]], is_split_into_words: bool
+    ):
+        """
+        Check if input is batched or a single sample.
+        Args:
+            texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+                Text to check.
+            is_split_into_words (:obj:`bool`):
+                If :obj:`True` and the input is a string, the input is split on spaces.
+        Returns:
+            :obj:`bool`: ``True`` if ``texts`` is batched, ``False`` otherwise.
+        """
+        return bool(
+            (not is_split_into_words and isinstance(texts, (list, tuple)))
+            or (
+                is_split_into_words
+                and isinstance(texts, (list, tuple))
+                and texts
+                and isinstance(texts[0], (list, tuple))
+            )
+        )

relik/inference/data/tokenizers/spacy_tokenizer.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import logging
+from copy import deepcopy
+from typing import Dict, List, Tuple, Union, Any
+import spacy
+# from ipa.common.utils import load_spacy
+from spacy.cli.download import download as spacy_download
+from spacy.tokens import Doc
+from relik.common.log import get_logger
+from relik.inference.data.objects import Word
+from relik.inference.data.tokenizers import SPACY_LANGUAGE_MAPPER
+from relik.inference.data.tokenizers.base_tokenizer import BaseTokenizer
+logger = get_logger(level=logging.DEBUG)
+# Spacy and Stanza stuff
+LOADED_SPACY_MODELS: Dict[Tuple[str, bool, bool, bool, bool], spacy.Language] = {}
+def load_spacy(
+    language: str,
+    pos_tags: bool = False,
+    lemma: bool = False,
+    parse: bool = False,
+    split_on_spaces: bool = False,
+) -> spacy.Language:
+    """
+    Download and load spacy model.
+    Args:
+        language (:obj:`str`, defaults to :obj:`en`):
+            Language of the text to tokenize.
+        pos_tags (:obj:`bool`, optional, defaults to :obj:`False`):
+            If :obj:`True`, performs POS tagging with spacy model.
+        lemma (:obj:`bool`, optional, defaults to :obj:`False`):
+            If :obj:`True`, performs lemmatization with spacy model.
+        parse (:obj:`bool`, optional, defaults to :obj:`False`):
+            If :obj:`True`, performs dependency parsing with spacy model.
+        split_on_spaces (:obj:`bool`, optional, defaults to :obj:`False`):
+            If :obj:`True`, will split by spaces without performing tokenization.
+    Returns:
+        :obj:`spacy.Language`: The spacy model loaded.
+    """
+    exclude = ["vectors", "textcat", "ner"]
+    if not pos_tags:
+        exclude.append("tagger")
+    if not lemma:
+        exclude.append("lemmatizer")
+    if not parse:
+        exclude.append("parser")
+    # check if the model is already loaded
+    # if so, there is no need to reload it
+    spacy_params = (language, pos_tags, lemma, parse, split_on_spaces)
+    if spacy_params not in LOADED_SPACY_MODELS:
+        try:
+            spacy_tagger = spacy.load(language, exclude=exclude)
+        except OSError:
+            logger.warning(
+                "Spacy model '%s' not found. Downloading and installing.", language
+            )
+            spacy_download(language)
+            spacy_tagger = spacy.load(language, exclude=exclude)
+        # if everything is disabled, return only the tokenizer
+        # for faster tokenization
+        # TODO: is it really faster?
+        # TODO: check split_on_spaces behaviour if we don't do this if
+        if len(exclude) >= 6 and split_on_spaces:
+            spacy_tagger = spacy_tagger.tokenizer
+        LOADED_SPACY_MODELS[spacy_params] = spacy_tagger
+    return LOADED_SPACY_MODELS[spacy_params]
+class SpacyTokenizer(BaseTokenizer):
+    """
+    A :obj:`Tokenizer` that uses SpaCy to tokenizer and preprocess the text. It returns :obj:`Word` objects.
+    Args:
+        language (:obj:`str`, optional, defaults to :obj:`en`):
+            Language of the text to tokenize.
+        return_pos_tags (:obj:`bool`, optional, defaults to :obj:`False`):
+            If :obj:`True`, performs POS tagging with spacy model.
+        return_lemmas (:obj:`bool`, optional, defaults to :obj:`False`):
+            If :obj:`True`, performs lemmatization with spacy model.
+        return_deps (:obj:`bool`, optional, defaults to :obj:`False`):
+            If :obj:`True`, performs dependency parsing with spacy model.
+        use_gpu (:obj:`bool`, optional, defaults to :obj:`False`):
+            If :obj:`True`, will load the Stanza model on GPU.
+    """
+    def __init__(
+        self,
+        language: str = "en",
+        return_pos_tags: bool = False,
+        return_lemmas: bool = False,
+        return_deps: bool = False,
+        use_gpu: bool = False,
+    ):
+        super().__init__()
+        if language not in SPACY_LANGUAGE_MAPPER:
+            raise ValueError(
+                f"`{language}` language not supported. The supported "
+                f"languages are: {list(SPACY_LANGUAGE_MAPPER.keys())}."
+            )
+        if use_gpu:
+            # load the model on GPU
+            # if the GPU is not available or not correctly configured,
+            # it will rise an error
+            spacy.require_gpu()
+        self.spacy = load_spacy(
+            SPACY_LANGUAGE_MAPPER[language],
+            return_pos_tags,
+            return_lemmas,
+            return_deps,
+        )
+    def __call__(
+        self,
+        texts: Union[str, List[str], List[List[str]]],
+        is_split_into_words: bool = False,
+        **kwargs,
+    ) -> Union[List[Word], List[List[Word]]]:
+        """
+        Tokenize the input into single words using SpaCy models.
+        Args:
+            texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+                Text to tag. It can be a single string, a batch of string and pre-tokenized strings.
+            is_split_into_words (:obj:`bool`, optional, defaults to :obj:`False`):
+                If :obj:`True` and the input is a string, the input is split on spaces.
+        Returns:
+            :obj:`List[List[Word]]`: The input text tokenized in single words.
+        Example::
+            >>> from relik.inference.data.tokenizers.spacy_tokenizer import SpacyTokenizer
+            >>> spacy_tokenizer = SpacyTokenizer(language="en", pos_tags=True, lemma=True)
+            >>> spacy_tokenizer("Mary sold the car to John.")
+        """
+        # check if input is batched or a single sample
+        is_batched = self.check_is_batched(texts, is_split_into_words)
+        if is_batched:
+            tokenized = self.tokenize_batch(texts, is_split_into_words)
+        else:
+            tokenized = self.tokenize(texts, is_split_into_words)
+        return tokenized
+    def tokenize(self, text: Union[str, List[str]], is_split_into_words: bool) -> Doc:
+        if is_split_into_words:
+            if isinstance(text, str):
+                text = text.split(" ")
+            elif isinstance(text, list):
+                text = text
+            else:
+                raise ValueError(
+                    f"text must be either `str` or `list`, found: `{type(text)}`"
+                )
+            spaces = [True] * len(text)
+            return self.spacy(Doc(self.spacy.vocab, words=text, spaces=spaces))
+        return self.spacy(text)
+    def tokenize_batch(
+        self, texts: Union[List[str], List[List[str]]], is_split_into_words: bool
+    ) -> list[Any] | list[Doc]:
+        try:
+            if is_split_into_words:
+                if isinstance(texts[0], str):
+                    texts = [text.split(" ") for text in texts]
+                elif isinstance(texts[0], list):
+                    texts = texts
+                else:
+                    raise ValueError(
+                        f"text must be either `str` or `list`, found: `{type(texts[0])}`"
+                    )
+                spaces = [[True] * len(text) for text in texts]
+                texts = [
+                    Doc(self.spacy.vocab, words=text, spaces=space)
+                    for text, space in zip(texts, spaces)
+                ]
+            return list(self.spacy.pipe(texts))
+        except AttributeError:
+            # a WhitespaceSpacyTokenizer has no `pipe()` method, we use simple for loop
+            return [self.spacy(tokens) for tokens in texts]

relik/inference/data/window/__init__.py ADDED Viewed

File without changes

relik/inference/data/window/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (195 Bytes). View file

relik/inference/data/window/__pycache__/manager.cpython-310.pyc ADDED Viewed

Binary file (11.2 kB). View file

relik/inference/data/window/manager.py ADDED Viewed

	@@ -0,0 +1,431 @@

+import collections
+import itertools
+from typing import Dict, List, Optional, Set, Tuple
+from relik.inference.data.splitters.blank_sentence_splitter import BlankSentenceSplitter
+from relik.inference.data.splitters.base_sentence_splitter import BaseSentenceSplitter
+from relik.inference.data.tokenizers.base_tokenizer import BaseTokenizer
+from relik.reader.data.relik_reader_sample import RelikReaderSample
+class WindowManager:
+    def __init__(
+        self, tokenizer: BaseTokenizer, splitter: BaseSentenceSplitter | None = None
+    ) -> None:
+        self.tokenizer = tokenizer
+        self.splitter = splitter or BlankSentenceSplitter()
+    def create_windows(
+        self,
+        documents: str | List[str],
+        window_size: int | None = None,
+        stride: int | None = None,
+        max_length: int | None = None,
+        doc_id: str | int | None = None,
+        doc_topic: str | None = None,
+        is_split_into_words: bool = False,
+        mentions: List[List[List[int]]] = None,
+    ) -> Tuple[List[RelikReaderSample], List[RelikReaderSample]]:
+        """
+        Create windows from a list of documents.
+        Args:
+            documents (:obj:`str` or :obj:`List[str]`):
+                The document(s) to split in windows.
+            window_size (:obj:`int`):
+                The size of the window.
+            stride (:obj:`int`):
+                The stride between two windows.
+            max_length (:obj:`int`, `optional`):
+                The maximum length of a window.
+            doc_id (:obj:`str` or :obj:`int`, `optional`):
+                The id of the document(s).
+            doc_topic (:obj:`str`, `optional`):
+                The topic of the document(s).
+            is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether the input is already pre-tokenized (e.g., split into words). If :obj:`False`, the
+                input will first be tokenized using the tokenizer, then the tokens will be split into words.
+            mentions (:obj:`List[List[List[int]]]`, `optional`):
+                The mentions of the document(s).
+        Returns:
+            :obj:`List[RelikReaderSample]`: The windows created from the documents.
+        """
+        # normalize input
+        if isinstance(documents, str) or is_split_into_words:
+            documents = [documents]
+        # batch tokenize
+        documents_tokens = self.tokenizer(
+            documents, is_split_into_words=is_split_into_words
+        )
+        # set splitter params
+        if hasattr(self.splitter, "window_size"):
+            self.splitter.window_size = window_size or self.splitter.window_size
+        if hasattr(self.splitter, "window_stride"):
+            self.splitter.window_stride = stride or self.splitter.window_stride
+        windowed_documents, windowed_blank_documents = [], []
+        if mentions is not None:
+            assert len(documents) == len(
+                mentions
+            ), f"documents and mentions should have the same length, got {len(documents)} and {len(mentions)}"
+            doc_iter = zip(documents, documents_tokens, mentions)
+        else:
+            doc_iter = zip(documents, documents_tokens, itertools.repeat([]))
+        for infered_doc_id, (document, document_tokens, document_mentions) in enumerate(
+            doc_iter
+        ):
+            if doc_topic is None:
+                doc_topic = document_tokens[0] if len(document_tokens) > 0 else ""
+            if doc_id is None:
+                doc_id = infered_doc_id
+            splitted_document = self.splitter(document_tokens, max_length=max_length)
+            document_windows = []
+            for window_id, window in enumerate(splitted_document):
+                window_text_start = window[0].idx
+                window_text_end = window[-1].idx + len(window[-1].text)
+                if isinstance(document, str):
+                    text = document[window_text_start:window_text_end]
+                else:
+                    # window_text_start = window[0].idx
+                    # window_text_end = window[-1].i
+                    text = " ".join([w.text for w in window])
+                sample = RelikReaderSample(
+                    doc_id=doc_id,
+                    window_id=window_id,
+                    text=text,
+                    tokens=[w.text for w in window],
+                    words=[w.text for w in window],
+                    doc_topic=doc_topic,
+                    offset=window_text_start,
+                    spans=[
+                        [m[0], m[1]] for m in document_mentions
+                        if window_text_end > m[0] >= window_text_start and window_text_end >= m[1] >= window_text_start
+                    ],
+                    token2char_start={str(i): w.idx for i, w in enumerate(window)},
+                    token2char_end={
+                        str(i): w.idx + len(w.text) for i, w in enumerate(window)
+                    },
+                    char2token_start={
+                        str(w.idx): w.i for i, w in enumerate(window)
+                    },
+                    char2token_end={
+                        str(w.idx + len(w.text)): w.i for i, w in enumerate(window)
+                    },
+                )
+                if mentions is not None and len(sample.spans) == 0:
+                    windowed_blank_documents.append(sample)
+                else:
+                    document_windows.append(sample)
+            windowed_documents.extend(document_windows)
+        if mentions is not None:
+            return windowed_documents, windowed_blank_documents
+        else:
+            return windowed_documents, windowed_blank_documents
+    def merge_windows(
+        self, windows: List[RelikReaderSample]
+    ) -> List[RelikReaderSample]:
+        windows_by_doc_id = collections.defaultdict(list)
+        for window in windows:
+            windows_by_doc_id[window.doc_id].append(window)
+        merged_window_by_doc = {
+            doc_id: self._merge_doc_windows(doc_windows)
+            for doc_id, doc_windows in windows_by_doc_id.items()
+        }
+        return list(merged_window_by_doc.values())
+    def _merge_doc_windows(self, windows: List[RelikReaderSample]) -> RelikReaderSample:
+        if len(windows) == 1:
+            return windows[0]
+        if len(windows) > 0 and getattr(windows[0], "offset", None) is not None:
+            windows = sorted(windows, key=(lambda x: x.offset))
+        window_accumulator = windows[0]
+        for next_window in windows[1:]:
+            window_accumulator = self._merge_window_pair(
+                window_accumulator, next_window
+            )
+        return window_accumulator
+    @staticmethod
+    def _merge_tokens(
+        window1: RelikReaderSample, window2: RelikReaderSample
+    ) -> Tuple[list, dict, dict]:
+        w1_tokens = window1.tokens[1:-1]
+        w2_tokens = window2.tokens[1:-1]
+        # find intersection if any
+        tokens_intersection = 0
+        for k in reversed(range(1, len(w1_tokens))):
+            if w1_tokens[-k:] == w2_tokens[:k]:
+                tokens_intersection = k
+                break
+        final_tokens = (
+            [window1.tokens[0]]  # CLS
+            + w1_tokens
+            + w2_tokens[tokens_intersection:]
+            + [window1.tokens[-1]]  # SEP
+        )
+        w2_starting_offset = len(w1_tokens) - tokens_intersection
+        def merge_char_mapping(t2c1: dict, t2c2: dict) -> dict:
+            final_t2c = dict()
+            final_t2c.update(t2c1)
+            for t, c in t2c2.items():
+                t = int(t)
+                if t < tokens_intersection:
+                    continue
+                final_t2c[str(t + w2_starting_offset)] = c
+            return final_t2c
+        return (
+            final_tokens,
+            merge_char_mapping(window1.token2char_start, window2.token2char_start),
+            merge_char_mapping(window1.token2char_end, window2.token2char_end),
+        )
+    @staticmethod
+    def _merge_words(
+        window1: RelikReaderSample, window2: RelikReaderSample
+    ) -> Tuple[list, dict, dict]:
+        w1_words = window1.words
+        w2_words = window2.words
+        # find intersection if any
+        words_intersection = 0
+        for k in reversed(range(1, len(w1_words))):
+            if w1_words[-k:] == w2_words[:k]:
+                words_intersection = k
+                break
+        final_words = w1_words + w2_words[words_intersection:]
+        w2_starting_offset = len(w1_words) - words_intersection
+        def merge_word_mapping(t2c1: dict, t2c2: dict) -> dict:
+            final_t2c = dict()
+            if t2c1 is None:
+                t2c1 = dict()
+            if t2c2 is None:
+                t2c2 = dict()
+            final_t2c.update(t2c1)
+            for t, c in t2c2.items():
+                t = int(t)
+                if t < words_intersection:
+                    continue
+                final_t2c[str(t + w2_starting_offset)] = c
+            return final_t2c
+        return (
+            final_words,
+            merge_word_mapping(window1.token2word_start, window2.token2word_start),
+            merge_word_mapping(window1.token2word_end, window2.token2word_end),
+        )
+    @staticmethod
+    def _merge_span_annotation(
+        span_annotation1: List[list], span_annotation2: List[list]
+    ) -> List[list]:
+        uniq_store = set()
+        final_span_annotation_store = []
+        for span_annotation in itertools.chain(span_annotation1, span_annotation2):
+            span_annotation_id = tuple(span_annotation)
+            if span_annotation_id not in uniq_store:
+                uniq_store.add(span_annotation_id)
+                final_span_annotation_store.append(span_annotation)
+        return sorted(final_span_annotation_store, key=lambda x: x[0])
+    @staticmethod
+    def _merge_predictions(
+        window1: RelikReaderSample, window2: RelikReaderSample
+    ) -> Tuple[Set[Tuple[int, int, str]], dict]:
+        # a RelikReaderSample should have a filed called `predicted_spans`
+        # that stores the span-level predictions, or a filed called
+        # `predicted_triples` that stores the triple-level predictions
+        # span predictions
+        merged_span_predictions: Set = set()
+        merged_span_probabilities = dict()
+        # triple predictions
+        merged_triplet_predictions: Set = set()
+        merged_triplet_probs: Dict = dict()
+        if (
+            getattr(window1, "predicted_spans", None) is not None
+            and getattr(window2, "predicted_spans", None) is not None
+        ):
+            merged_span_predictions = set(window1.predicted_spans).union(
+                set(window2.predicted_spans)
+            )
+            merged_span_predictions = sorted(merged_span_predictions)
+            # probabilities
+            for span_prediction, predicted_probs in itertools.chain(
+                window1.probs_window_labels_chars.items()
+                if window1.probs_window_labels_chars is not None
+                else [],
+                window2.probs_window_labels_chars.items()
+                if window2.probs_window_labels_chars is not None
+                else [],
+            ):
+                if span_prediction not in merged_span_probabilities:
+                    merged_span_probabilities[span_prediction] = predicted_probs
+            if (
+                getattr(window1, "predicted_triples", None) is not None
+                and getattr(window2, "predicted_triples", None) is not None
+            ):
+                # try to merge the triples predictions
+                # add offset to the second window
+                window1_triplets = [
+                    (
+                        merged_span_predictions.index(window1.predicted_spans[t[0]]),
+                        t[1],
+                        merged_span_predictions.index(window1.predicted_spans[t[2]]),
+                        t[3]
+                    )
+                    for t in window1.predicted_triples
+                ]
+                window2_triplets = [
+                    (
+                        merged_span_predictions.index(window2.predicted_spans[t[0]]),
+                        t[1],
+                        merged_span_predictions.index(window2.predicted_spans[t[2]]),
+                        t[3]
+                    )
+                    for t in window2.predicted_triples
+                ]
+                merged_triplet_predictions = set(window1_triplets).union(
+                    set(window2_triplets)
+                )
+                merged_triplet_predictions = sorted(merged_triplet_predictions)
+                # for now no triplet probs, we don't need them for the moment
+        return (
+            merged_span_predictions,
+            merged_span_probabilities,
+            merged_triplet_predictions,
+            merged_triplet_probs,
+        )
+    @staticmethod
+    def _merge_candidates(window1: RelikReaderSample, window2: RelikReaderSample):
+        candidates = []
+        windows_candidates = []
+        # TODO: retro-compatibility
+        if getattr(window1, "candidates", None) is not None:
+            candidates = window1.candidates
+        if getattr(window2, "candidates", None) is not None:
+            candidates += window2.candidates
+        # TODO: retro-compatibility
+        if getattr(window1, "windows_candidates", None) is not None:
+            windows_candidates = window1.windows_candidates
+        if getattr(window2, "windows_candidates", None) is not None:
+            windows_candidates += window2.windows_candidates
+        # TODO: add programmatically
+        span_candidates = []
+        if getattr(window1, "span_candidates", None) is not None:
+            span_candidates = window1.span_candidates
+        if getattr(window2, "span_candidates", None) is not None:
+            span_candidates += window2.span_candidates
+        triplet_candidates = []
+        if getattr(window1, "triplet_candidates", None) is not None:
+            triplet_candidates = window1.triplet_candidates
+        if getattr(window2, "triplet_candidates", None) is not None:
+            triplet_candidates += window2.triplet_candidates
+        # make them unique
+        candidates = list(set(candidates))
+        windows_candidates = list(set(windows_candidates))
+        span_candidates = list(set(span_candidates))
+        triplet_candidates = list(set(triplet_candidates))
+        return candidates, windows_candidates, span_candidates, triplet_candidates
+    def _merge_window_pair(
+        self,
+        window1: RelikReaderSample,
+        window2: RelikReaderSample,
+    ) -> RelikReaderSample:
+        merging_output = dict()
+        if getattr(window1, "doc_id", None) is not None:
+            assert window1.doc_id == window2.doc_id
+        if getattr(window1, "offset", None) is not None:
+            assert (
+                window1.offset < window2.offset
+            ), f"window 2 offset ({window2.offset}) is smaller that window 1 offset({window1.offset})"
+        merging_output["doc_id"] = window1.doc_id
+        merging_output["offset"] = window2.offset
+        m_tokens, m_token2char_start, m_token2char_end = self._merge_tokens(
+            window1, window2
+        )
+        m_words, m_token2word_start, m_token2word_end = self._merge_words(
+            window1, window2
+        )
+        (
+            m_candidates,
+            m_windows_candidates,
+            m_span_candidates,
+            m_triplet_candidates,
+        ) = self._merge_candidates(window1, window2)
+        window_labels = None
+        if getattr(window1, "window_labels", None) is not None:
+            window_labels = self._merge_span_annotation(
+                window1.window_labels, window2.window_labels
+            )
+        (
+            predicted_spans,
+            predicted_spans_probs,
+            predicted_triples,
+            predicted_triples_probs,
+        ) = self._merge_predictions(window1, window2)
+        merging_output.update(
+            dict(
+                tokens=m_tokens,
+                words=m_words,
+                token2char_start=m_token2char_start,
+                token2char_end=m_token2char_end,
+                token2word_start=m_token2word_start,
+                token2word_end=m_token2word_end,
+                window_labels=window_labels,
+                candidates=m_candidates,
+                span_candidates=m_span_candidates,
+                triplet_candidates=m_triplet_candidates,
+                windows_candidates=m_windows_candidates,
+                predicted_spans=predicted_spans,
+                predicted_spans_probs=predicted_spans_probs,
+                predicted_triples=predicted_triples,
+                predicted_triples_probs=predicted_triples_probs,
+            )
+        )
+        return RelikReaderSample(**merging_output)

relik/inference/gerbil.py ADDED Viewed

	@@ -0,0 +1,269 @@

+import argparse
+import json
+import logging
+import os
+from pathlib import Path
+import re
+import sys
+from http.server import BaseHTTPRequestHandler, HTTPServer
+from typing import Iterator, List, Optional, Tuple
+from urllib import parse
+from relik.inference.annotator import Relik
+from relik.inference.data.objects import RelikOutput
+# sys.path += ['../']
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../")))
+logger = logging.getLogger(__name__)
+class GerbilAlbyManager:
+    def __init__(
+        self,
+        annotator: Optional[Relik] = None,
+        response_logger_dir: Optional[str] = None,
+    ) -> None:
+        self.annotator = annotator
+        self.response_logger_dir = response_logger_dir
+        self.predictions_counter = 0
+        self.labels_mapping = None
+    def annotate(self, document: str):
+        relik_output: RelikOutput = self.annotator(
+            document, retriever_batch_size=2, reader_batch_size=1
+        )
+        annotations = [(ss, se, l) for ss, se, l, _ in relik_output.spans]
+        if self.labels_mapping is not None:
+            return [
+                (ss, se, self.labels_mapping.get(l, l)) for ss, se, l in annotations
+            ]
+        return annotations
+    def set_mapping_file(self, mapping_file_path: str):
+        with open(mapping_file_path) as f:
+            labels_mapping = json.load(f)
+        self.labels_mapping = {v: k for k, v in labels_mapping.items()}
+    def write_response_bundle(
+        self,
+        document: str,
+        new_document: str,
+        annotations: list,
+        mapped_annotations: list,
+    ) -> None:
+        if self.response_logger_dir is None:
+            return
+        if not os.path.isdir(self.response_logger_dir):
+            os.mkdir(self.response_logger_dir)
+        with open(
+            f"{self.response_logger_dir}/{self.predictions_counter}.json", "w"
+        ) as f:
+            out_json_obj = dict(
+                document=document,
+                new_document=new_document,
+                annotations=annotations,
+                mapped_annotations=mapped_annotations,
+            )
+            out_json_obj["span_annotations"] = [
+                (ss, se, document[ss:se], label) for (ss, se, label) in annotations
+            ]
+            out_json_obj["span_mapped_annotations"] = [
+                (ss, se, new_document[ss:se], label)
+                for (ss, se, label) in mapped_annotations
+            ]
+            json.dump(out_json_obj, f, indent=2)
+        self.predictions_counter += 1
+manager = GerbilAlbyManager()
+def preprocess_document(document: str) -> Tuple[str, List[Tuple[int, int]]]:
+    pattern_subs = {
+        "-LPR- ": " (",
+        "-RPR-": ")",
+        "\n\n": "\n",
+        "-LRB-": "(",
+        "-RRB-": ")",
+        '","': ",",
+    }
+    document_acc = document
+    curr_offset = 0
+    char2offset = []
+    matchings = re.finditer("({})".format("|".join(pattern_subs)), document)
+    for span_matching in sorted(matchings, key=lambda x: x.span()[0]):
+        span_start, span_end = span_matching.span()
+        span_start -= curr_offset
+        span_end -= curr_offset
+        span_text = document_acc[span_start:span_end]
+        span_sub = pattern_subs[span_text]
+        document_acc = document_acc[:span_start] + span_sub + document_acc[span_end:]
+        offset = len(span_text) - len(span_sub)
+        curr_offset += offset
+        char2offset.append((span_start + len(span_sub), curr_offset))
+    return document_acc, char2offset
+def map_back_annotations(
+    annotations: List[Tuple[int, int, str]], char_mapping: List[Tuple[int, int]]
+) -> Iterator[Tuple[int, int, str]]:
+    def map_char(char_idx: int) -> int:
+        current_offset = 0
+        for offset_idx, offset_value in char_mapping:
+            if char_idx >= offset_idx:
+                current_offset = offset_value
+            else:
+                break
+        return char_idx + current_offset
+    for ss, se, label in annotations:
+        yield map_char(ss), map_char(se), label
+def annotate(document: str) -> List[Tuple[int, int, str]]:
+    new_document, mapping = preprocess_document(document)
+    logger.info("Mapping: " + str(mapping))
+    logger.info("Document: " + str(document))
+    annotations = [
+        (cs, ce, label.replace(" ", "_"))
+        for cs, ce, label in manager.annotate(new_document)
+    ]
+    logger.info("New document: " + str(new_document))
+    mapped_annotations = (
+        list(map_back_annotations(annotations, mapping))
+        if len(mapping) > 0
+        else annotations
+    )
+    logger.info(
+        "Annotations: "
+        + str([(ss, se, document[ss:se], ann) for ss, se, ann in mapped_annotations])
+    )
+    manager.write_response_bundle(
+        document, new_document, mapped_annotations, annotations
+    )
+    if not all(
+        [
+            new_document[ss:se] == document[mss:mse]
+            for (mss, mse, _), (ss, se, _) in zip(mapped_annotations, annotations)
+        ]
+    ):
+        diff_mappings = [
+            (new_document[ss:se], document[mss:mse])
+            for (mss, mse, _), (ss, se, _) in zip(mapped_annotations, annotations)
+        ]
+        return None
+    assert all(
+        [
+            document[mss:mse] == new_document[ss:se]
+            for (mss, mse, _), (ss, se, _) in zip(mapped_annotations, annotations)
+        ]
+    ), (mapped_annotations, annotations)
+    return [(cs, ce - cs, label) for cs, ce, label in mapped_annotations]
+class GetHandler(BaseHTTPRequestHandler):
+    def do_POST(self):
+        content_length = int(self.headers["Content-Length"])
+        post_data = self.rfile.read(content_length)
+        self.send_response(200)
+        self.end_headers()
+        doc_text = read_json(post_data)
+        # try:
+        response = annotate(doc_text)
+        self.wfile.write(bytes(json.dumps(response), "utf-8"))
+        return
+def read_json(post_data):
+    data = json.loads(post_data.decode("utf-8"))
+    # logger.info("received data:", data)
+    text = data["text"]
+    # spans = [(int(j["start"]), int(j["length"])) for j in data["spans"]]
+    return text
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--relik-model-name", required=True)
+    parser.add_argument("--responses-log-dir")
+    parser.add_argument("--log-file", default="experiments/logging.txt")
+    parser.add_argument("--mapping-file")
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    responses_log_dir = Path(args.responses_log_dir)
+    responses_log_dir.mkdir(parents=True, exist_ok=True)
+    # init manager
+    manager.response_logger_dir = args.responses_log_dir
+    manager.annotator = Relik.from_pretrained(
+        args.relik_model_name,
+        device="cuda",
+        # document_index_device="cpu",
+        # document_index_precision="fp32",
+        # reader_device="cpu",
+        precision="fp16",  # , reader_device="cpu", reader_precision="fp32"
+        dataset_kwargs={"use_nme": True}
+    )
+    # print("Debugging, not using you relik model but an hardcoded one.")
+    # manager.annotator = Relik(
+    #     question_encoder="riccorl/relik-retriever-aida-blink-pretrain-omniencoder",
+    #     document_index="riccorl/index-relik-retriever-aida-blink-pretrain-omniencoder",
+    #     reader="relik/reader/models/relik-reader-deberta-base-new-data",
+    #     window_size=32,
+    #     window_stride=16,
+    #     candidates_preprocessing_fn=(lambda x: x.split("<def>")[0].strip()),
+    # )
+    if args.mapping_file is not None:
+        manager.set_mapping_file(args.mapping_file)
+    # port = 6654
+    port = 5555
+    server = HTTPServer(("localhost", port), GetHandler)
+    logger.info(f"Starting server at http://localhost:{port}")
+    # Create a file handler and set its level
+    file_handler = logging.FileHandler(args.log_file)
+    file_handler.setLevel(logging.DEBUG)
+    # Create a log formatter and set it on the handler
+    formatter = logging.Formatter(
+        "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    )
+    file_handler.setFormatter(formatter)
+    # Add the file handler to the logger
+    logger.addHandler(file_handler)
+    try:
+        server.serve_forever()
+    except KeyboardInterrupt:
+        exit(0)
+if __name__ == "__main__":
+    main()

relik/inference/serve/__init__.py ADDED Viewed

File without changes

relik/inference/serve/backend/__init__.py ADDED Viewed

File without changes

relik/inference/serve/backend/fastapi.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import logging
+import os
+from pathlib import Path
+from typing import List, Union
+import psutil
+import torch
+from relik.common.utils import is_package_available
+from relik.inference.annotator import Relik
+if not is_package_available("fastapi"):
+    raise ImportError(
+        "FastAPI is not installed. Please install FastAPI with `pip install relik[serve]`."
+    )
+from fastapi import FastAPI, HTTPException, APIRouter
+from relik.common.log import get_logger
+from relik.inference.serve.backend.utils import (
+    RayParameterManager,
+    ServerParameterManager,
+)
+logger = get_logger(__name__, level=logging.INFO)
+VERSION = {}  # type: ignore
+with open(
+    Path(__file__).parent.parent.parent.parent / "version.py", "r"
+) as version_file:
+    exec(version_file.read(), VERSION)
+# Env variables for server
+SERVER_MANAGER = ServerParameterManager()
+RAY_MANAGER = RayParameterManager()
+class RelikServer:
+    def __init__(
+        self,
+        relik_pretrained: str | None = None,
+        device: str = "cpu",
+        retriever_device: str | None = None,
+        document_index_device: str | None = None,
+        reader_device: str | None = None,
+        precision: str | int | torch.dtype = 32,
+        retriever_precision: str | int | torch.dtype | None = None,
+        document_index_precision: str | int | torch.dtype | None = None,
+        reader_precision: str | int | torch.dtype | None = None,
+        annotation_type: str = "char",
+        **kwargs,
+    ):
+        num_threads = os.getenv("TORCH_NUM_THREADS", psutil.cpu_count(logical=False))
+        torch.set_num_threads(num_threads)
+        logger.info(f"Torch is running on {num_threads} threads.")
+        # parameters
+        logger.info(f"RELIK_PRETRAINED: {relik_pretrained}")
+        self.relik_pretrained = relik_pretrained
+        logger.info(f"DEVICE: {device}")
+        self.device = device
+        if retriever_device is not None:
+            logger.info(f"RETRIEVER_DEVICE: {retriever_device}")
+        self.retriever_device = retriever_device or device
+        if document_index_device is not None:
+            logger.info(f"INDEX_DEVICE: {document_index_device}")
+        self.document_index_device = document_index_device or retriever_device
+        if reader_device is not None:
+            logger.info(f"READER_DEVICE: {reader_device}")
+        self.reader_device = reader_device
+        logger.info(f"PRECISION: {precision}")
+        self.precision = precision
+        if retriever_precision is not None:
+            logger.info(f"RETRIEVER_PRECISION: {retriever_precision}")
+        self.retriever_precision = retriever_precision or precision
+        if document_index_precision is not None:
+            logger.info(f"INDEX_PRECISION: {document_index_precision}")
+        self.document_index_precision = document_index_precision or precision
+        if reader_precision is not None:
+            logger.info(f"READER_PRECISION: {reader_precision}")
+        self.reader_precision = reader_precision or precision
+        logger.info(f"ANNOTATION_TYPE: {annotation_type}")
+        self.annotation_type = annotation_type
+        self.relik = Relik.from_pretrained(
+            self.relik_pretrained,
+            device=self.device,
+            retriever_device=self.retriever_device,
+            document_index_device=self.document_index_device,
+            reader_device=self.reader_device,
+            precision=self.precision,
+            retriever_precision=self.retriever_precision,
+            document_index_precision=self.document_index_precision,
+            reader_precision=self.reader_precision,
+        )
+        self.router = APIRouter()
+        self.router.add_api_route("/api/relik", self.relik_endpoint, methods=["POST"])
+        logger.info("RelikServer initialized.")
+    # @serve.batch()
+    async def __call__(self, text: List[str]) -> List:
+        return self.relik(text, annotation_type=self.annotation_type)
+    # @app.post("/api/relik")
+    async def relik_endpoint(self, text: Union[str, List[str]]):
+        try:
+            # get predictions for the retriever
+            return await self(text)
+        except Exception as e:
+            # log the entire stack trace
+            logger.exception(e)
+            raise HTTPException(status_code=500, detail=f"Server Error: {e}")
+app = FastAPI(
+    title="ReLiK",
+    version=VERSION["VERSION"],
+    description="ReLiK REST API",
+)
+server = RelikServer(**vars(SERVER_MANAGER))
+app.include_router(server.router)

relik/inference/serve/backend/ray.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import logging
+import os
+from pathlib import Path
+from typing import List, Union
+import psutil
+import torch
+from relik.common.utils import is_package_available
+from relik.inference.annotator import Relik
+if not is_package_available("fastapi"):
+    raise ImportError(
+        "FastAPI is not installed. Please install FastAPI with `pip install relik[serve]`."
+    )
+from fastapi import FastAPI, HTTPException
+if not is_package_available("ray"):
+    raise ImportError(
+        "Ray is not installed. Please install Ray with `pip install relik[serve]`."
+    )
+from ray import serve
+from relik.common.log import get_logger
+from relik.inference.serve.backend.utils import (
+    RayParameterManager,
+    ServerParameterManager,
+)
+logger = get_logger(__name__, level=logging.INFO)
+VERSION = {}  # type: ignore
+with open(
+    Path(__file__).parent.parent.parent.parent / "version.py", "r"
+) as version_file:
+    exec(version_file.read(), VERSION)
+# Env variables for server
+SERVER_MANAGER = ServerParameterManager()
+RAY_MANAGER = RayParameterManager()
+app = FastAPI(
+    title="ReLiK",
+    version=VERSION["VERSION"],
+    description="ReLiK REST API",
+)
+@serve.deployment(
+    ray_actor_options={
+        "num_gpus": RAY_MANAGER.num_gpus
+        if (
+            SERVER_MANAGER.device == "cuda"
+            or SERVER_MANAGER.retriever_device == "cuda"
+            or SERVER_MANAGER.reader_device == "cuda"
+        )
+        else 0
+    },
+    autoscaling_config={
+        "min_replicas": RAY_MANAGER.min_replicas,
+        "max_replicas": RAY_MANAGER.max_replicas,
+    },
+)
+@serve.ingress(app)
+class RelikServer:
+    def __init__(
+        self,
+        relik_pretrained: str | None = None,
+        device: str = "cpu",
+        retriever_device: str | None = None,
+        document_index_device: str | None = None,
+        reader_device: str | None = None,
+        precision: str | int | torch.dtype = 32,
+        retriever_precision: str | int | torch.dtype | None = None,
+        document_index_precision: str | int | torch.dtype | None = None,
+        reader_precision: str | int | torch.dtype | None = None,
+        annotation_type: str = "char",
+        retriever_batch_size: int = 32,
+        reader_batch_size: int = 32,
+        relik_config_override: dict | None = None,
+        **kwargs,
+    ):
+        num_threads = os.getenv("TORCH_NUM_THREADS", psutil.cpu_count(logical=False))
+        torch.set_num_threads(num_threads)
+        logger.info(f"Torch is running on {num_threads} threads.")
+        # parameters
+        logger.info(f"RELIK_PRETRAINED: {relik_pretrained}")
+        self.relik_pretrained = relik_pretrained
+        if relik_config_override is None:
+            relik_config_override = {}
+        logger.info(f"RELIK_CONFIG_OVERRIDE: {relik_config_override}")
+        self.relik_config_override = relik_config_override
+        logger.info(f"DEVICE: {device}")
+        self.device = device
+        if retriever_device is not None:
+            logger.info(f"RETRIEVER_DEVICE: {retriever_device}")
+        self.retriever_device = retriever_device or device
+        if document_index_device is not None:
+            logger.info(f"INDEX_DEVICE: {document_index_device}")
+        self.document_index_device = document_index_device or retriever_device
+        if reader_device is not None:
+            logger.info(f"READER_DEVICE: {reader_device}")
+        self.reader_device = reader_device
+        logger.info(f"PRECISION: {precision}")
+        self.precision = precision
+        if retriever_precision is not None:
+            logger.info(f"RETRIEVER_PRECISION: {retriever_precision}")
+        self.retriever_precision = retriever_precision or precision
+        if document_index_precision is not None:
+            logger.info(f"INDEX_PRECISION: {document_index_precision}")
+        self.document_index_precision = document_index_precision or precision
+        if reader_precision is not None:
+            logger.info(f"READER_PRECISION: {reader_precision}")
+        self.reader_precision = reader_precision or precision
+        logger.info(f"ANNOTATION_TYPE: {annotation_type}")
+        self.annotation_type = annotation_type
+        self.relik = Relik.from_pretrained(
+            self.relik_pretrained,
+            device=self.device,
+            retriever_device=self.retriever_device,
+            document_index_device=self.document_index_device,
+            reader_device=self.reader_device,
+            precision=self.precision,
+            retriever_precision=self.retriever_precision,
+            document_index_precision=self.document_index_precision,
+            reader_precision=self.reader_precision,
+            **self.relik_config_override,
+        )
+        self.retriever_batch_size = retriever_batch_size
+        self.reader_batch_size = reader_batch_size
+    # @serve.batch()
+    async def handle_batch(self, text: List[str]) -> List:
+        return self.relik(
+            text,
+            annotation_type=self.annotation_type,
+            retriever_batch_size=self.retriever_batch_size,
+            reader_batch_size=self.reader_batch_size,
+        )
+    @app.post("/api/relik")
+    async def relik_endpoint(self, text: Union[str, List[str]]):
+        try:
+            # get predictions for the retriever
+            return await self.handle_batch(text)
+        except Exception as e:
+            # log the entire stack trace
+            logger.exception(e)
+            raise HTTPException(status_code=500, detail=f"Server Error: {e}")
+server = RelikServer.bind(**vars(SERVER_MANAGER))

relik/inference/serve/backend/utils.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import ast
+import os
+from dataclasses import dataclass
+@dataclass
+class ServerParameterManager:
+    relik_pretrained: str = os.environ.get("RELIK_PRETRAINED", None)
+    device: str = os.environ.get("DEVICE", "cpu")
+    retriever_device: str | None = os.environ.get("RETRIEVER_DEVICE", None)
+    document_index_device: str | None = os.environ.get("INDEX_DEVICE", None)
+    reader_device: str | None = os.environ.get("READER_DEVICE", None)
+    precision: int | str | None = os.environ.get("PRECISION", "fp32")
+    retriever_precision: int | str | None = os.environ.get("RETRIEVER_PRECISION", None)
+    document_index_precision: int | str | None = os.environ.get("INDEX_PRECISION", None)
+    reader_precision: int | str | None = os.environ.get("READER_PRECISION", None)
+    annotation_type: str = os.environ.get("ANNOTATION_TYPE", "char")
+    question_encoder: str = os.environ.get("QUESTION_ENCODER", None)
+    passage_encoder: str = os.environ.get("PASSAGE_ENCODER", None)
+    document_index: str = os.environ.get("DOCUMENT_INDEX", None)
+    reader_encoder: str = os.environ.get("READER_ENCODER", None)
+    top_k: int = int(os.environ.get("TOP_K", 100))
+    use_faiss: bool = os.environ.get("USE_FAISS", False)
+    retriever_batch_size: int = int(os.environ.get("RETRIEVER_BATCH_SIZE", 32))
+    reader_batch_size: int = int(os.environ.get("READER_BATCH_SIZE", 32))
+    window_size: int = int(os.environ.get("WINDOW_SIZE", 32))
+    window_stride: int = int(os.environ.get("WINDOW_SIZE", 16))
+    split_on_spaces: bool = os.environ.get("SPLIT_ON_SPACES", False)
+    # relik_config_override: dict = ast.literal_eval(
+    #     os.environ.get("RELIK_CONFIG_OVERRIDE", None)
+    # )
+class RayParameterManager:
+    def __init__(self) -> None:
+        self.num_gpus = int(os.environ.get("NUM_GPUS", 1))
+        self.min_replicas = int(os.environ.get("MIN_REPLICAS", 1))
+        self.max_replicas = int(os.environ.get("MAX_REPLICAS", 1))

relik/inference/serve/frontend/__init__.py ADDED Viewed

File without changes

relik/inference/serve/frontend/relik_front.py ADDED Viewed

	@@ -0,0 +1,229 @@

+import os
+from pathlib import Path
+import requests
+import streamlit as st
+from spacy import displacy
+from streamlit_extras.badges import badge
+from streamlit_extras.stylable_container import stylable_container
+RELIK = os.getenv("RELIK", "localhost:8000/api/entities")
+import random
+def get_random_color(ents):
+    colors = {}
+    random_colors = generate_pastel_colors(len(ents))
+    for ent in ents:
+        colors[ent] = random_colors.pop(random.randint(0, len(random_colors) - 1))
+    return colors
+def floatrange(start, stop, steps):
+    if int(steps) == 1:
+        return [stop]
+    return [
+        start + float(i) * (stop - start) / (float(steps) - 1) for i in range(steps)
+    ]
+def hsl_to_rgb(h, s, l):
+    def hue_2_rgb(v1, v2, v_h):
+        while v_h < 0.0:
+            v_h += 1.0
+        while v_h > 1.0:
+            v_h -= 1.0
+        if 6 * v_h < 1.0:
+            return v1 + (v2 - v1) * 6.0 * v_h
+        if 2 * v_h < 1.0:
+            return v2
+        if 3 * v_h < 2.0:
+            return v1 + (v2 - v1) * ((2.0 / 3.0) - v_h) * 6.0
+        return v1
+    # if not (0 <= s <= 1): raise ValueError, "s (saturation) parameter must be between 0 and 1."
+    # if not (0 <= l <= 1): raise ValueError, "l (lightness) parameter must be between 0 and 1."
+    r, b, g = (l * 255,) * 3
+    if s != 0.0:
+        if l < 0.5:
+            var_2 = l * (1.0 + s)
+        else:
+            var_2 = (l + s) - (s * l)
+        var_1 = 2.0 * l - var_2
+        r = 255 * hue_2_rgb(var_1, var_2, h + (1.0 / 3.0))
+        g = 255 * hue_2_rgb(var_1, var_2, h)
+        b = 255 * hue_2_rgb(var_1, var_2, h - (1.0 / 3.0))
+    return int(round(r)), int(round(g)), int(round(b))
+def generate_pastel_colors(n):
+    """Return different pastel colours.
+    Input:
+        n (integer) : The number of colors to return
+    Output:
+        A list of colors in HTML notation (eg.['#cce0ff', '#ffcccc', '#ccffe0', '#f5ccff', '#f5ffcc'])
+    Example:
+        >>> print generate_pastel_colors(5)
+        ['#cce0ff', '#f5ccff', '#ffcccc', '#f5ffcc', '#ccffe0']
+    """
+    if n == 0:
+        return []
+    # To generate colors, we use the HSL colorspace (see http://en.wikipedia.org/wiki/HSL_color_space)
+    start_hue = 0.6  # 0=red    1/3=0.333=green   2/3=0.666=blue
+    saturation = 1.0
+    lightness = 0.8
+    # We take points around the chromatic circle (hue):
+    # (Note: we generate n+1 colors, then drop the last one ([:-1]) because
+    # it equals the first one (hue 0 = hue 1))
+    return [
+        "#%02x%02x%02x" % hsl_to_rgb(hue, saturation, lightness)
+        for hue in floatrange(start_hue, start_hue + 1, n + 1)
+    ][:-1]
+def set_sidebar(css):
+    white_link_wrapper = "<link rel='stylesheet' href='https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/all.min.css'><a href='{}'>{}</a>"
+    with st.sidebar:
+        st.markdown(f"<style>{css}</style>", unsafe_allow_html=True)
+        st.image(
+            "http://nlp.uniroma1.it/static/website/sapienza-nlp-logo-wh.svg",
+            use_column_width=True,
+        )
+        st.markdown("## ReLiK")
+        st.write(
+            f"""
+                - {white_link_wrapper.format("#", "<i class='fa-solid fa-file'></i>&nbsp; Paper")}
+                - {white_link_wrapper.format("https://github.com/SapienzaNLP/relik", "<i class='fa-brands fa-github'></i>&nbsp; GitHub")}
+                - {white_link_wrapper.format("https://hub.docker.com/repository/docker/sapienzanlp/relik", "<i class='fa-brands fa-docker'></i>&nbsp; Docker Hub")}
+                """,
+            unsafe_allow_html=True,
+        )
+        st.markdown("## Sapienza NLP")
+        st.write(
+            f"""
+                - {white_link_wrapper.format("https://nlp.uniroma1.it", "<i class='fa-solid fa-globe'></i>&nbsp; Webpage")}
+                - {white_link_wrapper.format("https://github.com/SapienzaNLP", "<i class='fa-brands fa-github'></i>&nbsp; GitHub")}
+                - {white_link_wrapper.format("https://twitter.com/SapienzaNLP", "<i class='fa-brands fa-twitter'></i>&nbsp; Twitter")}
+                - {white_link_wrapper.format("https://www.linkedin.com/company/79434450", "<i class='fa-brands fa-linkedin'></i>&nbsp; LinkedIn")}
+                """,
+            unsafe_allow_html=True,
+        )
+def get_el_annotations(response):
+    # swap labels key with ents
+    response["ents"] = response.pop("labels")
+    label_in_text = set(l["label"] for l in response["ents"])
+    options = {"ents": label_in_text, "colors": get_random_color(label_in_text)}
+    return response, options
+def set_intro(css):
+    # intro
+    st.markdown("# ReLik")
+    st.markdown(
+        "### Retrieve, Read and LinK: Fast and Accurate Entity Linking and Relation Extraction on an Academic Budget"
+    )
+    # st.markdown(
+    #     "This is a front-end for the paper [Universal Semantic Annotator: the First Unified API "
+    #     "for WSD, SRL and Semantic Parsing](https://www.researchgate.net/publication/360671045_Universal_Semantic_Annotator_the_First_Unified_API_for_WSD_SRL_and_Semantic_Parsing), which will be presented at LREC 2022 by "
+    #     "[Riccardo Orlando](https://riccorl.github.io), [Simone Conia](https://c-simone.github.io/), "
+    #     "[Stefano Faralli](https://corsidilaurea.uniroma1.it/it/users/stefanofaralliuniroma1it), and [Roberto Navigli](https://www.diag.uniroma1.it/navigli/)."
+    # )
+    badge(type="github", name="sapienzanlp/relik")
+    badge(type="pypi", name="relik")
+def run_client():
+    with open(Path(__file__).parent / "style.css") as f:
+        css = f.read()
+    st.set_page_config(
+        page_title="ReLik",
+        page_icon="🦮",
+        layout="wide",
+    )
+    set_sidebar(css)
+    set_intro(css)
+    # text input
+    text = st.text_area(
+        "Enter Text Below:",
+        value="Obama went to Rome for a quick vacation.",
+        height=200,
+        max_chars=500,
+    )
+    with stylable_container(
+        key="annotate_button",
+        css_styles="""
+            button {
+                background-color: #802433;
+                color: white;
+                border-radius: 25px;
+            }
+            """,
+    ):
+        submit = st.button("Annotate")
+    # submit = st.button("Run")
+    # ReLik API call
+    if submit:
+        text = text.strip()
+        if text:
+            st.markdown("####")
+            st.markdown("#### Entity Linking")
+            with st.spinner(text="In progress"):
+                response = requests.post(RELIK, json=text)
+                if response.status_code != 200:
+                    st.error("Error: {}".format(response.status_code))
+                else:
+                    response = response.json()
+                    # Entity Linking
+                    # with stylable_container(
+                    #     key="container_with_border",
+                    #     css_styles="""
+                    #         {
+                    #             border: 1px solid rgba(49, 51, 63, 0.2);
+                    #             border-radius: 0.5rem;
+                    #             padding: 0.5rem;
+                    #             padding-bottom: 2rem;
+                    #         }
+                    #         """,
+                    # ):
+                    # st.markdown("##")
+                    dict_of_ents, options = get_el_annotations(response=response)
+                    display = displacy.render(
+                        dict_of_ents, manual=True, style="ent", options=options
+                    )
+                    display = display.replace("\n", " ")
+                    # wsd_display = re.sub(
+                    #     r"(wiki::\d+\w)",
+                    #     r"<a href='https://babelnet.org/synset?id=\g<1>&orig=\g<1>&lang={}'>\g<1></a>".format(
+                    #         language.upper()
+                    #     ),
+                    #     wsd_display,
+                    # )
+                    with st.container():
+                        st.write(display, unsafe_allow_html=True)
+                    st.markdown("####")
+                    st.markdown("#### Relation Extraction")
+                    with st.container():
+                        st.write("Coming :)", unsafe_allow_html=True)
+        else:
+            st.error("Please enter some text.")
+if __name__ == "__main__":
+    run_client()

relik/inference/serve/frontend/relik_re_front.py ADDED Viewed

	@@ -0,0 +1,251 @@

+import os
+from datetime import datetime as dt
+from pathlib import Path
+import requests
+import spacy
+import streamlit as st
+import streamlit.components.v1 as components
+from pyvis.network import Network
+from spacy import displacy
+from spacy.tokens import Doc
+from streamlit_extras.badges import badge
+from streamlit_extras.stylable_container import stylable_container
+from utils import get_random_color, visualize_parser
+from relik import Relik
+# RELIK = os.getenv("RELIK", "localhost:8000/api/relik")
+state_variables = {"has_run_free": False, "html_free": ""}
+def init_state_variables():
+    for k, v in state_variables.items():
+        if k not in st.session_state:
+            st.session_state[k] = v
+def free_reset_session():
+    for k in state_variables:
+        del st.session_state[k]
+def generate_graph(dict_ents, response, filename, options):
+    g = Network(
+        width="720px",
+        height="600px",
+        directed=True,
+        notebook=False,
+        bgcolor="#222222",
+        font_color="white",
+    )
+    g.barnes_hut(
+        gravity=-3000,
+        central_gravity=0.3,
+        spring_length=50,
+        spring_strength=0.001,
+        damping=0.09,
+        overlap=0,
+    )
+    for ent in dict_ents:
+        g.add_node(
+            dict_ents[ent][0],
+            label=dict_ents[ent][1],
+            color=options["colors"][dict_ents[ent][0]],
+            title=dict_ents[ent][0],
+            size=15,
+            labelHighlightBold=True,
+        )
+    for rel in response.triples:
+        g.add_edge(
+            dict_ents[(rel.subject.start, rel.subject.end)][0],
+            dict_ents[(rel.object.start, rel.object.end)][0],
+            label=rel.label,
+            title=rel.label,
+        )
+    g.show(filename, notebook=False)
+def set_sidebar(css):
+    white_link_wrapper = (
+        "<link rel='stylesheet' "
+        "href='https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/all.min.css'><a href='{}'>{}</a>"
+    )
+    with st.sidebar:
+        st.markdown(f"<style>{css}</style>", unsafe_allow_html=True)
+        st.image(
+            "http://nlp.uniroma1.it/static/website/sapienza-nlp-logo-wh.svg",
+            use_column_width=True,
+        )
+        st.markdown("## ReLiK")
+        st.write(
+            f"""
+                - {white_link_wrapper.format("#", "<i class='fa-solid fa-file'></i>&nbsp; Paper")}
+                - {white_link_wrapper.format("https://github.com/SapienzaNLP/relik", "<i class='fa-brands fa-github'></i>&nbsp; GitHub")}
+                - {white_link_wrapper.format("https://hub.docker.com/repository/docker/sapienzanlp/relik", "<i class='fa-brands fa-docker'></i>&nbsp; Docker Hub")}
+                """,
+            unsafe_allow_html=True,
+        )
+        st.markdown("## Sapienza NLP")
+        st.write(
+            f"""
+                - {white_link_wrapper.format("https://nlp.uniroma1.it", "<i class='fa-solid fa-globe'></i>&nbsp; Webpage")}
+                - {white_link_wrapper.format("https://github.com/SapienzaNLP", "<i class='fa-brands fa-github'></i>&nbsp; GitHub")}
+                - {white_link_wrapper.format("https://twitter.com/SapienzaNLP", "<i class='fa-brands fa-twitter'></i>&nbsp; Twitter")}
+                - {white_link_wrapper.format("https://www.linkedin.com/company/79434450", "<i class='fa-brands fa-linkedin'></i>&nbsp; LinkedIn")}
+                """,
+            unsafe_allow_html=True,
+        )
+def get_span_annotations(response):
+    el_link_wrapper = (
+        "<link rel='stylesheet' "
+        "href='https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.2/css/all.min.css'>"
+        "<a href='https://en.wikipedia.org/wiki/{}' style='color: #414141'><i class='fa-brands"
+        " fa-wikipedia-w fa-xs'></i> <span style='font-size: 1.0em; font-family: monospace'> "
+        "{}</span></a>"
+    )
+    tokens = response.tokens
+    labels = ["O"] * len(tokens)
+    dict_ents = {}
+    # make BIO labels
+    for idx, span in enumerate(response.spans):
+        labels[span.start] = (
+            "B-" + span.label + str(idx)
+            if span.label == "NME"
+            else "B-" + el_link_wrapper.format(span.label.replace(" ", "_"), span.label)
+        )
+        for i in range(span.start + 1, span.end):
+            labels[i] = (
+                "I-" + span.label + str(idx)
+                if span.label == "NME"
+                else "I-"
+                + el_link_wrapper.format(span.label.replace(" ", "_"), span.label)
+            )
+        dict_ents[(span.start, span.end)] = (
+            span.label + str(idx),
+            " ".join(tokens[span.start : span.end]),
+        )
+    unique_labels = set(w[2:] for w in labels if w != "O")
+    options = {"ents": unique_labels, "colors": get_random_color(unique_labels)}
+    return tokens, labels, options, dict_ents
+@st.cache_resource()
+def load_model():
+    return Relik.from_pretrained("riccorl/relik-relation-extraction-nyt-small")
+def set_intro(css):
+    # intro
+    st.markdown("# ReLik")
+    st.markdown(
+        "### Retrieve, Read and LinK: Fast and Accurate Entity Linking "
+        "and Relation Extraction on an Academic Budget"
+    )
+    # st.markdown(
+    #     "This is a front-end for the paper [Universal Semantic Annotator: the First Unified API "
+    #     "for WSD, SRL and Semantic Parsing](https://www.researchgate.net/publication/360671045_Universal
+    #     _Semantic_Annotator_the_First_Unified_API_for_WSD_SRL_and_Semantic_Parsing),
+    #     which will be presented at LREC 2022 by "
+    #     "[Riccardo Orlando](https://riccorl.github.io), [Simone Conia](https://c-simone.github.io/), "
+    #     "[Stefano Faralli](https://corsidilaurea.uniroma1.it/it/users/stefanofaralliuniroma1it),
+    #     and [Roberto Navigli](https://www.diag.uniroma1.it/navigli/)."
+    # )
+    badge(type="github", name="sapienzanlp/relik")
+    badge(type="pypi", name="relik")
+def run_client():
+    with open(Path(__file__).parent / "style.css") as f:
+        css = f.read()
+    st.set_page_config(
+        page_title="ReLik",
+        page_icon="🦮",
+        layout="wide",
+    )
+    set_sidebar(css)
+    set_intro(css)
+    # text input
+    text = st.text_area(
+        "Enter Text Below:",
+        value="Michael Jordan was one of the best players in the NBA.",
+        height=200,
+        max_chars=1500,
+    )
+    with stylable_container(
+        key="annotate_button",
+        css_styles="""
+            button {
+                background-color: #802433;
+                color: white;
+                border-radius: 25px;
+            }
+            """,
+    ):
+        submit = st.button("Annotate")
+    if "relik_model" not in st.session_state.keys():
+        st.session_state["relik_model"] = load_model()
+    relik_model = st.session_state["relik_model"]
+    init_state_variables()
+    # ReLik API call
+    # spacy for span visualization
+    nlp = spacy.blank("xx")
+    if submit:
+        text = text.strip()
+        if text:
+            st.session_state["filename"] = str(dt.now().timestamp() * 1000) + ".html"
+            with st.spinner(text="In progress"):
+                response = relik_model(text, annotation_type="word", num_workers=0)
+                # response = requests.post(RELIK, json=text)
+                # if response.status_code != 200:
+                #     st.error("Error: {}".format(response.status_code))
+                # else:
+                #     response = response.json()
+                # EL
+                st.markdown("####")
+                st.markdown("#### Entities")
+                tokens, labels, options, dict_ents = get_span_annotations(
+                    response=response
+                )
+                doc = Doc(nlp.vocab, words=tokens, ents=labels)
+                display_el = displacy.render(doc, style="ent", options=options)
+                display_el = display_el.replace("\n", " ")
+                # heuristic, prevents split of annotation decorations
+                display_el = display_el.replace(
+                    "border-radius: 0.35em;",
+                    "border-radius: 0.35em; white-space: nowrap;",
+                )
+                with st.container():
+                    st.write(display_el, unsafe_allow_html=True)
+                # RE
+                generate_graph(
+                    dict_ents, response, st.session_state["filename"], options
+                )
+                HtmlFile = open(st.session_state["filename"], "r", encoding="utf-8")
+                source_code = HtmlFile.read()
+                st.session_state["html_free"] = source_code
+                os.remove(st.session_state["filename"])
+                st.session_state["has_run_free"] = True
+        else:
+            st.error("Please enter some text.")
+    if st.session_state["has_run_free"]:
+        st.markdown("#### Relations")
+        components.html(st.session_state["html_free"], width=720, height=600)
+if __name__ == "__main__":
+    run_client()

relik/inference/serve/frontend/style.css ADDED Viewed

	@@ -0,0 +1,33 @@

+/* Sidebar */
+.eczjsme11 {
+    background-color: #802433;
+}
+.st-emotion-cache-10oheav h2 {
+    color: white;
+}
+.st-emotion-cache-10oheav li {
+    color: white;
+}
+/* Main */
+a:link {
+    text-decoration: none;
+    color: white;
+}
+a:visited {
+    text-decoration: none;
+    color: white;
+}
+a:hover {
+    text-decoration: none;
+    color: rgba(255, 255, 255, 0.871);
+}
+a:active {
+    text-decoration: none;
+    color: white;
+}

relik/inference/serve/frontend/utils.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import base64
+import random
+from typing import Dict, List, Optional, Union
+import spacy
+import streamlit as st
+from spacy import displacy
+def get_html(html: str):
+    """Convert HTML so it can be rendered."""
+    WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
+    # Newlines seem to mess with the rendering
+    html = html.replace("\n", " ")
+    return WRAPPER.format(html)
+def get_svg(svg: str, style: str = "", wrap: bool = True):
+    """Convert an SVG to a base64-encoded image."""
+    b64 = base64.b64encode(svg.encode("utf-8")).decode("utf-8")
+    html = f'<img src="data:image/svg+xml;base64,{b64}" style="{style}"/>'
+    return get_html(html) if wrap else html
+def visualize_parser(
+    doc: Union[spacy.tokens.Doc, List[Dict[str, str]]],
+    *,
+    title: Optional[str] = None,
+    key: Optional[str] = None,
+    manual: bool = False,
+    displacy_options: Optional[Dict] = None,
+) -> None:
+    """Visualizer for dependency parses.
+    doc (Doc, List): The document to visualize.
+    key (str): Key used for the streamlit component for selecting labels.
+    title (str): The title displayed at the top of the parser visualization.
+    manual (bool): Flag signifying whether the doc argument is a Doc object or a List of Dicts containing parse information.
+    displacy_options (Dict): Dictionary of options to be passed to the displacy render method for generating the HTML to be rendered.
+      See: https://spacy.io/api/top-level#options-dep
+    """
+    if displacy_options is None:
+        displacy_options = dict()
+    if title:
+        st.header(title)
+    docs = [doc]
+    # add selected options to options provided by user
+    # `options` from `displacy_options` are overwritten by user provided
+    # options from the checkboxes
+    for sent in docs:
+        html = displacy.render(
+            sent, options=displacy_options, style="dep", manual=manual
+        )
+        # Double newlines seem to mess with the rendering
+        html = html.replace("\n\n", "\n")
+        st.write(get_svg(html), unsafe_allow_html=True)
+def get_random_color(ents):
+    colors = {}
+    random_colors = generate_pastel_colors(len(ents))
+    for ent in ents:
+        colors[ent] = random_colors.pop(random.randint(0, len(random_colors) - 1))
+    return colors
+def floatrange(start, stop, steps):
+    if int(steps) == 1:
+        return [stop]
+    return [
+        start + float(i) * (stop - start) / (float(steps) - 1) for i in range(steps)
+    ]
+def hsl_to_rgb(h, s, l):
+    def hue_2_rgb(v1, v2, v_h):
+        while v_h < 0.0:
+            v_h += 1.0
+        while v_h > 1.0:
+            v_h -= 1.0
+        if 6 * v_h < 1.0:
+            return v1 + (v2 - v1) * 6.0 * v_h
+        if 2 * v_h < 1.0:
+            return v2
+        if 3 * v_h < 2.0:
+            return v1 + (v2 - v1) * ((2.0 / 3.0) - v_h) * 6.0
+        return v1
+    # if not (0 <= s <= 1): raise ValueError, "s (saturation) parameter must be between 0 and 1."
+    # if not (0 <= l <= 1): raise ValueError, "l (lightness) parameter must be between 0 and 1."
+    r, b, g = (l * 255,) * 3
+    if s != 0.0:
+        if l < 0.5:
+            var_2 = l * (1.0 + s)
+        else:
+            var_2 = (l + s) - (s * l)
+        var_1 = 2.0 * l - var_2
+        r = 255 * hue_2_rgb(var_1, var_2, h + (1.0 / 3.0))
+        g = 255 * hue_2_rgb(var_1, var_2, h)
+        b = 255 * hue_2_rgb(var_1, var_2, h - (1.0 / 3.0))
+    return int(round(r)), int(round(g)), int(round(b))
+def generate_pastel_colors(n):
+    """Return different pastel colours.
+    Input:
+        n (integer) : The number of colors to return
+    Output:
+        A list of colors in HTML notation (eg.['#cce0ff', '#ffcccc', '#ccffe0', '#f5ccff', '#f5ffcc'])
+    Example:
+        >>> print generate_pastel_colors(5)
+        ['#cce0ff', '#f5ccff', '#ffcccc', '#f5ffcc', '#ccffe0']
+    """
+    if n == 0:
+        return []
+    # To generate colors, we use the HSL colorspace (see http://en.wikipedia.org/wiki/HSL_color_space)
+    start_hue = 0.0  # 0=red    1/3=0.333=green   2/3=0.666=blue
+    saturation = 1.0
+    lightness = 0.9
+    # We take points around the chromatic circle (hue):
+    # (Note: we generate n+1 colors, then drop the last one ([:-1]) because
+    # it equals the first one (hue 0 = hue 1))
+    return [
+        "#%02x%02x%02x" % hsl_to_rgb(hue, saturation, lightness)
+        for hue in floatrange(start_hue, start_hue + 1, n + 1)
+    ][:-1]