rr-ss commited on Mar 9

Commit

3290550

verified ·

1 Parent(s): f929fd2

Upload folder using huggingface_hub

Browse files

Files changed (33) hide show

.gitattributes +4 -34
.gitignore +6 -0
LICENSE +21 -0
README.md +123 -3
doc/Polaris.png +3 -0
doc/logo.png +3 -0
doc/world_logo.jpg +0 -0
example/APA/APA_example.ipynb +69 -0
example/APA/GM12878_250M_chr151617_loops.pileup.png +0 -0
example/CLI_walkthrough.ipynb +171 -0
example/README.md +42 -0
example/loop_annotation/GM12878_250M.bcool +3 -0
example/loop_annotation/GM12878_250M_chr151617_loop_score.bedpe +0 -0
example/loop_annotation/GM12878_250M_chr151617_loops.bedpe +0 -0
example/loop_annotation/GM12878_250M_chr151617_loops_method2.bedpe +0 -0
example/loop_annotation/loop_annotation.ipynb +262 -0
polaris/loop.py +306 -0
polaris/loopDev.py +200 -0
polaris/loopPool.py +178 -0
polaris/loopPool.py.bak +178 -0
polaris/loopPool_proof_wang_duplicate.py.bak +192 -0
polaris/loopScore.py +129 -0
polaris/model/polarisnet.py +526 -0
polaris/model/sft_loop.pt +3 -0
polaris/polaris.py +54 -0
polaris/utils/util_bcooler.py +347 -0
polaris/utils/util_cool2bcool.py +88 -0
polaris/utils/util_data.py +159 -0
polaris/utils/util_depth.py +57 -0
polaris/utils/util_loop.py +12 -0
polaris/utils/util_pileup.py +95 -0
polaris/version.py +1 -0
setup.py +56 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,5 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

 *.pt filter=lfs diff=lfs merge=lfs -text
+*.bcool filter=lfs diff=lfs merge=lfs -text
+*.mcool filter=lfs diff=lfs merge=lfs -text
+doc/Polaris.png filter=lfs diff=lfs merge=lfs -text
+doc/logo.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,6 @@

+build/*
+dist/*
+docs/*
+polaris.egg-info/*
+polaris/**/__pycache__/
+requirements.txt

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2022 ai4nucleome
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,3 +1,123 @@
----
-license: mit
----

+<img src="./doc/logo.png" alt="Polaris" title="Polaris" width="400">
+# A Versatile Framework for Chromatin Loop Annotation in Bulk and Single-cell Hi-C Data
+<a href="https://github.com/ai4nucleome/Polaris/releases/latest">
+   <img src="https://img.shields.io/badge/Polaris-v1.0.0-green">
+   <img src="https://img.shields.io/badge/platform-Linux%20%7C%20Mac%20-green">
+   <img src="https://img.shields.io/badge/Language-python3-green">
+   <!-- <img src="https://img.shields.io/badge/dependencies-tested-green"> -->
+</a>
+🌟 **Polaris** is a versatile and efficient command line tool tailored for rapid and accurate chromatin loop detectionfrom from contact maps generated by various assays, including bulk Hi-C, scHi-C, Micro-C, and DNA SPRITE. Polaris is particularly well-suited for analyzing **sparse scHi-C data and low-coverage datasets**.
+<div style="text-align: center;">
+    <img src="./doc/Polaris.png" alt="Polaris Model" title="Polaris Model" width="600">
+</div>
+- Using examples for single cell Hi-C and bulk cell Hi-C loop annotations are under [**example folder**](https://github.com/ai4nucleome/Polaris/tree/master/example).
+- The scripts and data to **reproduce our analysis** can be found at: [**Polaris Reproducibility**](https://zenodo.org/records/14294273).
+> ❗️<b>NOTE❗️:</b> We suggest users run Polaris on <b>GPU</b>.
+> You can run Polaris on CPU for loop annotations, but it is much slower than on GPU.
+> ❗️**NOTE❗️:** If you encounter a `CUDA OUT OF MEMORY` error, please:
+> - Check your GPU's status and available memory.
+> - Reduce the --batchsize parameter. (The default value of 128 requires approximately 36GB of CUDA memory. Setting it to 24 will reduce the requirement to less than 10GB.)
+## Documentation
+📝 **Extensive documentation** can be found at: [Polaris Doc](https://nucleome-polaris.readthedocs.io/en/latest/).
+## Installation
+Polaris is developed and tested on Linux machines with python3.9 and relies on several libraries including pytorch, scipy, etc.
+We **strongly recommend** that you install Polaris in a virtual environment.
+We suggest users using [conda](https://anaconda.org/) to create a virtual environment for it (It should also work without using conda, i.e. with pip). You can run the command snippets below to install Polaris:
+```bash
+git clone https://github.com/ai4nucleome/Polaris.git
+cd Polaris
+conda create -n polaris python=3.9
+conda activate polaris
+```
+-------
+### ❗️Important Note❗️: Downloading Polaris Network  Weights
+The Polaris repository utilizes Git Large File Storage (Git-LFS) to host its pre-trained model weight files. Standard `git clone` operations **will not** automatically download these large files unless Git-LFS is installed and configured.
+To resolve this, please follow one of the methods below:
+#### Method 1: Manual Download via Browser
+1. Directly download the pre-trained model weights (`sft_loop.pt`) from the [Polaris model directory](https://github.com/ai4nucleome/Polaris/blob/master/polaris/model/sft_loop.pt).
+2. Save the file to the directory:
+    ```bash
+    Polaris/polaris/model/
+    ```
+#### Method 2: Install Git-LFS
+1. Install Git-LFS by following the official instructions: [Git-LFS Installation Guide](https://git-lfs.com/).
+2. After installation, either:
+    Re-clone the repository:
+    ```bash
+    git clone https://github.com/ai4nucleome/Polaris.git
+    ```
+    OR, if the repository is already cloned, run:
+    ```bash
+    git lfs pull
+    ```
+    This ensures all large files, including model weights, are retrieved.
+----------
+Install [PyTorch](https://pytorch.org/get-started/locally/) as described on their website. It might be the following command depending on your cuda version:
+```bash
+pip install torch==2.2.2 torchvision==0.17.2 torchaudio==2.2.2 --index-url https://download.pytorch.org/whl/cu121
+```
+Install Polaris:
+```bash
+pip install --use-pep517 --editable .
+```
+If fail, please try `python setup build` and `python setup install` first.
+The installation requires network access to download libraries. Usually, the installation will finish within 5 minutes. The installation time is longer if network access is slow and/or unstable.
+## Quick Start for Loop Annotation
+```bash
+polaris loop pred -i [input mcool file] -o [output path of annotated loops]
+```
+It outputs predicted loops from the input contact map at 5kb resolution.
+### output format
+It contains tab separated fields as follows:
+```
+Chr1    Start1    End1    Chr2    Start2    End2    Score
+```
+|     Field     |                                  Detail                                 |
+|:-------------:|:-----------------------------------------------------------------------:|
+|   Chr1/Chr2   | chromosome names                                                        |
+| Start1/Start2 | start genomic coordinates                                               |
+|   End1/End2   | end genomic coordinates (i.e. End1=Start1+resol)                        |
+|     Score     | Polaris's loop score [0~1]                                              |
+## Citation:
+Yusen Hou, Audrey Baguette, Mathieu Blanchette*, & Yanlin Zhang*. __A versatile tool for chromatin loop annotation in bulk and single-cell Hi-C data__. _bioRxiv_, 2024. [Paper](https://doi.org/10.1101/2024.12.24.630215)
+<br>
+```
+@article {Hou2024Polaris,
+	title = {A versatile tool for chromatin loop annotation in bulk and single-cell Hi-C data},
+	author = {Yusen Hou, Audrey Baguette, Mathieu Blanchette, and Yanlin Zhang},
+	journal = {bioRxiv}
+	year = {2024},
+}
+```
+## 📩 Contact
+A GitHub issue is preferable for all problems related to using Polaris.
+For other concerns, please email Yusen Hou or Yanlin Zhang ([email protected],  [email protected]).

doc/Polaris.png ADDED Viewed

Git LFS Details

SHA256: e61b54271dfe016eaa2a86dd0f0f91082712bd6f0dc4ae4aa2ec75d9fe303e9f
Pointer size: 132 Bytes
Size of remote file: 3.4 MB

doc/logo.png ADDED Viewed

Git LFS Details

SHA256: 21dfbcd2a0795688fae5209771c3cf749633c7fae25738d59964146528e53b37
Pointer size: 131 Bytes
Size of remote file: 742 kB

doc/world_logo.jpg ADDED Viewed

example/APA/APA_example.ipynb ADDED Viewed

	@@ -0,0 +1,69 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# APA Analysis"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "After detecting chromatin loops using Polaris, we can use Aggregated Peak Analysis (APA) to visualize the results and assess their quality. This approach allows us to aggregate the detected loops across different genomic regions and observe their behavior in a visually intuitive way."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You can run the following command to get a quick check of loops detected by Polaris."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%bash \n",
+    "\n",
+    "polaris util pileup --savefig \"./GM12878_250M_chr151617_loops.pileup.png\" --p2ll True \"../loop_annotation/GM12878_250M_chr151617_loops.bedpe\" \"../loop_annotation/GM12878_250M.bcool\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The result will be saved at `\"./GM12878_250M_chr151617_loops.pileup.png\"`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "polaris",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.20"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

example/APA/GM12878_250M_chr151617_loops.pileup.png ADDED Viewed

example/CLI_walkthrough.ipynb ADDED Viewed

	@@ -0,0 +1,171 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Polaris command line interface"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If you type `Polaris` at the command line with no arguments or with `--help` you'll get the following quick reference of available subcommands."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Usage: polaris [OPTIONS] COMMAND [ARGS]...\n",
+      "\n",
+      "  Polaris\n",
+      "\n",
+      "  A Versatile Tool for Chromatin Loop Annotation in Bulk and Single-cell Hi-C\n",
+      "  Data\n",
+      "\n",
+      "Options:\n",
+      "  --help  Show this message and exit.\n",
+      "\n",
+      "Commands:\n",
+      "  loop  Loop annotation.\n",
+      "  util  Utilities.\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "polaris --help"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## polaris subcommand\n",
+    "\n",
+    "For more information about a specific subcommand, type `polaris <subcommand> --help` to display the help text."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Polaris loop"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Usage: polaris loop [OPTIONS] COMMAND [ARGS]...\n",
+      "\n",
+      "  Loop annotation.\n",
+      "\n",
+      "  Annotate loops from chromosomal contact maps.\n",
+      "\n",
+      "Options:\n",
+      "  --help  Show this message and exit.\n",
+      "\n",
+      "Commands:\n",
+      "  dev    *development function* Coming soon...\n",
+      "  pool   Call loops from loop candidates by clustering\n",
+      "  pred   Predict loops from input contact map directly\n",
+      "  score  Predict loop score for each pixel in the input contact map\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "polaris loop --help"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Polaris util"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Usage: polaris util [OPTIONS] COMMAND [ARGS]...\n",
+      "\n",
+      "  Utilities.\n",
+      "\n",
+      "  Utilities for analysis and visualization.\n",
+      "\n",
+      "Options:\n",
+      "  --help  Show this message and exit.\n",
+      "\n",
+      "Commands:\n",
+      "  cool2bcool  covert a .mcool file to a .bcool file\n",
+      "  pileup      2D pileup contact maps around given foci\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "polaris util --help"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Detailed Instructions\n",
+    "\n",
+    "For detailed instructions of each subcommand, please refer to [Polaris Doc](https://nucleome-polaris.readthedocs.io/en/latest/) and tutorials :\n",
+    "- [Loop Annotation tutorial](https://github.com/ai4nucleome/Polaris/blob/master/example/loop_annotation/loop_annotation.ipynb)\n",
+    "- [Aggregated Peak Analysis tutorial](https://github.com/ai4nucleome/Polaris/blob/master/example/APA/APA.ipynb) "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "polaris",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.20"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

example/README.md ADDED Viewed

	@@ -0,0 +1,42 @@

+# Example Use of Loop Annotaion and APA
+This folder contains two subfolders that showcase example results of **Polaris** on loop prediction and aggregated peak analysis.
+You can re-run **Polaris** to reproduce these results by following the commands provided in the sections below.
+> **Note:** If you encounter a `CUDA OUT OF MEMORY` error, please:
+> - Check your GPU's status and available memory.
+> - Reduce the --batchsize parameter. (The default value of 128 requires approximately 36GB of CUDA memory. Setting it to 24 will reduce the requirement to less than 10GB.)
+## Loop Prediction on GM12878 (250M Valid Read Pairs)
+```bash
+polaris loop pred --chrom chr15,chr16,chr17 -i ./loop_annotation/GM12878_250M.bcool -o ./loop_annotation/GM12878_250M_chr151617_loops.bedpe
+```
+The [loop_annotation](https://github.com/compbiodsa/Polaris/tree/master/example/loop_annotation) sub-folder contains the results on bulk Hi-C data of GM12878 (250M valid read pairs).
+## APA of Loops Detected by Polaris
+``` bash
+polaris util pileup --savefig ./APA/GM12878_250M_chr151617_loops.pileup.png --p2ll True ./loop_annotation/GM12878_250M_chr151617_loops.bedpe ./loop_annotation/GM12878_250M.bcool
+```
+The [APA](https://github.com/compbiodsa/Polaris/tree/master/example/APA) sub-folder contains the Aggregate Peak Analysis result of loops detected on GM12878 (250M Valid Read Pairs) by Polaris.
+<div style="text-align: center;">
+    <figure>
+        <img src="./APA/GM12878_250M_chr151617_loops.pileup.png"
+             alt="GM12878_250M_chr151617_loops"
+             title="GM12878_250M_chr151617_loops"
+             width="150">
+        <figcaption>APA of loops on GM12878 (250M Valid Read Pairs)</figcaption>
+    </figure>
+</div>
+---
+- **Extensive documentation** can be found at: [Polaris Documentaion](https://nucleome-polaris.readthedocs.io/en/latest/).
+- You can find more detailed tutorials in the **Jupyter Notebooks located within the respective subfolders**.

example/loop_annotation/GM12878_250M.bcool ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:962c8dbb130eb024d9d931cf50ace2f1adff8a84bdbf023f6d3770d27842212d
+size 70088396

example/loop_annotation/GM12878_250M_chr151617_loop_score.bedpe ADDED Viewed

The diff for this file is too large to render. See raw diff

example/loop_annotation/GM12878_250M_chr151617_loops.bedpe ADDED Viewed

The diff for this file is too large to render. See raw diff

example/loop_annotation/GM12878_250M_chr151617_loops_method2.bedpe ADDED Viewed

The diff for this file is too large to render. See raw diff

example/loop_annotation/loop_annotation.ipynb ADDED Viewed

	@@ -0,0 +1,262 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Loop Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Input files"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Polaris requires a `.mcool` file as input. You can obtain `.mcool` files in the following ways:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 1. Download from the 4DN Database\n",
+    "\n",
+    "- Visit the [4DN Data Portal](https://data.4dnucleome.org/).\n",
+    "- Search for and download `.mcool` files suitable for your study."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 2. Convert Files Using cooler\n",
+    "\n",
+    "If you have data in formats such as `.pairs` or `.cool`, you can convert them to `.mcool` format using the Python library [cooler](https://cooler.readthedocs.io/en/latest/index.html). Follow these steps:\n",
+    "\n",
+    "- **Install cooler**\n",
+    "\n",
+    "  Ensure you have installed cooler using the following command:\n",
+    "  ```bash\n",
+    "  pip install cooler\n",
+    "  ```\n",
+    "- **Convert .pairs to .cool**\n",
+    "\n",
+    "   If you are starting with a .pairs file (e.g., normalized contact data with columns for chrom1, pos1, chrom2, pos2), use this command to create a .cool file:\n",
+    "   ```bash\n",
+    "   cooler cload pairs --assembly <genome_version> -c1 chrom1 -p1 pos1 -c2 chrom2 -p2 pos2 <pairs_file> <resolution>.cool\n",
+    "   ```\n",
+    "   Replace `<genome_version> with the appropriate genome assembly (e.g., hg38) and <resolution> with the desired bin size in base pairs.\n",
+    "- **Generate a Multiresolution .mcool File**\n",
+    "\n",
+    "   To convert a single-resolution .cool file into a multiresolution .mcool file, use the following command:\n",
+    "\n",
+    "   ```bash\n",
+    "   cooler zoomify <input.cool>\n",
+    "   ```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The resulting `.mcool` file can be directly used as input for Polaris."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Loop Annotation by Polaris"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Polaris provides two methods to generate loop annotations for input `.mcool` file. Both methods ultimately yield consistent loop results."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Method 1: polaris loop pred\n",
+    "\n",
+    "This is the simplest approach, allowing you to directly predict loops in a single step.\n",
+    "The command below will take approximately 30 seconds, depending on your device, to identify loops in GM12878 data (250M valid read pairs)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "use gping cuda:0\n",
+      "\n",
+      "Analysing chroms: ['chr15', 'chr16', 'chr17']\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[analyzing chr17]: 100%|██████████| 3/3 [00:24<00:00,  8.31s/it]\n",
+      "[Runing clustering on chr15]: 100%|██████████| 3/3 [00:01<00:00,  1.87it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1830 loops saved to  GM12878_250M_chr151617_loops.bedpe\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "polaris loop pred --chrom chr15,chr16,chr17 -i GM12878_250M.bcool -o GM12878_250M_chr151617_loops.bedpe "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "> **Note:** If you encounter a `CUDA OUT OF MEMORY` error, please:\n",
+    "> - Check your GPU's status and available memory.\n",
+    "> - Reduce the --batchsize parameter. (The default value of 128 requires approximately 36GB of CUDA memory. Setting it to 24 will reduce the requirement to less than 10GB.)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Method 2: polaris loop score and polaris loop pool"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This method involves two steps: generating loop scores for each pixel in the contact map and clustering these scores to call loops.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Step 1: Generate Loop Scores**\n",
+    "\n",
+    "Run the following command to calculate the loop score for each pixel in the input contact map and save the result in `GM12878_250M_chr151617_loop_score.bedpe`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "use gping cuda:0\n",
+      "\n",
+      "Analysing chroms: ['chr15', 'chr16', 'chr17']\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[analyzing chr17]: 100%|██████████| 3/3 [00:34<00:00, 11.37s/it]\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "polaris loop score --chrom chr15,chr16,chr17 -i GM12878_250M.bcool -o GM12878_250M_chr151617_loop_score.bedpe "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Step 2: Call Loops from Loop Candidates**\n",
+    "\n",
+    "Use the following command to identify loops by clustering from the generated loop score file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[Runing clustering on chr16]: 100%|██████████| 3/3 [00:01<00:00,  1.72it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1830 loops saved to  GM12878_250M_chr151617_loops_method2.bedpe\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "polaris loop pool -i GM12878_250M_chr151617_loop_score.bedpe  -o GM12878_250M_chr151617_loops_method2.bedpe "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can see both methods ultimately yield consistent loop number.\n",
+    "\n",
+    "The we can perform [Aggregate Peak Analysis](https://github.com/ai4nucleome/Polaris/blob/master/example/APA/APA.ipynb) to visualize these results."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "polaris",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.20"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

polaris/loop.py ADDED Viewed

	@@ -0,0 +1,306 @@

+import sys
+import torch
+import cooler
+import click
+import numpy as np
+import pandas as pd
+from importlib_resources import files
+from torch import nn
+from tqdm import tqdm
+from torch.cuda.amp import autocast
+from torch.utils.data import DataLoader
+from sklearn.neighbors import KDTree
+from polaris.model.polarisnet import polarisnet
+from polaris.utils.util_data import centerPredCoolDataset
+def rhoDelta(data,resol,dc,radius):
+    pos = data[[1, 4]].to_numpy() // resol
+    posTree = KDTree(pos, leaf_size=30, metric='chebyshev')
+    NNindexes, NNdists = posTree.query_radius(pos, r=radius, return_distance=True)
+    _l = []
+    for v in NNindexes:
+        _l.append(len(v))
+    _l=np.asarray(_l)
+    data = data[_l>5].reset_index(drop=True)
+    if data.shape[0] != 0:
+        pos = data[[1, 4]].to_numpy() // resol
+        val = data[6].to_numpy()
+        try:
+            posTree = KDTree(pos, leaf_size=30, metric='chebyshev')
+            NNindexes, NNdists = posTree.query_radius(pos, r=dc, return_distance=True)
+        except ValueError as e:
+            if "Found array with 0 sample(s)" in str(e):
+                print("#"*88,'\n#')
+                print("#\033[91m Error!!! The data is too sparse. Please increase the value of: [t]\033[0m\n#")
+                print("#"*88,'\n')
+                sys.exit(1)
+            else:
+                raise
+        rhos = []
+        for i in range(len(NNindexes)):
+            rhos.append(np.dot(np.exp(-(NNdists[i] / dc) ** 2), val[NNindexes[i]]))
+        rhos = np.asarray(rhos)
+        _r = 100
+        _indexes, _dists = posTree.query_radius(pos, r=_r, return_distance=True, sort_results=True)
+        deltas = rhos * 0
+        LargerNei = rhos * 0 - 1
+        for i in range(len(_indexes)):
+            idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]])
+            if idx.shape[0] == 0:
+                deltas[i] = _dists[i][-1] + 1
+            else:
+                LargerNei[i] = _indexes[i][idx[0]]
+                deltas[i] = _dists[i][idx[0]]
+        failed = np.argwhere(LargerNei == -1).flatten()
+        while len(failed) > 1 and _r < 100000:
+            _r = _r * 10
+            _indexes, _dists = posTree.query_radius(pos[failed], r=_r, return_distance=True, sort_results=True)
+            for i in range(len(_indexes)):
+                idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]])
+                if idx.shape[0] == 0:
+                    deltas[failed[i]] = _dists[i][-1] + 1
+                else:
+                    LargerNei[failed[i]] = _indexes[i][idx[0]]
+                    deltas[failed[i]] = _dists[i][idx[0]]
+            failed = np.argwhere(LargerNei == -1).flatten()
+        data['rhos']=rhos
+        data['deltas']=deltas
+    else:
+        data['rhos']=[]
+        data['deltas']=[]
+    return data
+def pool(data,dc,resol,mindelta,t,output,radius,refine=True):
+    ccs = set(data.iloc[:,0])
+    if data.shape[0] == 0:
+        print("#"*88,'\n#')
+        print("#\033[91m Error!!! The file is empty. Please check your file.\033[0m\n#")
+        print("#"*88,'\n')
+        sys.exit(1)
+    data = data[data[6] > t].reset_index(drop=True)
+    data = data[data[4] - data[1] > 11*resol].reset_index(drop=True)
+    if data.shape[0] == 0:
+        print("#"*88,'\n#')
+        print("#\033[91m Error!!! The data is too sparse. Please decrease: [threshold] (minimum: 0.5).\033[0m\n#")
+        print("#"*88,'\n')
+        sys.exit(1)
+    data[['rhos','deltas']]=0
+    data=data.groupby([0]).apply(rhoDelta,resol=resol,dc=dc,radius=radius).reset_index(drop=True)
+    minrho=0
+    targetData=data.reset_index(drop=True)
+    loopPds=[]
+    chroms=tqdm(set(targetData[0]), dynamic_ncols=True)
+    for chrom in chroms:
+        chroms.desc = f"[Runing clustering on {chrom}]"
+        data = targetData[targetData[0]==chrom].reset_index(drop=True)
+        pos = data[[1, 4]].to_numpy() // resol
+        posTree = KDTree(pos, leaf_size=30, metric='chebyshev')
+        rhos = data['rhos'].to_numpy()
+        deltas = data['deltas'].to_numpy()
+        centroid = np.argwhere((rhos > minrho) & (deltas > mindelta)).flatten()
+        _r = 100
+        _indexes, _dists = posTree.query_radius(pos, r=_r, return_distance=True, sort_results=True)
+        LargerNei = rhos * 0 - 1
+        for i in range(len(_indexes)):
+            idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]])
+            if idx.shape[0] == 0:
+                pass
+            else:
+                LargerNei[i] = _indexes[i][idx[0]]
+        failed = np.argwhere(LargerNei == -1).flatten()
+        while len(failed) > 1 and _r < 100000:
+            _r = _r * 10
+            _indexes, _dists = posTree.query_radius(pos[failed], r=_r, return_distance=True, sort_results=True)
+            for i in range(len(_indexes)):
+                idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]])
+                if idx.shape[0] == 0:
+                    pass
+                else:
+                    LargerNei[failed[i]] = _indexes[i][idx[0]]
+            failed = np.argwhere(LargerNei == -1).flatten()
+        LargerNei = LargerNei.astype(int)
+        label = LargerNei * 0 - 1
+        for i in range(len(centroid)):
+            label[centroid[i]] = i
+        decreasingsortedIdxRhos = np.argsort(-rhos)
+        for i in decreasingsortedIdxRhos:
+            if label[i] == -1:
+                label[i] = label[LargerNei[i]]
+        val = data[6].to_numpy()
+        refinedLoop = []
+        label = label.flatten()
+        for l in set(label):
+            idx = np.argwhere(label == l).flatten()
+            if len(idx) > 0:
+                refinedLoop.append(idx[np.argmax(val[idx])])
+        if refine:
+            loopPds.append(data.loc[refinedLoop])
+        else:
+            loopPds.append(data.loc[centroid])
+    loopPd=pd.concat(loopPds).sort_values(6,ascending=False)
+    loopPd[[1, 2, 4, 5]] = loopPd[[1, 2, 4, 5]].astype(int)
+    loopPd[[0,1,2,3,4,5,6]].to_csv(output,sep='\t',header=False, index=False)
+    ccs_ = set(loopPd.iloc[:,0])
+    badc = ccs.difference(ccs_)
+    return len(loopPd),badc,ccs
+@click.command()
+@click.option('-b','--batchsize', type=int, default=128, help='Batch size [128]')
+@click.option('-C','--cpu', type=bool, default=False, help='Use CPU [False]')
+@click.option('-G','--gpu', type=str, default=None, help='Comma-separated GPU indices [auto select]')
+@click.option('-c','--chrom', type=str, default=None, help='Comma separated chroms [all autosomes]')
+@click.option('-nw','--workers', type=int, default=16, help='Number of cpu threads [16]')
+@click.option('-t','--threshold', type=float, default=0.6, help='Loop Score Threshold [0.6]')
+@click.option('-s','--sparsity', type=float, default=0.9, help='Allowed sparsity of submatrices [0.9]')
+@click.option('-md','--max_distance', type=int, default=3000000, help='Max distance (bp) between contact pairs [3000000]')
+@click.option('-r','--resol',type=int,default=5000,help ='Resolution [5000]')
+@click.option('-dc','--distance_cutoff', type=int, default=5, help='Distance cutoff for local density calculation in terms of bin. [5]')
+@click.option('-R','--radius', type=int, default=2, help='Radius threshold to remove outliers. [2]')
+@click.option('-d','--mindelta', type=float, default=5, help='Min distance allowed between two loops [5]')
+@click.option('--raw',type=bool,default=False,help ='Raw matrix or balanced matrix')
+@click.option('-i','--input', type=str,required=True,help='Hi-C contact map path')
+@click.option('-o','--output', type=str,required=True,help='.bedpe file path to save loops')
+def pred(batchsize, cpu, gpu, chrom, threshold, sparsity, workers, max_distance, resol, distance_cutoff, radius, mindelta, input, output, raw, image=224):
+    """Predict loops from input contact map directly
+    """
+    print('\npolaris loop pred START :)')
+    center_size = image // 2
+    start_idx = (image - center_size) // 2
+    end_idx = (image + center_size) // 2
+    slice_obj_pred = (slice(None), slice(None), slice(start_idx, end_idx), slice(start_idx, end_idx))
+    slice_obj_coord = (slice(None), slice(start_idx, end_idx), slice(start_idx, end_idx))
+    results=[]
+    if cpu:
+        assert gpu is None, "\033[91m QAQ The CPU and GPU modes cannot be used simultaneously. Please check the command. \033[0m\n"
+        gpu = ['None']
+        device = torch.device("cpu")
+        print('Using CPU mode... (This may take significantly longer than using GPU mode.)')
+    else:
+        if torch.cuda.is_available():
+            if gpu is not None:
+                print("Using the specified GPU: " + gpu)
+                gpu=[int(i) for i in gpu.split(',')]
+                device = torch.device(f"cuda:{gpu[0]}")
+            else:
+                gpuIdx = torch.cuda.current_device()
+                device = torch.device(gpuIdx)
+                print("Automatically selected GPU: " + str(gpuIdx))
+                gpu=[gpu]
+        else:
+            device = torch.device("cpu")
+            gpu = ['None']
+            cpu = True
+            print('GPU is not available!')
+            print('Using CPU mode... (This may take significantly longer than using GPU mode.)')
+    coolfile = cooler.Cooler(input + '::/resolutions/' + str(resol))
+    modelstate = str(files('polaris').joinpath('model/sft_loop.pt'))
+    _modelstate = torch.load(modelstate, map_location=device.type)
+    parameters = _modelstate['parameters']
+    if chrom is None:
+        chrom =coolfile.chromnames
+    else:
+        chrom = chrom.split(',')
+    # for rmchr in ['chrMT','MT','chrM','M','Y','chrY','X','chrX','chrW','W','chrZ','Z']: # 'Y','chrY','X','chrX'
+    #     if rmchr in chrom:
+    #         chrom.remove(rmchr)
+    print(f"Analysing chroms: {chrom}")
+    model = polarisnet(
+            image_size=parameters['image_size'],
+            in_channels=parameters['in_channels'],
+            out_channels=parameters['out_channels'],
+            embed_dim=parameters['embed_dim'],
+            depths=parameters['depths'],
+            channels=parameters['channels'],
+            num_heads=parameters['num_heads'],
+            drop=parameters['drop'],
+            drop_path=parameters['drop_path'],
+            pos_embed=parameters['pos_embed']
+    ).to(device)
+    model.load_state_dict(_modelstate['model_state_dict'])
+    if not cpu and len(gpu) > 1:
+        model = nn.DataParallel(model, device_ids=gpu)
+    model.eval()
+    print('\n********score START********')
+    badc=[]
+    chrom_ = tqdm(chrom, dynamic_ncols=True)
+    for _chrom in chrom_:
+        test_data = centerPredCoolDataset(coolfile,_chrom,max_distance_bin=max_distance//resol,w=image,step=center_size,s=sparsity,raw=raw)
+        test_dataloader = DataLoader(test_data, batch_size=batchsize, shuffle=False,num_workers=workers,prefetch_factor=4,pin_memory=(gpu is not None))
+        chrom_.desc = f"[Analyzing {_chrom} with {len(test_data)} submatrices]"
+        if len(test_data) == 0:
+            badc.append(_chrom)
+        with torch.no_grad():
+            for X in test_dataloader:
+                bin_i,bin_j,targetX=X
+                bin_i = bin_i*resol
+                bin_j = bin_j*resol
+                with autocast():
+                    pred = torch.sigmoid(model(targetX.float().to(device)))[slice_obj_pred].flatten()
+                    loop = torch.nonzero(pred>threshold).flatten().cpu()
+                    prob = pred[loop].cpu().numpy().flatten().tolist()
+                    frag1 = bin_i[slice_obj_coord].flatten().cpu().numpy()[loop].flatten().tolist()
+                    frag2 = bin_j[slice_obj_coord].flatten().cpu().numpy()[loop].flatten().tolist()
+                for i in range(len(frag1)):
+                    # if frag1[i] < frag2[i] and frag2[i]-frag1[i] > 11*resol and frag2[i]-frag1[i] < max_distance:
+                    if frag1[i] < frag2[i] and frag2[i]-frag1[i] < max_distance:
+                        results.append([_chrom, frag1[i], frag1[i] + resol,
+                                        _chrom, frag2[i], frag2[i] + resol,
+                                        prob[i]])
+    if len(badc)==len(chrom):
+        raise ValueError("score FAILED :(\nThe '-s' value needs to be increased for more sparse data.")
+    else:
+        print(f'********score FINISHED********')
+        if len(badc)>0:
+            print(f"· But the size of {badc} are too small or their contact matrix are too sparse.\n· You may need to check the data or run these chr respectively by increasing -s.")
+        print(f'********pool START********')
+    df = pd.DataFrame(results)
+    loopNum,badcp,ccs = pool(df,distance_cutoff,resol,mindelta,threshold,output,radius)
+    if len(badcp) == len(ccs):
+        raise ValueError("pool FAILED :(\nPlease check input and mcool file to yield scoreFile. Or use higher '-s' value for more sparse mcool data.")
+    else:
+        print(f'********pool FINISHED********')
+        if len(badcp) > 0:
+            print(f"· But the loop score of {badcp} are too sparse.\n· You may need to check the mcool data or re-run polaris loop score by increasing -s.")
+    print(f'\npolaris loop pred FINISHED :)\n{loopNum} loops saved to {output}')
+if __name__ == '__main__':
+    pred()

polaris/loopDev.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import torch
+import click
+import cooler
+import warnings
+import numpy as np
+from torch import nn
+from tqdm import tqdm
+from torch.cuda.amp import autocast
+from importlib_resources import files
+from polaris.utils.util_loop import bedpewriter
+from polaris.model.polarisnet import polarisnet
+from scipy.sparse import coo_matrix
+from scipy.sparse import SparseEfficiencyWarning
+warnings.filterwarnings("ignore", category=SparseEfficiencyWarning)
+def getLocal(mat, i, jj, w, N):
+    if i >= 0 and jj >= 0 and i+w <= N and jj+w <= N:
+        mat = mat[i:i+w,jj:jj+w].toarray()
+        # print(f"global: {mat.shape}")
+        return mat[None,...]
+    # pad_width = ((up, down), (left, right))
+    slice_pos = [[i, i+w], [jj, jj+w]]
+    pad_width = [[0, 0], [0, 0]]
+    if i < 0:
+        pad_width[0][0] = -i
+        slice_pos[0][0] = 0
+    if jj < 0:
+        pad_width[1][0] = -jj
+        slice_pos[1][0] = 0
+    if i+w > N:
+        pad_width[0][1] = i+w-N
+        slice_pos[0][1] = N
+    if jj+w > N:
+        pad_width[1][1] = jj+w-N
+        slice_pos[1][1] = N
+    _mat = mat[slice_pos[0][0]:slice_pos[0][1],slice_pos[1][0]:slice_pos[1][1]].toarray()
+    padded_mat = np.pad(_mat, pad_width, mode='constant', constant_values=0)
+    # print(f"global: {padded_mat.shape}",slice_pos, pad_width)
+    return padded_mat[None,...]
+def upperCoo2symm(row,col,data,N=None):
+    # print(np.max(row),np.max(col),N)
+    if N:
+        shape=(N,N)
+    else:
+        shape=(row.max() + 1,col.max() + 1)
+    sparse_matrix = coo_matrix((data, (row, col)), shape=shape)
+    symm = sparse_matrix + sparse_matrix.T
+    diagVal = symm.diagonal(0)/2
+    symm = symm.tocsr()
+    symm.setdiag(diagVal)
+    return symm
+def processCoolFile(coolfile, cchrom):
+    extent = coolfile.extent(cchrom)
+    N = extent[1] - extent[0]
+    ccdata = coolfile.matrix(balance=True, sparse=True, as_pixels=True).fetch(cchrom)
+    ccdata['balanced'] = ccdata['balanced'].fillna(0)
+    ccdata['bin1_id'] -= extent[0]
+    ccdata['bin2_id'] -= extent[0]
+    ccdata['distance'] = ccdata['bin2_id'] - ccdata['bin1_id']
+    d_means = ccdata.groupby('distance')['balanced'].transform('mean')
+    ccdata['oe'] = ccdata['balanced'] / d_means
+    ccdata['oe'] = ccdata['oe'].fillna(0)
+    ccdata['oe'] = ccdata['oe'] / ccdata['oe'].max()
+    oeMat = upperCoo2symm(ccdata['bin1_id'].ravel(), ccdata['bin2_id'].ravel(), ccdata['oe'].ravel(), N)
+    return oeMat, N
+@click.command()
+@click.option('--batchsize', type=int, default=16, help='Batch size [16]')
+@click.option('--cpu', type=bool, default=False, help='Use CPU [False]')
+@click.option('--gpu', type=str, default=None, help='Comma-separated GPU indices [auto select]')
+@click.option('--chrom', type=str, default=None, help='Comma separated chroms')
+@click.option('--max_distance', type=int, default=3000000, help='Max distance (bp) between contact pairs')
+@click.option('--resol',type=int,default=500,help ='Resolution')
+@click.option('--image',type=int,default=1024,help ='Resolution')
+@click.option('--center_size',type=int,default=224,help ='Resolution')
+@click.option('-i','--input', type=str,required=True,help='Hi-C contact map path')
+@click.option('-o','--output', type=str,required=True,help='.bedpe file path to save loop candidates')
+def dev(batchsize, cpu, gpu, chrom, max_distance, resol, input, output, image, center_size):
+    """ *development function* Coming soon...
+    """
+    print('polaris loop dev START :) ')
+    # center_size = 224
+    # center_size = image // 2
+    start_idx = (image - center_size) // 2
+    end_idx = (image + center_size) // 2
+    slice_obj_pred = (slice(None), slice(None), slice(start_idx, end_idx), slice(start_idx, end_idx))
+    slice_obj_coord = (slice(None), slice(start_idx, end_idx), slice(start_idx, end_idx))
+    max_distance_bin=max_distance//resol
+    loopwriter = bedpewriter(output,resol,max_distance)
+    if cpu:
+        assert gpu is None, "\033[91m QAQ The CPU and GPU modes cannot be used simultaneously. Please check the command. \033[0m\n"
+        gpu = ['None']
+        device = torch.device("cpu")
+        print('Using CPU mode... (This may take significantly longer than using GPU mode.)')
+    else:
+        if torch.cuda.is_available():
+            if gpu is not None:
+                print("Using the specified GPU: " + gpu)
+                gpu=[int(i) for i in gpu.split(',')]
+                device = torch.device(f"cuda:{gpu[0]}")
+            else:
+                gpuIdx = torch.cuda.current_device()
+                device = torch.device(gpuIdx)
+                print("Automatically selected GPU: " + str(gpuIdx))
+                gpu=[gpu]
+        else:
+            device = torch.device("cpu")
+            gpu = ['None']
+            cpu = True
+            print('GPU is not available!')
+            print('Using CPU mode... (This may take significantly longer than using GPU mode.)')
+    coolfile = cooler.Cooler(input + '::/resolutions/' + str(resol))
+    modelstate = str(files('polaris').joinpath('model/sft_loop.pt'))
+    _modelstate = torch.load(modelstate, map_location=device.type)
+    parameters = _modelstate['parameters']
+    if chrom is None:
+        chrom =coolfile.chromnames
+    else:
+        chrom = chrom.split(',')
+    for rmchr in ['chrMT','MT','chrM','M','Y','chrY','X','chrX']: # 'Y','chrY','X','chrX'
+        if rmchr in chrom:
+            chrom.remove(rmchr)
+    print(f"\nAnalysing chroms: {chrom}")
+    model = polarisnet(
+            image_size=parameters['image_size'],
+            in_channels=parameters['in_channels'],
+            out_channels=parameters['out_channels'],
+            embed_dim=parameters['embed_dim'],
+            depths=parameters['depths'],
+            channels=parameters['channels'],
+            num_heads=parameters['num_heads'],
+            drop=parameters['drop'],
+            drop_path=parameters['drop_path'],
+            pos_embed=parameters['pos_embed']
+    ).to(device)
+    model.load_state_dict(_modelstate['model_state_dict'])
+    if not cpu and len(gpu) > 1:
+        model = nn.DataParallel(model, device_ids=gpu)
+    model.eval()
+    chrom = tqdm(chrom, dynamic_ncols=True)
+    for _chrom in chrom:
+        chrom.desc = f"[analyzing {_chrom}]"
+        oeMat, N = processCoolFile(coolfile, _chrom)
+        start_point = -(image - center_size) // 2
+        joffset = np.repeat(np.linspace(0, image, image, endpoint=False, dtype=int)[np.newaxis, :], image, axis=0)
+        ioffset = np.repeat(np.linspace(0, image, image, endpoint=False, dtype=int)[:, np.newaxis], image, axis=1)
+        data, i_list, j_list = [], [], []
+        for i in range(start_point, N - image - start_point, center_size):
+            for j in range(0, max_distance_bin, center_size):
+                jj = j + i
+                # if jj + w <= N and i + w <= N:
+                _oeMat = getLocal(oeMat, i, jj, image, N)
+                if np.sum(_oeMat == 0) <= (image*image*0.9):
+                    data.append(_oeMat)
+                    i_list.append(i + ioffset)
+                    j_list.append(jj + joffset)
+            while len(data) >= batchsize or (i + center_size > N - image - start_point and len(data) > 0):
+                bin_i = torch.tensor(np.stack(i_list[:batchsize], axis=0)).to(device)
+                bin_j = torch.tensor(np.stack(j_list[:batchsize], axis=0)).to(device)
+                targetX = torch.tensor(np.stack(data[:batchsize], axis=0)).to(device)
+                bin_i = bin_i*resol
+                bin_j = bin_j*resol
+                data = data[batchsize:]
+                i_list = i_list[batchsize:]
+                j_list = j_list[batchsize:]
+                print(targetX.shape)
+                print(bin_i.shape)
+                print(bin_j.shape)
+                with torch.no_grad():
+                    with autocast():
+                        pred = torch.sigmoid(model(targetX.float().to(device)))[slice_obj_pred].flatten()
+                        loop = torch.nonzero(pred>0.5).flatten().cpu()
+                        prob = pred[loop].cpu().numpy().flatten().tolist()
+                        frag1 = bin_i[slice_obj_coord].flatten().cpu().numpy()[loop].flatten().tolist()
+                        frag2 = bin_j[slice_obj_coord].flatten().cpu().numpy()[loop].flatten().tolist()
+                    loopwriter.write(_chrom,frag1,frag2,prob)
+if __name__ == '__main__':
+    dev()

polaris/loopPool.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import sys
+import click
+import numpy as np
+from sklearn.neighbors import KDTree
+import pandas as pd
+from tqdm import tqdm
+def rhoDelta(data,resol,dc,radius):
+    pos = data[[1, 4]].to_numpy() // resol
+    posTree = KDTree(pos, leaf_size=30, metric='chebyshev')
+    NNindexes, NNdists = posTree.query_radius(pos, r=radius, return_distance=True)
+    _l = []
+    for v in NNindexes:
+        _l.append(len(v))
+    _l=np.asarray(_l)
+    data = data[_l>5].reset_index(drop=True)
+    if data.shape[0] != 0:
+        pos = data[[1, 4]].to_numpy() // resol
+        val = data[6].to_numpy()
+        try:
+            posTree = KDTree(pos, leaf_size=30, metric='chebyshev')
+            NNindexes, NNdists = posTree.query_radius(pos, r=dc, return_distance=True)
+        except ValueError as e:
+            if "Found array with 0 sample(s)" in str(e):
+                print("#"*88,'\n#')
+                print("#\033[91m Error!!! The data is too sparse. Please decrease the value of: [t]\033[0m\n#")
+                print("#"*88,'\n')
+                sys.exit(1)
+            else:
+                raise
+        rhos = []
+        for i in range(len(NNindexes)):
+            rhos.append(np.dot(np.exp(-(NNdists[i] / dc) ** 2), val[NNindexes[i]]))
+        rhos = np.asarray(rhos)
+        _r = 100
+        _indexes, _dists = posTree.query_radius(pos, r=_r, return_distance=True, sort_results=True)
+        deltas = rhos * 0
+        LargerNei = rhos * 0 - 1
+        for i in range(len(_indexes)):
+            idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]])
+            if idx.shape[0] == 0:
+                deltas[i] = _dists[i][-1] + 1
+            else:
+                LargerNei[i] = _indexes[i][idx[0]]
+                deltas[i] = _dists[i][idx[0]]
+        failed = np.argwhere(LargerNei == -1).flatten()
+        while len(failed) > 1 and _r < 100000:
+            _r = _r * 10
+            _indexes, _dists = posTree.query_radius(pos[failed], r=_r, return_distance=True, sort_results=True)
+            for i in range(len(_indexes)):
+                idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]])
+                if idx.shape[0] == 0:
+                    deltas[failed[i]] = _dists[i][-1] + 1
+                else:
+                    LargerNei[failed[i]] = _indexes[i][idx[0]]
+                    deltas[failed[i]] = _dists[i][idx[0]]
+            failed = np.argwhere(LargerNei == -1).flatten()
+        data['rhos']=rhos
+        data['deltas']=deltas
+    else:
+        data['rhos']=[]
+        data['deltas']=[]
+    return data
+@click.command()
+@click.option('-dc','--distance_cutoff', type=int, default=5, help='Distance cutoff for local density calculation in terms of bin. [5]')
+@click.option('-t','--threshold', type=float, default=0.6, help='Loop score threshold [0.6]')
+@click.option('-r','--resol', default=5000, help='resolution [5000]')
+@click.option('-R','--radius', type=int, default=2, help='Radius threshold to remove outliers. [2]')
+@click.option('-d','--mindelta', type=float, default=5, help='Min distance allowed between two loops [5]')
+@click.option('-i','--candidates', type=str,required=True,help ='Loop candidates file path')
+@click.option('-o','--output', type=str,required=True,help ='.bedpe file path to save loops')
+def pool(distance_cutoff,candidates,resol,mindelta,threshold,output,radius,refine=True):
+    """Call loops from loop candidates by clustering
+    """
+    print('\npolaris loop pool START :) ')
+    data = pd.read_csv(candidates, sep='\t', header=None)
+    ccs = set(data.iloc[:,0])
+    if data.shape[0] == 0:
+        print("#"*88,'\n#')
+        print("#\033[91m Error!!! The file is empty. Please check your file.\033[0m\n#")
+        print("#"*88,'\n')
+        sys.exit(1)
+    data = data[data[6] > threshold].reset_index(drop=True)
+    data = data[data[4] - data[1] > 11*resol].reset_index(drop=True)
+    if data.shape[0] == 0:
+        print("#"*88,'\n#')
+        print("#\033[91m Error!!! The data is too sparse. Please decrease: [threshold] (minimum: 0.5).\033[0m\n#")
+        print("#"*88,'\n')
+        sys.exit(1)
+    data[['rhos','deltas']]=0
+    data=data.groupby([0]).apply(rhoDelta,resol=resol,dc=distance_cutoff,radius=radius).reset_index(drop=True)
+    minrho=0
+    targetData=data.reset_index(drop=True)
+    loopPds=[]
+    chroms=tqdm(set(targetData[0]), dynamic_ncols=True)
+    for chrom in chroms:
+        chroms.desc = f"[Runing clustering on {chrom}]"
+        data = targetData[targetData[0]==chrom].reset_index(drop=True)
+        pos = data[[1, 4]].to_numpy() // resol
+        posTree = KDTree(pos, leaf_size=30, metric='chebyshev')
+        rhos = data['rhos'].to_numpy()
+        deltas = data['deltas'].to_numpy()
+        centroid = np.argwhere((rhos > minrho) & (deltas > mindelta)).flatten()
+        _r = 100
+        _indexes, _dists = posTree.query_radius(pos, r=_r, return_distance=True, sort_results=True)
+        LargerNei = rhos * 0 - 1
+        for i in range(len(_indexes)):
+            idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]])
+            if idx.shape[0] == 0:
+                pass
+            else:
+                LargerNei[i] = _indexes[i][idx[0]]
+        failed = np.argwhere(LargerNei == -1).flatten()
+        while len(failed) > 1 and _r < 100000:
+            _r = _r * 10
+            _indexes, _dists = posTree.query_radius(pos[failed], r=_r, return_distance=True, sort_results=True)
+            for i in range(len(_indexes)):
+                idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]])
+                if idx.shape[0] == 0:
+                    pass
+                else:
+                    LargerNei[failed[i]] = _indexes[i][idx[0]]
+            failed = np.argwhere(LargerNei == -1).flatten()
+        LargerNei = LargerNei.astype(int)
+        label = LargerNei * 0 - 1
+        for i in range(len(centroid)):
+            label[centroid[i]] = i
+        decreasingsortedIdxRhos = np.argsort(-rhos)
+        for i in decreasingsortedIdxRhos:
+            if label[i] == -1:
+                label[i] = label[LargerNei[i]]
+        val = data[6].to_numpy()
+        refinedLoop = []
+        label = label.flatten()
+        for l in set(label):
+            idx = np.argwhere(label == l).flatten()
+            if len(idx) > 0:
+                refinedLoop.append(idx[np.argmax(val[idx])])
+        if refine:
+            loopPds.append(data.loc[refinedLoop])
+        else:
+            loopPds.append(data.loc[centroid])
+    loopPd=pd.concat(loopPds).sort_values(6,ascending=False)
+    loopPd[[1, 2, 4, 5]] = loopPd[[1, 2, 4, 5]].astype(int)
+    loopPd[[0,1,2,3,4,5,6]].to_csv(output,sep='\t',header=False, index=False)
+    ccs_ = set(loopPd.iloc[:,0])
+    badc = ccs.difference(ccs_)
+    if len(badc) == len(ccs):
+        raise ValueError("polaris loop pool FAILED :(\nPlease check input and mcool file to yield scoreFile. Or use higher '-s' value for more sparse mcool data.")
+    else:
+        print(f'\npolaris loop pool FINISHED :)\n{len(loopPd)} loops saved to {output}')
+        if len(badc) > 0:
+            print(f"But the loop score of {badc} are too sparse.\nYou may need to check the mcool data or re-run polaris loop score by increasing -s.")
+if __name__ == '__main__':
+    pool()

polaris/loopPool.py.bak ADDED Viewed

	@@ -0,0 +1,178 @@

+import sys
+import click
+import numpy as np
+from sklearn.neighbors import KDTree
+import pandas as pd
+from tqdm import tqdm
+def rhoDelta(data,resol,dc,radius):
+    pos = data[[1, 4]].to_numpy() // resol
+    posTree = KDTree(pos, leaf_size=30, metric='chebyshev')
+    NNindexes, NNdists = posTree.query_radius(pos, r=radius, return_distance=True)
+    _l = []
+    for v in NNindexes:
+        _l.append(len(v))
+    _l=np.asarray(_l)
+    data = data[_l>5].reset_index(drop=True)
+    if data.shape[0] != 0:
+        pos = data[[1, 4]].to_numpy() // resol
+        val = data[6].to_numpy()
+        try:
+            posTree = KDTree(pos, leaf_size=30, metric='chebyshev')
+            NNindexes, NNdists = posTree.query_radius(pos, r=dc, return_distance=True)
+        except ValueError as e:
+            if "Found array with 0 sample(s)" in str(e):
+                print("#"*88,'\n#')
+                print("#\033[91m Error!!! The data is too sparse. Please decrease the value of: [t]\033[0m\n#")
+                print("#"*88,'\n')
+                sys.exit(1)
+            else:
+                raise
+        rhos = []
+        for i in range(len(NNindexes)):
+            rhos.append(np.dot(np.exp(-(NNdists[i] / dc) ** 2), val[NNindexes[i]]))
+        rhos = np.asarray(rhos)
+        _r = 100
+        _indexes, _dists = posTree.query_radius(pos, r=_r, return_distance=True, sort_results=True)
+        deltas = rhos * 0
+        LargerNei = rhos * 0 - 1
+        for i in range(len(_indexes)):
+            idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]])
+            if idx.shape[0] == 0:
+                deltas[i] = _dists[i][-1] + 1
+            else:
+                LargerNei[i] = _indexes[i][idx[0]]
+                deltas[i] = _dists[i][idx[0]]
+        failed = np.argwhere(LargerNei == -1).flatten()
+        while len(failed) > 1 and _r < 100000:
+            _r = _r * 10
+            _indexes, _dists = posTree.query_radius(pos[failed], r=_r, return_distance=True, sort_results=True)
+            for i in range(len(_indexes)):
+                idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]])
+                if idx.shape[0] == 0:
+                    deltas[failed[i]] = _dists[i][-1] + 1
+                else:
+                    LargerNei[failed[i]] = _indexes[i][idx[0]]
+                    deltas[failed[i]] = _dists[i][idx[0]]
+            failed = np.argwhere(LargerNei == -1).flatten()
+        data['rhos']=rhos
+        data['deltas']=deltas
+    else:
+        data['rhos']=[]
+        data['deltas']=[]
+    return data
+@click.command()
+@click.option('-dc','--distance_cutoff', type=int, default=5, help='Distance cutoff for local density calculation in terms of bin. [5]')
+@click.option('-t','--threshold', type=float, default=0.6, help='Loop score threshold [0.6]')
+@click.option('-r','--resol', default=5000, help='resolution [5000]')
+@click.option('-R','--radius', type=int, default=2, help='Radius threshold to remove outliers. [2]')
+@click.option('-d','--mindelta', type=float, default=5, help='Min distance allowed between two loops [5]')
+@click.option('-i','--candidates', type=str,required=True,help ='Loop candidates file path')
+@click.option('-o','--output', type=str,required=True,help ='.bedpe file path to save loops')
+def pool(distance_cutoff,candidates,resol,mindelta,threshold,output,radius,refine=True):
+    """Call loops from loop candidates by clustering
+    """
+    print('\npolaris loop pool START :) ')
+    data = pd.read_csv(candidates, sep='\t', header=None)
+    ccs = set(data.iloc[:,0])
+    if data.shape[0] == 0:
+        print("#"*88,'\n#')
+        print("#\033[91m Error!!! The file is empty. Please check your file.\033[0m\n#")
+        print("#"*88,'\n')
+        sys.exit(1)
+    data = data[data[6] > threshold].reset_index(drop=True)
+    data = data[data[4] - data[1] > 11*resol].reset_index(drop=True)
+    if data.shape[0] == 0:
+        print("#"*88,'\n#')
+        print("#\033[91m Error!!! The data is too sparse. Please decrease: [threshold] (minimum: 0.5).\033[0m\n#")
+        print("#"*88,'\n')
+        sys.exit(1)
+    data[['rhos','deltas']]=0
+    data=data.groupby([0]).apply(rhoDelta,resol=resol,dc=distance_cutoff,radius=radius).reset_index(drop=True)
+    minrho=0
+    targetData=data.reset_index(drop=True)
+    loopPds=[]
+    chroms=tqdm(set(targetData[0]), dynamic_ncols=True)
+    for chrom in chroms:
+        chroms.desc = f"[Runing clustering on {chrom}]"
+        data = targetData[targetData[0]==chrom].reset_index(drop=True)
+        pos = data[[1, 4]].to_numpy() // resol
+        posTree = KDTree(pos, leaf_size=30, metric='chebyshev')
+        rhos = data['rhos'].to_numpy()
+        deltas = data['deltas'].to_numpy()
+        centroid = np.argwhere((rhos > minrho) & (deltas > mindelta)).flatten()
+        _r = 100
+        _indexes, _dists = posTree.query_radius(pos, r=_r, return_distance=True, sort_results=True)
+        LargerNei = rhos * 0 - 1
+        for i in range(len(_indexes)):
+            idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]])
+            if idx.shape[0] == 0:
+                pass
+            else:
+                LargerNei[i] = _indexes[i][idx[0]]
+        failed = np.argwhere(LargerNei == -1).flatten()
+        while len(failed) > 1 and _r < 100000:
+            _r = _r * 10
+            _indexes, _dists = posTree.query_radius(pos[failed], r=_r, return_distance=True, sort_results=True)
+            for i in range(len(_indexes)):
+                idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]])
+                if idx.shape[0] == 0:
+                    pass
+                else:
+                    LargerNei[failed[i]] = _indexes[i][idx[0]]
+            failed = np.argwhere(LargerNei == -1).flatten()
+        LargerNei = LargerNei.astype(int)
+        label = LargerNei * 0 - 1
+        for i in range(len(centroid)):
+            label[centroid[i]] = i
+        decreasingsortedIdxRhos = np.argsort(-rhos)
+        for i in decreasingsortedIdxRhos:
+            if label[i] == -1:
+                label[i] = label[LargerNei[i]]
+        val = data[6].to_numpy()
+        refinedLoop = []
+        label = label.flatten()
+        for l in set(label):
+            idx = np.argwhere(label == l).flatten()
+            if len(idx) > 0:
+                refinedLoop.append(idx[np.argmax(val[idx])])
+        if refine:
+            loopPds.append(data.loc[refinedLoop])
+        else:
+            loopPds.append(data.loc[centroid])
+    loopPd=pd.concat(loopPds).sort_values(6,ascending=False)
+    loopPd[[1, 2, 4, 5]] = loopPd[[1, 2, 4, 5]].astype(int)
+    loopPd[[0,1,2,3,4,5,6]].to_csv(output,sep='\t',header=False, index=False)
+    ccs_ = set(loopPd.iloc[:,0])
+    badc = ccs.difference(ccs_)
+    if len(badc) == len(ccs):
+        raise ValueError("polaris loop pool FAILED :(\nPlease check input and mcool file to yield scoreFile. Or use higher '-s' value for more sparse mcool data.")
+    else:
+        print(f'\npolaris loop pool FINISHED :)\n{len(loopPd)} loops saved to {output}')
+        if len(badc) > 0:
+            print(f"But the loop score of {badc} are too sparse.\nYou may need to check the mcool data or re-run polaris loop score by increasing -s.")
+if __name__ == '__main__':
+    pool()

polaris/loopPool_proof_wang_duplicate.py.bak ADDED Viewed

	@@ -0,0 +1,192 @@

+import sys
+import click
+import numpy as np
+from sklearn.neighbors import KDTree
+import pandas as pd
+from tqdm import tqdm
+def rhoDelta(data,resol,dc,radius):
+    pos = data[[1, 4]].to_numpy() // resol
+    val = data[6].to_numpy()
+    try:
+        posTree = KDTree(pos, leaf_size=30, metric='chebyshev')
+        NNindexes, NNdists = posTree.query_radius(pos, r=dc, return_distance=True)
+    except ValueError as e:
+        if "Found array with 0 sample(s)" in str(e):
+            print("#"*88,'\n#')
+            print("#\033[91m Error!!! The data is too sparse. Please decrease the value of: [t]\033[0m\n#")
+            print("#"*88,'\n')
+            sys.exit(1)
+        else:
+            raise
+    rhos = []
+    for i in range(len(NNindexes)):
+        rhos.append(np.dot(np.exp(-(NNdists[i] / dc) ** 2), val[NNindexes[i]]))
+    rhos = np.asarray(rhos)
+    _r = 100
+    _indexes, _dists = posTree.query_radius(pos, r=_r, return_distance=True, sort_results=True)
+    deltas = rhos * 0
+    LargerNei = rhos * 0 - 1
+    for i in range(len(_indexes)):
+        idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]])
+        if idx.shape[0] == 0:
+            deltas[i] = _dists[i][-1] + 1
+        else:
+            LargerNei[i] = _indexes[i][idx[0]]
+            deltas[i] = _dists[i][idx[0]]
+    failed = np.argwhere(LargerNei == -1).flatten()
+    while len(failed) > 1 and _r < 100000:
+        _r = _r * 10
+        _indexes, _dists = posTree.query_radius(pos[failed], r=_r, return_distance=True, sort_results=True)
+        for i in range(len(_indexes)):
+            idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]])
+            if idx.shape[0] == 0:
+                deltas[failed[i]] = _dists[i][-1] + 1
+            else:
+                LargerNei[failed[i]] = _indexes[i][idx[0]]
+                deltas[failed[i]] = _dists[i][idx[0]]
+        failed = np.argwhere(LargerNei == -1).flatten()
+    data['rhos']=rhos
+    data['deltas']=deltas
+    return data
+@click.command()
+@click.option('-dc','--distance_cutoff', type=int, default=5, help='Distance cutoff for local density calculation in terms of bin. [5]')
+@click.option('-t','--threshold', type=float, default=0.6, help='Loop score threshold [0.6]')
+@click.option('-r','--resol', default=5000, help='resolution [5000]')
+@click.option('-R','--radius', type=int, default=2, help='Radius threshold to remove outliers. [2]')
+@click.option('-d','--mindelta', type=float, default=5, help='Min distance allowed between two loops [5]')
+@click.option('-i','--candidates', type=str,required=True,help ='Loop candidates file path')
+@click.option('-o','--output', type=str,required=True,help ='.bedpe file path to save loops')
+def pool(distance_cutoff,candidates,resol,mindelta,threshold,output,radius,refine=True):
+    """Call loops from loop candidates by clustering
+    """
+    print('\npolaris loop pool START :) ')
+    data = pd.read_csv(candidates, sep='\t', header=None, comment='#')
+    print(data.head())
+    data[6]=1
+    print(data.head())
+    data[[1,4]] = data[[1,4]]//resol*resol
+    print(data.head())
+    data = data.drop_duplicates().reset_index(drop=True)
+    ccs = set(data.iloc[:,0])
+    # if data.shape[0] == 0:
+    #     print("#"*88,'\n#')
+    #     print("#\033[91m Error!!! The file is empty. Please check your file.\033[0m\n#")
+    #     print("#"*88,'\n')
+    #     sys.exit(1)
+    # data = data[data[6] > threshold].reset_index(drop=True)
+    # data = data[data[4] - data[1] > 11*resol].reset_index(drop=True)
+    # if data.shape[0] == 0:
+    #     print("#"*88,'\n#')
+    #     print("#\033[91m Error!!! The data is too sparse. Please decrease: [threshold] (minimum: 0.5).\033[0m\n#")
+    #     print("#"*88,'\n')
+    #     sys.exit(1)
+    data[['rhos','deltas']]=0
+    print(data.shape)
+    data=data.groupby([0]).apply(rhoDelta,resol=resol,dc=distance_cutoff,radius=radius).reset_index(drop=True)
+    minrho=0
+    targetData=data.reset_index(drop=True)
+    print(data.shape)
+    loopPds=[]
+    # chroms=tqdm(set(targetData[0]), dynamic_ncols=True)
+    rep=0
+    chroms=set(targetData[0])
+    for chrom in chroms:
+        print(f"[Runing clustering on {chrom}]")
+        # chroms.desc = f"[Runing clustering on {chrom}]"
+        data = targetData[targetData[0]==chrom].reset_index(drop=True)
+        print(data.shape)
+        pos = data[[1, 4]].to_numpy() // resol
+        posTree = KDTree(pos, leaf_size=30, metric='chebyshev')
+        rhos = data['rhos'].to_numpy()
+        deltas = data['deltas'].to_numpy()
+        # centroid = np.argwhere((rhos > minrho) & (deltas > mindelta)).flatten()
+        centroid = np.argwhere((deltas > mindelta)).flatten()
+        print(centroid.shape)
+        rep += data.shape[0] - centroid.shape[0]
+        _r = 100
+        _indexes, _dists = posTree.query_radius(pos, r=_r, return_distance=True, sort_results=True)
+        LargerNei = rhos * 0 - 1
+        for i in range(len(_indexes)):
+            idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]])
+            if idx.shape[0] == 0:
+                pass
+            else:
+                LargerNei[i] = _indexes[i][idx[0]]
+        failed = np.argwhere(LargerNei == -1).flatten()
+        while len(failed) > 1 and _r < 100000:
+            _r = _r * 10
+            _indexes, _dists = posTree.query_radius(pos[failed], r=_r, return_distance=True, sort_results=True)
+            for i in range(len(_indexes)):
+                idx = np.argwhere(rhos[_indexes[i]] > rhos[_indexes[i][0]])
+                if idx.shape[0] == 0:
+                    pass
+                else:
+                    LargerNei[failed[i]] = _indexes[i][idx[0]]
+            failed = np.argwhere(LargerNei == -1).flatten()
+        LargerNei = LargerNei.astype(int)
+        label = LargerNei * 0 - 1
+        for i in range(len(centroid)):
+            label[centroid[i]] = i
+        decreasingsortedIdxRhos = np.argsort(-rhos)
+        for i in decreasingsortedIdxRhos:
+            if label[i] == -1:
+                label[i] = label[LargerNei[i]]
+        val = data[6].to_numpy()
+        refinedLoop = []
+        label = label.flatten()
+        for l in set(label):
+            idx = np.argwhere(label == l).flatten()
+            if len(idx) > 0:
+                refinedLoop.append(idx[np.argmax(val[idx])])
+        if refine:
+            loopPds.append(data.loc[refinedLoop])
+        else:
+            loopPds.append(data.loc[centroid])
+    loopPd=pd.concat(loopPds).sort_values(6,ascending=False)
+    loopPd[[1, 2, 4, 5]] = loopPd[[1, 2, 4, 5]].astype(int)
+    loopPd[[0,1,2,3,4,5,6]].to_csv(output,sep='\t',header=False, index=False)
+    ccs_ = set(loopPd.iloc[:,0])
+    badc = ccs.difference(ccs_)
+    if len(badc) == len(ccs):
+        raise ValueError("polaris loop pool FAILED :(\nPlease check input and mcool file to yield scoreFile. Or use higher '-s' value for more sparse mcool data.")
+    else:
+        print(f'\npolaris loop pool FINISHED :)\n{len(loopPd)} loops saved to {output}')
+        if len(badc) > 0:
+            print(f"But the loop score of {badc} are too sparse.\nYou may need to check the mcool data or re-run polaris loop score by increasing -s.")
+    print(f"dupicate loop: {rep}")
+if __name__ == '__main__':
+    pool()

polaris/loopScore.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import torch
+import cooler
+import click
+from torch import nn
+from tqdm import tqdm
+from torch.cuda.amp import autocast
+from importlib_resources import files
+from torch.utils.data import DataLoader
+from polaris.utils.util_loop import bedpewriter
+from polaris.model.polarisnet import polarisnet
+from polaris.utils.util_data import centerPredCoolDataset
+@click.command()
+@click.option('-b','--batchsize', type=int, default=128, help='Batch size [128]')
+@click.option('-C','--cpu', type=bool, default=False, help='Use CPU [False]')
+@click.option('-G','--gpu', type=str, default=None, help='Comma-separated GPU indices [auto select]')
+@click.option('-c','--chrom', type=str, default=None, help='Comma separated chroms [all autosomes]')
+@click.option('-nw','--workers', type=int, default=16, help='Number of cpu threads [16]')
+@click.option('-t','--threshold', type=float, default=0.5, help='Loop Score Threshold [0.5]')
+@click.option('-s','--sparsity', type=float, default=0.9, help='Allowed sparsity of submatrices [0.9]')
+@click.option('-md','--max_distance', type=int, default=3000000, help='Max distance (bp) between contact pairs [3000000]')
+@click.option('-r','--resol',type=int,default=5000,help ='Resolution [5000]')
+@click.option('--raw',type=bool,default=False,help ='Raw matrix or balanced matrix')
+@click.option('-i','--input', type=str,required=True,help='Hi-C contact map path')
+@click.option('-o','--output', type=str,required=True,help='.bedpe file path to save loop candidates')
+def score(batchsize, cpu, gpu, chrom, workers, threshold, sparsity, max_distance, resol, input, output, raw, image=224):
+    """Predict loop score for each pixel in the input contact map
+    """
+    print('\npolaris loop score START :) ')
+    center_size = image // 2
+    start_idx = (image - center_size) // 2
+    end_idx = (image + center_size) // 2
+    slice_obj_pred = (slice(None), slice(None), slice(start_idx, end_idx), slice(start_idx, end_idx))
+    slice_obj_coord = (slice(None), slice(start_idx, end_idx), slice(start_idx, end_idx))
+    loopwriter = bedpewriter(output,resol,max_distance)
+    if cpu:
+        assert gpu is None, "\033[91m QAQ The CPU and GPU modes cannot be used simultaneously. Please check the command. \033[0m\n"
+        gpu = ['None']
+        device = torch.device("cpu")
+        print('Using CPU mode... (This may take significantly longer than using GPU mode.)')
+    else:
+        if torch.cuda.is_available():
+            if gpu is not None:
+                print("Using the specified GPU: " + gpu)
+                gpu=[int(i) for i in gpu.split(',')]
+                device = torch.device(f"cuda:{gpu[0]}")
+            else:
+                gpuIdx = torch.cuda.current_device()
+                device = torch.device(gpuIdx)
+                print("Automatically selected GPU: " + str(gpuIdx))
+                gpu=[gpu]
+        else:
+            device = torch.device("cpu")
+            gpu = ['None']
+            cpu = True
+            print('GPU is not available!')
+            print('Using CPU mode... (This may take significantly longer than using GPU mode.)')
+    coolfile = cooler.Cooler(input + '::/resolutions/' + str(resol))
+    modelstate = str(files('polaris').joinpath('model/sft_loop.pt'))
+    _modelstate = torch.load(modelstate, map_location=device.type)
+    parameters = _modelstate['parameters']
+    if chrom is None:
+        chrom =coolfile.chromnames
+    else:
+        chrom = chrom.split(',')
+    # for rmchr in ['chrMT','MT','chrM','M','Y','chrY','X','chrX','chrW','W','chrZ','Z']: # 'Y','chrY','X','chrX'
+    #     if rmchr in chrom:
+    #         chrom.remove(rmchr)
+    print(f"Analysing chroms: {chrom}")
+    model = polarisnet(
+            image_size=parameters['image_size'],
+            in_channels=parameters['in_channels'],
+            out_channels=parameters['out_channels'],
+            embed_dim=parameters['embed_dim'],
+            depths=parameters['depths'],
+            channels=parameters['channels'],
+            num_heads=parameters['num_heads'],
+            drop=parameters['drop'],
+            drop_path=parameters['drop_path'],
+            pos_embed=parameters['pos_embed']
+    ).to(device)
+    model.load_state_dict(_modelstate['model_state_dict'])
+    if not cpu and len(gpu) > 1:
+        model = nn.DataParallel(model, device_ids=gpu)
+    model.eval()
+    badc=[]
+    chrom_ = tqdm(chrom, dynamic_ncols=True)
+    for _chrom in chrom_:
+        test_data = centerPredCoolDataset(coolfile,_chrom,max_distance_bin=max_distance//resol,w=image,step=center_size,s=sparsity,raw=raw)
+        test_dataloader = DataLoader(test_data, batch_size=batchsize, shuffle=False,num_workers=workers,prefetch_factor=4,pin_memory=(gpu is not None))
+        chrom_.desc = f"[Analyzing {_chrom} with {len(test_data)} submatrices]"
+        if len(test_data) == 0:
+            badc.append(_chrom)
+        with torch.no_grad():
+            for X in test_dataloader:
+                bin_i,bin_j,targetX=X
+                bin_i = bin_i*resol
+                bin_j = bin_j*resol
+                with autocast():
+                    pred = torch.sigmoid(model(targetX.float().to(device)))[slice_obj_pred].flatten()
+                    loop = torch.nonzero(pred>threshold).flatten().cpu()
+                    prob = pred[loop].cpu().numpy().flatten().tolist()
+                    frag1 = bin_i[slice_obj_coord].flatten().cpu().numpy()[loop].flatten().tolist()
+                    frag2 = bin_j[slice_obj_coord].flatten().cpu().numpy()[loop].flatten().tolist()
+                loopwriter.write(_chrom,frag1,frag2,prob)
+    if len(badc)==len(chrom):
+        raise ValueError("polaris loop score FAILED :( \nThe '-s' value needs to be increased for more sparse data.")
+    else:
+        print(f'\npolaris loop score FINISHED :)\nLoopscore file saved at {output}')
+        if len(badc)>0:
+            print(f"But the size of {badc} are too small or their contact matrix are too sparse.\nYou may need to check the data or run these chr respectively by increasing -s.")
+if __name__ == '__main__':
+    score()

polaris/model/polarisnet.py ADDED Viewed

	@@ -0,0 +1,526 @@

+import torch
+import torch.nn as nn
+from operator import itemgetter
+from typing import Type, Callable, Tuple, Optional, Set, List, Union
+from timm.models.layers import drop_path, trunc_normal_, Mlp, DropPath
+from timm.models.efficientnet_blocks import SqueezeExcite, DepthwiseSeparableConv
+def exists(val):
+    return val is not None
+def map_el_ind(arr, ind):
+    return list(map(itemgetter(ind), arr))
+def sort_and_return_indices(arr):
+    indices = [ind for ind in range(len(arr))]
+    arr = zip(arr, indices)
+    arr = sorted(arr)
+    return map_el_ind(arr, 0), map_el_ind(arr, 1)
+def calculate_permutations(num_dimensions, emb_dim):
+    total_dimensions = num_dimensions + 2
+    axial_dims = [ind for ind in range(1, total_dimensions) if ind != emb_dim]
+    permutations = []
+    for axial_dim in axial_dims:
+        last_two_dims = [axial_dim, emb_dim]
+        dims_rest = set(range(0, total_dimensions)) - set(last_two_dims)
+        permutation = [*dims_rest, *last_two_dims]
+        permutations.append(permutation)
+    return permutations
+class ChanLayerNorm(nn.Module):
+    def __init__(self, dim, eps = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.g = nn.Parameter(torch.ones(1, dim, 1, 1))
+        self.b = nn.Parameter(torch.zeros(1, dim, 1, 1))
+    def forward(self, x):
+        std = torch.var(x, dim = 1, unbiased = False, keepdim = True).sqrt()
+        mean = torch.mean(x, dim = 1, keepdim = True)
+        return (x - mean) / (std + self.eps) * self.g + self.b
+class PreNorm(nn.Module):
+    def __init__(self, dim, fn):
+        super().__init__()
+        self.fn = fn
+        self.norm = nn.LayerNorm(dim)
+    def forward(self, x):
+        x = self.norm(x)
+        return self.fn(x)
+class PermuteToFrom(nn.Module):
+    def __init__(self, permutation, fn):
+        super().__init__()
+        self.fn = fn
+        _, inv_permutation = sort_and_return_indices(permutation)
+        self.permutation = permutation
+        self.inv_permutation = inv_permutation
+    def forward(self, x, **kwargs):
+        axial = x.permute(*self.permutation).contiguous()
+        shape = axial.shape
+        *_, t, d = shape
+        axial = axial.reshape(-1, t, d)
+        axial = self.fn(axial, **kwargs)
+        axial = axial.reshape(*shape)
+        axial = axial.permute(*self.inv_permutation).contiguous()
+        return axial
+class AxialPositionalEmbedding(nn.Module):
+    def __init__(self, dim, shape, emb_dim_index = 1):
+        super().__init__()
+        parameters = []
+        total_dimensions = len(shape) + 2
+        ax_dim_indexes = [i for i in range(1, total_dimensions) if i != emb_dim_index]
+        self.num_axials = len(shape)
+        for i, (axial_dim, axial_dim_index) in enumerate(zip(shape, ax_dim_indexes)):
+            shape = [1] * total_dimensions
+            shape[emb_dim_index] = dim
+            shape[axial_dim_index] = axial_dim
+            parameter = nn.Parameter(torch.randn(*shape))
+            setattr(self, f'param_{i}', parameter)
+    def forward(self, x):
+        for i in range(self.num_axials):
+            x = x + getattr(self, f'param_{i}')
+        return x
+class SelfAttention(nn.Module):
+    def __init__(self, dim, heads, dim_heads=None, drop=0):
+        super().__init__()
+        self.dim_heads = (dim // heads) if dim_heads is None else dim_heads
+        dim_hidden = self.dim_heads * heads
+        self.drop_rate = drop
+        self.heads = heads
+        self.to_q = nn.Linear(dim, dim_hidden, bias = False)
+        self.to_kv = nn.Linear(dim, 2 * dim_hidden, bias = False)
+        self.to_out = nn.Linear(dim_hidden, dim)
+        self.proj_drop = DropPath(drop)
+    def forward(self, x, kv = None):
+        kv = x if kv is None else kv
+        q, k, v = (self.to_q(x), *self.to_kv(kv).chunk(2, dim=-1))
+        b, t, d, h, e = *q.shape, self.heads, self.dim_heads
+        merge_heads = lambda x: x.reshape(b, -1, h, e).transpose(1, 2).reshape(b * h, -1, e)
+        q, k, v = map(merge_heads, (q, k, v))
+        dots = torch.einsum('bie,bje->bij', q, k) * (e ** -0.5)
+        dots = dots.softmax(dim=-1)
+        out = torch.einsum('bij,bje->bie', dots, v)
+        out = out.reshape(b, h, -1, e).transpose(1, 2).reshape(b, -1, d)
+        out = self.to_out(out)
+        out = self.proj_drop(out)
+        return out
+class AxialTransformerBlock(nn.Module):
+    def __init__(self,
+                 dim,
+                 axial_pos_emb_shape,
+                 pos_embed,
+                 heads = 8,
+                 dim_heads = None,
+                 drop = 0.,
+                 drop_path_rate=0.,
+    ):
+        super().__init__()
+        dim_index = 1
+        permutations = calculate_permutations(2, dim_index)
+        self.pos_emb = AxialPositionalEmbedding(dim, axial_pos_emb_shape, dim_index) if pos_embed else nn.Identity()
+        self.height_attn, self.width_attn = nn.ModuleList([PermuteToFrom(permutation, PreNorm(dim, SelfAttention(dim, heads, dim_heads, drop=drop))) for permutation in permutations])
+        self.FFN = nn.Sequential(
+            ChanLayerNorm(dim),
+            nn.Conv2d(dim, dim * 4, 3, padding = 1),
+            nn.GELU(),
+            DropPath(drop),
+            nn.Conv2d(dim * 4, dim, 3, padding = 1),
+            DropPath(drop),
+            ChanLayerNorm(dim),
+            nn.Conv2d(dim, dim * 4, 3, padding = 1),
+            nn.GELU(),
+            DropPath(drop),
+            nn.Conv2d(dim * 4, dim, 3, padding = 1),
+            DropPath(drop),
+        )
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+    def forward(self, x):
+        x = self.pos_emb(x)
+        x = x + self.drop_path(self.height_attn(x))
+        x = x + self.drop_path(self.width_attn(x))
+        x = x + self.drop_path(self.FFN(x))
+        return x
+def pair(t):
+    return t if isinstance(t, tuple) else (t, t)
+def _gelu_ignore_parameters(*args, **kwargs) -> nn.Module:
+    activation = nn.GELU()
+    return activation
+class DoubleConv(nn.Module):
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            downscale: bool = False,
+            act_layer: Type[nn.Module] = nn.GELU,
+            norm_layer: Type[nn.Module] = nn.BatchNorm2d,
+            drop_path: float = 0.,
+    ) -> None:
+        super(DoubleConv, self).__init__()
+        self.drop_path_rate: float = drop_path
+        if act_layer == nn.GELU:
+            act_layer = _gelu_ignore_parameters
+        self.main_path = nn.Sequential(
+            norm_layer(in_channels),
+            nn.Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=(1, 1)),
+            DepthwiseSeparableConv(in_chs=in_channels, out_chs=out_channels, stride=2 if downscale else 1,
+                                   act_layer=act_layer, norm_layer=norm_layer, drop_path_rate=drop_path),
+            SqueezeExcite(in_chs=out_channels, rd_ratio=0.25),
+            nn.Conv2d(in_channels=out_channels, out_channels=out_channels, kernel_size=(1, 1))
+        )
+        if downscale:
+            self.skip_path = nn.Sequential(
+                nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2)),
+                nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=(1, 1))
+            )
+        else:
+            self.skip_path = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=(1, 1))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        output = self.main_path(x)
+        if self.drop_path_rate > 0.:
+            output = drop_path(output, self.drop_path_rate, self.training)
+        x = output + self.skip_path(x)
+        return x
+class DeconvModule(nn.Module):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 norm_layer=nn.BatchNorm2d,
+                 act_layer=nn.Mish,
+                 kernel_size=4,
+                 scale_factor=2):
+        super(DeconvModule, self).__init__()
+        assert (kernel_size - scale_factor >= 0) and\
+               (kernel_size - scale_factor) % 2 == 0,\
+               f'kernel_size should be greater than or equal to scale_factor '\
+               f'and (kernel_size - scale_factor) should be even numbers, '\
+               f'while the kernel size is {kernel_size} and scale_factor is '\
+               f'{scale_factor}.'
+        stride = scale_factor
+        padding = (kernel_size - scale_factor) // 2
+        deconv = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding)
+        norm = norm_layer(out_channels)
+        activate = act_layer()
+        self.deconv_upsamping = nn.Sequential(deconv, norm, activate)
+    def forward(self, x):
+        out = self.deconv_upsamping(x)
+        return out
+class Stage(nn.Module):
+    def __init__(self,
+            image_size: int,
+            depth: int,
+            in_channels: int,
+            out_channels: int,
+            type_name: str,
+            pos_embed: bool,
+            num_heads: int = 32,
+            drop: float = 0.,
+            drop_path: Union[List[float], float] = 0.,
+            act_layer: Type[nn.Module] = nn.GELU,
+            norm_layer: Type[nn.Module] = nn.BatchNorm2d,
+    ):
+        super().__init__()
+        self.type_name = type_name
+        if self.type_name == "encoder":
+            self.conv = DoubleConv(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                downscale=True,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                drop_path=drop_path[0],
+            )
+            self.blocks = nn.Sequential(*[
+                AxialTransformerBlock(
+                    dim=out_channels,
+                    axial_pos_emb_shape=pair(image_size),
+                    heads = num_heads,
+                    drop = drop,
+                    drop_path_rate=drop_path[index],
+                    dim_heads = None,
+                    pos_embed=pos_embed
+                )
+                for index in range(depth)
+            ])
+        elif self.type_name == "decoder":
+            self.upsample = DeconvModule(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                norm_layer=norm_layer,
+                act_layer=act_layer
+                )
+            self.conv = DoubleConv(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                downscale=False,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                drop_path=drop_path[0],
+            )
+            self.blocks = nn.Sequential(*[
+                AxialTransformerBlock(
+                    dim=out_channels,
+                    axial_pos_emb_shape=pair(image_size),
+                    heads = num_heads,
+                    drop = drop,
+                    drop_path_rate=drop_path[index],
+                    dim_heads = None,
+                    pos_embed=pos_embed
+                )
+                for index in range(depth)
+            ])
+    def forward(self, x, skip=None):
+        if self.type_name == "encoder":
+            x = self.conv(x)
+            x = self.blocks(x)
+        elif self.type_name == "decoder":
+            x = self.upsample(x)
+            x = torch.cat([skip, x], dim=1)
+            x = self.conv(x)
+            x = self.blocks(x)
+        return x
+class FinalExpand(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        embed_dim,
+        out_channels,
+        norm_layer,
+        act_layer,
+        ):
+        super().__init__()
+        self.upsample = DeconvModule(
+                in_channels=in_channels,
+                out_channels=embed_dim,
+                norm_layer=norm_layer,
+                act_layer=act_layer
+        )
+        self.conv = nn.Sequential(
+            nn.Conv2d(in_channels=embed_dim*2, out_channels=embed_dim, kernel_size=3, stride=1, padding=1),
+            act_layer(),
+            nn.Conv2d(in_channels=embed_dim, out_channels=embed_dim, kernel_size=3, stride=1, padding=1),
+            act_layer(),
+        )
+    def forward(self, skip, x):
+        x = self.upsample(x)
+        x = torch.cat([skip, x], dim=1)
+        x = self.conv(x)
+        return x
+class polarisnet(nn.Module):
+    def __init__(
+        self,
+        image_size=224,
+        in_channels=1,
+        out_channels=1,
+        embed_dim=64,
+        depths=[2,2,2,2],
+        channels=[64,128,256,512],
+        num_heads = 16,
+        drop=0.,
+        drop_path=0.1,
+        act_layer=nn.GELU,
+        norm_layer=nn.BatchNorm2d,
+        pos_embed=False
+        ):
+        super(polarisnet, self).__init__()
+        self.num_stages = len(depths)
+        self.num_features = channels[-1]
+        self.embed_dim = channels[0]
+        self.conv_first = nn.Sequential(
+            nn.Conv2d(in_channels=in_channels, out_channels=embed_dim, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
+            act_layer(),
+            nn.Conv2d(in_channels=embed_dim, out_channels=embed_dim, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
+            act_layer(),
+        )
+        drop_path = torch.linspace(0.0, drop_path, sum(depths)).tolist()
+        encoder_stages = []
+        for index in range(self.num_stages):
+            encoder_stages.append(
+                Stage(
+                    image_size=image_size//(pow(2,1+index)),
+                    depth=depths[index],
+                    in_channels=embed_dim if index == 0 else channels[index - 1],
+                    out_channels=channels[index],
+                    num_heads=num_heads,
+                    drop=drop,
+                    drop_path=drop_path[sum(depths[:index]):sum(depths[:index + 1])],
+                    act_layer=act_layer,
+                    norm_layer=norm_layer,
+                    type_name = "encoder",
+                    pos_embed=pos_embed
+                )
+            )
+        self.encoder_stages = nn.ModuleList(encoder_stages)
+        decoder_stages = []
+        for index in range(self.num_stages-1):
+            decoder_stages.append(
+                Stage(
+                    image_size=image_size//(pow(2,self.num_stages-index-1)),
+                    depth=depths[self.num_stages - index - 2],
+                    in_channels=channels[self.num_stages - index - 1],
+                    out_channels=channels[self.num_stages - index - 2],
+                    num_heads=num_heads,
+                    drop=drop,
+                    drop_path=drop_path[sum(depths[:(self.num_stages-2-index)]):sum(depths[:(self.num_stages-2-index) + 1])],
+                    act_layer=act_layer,
+                    norm_layer=norm_layer,
+                    type_name = "decoder",
+                    pos_embed=pos_embed
+                )
+            )
+        self.decoder_stages = nn.ModuleList(decoder_stages)
+        self.norm = norm_layer(self.num_features)
+        self.norm_up= norm_layer(self.embed_dim)
+        self.up = FinalExpand(
+            in_channels=channels[0],
+            embed_dim=embed_dim,
+            out_channels=embed_dim,
+            norm_layer=norm_layer,
+            act_layer=act_layer
+        )
+        self.output = nn.Conv2d(embed_dim, out_channels, kernel_size=3, padding=1)
+    def encoder_forward(self, x: torch.Tensor) -> torch.Tensor:
+        outs = []
+        x = self.conv_first(x)
+        for stage in self.encoder_stages:
+            outs.append(x)
+            x = stage(x)
+        x = self.norm(x)
+        return x, outs
+    def decoder_forward(self, x: torch.Tensor, x_downsample: list) -> torch.Tensor:
+        for inx, stage in enumerate(self.decoder_stages):
+            x = stage(x, x_downsample[len(x_downsample)-1-inx])
+        x = self.norm_up(x)
+        return x
+    def up_x4(self, x: torch.Tensor, x_downsample: list):
+        x = self.up(x_downsample[0],x)
+        x = self.output(x)
+        return x
+    def forward(self, x):
+        x, x_downsample = self.encoder_forward(x)
+        x = self.decoder_forward(x,x_downsample)
+        x = self.up_x4(x,x_downsample)
+        return x
+if __name__ == '__main__':
+    net = polarisnet(in_channels=1, embed_dim=64, pos_embed=True).cuda()
+    X = torch.randn(5, 1, 224, 224).cuda()
+    y = net(X)
+    print(y.shape)

polaris/model/sft_loop.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cae9e9a28e5c3ff0d328934c066d275371d5301db084a914431198134f66ada2
+size 547572280

polaris/polaris.py ADDED Viewed

	@@ -0,0 +1,54 @@

+# My code has references to the following repositories:
+# RefHiC: https://github.com/BlanchetteLab/RefHiC（Analysis code）
+# Axial Attention: https://github.com/lucidrains/axial-attention (Model architecture)
+# Peakachu: https://github.com/tariks/peakachu (Calculate intra reads)
+# Thanks a lot for their implement.
+# --------------------------------------------------------
+import click
+from polaris.loopScore import score
+from polaris.loopDev import dev
+from polaris.loopPool import pool
+from polaris.loop import pred
+from polaris.utils.util_cool2bcool import cool2bcool
+from polaris.utils.util_pileup import pileup
+from polaris.utils.util_depth import depth
+@click.group()
+def cli():
+    '''
+    Polaris
+    A Versatile Framework for Chromatin Loop Annotation in Bulk and Single-cell Hi-C Data
+    '''
+    pass
+@cli.group()
+def loop():
+    '''Loop annotation.
+    \b
+    Annotate loops from chromosomal contact maps.
+    '''
+    pass
+@cli.group()
+def util():
+    '''Utilities.
+    \b
+    Utilities for analysis and visualization.'''
+    pass
+loop.add_command(pred)
+loop.add_command(score)
+loop.add_command(dev)
+loop.add_command(pool)
+util.add_command(depth)
+util.add_command(cool2bcool)
+util.add_command(pileup)
+if __name__ == '__main__':
+    cli()

polaris/utils/util_bcooler.py ADDED Viewed

	@@ -0,0 +1,347 @@

+import cooler
+import numpy as np
+from types import SimpleNamespace
+import random
+import sys
+def shuffleIFWithCount(df):
+    shuf=df[['count','balanced']].sample(frac=1)
+    df[['count','balanced']]=shuf[['count','balanced']].to_numpy()
+    return df
+def shuffleIF(df):
+    if len(df)<10:
+        df = shuffleIFWithCount(df)
+        return df
+    min=np.min(df['bin1_id'])
+    max=np.max(df['bin1_id'])
+    distance = df['distance'].iloc[0]
+    bin1_id = np.random.randint(min, high=max, size=int(len(df)*1.5))
+    bin2_id = bin1_id + distance
+    pair_id = set(zip(bin1_id,bin2_id))
+    if len(pair_id)<len(df)-50:
+        bin1_id = np.random.randint(min, high=max, size=len(df))
+        bin2_id = bin1_id + distance
+        extra_pair_id = set(zip(bin1_id,bin2_id))
+        pair_id.update(extra_pair_id)
+    if len(pair_id)<len(df):
+        df = df.sample(len(pair_id))
+    pair_id = list(pair_id)
+    random.shuffle(pair_id)
+    pair_id=np.asarray(pair_id[:len(df)])
+    df['bin1_id']=pair_id[:,0]
+    df['bin2_id'] = pair_id[:,1]
+    return df
+class bandmatrix():
+    def __init__(self, pixels, extent, max_distance_bins=None, bins=None, info=None):
+        self.extent = extent
+        self.max_distance_bins = max_distance_bins
+        self.bmatrix = np.zeros((extent[1] - extent[0], max_distance_bins))
+        self.offset = extent[0]
+        self.bmatrix[pixels['bin1_id'] - self.offset, (pixels['bin2_id'] - pixels['bin1_id']).abs()] = pixels[
+            'balanced']
+        self.diag_mean = np.nanmean(self.bmatrix, axis=0)
+        np.nan_to_num(self.bmatrix, copy=False)
+        self.bins = bins
+        self.bp2bin = \
+            bins['start'].reset_index(drop=False).rename(columns={"start": "bp", "index": "bin"}).set_index(
+                'bp').to_dict()[
+                'bin']
+        self.resol = self.bins.iloc[0]['end'] - self.bins.iloc[0]['start']
+        self.info = info
+        self.bin2bias = np.zeros(self.extent[1] - self.extent[0] + 1)
+        if 'full_sum' in self.info:
+            self.totalRC = self.info['full_sum']
+        elif 'sum' in info:
+            self.totalRC = self.info['sum']
+        else:
+            self.totalRC = None
+        self.bin2bias = np.zeros((extent[1] - extent[0]))
+        for k, v in bins.to_dict()['weight'].items():
+            self.bin2bias[k - self.offset] = v
+        self.bin2bias = np.nan_to_num(self.bin2bias)
+        self.continousRows = {'start_bp': np.inf, 'end_bp': -1, 'O_matrix': None, 'OE_matrix': None, 'bias': None,
+                              'offset_bin': 0}
+        self.continousRows = SimpleNamespace(**self.continousRows)
+    def __bandedRows2fullRows(self, x):
+        """
+        coverting rows in bandedMatrix to upper triangle (+ necessary lower triangle) fullMatrix
+        x????        x???x000
+        x@@xx        ?x@@xx00
+        x#xxx   -->  ?@x#xxx0
+        xxxxx        ?@#xxxxx
+        """
+        b, h, w = x.shape
+        output = np.zeros((b, h, h + w))
+        output[:b, :h, :w] = x
+        output = output.reshape(b, -1)[:, :-h].reshape(b, h, -1)[:, :, :h + w]
+        i_lower = np.tril_indices(h, -1)
+        for i in range(b):
+            output[i][i_lower] = output[i].swapaxes(-1, -2)[i_lower]
+        return output
+    def __relative_right_shift(self, x):
+        """
+        .........xxxxxx      xxxxxx0000000000
+        ........xxxxxx.      xxxxxx.000000000
+        .......xxxxxx.. ---> xxxxxx..00000000
+        ......xxxxxx...      xxxxxx...0000000
+        .....xxxxxx....      xxxxxx....000000
+        """
+        b, h, w = x.shape
+        output = np.zeros((b, h, 2 * w))
+        output[:b, :h, :w] = x
+        return output.reshape(b, -1)[:, :-h].reshape(b, h, -1)[:, :, h - 1:]
+    def __tril_block(self, top, left, bottom, right, type='o'):
+        """
+        fetch data in lower triangular part without main diagonal
+        Parameters:
+        top,left,bottom,right : block coords. left/right < 0
+        type                  : o [observe], oe [o/e], b [both]
+        """
+        if left >= 0 or right >= 0:
+            raise Exception("Trying to access data outside lower triangular part with tril_block")
+        height = bottom - top
+        top, bottom = top + left, bottom + right
+        left, right = -right, -left
+        if top < 0 or bottom > self.bmatrix.shape[0] - 1:
+            raise Exception("Accessing values outside the contact map ... valid region:" +
+                            str(10 * self.resol) + '~' + str((self.extent[1] - self.extent[0] - 10) * self.resol))
+        O = self.bmatrix[top:bottom + 1, left:right + 1]
+        if type == 'o':
+            out = self.__relative_right_shift(O[None].swapaxes(-1, 1)).swapaxes(-1, 1)[:, :height + 1, :]
+        elif type == 'oe':
+            OE = O / self.diag_mean[left:right + 1]
+            out = self.__relative_right_shift(OE[None].swapaxes(-1, 1)).swapaxes(-1, 1)[:, :height + 1, :]
+        else:
+            OE = O / self.diag_mean[left:right + 1]
+            out = np.concatenate((O[None], OE[None]))
+            out = self.__relative_right_shift(out.swapaxes(-1, 1)).swapaxes(-1, 1)[:, :height + 1, :]
+        return out[..., ::-1]
+    def rows(self, firstRow, lastRow, type='o', returnBias=False):
+        """
+        fetch rows [firstRow,lastRow] of contacts
+        Parameters
+        ----------
+        firstRow   : inclusive first row in bp
+        lastRow    : inclusive last  row in bp
+        type       : o [observe], oe [o/e], b [both]
+        returnBias : If true, return bias in an array for bins [first row,last row + max_distance_bins)
+        """
+        firstRow = firstRow // self.resol * self.resol
+        lastRow = lastRow // self.resol * self.resol
+        ORows = None
+        OERows = None
+        if firstRow < 0 or lastRow < 0 or firstRow > (self.extent[1] - self.extent[0]) * self.resol or lastRow > (
+                self.extent[1] - self.extent[0]) * self.resol:
+            raise Exception("Accessing values outside the contact map ... valid region: 0 ~ "
+                            + str((self.extent[1] - self.extent[0]) * self.resol))
+        firstRowRelativeBin = self.bp2bin[firstRow] - self.offset
+        lastRowRelativeBin = self.bp2bin[lastRow] - self.offset
+        ORows = self.bmatrix[firstRowRelativeBin:lastRowRelativeBin + 1, :][None]
+        if type == 'o':
+            outRows = ORows
+        elif type == 'oe':
+            OERows = (ORows / self.diag_mean)
+            outRows = OERows
+        elif type == 'b':
+            OERows = (ORows / self.diag_mean)
+            outRows = np.concatenate((ORows, OERows), axis=0)
+        outRows = self.__bandedRows2fullRows(outRows)
+        if returnBias:
+            bias = self.bin2bias[firstRowRelativeBin:lastRowRelativeBin + self.max_distance_bins]
+            # print('bias.shape',bias.shape)
+            # p2ll = self.p2ll(output[-1,:,:],cw=3) # prefer to use obs to compuate p2ll
+            return outRows, bias
+        return outRows
+    def __squareFromContinousRows(self, xCenter, yCenter, w, type='o', meta=True):
+        """
+        fetch a (2w+1)*(2w+1) square of contacts centered at (xCenter,yCenter) from continousrows efficiently
+        Parameters
+        ----------
+        xCenter : xCenter in bp
+        yCenter : yCenter in bp
+        w       : block width = 2w+1, in bins
+        type    : o [observe], oe [o/e], b [both]
+        """
+        if xCenter < self.continousRows.start_bp or xCenter > self.continousRows.end_bp:
+            print('miss')
+            rowStep = 1000
+            startRow_bp = np.max([0, xCenter // (rowStep * self.resol) * (rowStep - 2 * w) * self.resol])
+            endRow_bp = np.min(
+                [startRow_bp + (rowStep + 2 * w) * self.resol, (self.extent[1] - self.offset - 1) * self.resol])
+            mat, bias = self.rows(startRow_bp, endRow_bp, type='b', returnBias=True)
+            self.continousRows.start_bp = startRow_bp
+            self.continousRows.end_bp = endRow_bp
+            self.continousRows.O_matrix = mat[0, :, :]
+            self.continousRows.OE_matrix = mat[1, :, :]
+            self.continousRows.bias = bias
+        else:
+            print('hit')
+        xCenterRelativeBin = (xCenter - self.continousRows.start_bp) // self.resol
+        yCenterRelativeBin = (yCenter - self.continousRows.start_bp) // self.resol
+        # = {'start_bp': v, 'end_bp': v, 'O_matrix': None, 'OE_matrix': None, 'bias':None, 'offset_bin': 0}
+        if type == 'o':
+            output = self.continousRows.O_matrix[xCenterRelativeBin - w:xCenterRelativeBin + w + 1,
+                     yCenterRelativeBin - w:yCenterRelativeBin + w + 1][None]
+        elif type == 'oe':
+            output = self.continousRows.OE_matrix[xCenterRelativeBin - w:xCenterRelativeBin + w + 1,
+                     yCenterRelativeBin - w:yCenterRelativeBin + w + 1][None]
+        else:
+            OEsquare = self.continousRows.OE_matrix[xCenterRelativeBin - w:xCenterRelativeBin + w + 1,
+                       yCenterRelativeBin - w:yCenterRelativeBin + w + 1][None]
+            Osquare = self.continousRows.O_matrix[xCenterRelativeBin - w:xCenterRelativeBin + w + 1,
+                      yCenterRelativeBin - w:yCenterRelativeBin + w + 1][None]
+            output = np.concatenate((Osquare, OEsquare))
+        if meta:
+            xBias = self.continousRows.bias[xCenterRelativeBin - w:xCenterRelativeBin + w + 1]
+            yBias = self.continousRows.bias[yCenterRelativeBin - w:yCenterRelativeBin + w + 1]
+            bias = np.concatenate((xBias, yBias))
+            p2ll,crk = self.p2ll(output[-1, :, :], cw=3)  # prefer to use obs to compuate p2ll
+            return output, np.concatenate((bias, [self.totalRC, p2ll,yCenterRelativeBin,crk]))
+        return output
+    def p2ll(self, x, cw=3):
+        """
+        P2LL for a peak.
+        Parameters:
+        x      : sqaure matrix, peak and its surrandings
+        cw     : lower-left corner width
+        """
+        c = x.shape[0] // 2
+        llcorner = x[-cw:, :cw].flatten()
+        if sum(llcorner) == 0:
+            return 0,np.sum(x[c,c]>x[c-1:c+2,c-1:c+2])
+        return x[c, c] / (sum(llcorner) / len(llcorner)),np.sum(x[c,c]>x[c-1:c+2,c-1:c+2])
+    def square(self, xCenter, yCenter, w, type='o', meta=True, cache=False):
+        """
+        fetch a (2w+1)*(2w+1) square of contacts centered at (xCenter,yCenter)
+        Parameters
+        ----------
+        xCenter : xCenter in bp
+        yCenter : yCenter in bp
+        w       : block width = 2w+1, in bins
+        type    : o [observe], oe [o/e], b [both]
+        """
+        # print(xCenter,yCenter)
+        tril = None
+        xCenter = xCenter // self.resol * self.resol
+        yCenter = yCenter // self.resol * self.resol
+        # if xCenter > yCenter:
+        #     tmp = xCenter
+        #     xCenter = yCenter
+        #     yCenter = tmp
+        # if xCenter - w * self.resol < 0 or yCenter - w * self.resol < 0 or \
+        #         xCenter + w * self.resol > (
+        #         self.extent[1] - self.extent[0] - 1) * self.resol or yCenter + w * self.resol > (
+        #         self.extent[1] - self.extent[0] - 1) * self.resol:
+        #     raise Exception("Accessing values outside the contact map ... valid region: 0 ~ "
+        #                     + str((self.extent[1] - self.extent[0]) * self.resol))
+        # if cache:
+        #     # print("cache")
+        #     return self.__squareFromContinousRows(xCenter, yCenter, w, type, meta)
+        xCenterRelativeBin = self.bp2bin[xCenter] - self.offset
+        yCenterRelativeBin = self.bp2bin[yCenter] - self.offset - xCenterRelativeBin
+        # if yCenterRelativeBin + 2 * w >= self.max_distance_bins:
+        #     raise Exception("max distance in this bcool file is ", self.max_distance_bins * self.resol)
+        topleft = [xCenterRelativeBin - w, yCenterRelativeBin - 2 * w]
+        bottomright = [xCenterRelativeBin + w, yCenterRelativeBin + 2 * w]
+        if topleft[1] < 0:
+            tril = (topleft[0], topleft[1], bottomright[0], -1)
+            topleft[1] = 0
+            tril_part = self.__tril_block(tril[0], tril[1], tril[2], tril[3], type)
+        Osquare = self.bmatrix[topleft[0]:bottomright[0] + 1, topleft[1]:bottomright[1] + 1]
+        if type == 'o':
+            Osquare = Osquare[None]
+            if tril is not None:
+                Osquare = np.concatenate((tril_part, Osquare), axis=-1)
+            output = self.__relative_right_shift(Osquare)[:, :, :2 * w + 1]
+        elif type == 'oe':
+            OEsquare = (Osquare / self.diag_mean[topleft[1]:bottomright[1] + 1])[None]
+            if tril is not None:
+                OEsquare = np.concatenate((tril_part, OEsquare), axis=-1)
+            output = self.__relative_right_shift(OEsquare)[:, :, :2 * w + 1]
+        else:
+            OEsquare = Osquare / self.diag_mean[topleft[1]:bottomright[1] + 1]
+            output = np.concatenate((Osquare[None], OEsquare[None]))
+            if tril is not None:
+                output = np.concatenate((tril_part, output), axis=-1)
+            output = self.__relative_right_shift(output)[:, :, :2 * w + 1]
+        if meta:
+            xBias = self.bin2bias[self.bp2bin[xCenter] - self.offset - w:self.bp2bin[xCenter] - self.offset + w + 1]
+            yBias = self.bin2bias[self.bp2bin[yCenter] - self.offset - w:self.bp2bin[yCenter] - self.offset + w + 1]
+            bias = np.concatenate((xBias, yBias))
+            p2ll,crk = self.p2ll(output[-1, :, :], cw=3)  # prefer to use obs to compuate p2ll
+            return output, np.concatenate((bias, [self.totalRC, p2ll,yCenterRelativeBin,crk]))
+        return output
+class bcool(cooler.Cooler):
+    def __init__(self, store):
+        super().__init__(store)
+    def bchr(self, chrom, max_distance=None, annotate=True,decoy=False,restrictDecoy=False):
+        '''
+        get banded matrix for a given chrom
+        '''
+        balance = True
+        resol = self.info['bin-size']
+        if max_distance is not None and 'max_distance' in self.info and max_distance > self.info['max_distance']:
+            raise Exception("max distance in this bcool file is ", self.info['max_distance'])
+        else:
+            if 'max_distance' in self.info:
+                max_distance = self.info['max_distance']
+            else:
+                max_distance = 3000000
+        pixels = self.matrix(balance=balance, as_pixels=True).fetch(chrom)
+        pixels=pixels[(pixels['bin2_id']-pixels['bin1_id']).abs()<max_distance//resol].reset_index(drop=True)
+        if decoy:
+            np.random.seed(0)
+            pixels['distance']=(pixels['bin2_id']-pixels['bin1_id']).abs()
+            if restrictDecoy:
+                pixels = pixels.groupby('distance').apply(shuffleIFWithCount)
+            else:
+                pixels=pixels.groupby('distance').apply(shuffleIF)
+        if annotate:
+            bins = self.bins().fetch(chrom)
+            info = self.info
+        else:
+            bins = None
+            info = None
+        extent = self.extent(chrom)
+        bmatrix = bandmatrix(pixels, extent, max_distance // resol, bins, info)
+        return bmatrix

polaris/utils/util_cool2bcool.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# Modified from RefHiC: https://github.com/BlanchetteLab/RefHiC（Analysis code）
+# --------------------------------------------------------------
+import click
+import cooler
+import h5py
+from cooler.create._create import write_pixels,write_indexes,index_bins,index_pixels,prepare_pixels,PIXEL_DTYPES,_set_h5opts,write_info
+from cooler.util import get_meta
+import posixpath
+@click.command()
+@click.option('-u', type=int, default=3000000, help='distance upperbund [bp] [default=3000000]')
+@click.option('--resol',default=None,help='comma separated resols for output')
+@click.argument('mcool', type=str,required=True)
+@click.argument('bcool', type=str,required=True)
+def cool2bcool(mcool, bcool,u,resol):
+    '''covert a .mcool file to a .bcool file'''
+    h5opts = _set_h5opts(None)
+    copy = ['bins', 'chroms']
+    Ofile = h5py.File(bcool, 'w')
+    Ifile = h5py.File(mcool, 'r')
+    if resol is None:
+        resols = [r.split('/')[-1] for r in cooler.fileops.list_coolers(mcool)]
+    else:
+        resols = resol.split(',')
+    # copy bins and chroms
+    for grp in Ifile:
+        Ofile.create_group(grp)
+        for subgrp in Ifile[grp]:
+            if subgrp in resols:
+                Ofile[grp].create_group(subgrp)
+                for ssubgrp in Ifile[grp][subgrp]:
+                    if ssubgrp in copy:
+                        Ofile.copy(Ifile[grp + '/' + subgrp + '/' + ssubgrp], grp + '/' + subgrp + '/' + ssubgrp)
+    Ofile.flush()
+    Ifile.close()
+    for group_path in ['/resolutions/'+str(r) for r in resols]:
+        c = cooler.Cooler(mcool + '::' + group_path)
+        nnz_src = c.info['nnz']
+        n_bins = c.info['nbins']
+        n_chroms = c.info['nchroms']
+        bins = c.bins()[:]
+        pixels = []
+        info = c.info
+        info['subformat'] = 'bcool'
+        info['max_distance'] = u
+        info['full_nnz'] = info['nnz']
+        info['full_sum'] = info['sum']
+        # collect pixels
+        for lo, hi in cooler.util.partition(0, nnz_src, nnz_src // 100):
+            pixel = c.pixels(join=False)[lo:hi].reset_index(drop=True)
+            bins1 = bins.iloc[pixel['bin1_id']][['chrom', 'start']].reset_index(drop=True)
+            bins2 = bins.iloc[pixel['bin2_id']][['chrom', 'start']].reset_index(drop=True)
+            pixel = pixel[
+                (bins1['chrom'] == bins2['chrom']) & ((bins1['start'] - bins2['start']).abs() < u)].reset_index(
+                drop=True)
+            pixels.append(pixel)
+        columns = list(pixels[0].columns.values)
+        meta = get_meta(columns, dict(PIXEL_DTYPES), default_dtype=float)
+        # write pixels
+        with h5py.File(bcool, "r+") as f:
+            h5 = f[group_path]
+            grp = h5.create_group("pixels")
+            max_size = n_bins * (n_bins - 1) // 2 + n_bins
+            prepare_pixels(grp, n_bins, max_size, meta.columns, dict(meta.dtypes), h5opts)
+        target = posixpath.join(group_path, 'pixels')
+        nnz, ncontacts = write_pixels(bcool, target, columns, pixels, h5opts, lock=None)
+        info['nnz'] = nnz
+        info['sum'] = ncontacts
+        # write indexes
+        with h5py.File(bcool, "r+") as f:
+            h5 = f[group_path]
+            grp = h5.create_group("indexes")
+            chrom_offset = index_bins(h5["bins"], n_chroms, n_bins)
+            bin1_offset = index_pixels(h5["pixels"], n_bins, nnz)
+            write_indexes(grp, chrom_offset, bin1_offset, h5opts)
+            write_info(h5, info)
+if __name__ == '__main__':
+    cool2bcool()

polaris/utils/util_data.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import random
+import warnings
+import numpy as np
+from scipy.sparse import coo_matrix
+from torch.utils.data import Dataset
+from scipy.sparse import SparseEfficiencyWarning
+warnings.filterwarnings("ignore", category=SparseEfficiencyWarning)
+def getLocal(mat, i, jj, w, N):
+    if i >= 0 and jj >= 0 and i+w <= N and jj+w <= N:
+        mat = mat[i:i+w,jj:jj+w].toarray()
+        # print(f"global: {mat.shape}")
+        return mat[None,...]
+    # pad_width = ((up, down), (left, right))
+    slice_pos = [[i, i+w], [jj, jj+w]]
+    pad_width = [[0, 0], [0, 0]]
+    if i < 0:
+        pad_width[0][0] = -i
+        slice_pos[0][0] = 0
+    if jj < 0:
+        pad_width[1][0] = -jj
+        slice_pos[1][0] = 0
+    if i+w > N:
+        pad_width[0][1] = i+w-N
+        slice_pos[0][1] = N
+    if jj+w > N:
+        pad_width[1][1] = jj+w-N
+        slice_pos[1][1] = N
+    _mat = mat[slice_pos[0][0]:slice_pos[0][1],slice_pos[1][0]:slice_pos[1][1]].toarray()
+    padded_mat = np.pad(_mat, pad_width, mode='constant', constant_values=0)
+    # print(f"global: {padded_mat.shape}",slice_pos, pad_width)
+    return padded_mat[None,...]
+def upperCoo2symm(row,col,data,N=None):
+    # print(np.max(row),np.max(col),N)
+    if N:
+        shape=(N,N)
+    else:
+        shape=(row.max() + 1,col.max() + 1)
+    sparse_matrix = coo_matrix((data, (row, col)), shape=shape)
+    symm = sparse_matrix + sparse_matrix.T
+    diagVal = symm.diagonal(0)/2
+    symm = symm.tocsr()
+    symm.setdiag(diagVal)
+    return symm
+def shuffleIFWithCount(df):
+    shuffled_df = df.copy()
+    shuffled_df[['oe', 'balanced']] = df[['oe', 'balanced']].sample(frac=1).reset_index(drop=True)
+    return shuffled_df
+def shuffleIF(df):
+    if len(df)<10:
+        df = shuffleIFWithCount(df)
+        return df
+    min=np.min(df['bin1_id'])
+    max=np.max(df['bin1_id'])
+    distance = df['distance'].iloc[0]
+    bin1_id = np.random.randint(min, high=max, size=int(len(df)*1.5))
+    bin2_id = bin1_id + distance
+    pair_id = set(zip(bin1_id,bin2_id))
+    if len(pair_id)<len(df)-50:
+        bin1_id = np.random.randint(min, high=max, size=len(df))
+        bin2_id = bin1_id + distance
+        extra_pair_id = set(zip(bin1_id,bin2_id))
+        pair_id.update(extra_pair_id)
+    if len(pair_id)<len(df):
+        df = df.sample(len(pair_id))
+    pair_id = list(pair_id)
+    random.shuffle(pair_id)
+    pair_id=np.asarray(pair_id[:len(df)])
+    df['bin1_id']=pair_id[:,0]
+    df['bin2_id'] = pair_id[:,1]
+    return df
+class centerPredCoolDataset(Dataset):
+    def __init__(self, coolfile, cchrom, step=224, w=224, max_distance_bin=600, decoy=False, restrictDecoy=False, s=0.9, raw=False):
+        '''
+        Args:
+            step (int): the step of slide window moved and also the center crop size to predict
+        '''
+        self.s=s
+        oeMat, decoyOeMat, N = self._processCoolFile(coolfile, cchrom, decoy=decoy, restrictDecoy=restrictDecoy, raw=raw)
+        self.data, self.i, self.j = self._prepare_data(oeMat, N, step, w, max_distance_bin, decoyOeMat)
+        del oeMat, decoyOeMat
+    def _prepare_data(self, oeMat, N, step, w, max_distance_bin, decoyOeMat=None):
+        center_crop_size = step
+        start_point = -(w - center_crop_size) // 2
+        data, i_list, j_list = [], [], []
+        joffset = np.repeat(np.linspace(0, w, w, endpoint=False, dtype=int)[np.newaxis, :], w, axis=0)
+        ioffset = np.repeat(np.linspace(0, w, w, endpoint=False, dtype=int)[:, np.newaxis], w, axis=1)
+        for i in range(start_point, N - w - start_point, step):
+            _data, _i_list, _j_list = self._process_window(oeMat, i, step, w, N, joffset, ioffset, max_distance_bin, decoyOeMat)
+            data.extend(_data)
+            i_list.extend(_i_list)
+            j_list.extend(_j_list)
+        return data, i_list, j_list
+    def _process_window(self, oeMat, i, step, w, N, joffset, ioffset, max_distance_bin, decoyOeMat=None):
+        data, i_list, j_list = [], [], []
+        for j in range(0, max_distance_bin, step):
+            jj = j + i
+            # if jj + w <= N and i + w <= N:
+            _oeMat = getLocal(oeMat, i, jj, w, N)
+            if np.sum(_oeMat == 0) <= (w*w*self.s):
+                if decoyOeMat is not None:
+                    _decoyOeMat = getLocal(decoyOeMat, i, jj, w, N)
+                    data.append(np.vstack((_oeMat, _decoyOeMat)))
+                else:
+                    data.append(_oeMat)
+                i_list.append(i + ioffset)
+                j_list.append(jj + joffset)
+        return data, i_list, j_list
+    def _processCoolFile(self, coolfile, cchrom, decoy=False, restrictDecoy=False, raw=False):
+        extent = coolfile.extent(cchrom)
+        N = extent[1] - extent[0]
+        if raw:
+            ccdata = coolfile.matrix(balance=False, sparse=True, as_pixels=True).fetch(cchrom)
+            v='count'
+        else:
+            ccdata = coolfile.matrix(balance=True, sparse=True, as_pixels=True).fetch(cchrom)
+            v='balanced'
+        ccdata['bin1_id'] -= extent[0]
+        ccdata['bin2_id'] -= extent[0]
+        ccdata['distance'] = ccdata['bin2_id'] - ccdata['bin1_id']
+        d_means = ccdata.groupby('distance')[v].transform('mean')
+        ccdata[v] = ccdata[v].fillna(0)
+        ccdata['oe'] = ccdata[v] / d_means
+        ccdata['oe'] = ccdata['oe'].fillna(0)
+        ccdata['oe'] = ccdata['oe'] / ccdata['oe'].max()
+        oeMat = upperCoo2symm(ccdata['bin1_id'].ravel(), ccdata['bin2_id'].ravel(), ccdata['oe'].ravel(), N)
+        decoyMat = None
+        if decoy:
+            decoydata = ccdata.copy(deep=True)
+            np.random.seed(0)
+            if restrictDecoy:
+                decoydata = decoydata.groupby('distance').apply(shuffleIF)
+            else:
+                decoydata = decoydata.groupby('distance').apply(shuffleIFWithCount)
+            decoyMat = upperCoo2symm(decoydata['bin1_id'].ravel(), decoydata['bin2_id'].ravel(), decoydata['oe'].ravel(), N)
+        return oeMat, decoyMat, N
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        return self.i[idx], self.j[idx], self.data[idx]

polaris/utils/util_depth.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import click
+import cooler
+import numpy as np
+from tqdm import tqdm
+from multiprocessing import Pool
+np.seterr(divide='ignore', invalid='ignore')
+def process_chrom(args):
+    chrom_name, input_file, resol, mindis, exclude_self = args
+    try:
+        C = cooler.Cooler(f"{input_file}::resolutions/{resol}")
+        pixels = C.matrix(
+            balance=False, sparse=True, as_pixels=True).fetch(chrom_name)
+        bin_diff = pixels['bin2_id'] - pixels['bin1_id']
+        min_diff = max(mindis, 1) if exclude_self else mindis
+        mask = bin_diff >= min_diff
+        return pixels[mask]['count'].sum()
+    except Exception as e:
+        print(f"Error processing {chrom_name}: {e}")
+        return 0
+@click.command()
+@click.option('-c','--chrom', type=str, default=None, help='Comma separated chroms [all autosomes]')
+@click.option('-md','--mindis', type=int, default=0, help='Min genomic distance in bins [0]')
+@click.option('-r','--resol',type=int,required=True,help='Resolution (bp)')
+@click.option('-i','--input', type=str,required=True,help='mcool file path')
+@click.option('--exclude-self', is_flag=True, help='Exclude bin_diff=0 contacts')
+def depth(input, resol, mindis, chrom, exclude_self):
+    """Calculate intra-chromosomal contacts with bin distance >= mindis"""
+    print(f'\n[polaris] Depth calculation START')
+    try:
+        C = cooler.Cooler(f"{input}::resolutions/{resol}")
+    except ValueError:
+        available_res = cooler.fileops.list_coolers(input)
+        raise ValueError(f"Resolution {resol} not found. Available: {available_res}")
+    chrom_list = chrom.split(',') if chrom else C.chromnames
+    invalid_chroms = [c for c in chrom_list if c not in C.chromnames]
+    if invalid_chroms:
+        raise ValueError(f"Invalid chromosomes: {invalid_chroms}. Valid: {C.chromnames}")
+    # 并行处理
+    with Pool(processes=min(len(chrom_list), 4)) as pool:
+        args_list = [(chrom, input, resol, mindis, exclude_self) for chrom in chrom_list]
+        results = list(tqdm(pool.imap(process_chrom, args_list), total=len(chrom_list), dynamic_ncols=True))
+        total_contacts = sum(results)
+    print(f"\n[polaris] Depth calculation FINISHED")
+    print(f"File: {input} (res={resol}bp)")
+    print(f"Chromosomes: {chrom_list}")
+    print(f"Minimum bin distance: {mindis}{', exclude self' if exclude_self else ''}")
+    print(f"Total intra contacts: {total_contacts:,}")
+if __name__ == '__main__':
+    depth()

polaris/utils/util_loop.py ADDED Viewed

	@@ -0,0 +1,12 @@

+class bedpewriter():
+    def __init__(self,file_path, resol, max_distance):
+        self.f = open(file_path,'w')
+        self.resol = resol
+        self.max_distance = max_distance
+    def write(self,chrom,x,y,prob):
+        for i in range(len(x)):
+            # if x[i] < y[i] and y[i]-x[i] > 11*self.resol and y[i] - x[i] < self.max_distance:
+            if x[i] < y[i] and y[i] - x[i] < self.max_distance:
+                self.f.write(chrom+'\t'+str(x[i])+'\t'+str(x[i]+self.resol)
+                            +'\t'+chrom+'\t'+str(y[i])+'\t'+str(y[i]+self.resol)
+                            +'\t'+str(prob[i])+'\n')

polaris/utils/util_pileup.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import numpy as np
+import click
+import pandas as pd
+from polaris.utils.util_bcooler import bcool
+from matplotlib import pylab as plt
+from  matplotlib.colors import LinearSegmentedColormap
+cmap=LinearSegmentedColormap.from_list('wr',["w", "r"], N=256)
+def p2LL(x, cw=3):
+    """
+    P2LL for a peak.
+    Parameters:
+    x      : sqaure matrix, peak and its surrandings
+    cw     : lower-left corner width
+    """
+    c = x.shape[0] // 2
+    llcorner = x[-cw:, :cw].flatten()
+    if sum(llcorner) == 0:
+        return 0,np.sum(x[c,c]>x[c-1:c+2,c-1:c+2])
+    return x[c, c] / (sum(llcorner) / len(llcorner)),np.sum(x[c,c]>x[c-1:c+2,c-1:c+2])
+@click.command()
+@click.option('-w', type=int, default=10, help="window size (bins): (2w+1)x(2w+1) [10]")
+@click.option('--savefig', type=str, default=None, help="save pileup plot to file [FOCI_pileup.png]")
+@click.option('--p2ll', type=bool, default=False, help="compute p2ll [False]")
+@click.option('--mindistance', type=int, default=None, help="min distance (bins) to skip, only for bedpe foci [2w+1]")
+@click.option('--maxdistance', type=int, default=1e9, help="min distance (bins) to skip , only for bedpe foci [1e9]")
+@click.option('--resol', type=int, default=5000, help="resolution [5000]")
+@click.option('--oe', type=bool, default=True, help="O/E normalized [True]")
+@click.argument('foci', type=str,required=True)
+@click.argument('mcool', type=str,required=True)
+def pileup(w,savefig,p2ll,mindistance,resol,maxdistance,foci,mcool,oe):
+    ''' 2D pileup contact maps around given foci
+    \b
+    FOCI format: bedpe file contains loops
+    \f
+    :param w:
+    :param savefig:
+    :param p2ll:
+    :param mindistance:
+    :param resol:
+    :param maxdistance:
+    :param foci:
+    :param mcool:
+    :param oe:
+    :return:
+    '''
+    if mindistance is None:
+        mindistance=2*w+1
+    if savefig is None:
+        savefig=foci+'_pileup.png'
+    bcoolFile = bcool(mcool + '::/resolutions/' + str(resol))
+    pileup=np.zeros((2 * w + 1, 2 * w + 1))
+    if '.bedpe' in foci:
+        filetype='bedpe'
+    else:
+        filetype = 'bed'
+    if oe:
+        oeType='oe'
+    else:
+        oeType='o'
+    foci = pd.read_csv(foci,sep='\t',header=None)
+    if filetype == 'bedpe':
+        foci=foci[foci[4]-foci[1]>mindistance*resol]
+        foci=foci[foci[4]-foci[1]<maxdistance*resol]
+    chroms=list(set(foci[0]))
+    n=0
+    for chrom in chroms:
+        fociChr=foci[foci[0]==chrom]
+        X = list(fociChr[1])
+        if filetype=='bedpe':
+            Y = list(fociChr[4])
+        else:
+            Y=X.copy()
+        bmatrix = bcoolFile.bchr(chrom,decoy=False)
+        for x,y in zip(X,Y):
+            mat,meta= bmatrix.square(x,y,w,oeType)
+            pileup+=mat[0,:,:]
+            n+=1
+    pileup/=n
+    plt.figure(figsize=(2, 2))
+    plt.imshow(pileup,cmap=cmap)
+    plt.xticks([])
+    plt.yticks([])
+    if p2ll:
+        plt.title('P2LL=' + "{:.2f}".format(p2LL(pileup)[0]), fontsize=12)
+    plt.savefig(savefig,dpi=600)

polaris/version.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __version__ = '1.0.0'

setup.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# My code has references to the following repositories:
+# RefHiC: https://github.com/BlanchetteLab/RefHiC（Analysis code）
+# Axial Attention: https://github.com/lucidrains/axial-attention (Model architecture)
+# Peakachu: https://github.com/tariks/peakachu (Calculate intra reads)
+# Thanks a lot for their implement.
+"""
+Setup script for Polaris.
+A Versatile Framework for Chromatin Loop Annotation in Bulk and Single-cell Hi-C Data.
+"""
+from setuptools import setup, find_packages
+with open("README.md", "r") as readme:
+    long_des = readme.read()
+setup(
+    name='polaris',
+    version='1.0.1',
+    author="Yusen HOU, Audrey Baguette, Mathieu Blanchette*, Yanlin Zhang*",
+    author_email="[email protected]",
+    description="A Versatile Framework for Chromatin Loop Annotation in Bulk and Single-cell Hi-C Data",
+    long_description=long_des,
+    long_description_content_type="text/markdown",
+    url="https://github.com/ai4nucleome/Polaris",
+    packages=['polaris'],
+    include_package_data=True,
+    install_requires=[
+        'setuptools==75.1.0',
+        'appdirs==1.4.4',
+        'click==8.0.1',
+        'cooler==0.8.11',
+        'matplotlib==3.8.0',
+        'numpy==1.22.4',
+        'pandas==1.3.0',
+        'scikit-learn==1.4.2',
+        'scipy==1.7.3',
+        'torch==2.2.2',
+        'timm==0.6.12',
+        'tqdm==4.65.0',
+    ],
+    entry_points={
+        'console_scripts': [
+            'polaris = polaris.polaris:cli',
+        ],
+    },
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Intended Audience :: Science/Research",
+        "Topic :: Scientific/Engineering :: Bio-Informatics",
+        "Operating System :: OS Independent",
+    ],
+    python_requires='>=3.9',
+)