【原】单细胞ATAC实战05: 差异可及区域

生信探索 2023-04-14 发布于云南

展开全文

import warnings

import numpy as np
import pandas as pd
import scanpy as sc
import snapatac2 as snap
import polars as pl
warnings.filterwarnings(action="ignore")

对每个细胞call peaks

snap.tl.call_peaks这个函数需要anndata对象中.obsm['insertion']和.uns['reference_sequences']两个数据去call peaks，但是atac_annot对象中没有，因此需要加进去。

dataset = snap.read_dataset("pbmc.h5ads")
atac_annot=sc.read("atac_annot.h5ad")
atac_annot.obsm['insertion']=dataset.adatas.obsm['insertion']
pbmc_5k = sc.read("pbmc_5k.h5ad")
atac_annot.uns['reference_sequences']=pbmc_5k.uns['reference_sequences']

删除不需要的变量

del pbmc_5k
dataset.close()

callpeak

Use the callpeak command in MACS2 to identify regions enriched with TN5 insertions. The parameters passed to MACS2 are: “-shift -100 -extsize 200 -nomodel -callsummits -nolambda -keep-dup all”

snap.tl.call_peaks(atac_annot, groupby="CellType",out_dir="tmp",key_added="Peaks_CellType")

cell by peak count matrix

peak_mat = snap.pp.make_peak_matrix(atac_annot, file="peak_matrix.h5ad",use_rep="Peaks_CellType")
peak_mat.uns['Peaks_CellType']=atac_annot.uns["Peaks_CellType"]

删除不需要的变量

del atac_annot

每类细胞的marker peaks

identify marker regions for each cell type

tl.marker_regions function aggregates signal across cells and utilizes z-scores to identify specifically enriched peaks.

Aggregate values in adata.X in a row-wise fashion. This is used to compute RPKM or RPM values stratified by user-provided groupings.

tl.marker_regions这个函数对每类细胞进行summarize（R中的函数），aggregates（python中的聚合），再对的到的matrix归一化，画热图

marker_peaks = snap.tl.marker_regions(peak_mat, groupby="CellType", pvalue=0.01)
snap.pl.regions(peak_mat, groupby="CellType", peaks=marker_peaks, interactive=False,show=True)

每类细胞的转录因子富集

Identify enriched transcription factor motifs.

genome_fasta参数，必须是绝对路径，没有压缩的fasta格式的基因组文件

motifs = snap.tl.motif_enrichment(
    motifs=snap.datasets.cis_bp(unique=True),
    regions=marker_peaks,
    genome_fasta="/User/victor/DataHub/Genomics/GENCODE/GRCh38.primary_assembly.genome.fa",
)
snap.pl.motif_enrichment(enrichment=motifs,min_log_fc=1, max_fdr=0.0001, height=1200, interactive=False)

Naive B 和Memory B细胞中特异的peaks

group1 = "Naive B"
group2 = "Memory B"
naive_B = np.array(peak_mat.obs['CellType'] == group1)
memory_B = np.array(peak_mat.obs['CellType'] == group2)
peaks_selected = np.logical_or(
    peak_mat.uns["Peaks_CellType"][group1].to_numpy(),
    peak_mat.uns["Peaks_CellType"][group2].to_numpy(),
)

cell_group1和cell_group2这两个参数一定要是np.array否则报错。

diff_peaks = snap.tl.diff_test(
    data=peak_mat,
    cell_group1=naive_B,
    cell_group2=memory_B,
    features=peaks_selected,
    direction="both"
    min_log_fc=0.25,
    min_pct=0.05
)

根据p值过滤peaks，官网使用的是校准的p值，但是adjusted p-value全部大于0.05因此使用p-value

diff_peaks = diff_peaks.filter(pl.col('p-value') < 0.01)
diff_peaks

diff_peaks

shape: (490, 4)
┌───────────────────────────┬───────────────────┬──────────┬──────────────────┐
│ feature name              ┆ log2(fold_change) ┆ p-value  ┆ adjusted p-value │
│ ---                       ┆ ---               ┆ ---      ┆ ---              │
│ str                       ┆ f64               ┆ f64      ┆ f64              │
╞═══════════════════════════╪═══════════════════╪══════════╪══════════════════╡
│ chr1:40152971-40153472    ┆ -1.616357         ┆ 0.000054 ┆ 0.263484         │
│ chr13:110593401-110593902 ┆ 1.445044          ┆ 0.000052 ┆ 0.263484         │
│ chr16:11768789-11769290   ┆ -2.19686          ┆ 0.00004  ┆ 0.263484         │
│ chr11:59135818-59136319   ┆ 0.781031          ┆ 0.000163 ┆ 0.264818         │
│ …                         ┆ …                 ┆ …        ┆ …                │
│ chr9:35619345-35619846    ┆ 0.501729          ┆ 0.00957  ┆ 0.295918         │
│ chr9:94639596-94640097    ┆ 0.418962          ┆ 0.009847 ┆ 0.295918         │
│ chr9:99140650-99141151    ┆ 0.814029          ┆ 0.009244 ┆ 0.295918         │
│ chrX:10121779-10122280    ┆ 0.948257          ┆ 0.009888 ┆ 0.295918         │
└───────────────────────────┴───────────────────┴──────────┴──────────────────┘

snap.pl.regions(
    peak_mat,
    groupby = 'CellType',
    peaks = {
        group1: diff_peaks.filter(pl.col("log2(fold_change)") > 0)['feature name'].to_numpy(),
        group2: diff_peaks.filter(pl.col("log2(fold_change)") < 0)['feature name'].to_numpy(),
    },
    interactive = False,
)

memory_B细胞中特异的peaks

从非memory_B的细胞类型中各挑选100个细胞组成背景（background），通过差异分析找出memory_B相对于背景的特异的peaks

barcodes = np.array(peak_mat.obs_names)
background = []
for i in np.unique(peak_mat.obs['CellType']):
    if i != group2:
        cells = np.random.choice(
            barcodes[peak_mat.obs['CellType'] == i],
            size = 100,
            replace = False,
        )
        background.append(cells)
background = np.concatenate(background)

diff_peaks = snap.tl.diff_test(
    peak_mat,
    cell_group1 = memory_B,
    cell_group2 = background,
    features = peak_mat.uns["Peaks_CellType"][group2].to_numpy(),
    direction = "positive",
)

根据p值过滤peaks，官网使用的是校准的p值，但是adjusted p-value全部大于0.05因此使用p-value

diff_peaks = diff_peaks.filter(pl.col('p-value') < 0.01)
diff_peaks

diff_peaks

shape: (5572, 4)
┌──────────────────────────┬───────────────────┬──────────┬──────────────────┐
│ feature name             ┆ log2(fold_change) ┆ p-value  ┆ adjusted p-value │
│ ---                      ┆ ---               ┆ ---      ┆ ---              │
│ str                      ┆ f64               ┆ f64      ┆ f64              │
╞══════════════════════════╪═══════════════════╪══════════╪══════════════════╡
│ chr17:32252971-32253472  ┆ 1.046856          ┆ 0.000089 ┆ 0.138804         │
│ chr19:6459579-6460080    ┆ 0.994567          ┆ 0.000108 ┆ 0.138804         │
│ chr19:40529039-40529540  ┆ 0.48354           ┆ 0.000149 ┆ 0.138804         │
│ chr3:155870796-155871297 ┆ 0.606834          ┆ 0.000148 ┆ 0.138804         │
│ …                        ┆ …                 ┆ …        ┆ …                │
│ chr1:1629312-1629813     ┆ 0.283976          ┆ 0.986251 ┆ 0.986782         │
│ chr2:111898593-111899094 ┆ 0.392095          ┆ 0.987453 ┆ 0.987807         │
│ chr18:13562387-13562888  ┆ 0.252019          ┆ 0.996619 ┆ 0.996798         │
│ chr19:45667913-45668414  ┆ 0.31387           ┆ 0.999187 ┆ 0.999187         │
└──────────────────────────┴───────────────────┴──────────┴──────────────────┘

snap.pl.regions(
    peak_mat,
    groupby = 'CellType',
    peaks = {
        group2: diff_peaks['feature name'].to_numpy(),
    },
    interactive = False,
)

peak_mat.close()

├── 10x-Multiome-Pbmc10k-RNA.h5ad
├── ATAC.01.pp.ipynb
├── ATAC.02.annot.ipynb
├── ATAC.03.DAR.ipynb
├── atac_annot.h5ad
├── data
│   ├── pbmc_10k
│   │   ├── fragments.tsv.gz
│   │   └── web_summary.html
│   └── pbmc_5k
│       ├── fragments.tsv.gz
│       └── web_summary.html
├── pbmc.h5ads
├── pbmc_10k.h5ad
├── pbmc_5k.h5ad
├── peak_matrix.h5ad
├── quantify.sh.log
└── tmp
    ├── CD14 Mono.NarrowPeak.gz
    ├── CD14 Mono_insertion.bed.gz
    ├── CD4 Naive.NarrowPeak.gz
    ├── CD4 Naive_insertion.bed.gz
    ├── Intermediate B.NarrowPeak.gz
    ├── Intermediate B_insertion.bed.gz
    ├── MAIT.NarrowPeak.gz
    ├── MAIT_insertion.bed.gz
    ├── Memory B.NarrowPeak.gz
    ├── Memory B_insertion.bed.gz
    ├── NK.NarrowPeak.gz
    ├── NK_insertion.bed.gz
    ├── Naive B.NarrowPeak.gz
    ├── Naive B_insertion.bed.gz
    ├── Treg.NarrowPeak.gz
    ├── Treg_insertion.bed.gz
    ├── cDC.NarrowPeak.gz
    ├── cDC_insertion.bed.gz
    ├── pDC.NarrowPeak.gz
    └── pDC_insertion.bed.gz