分享

单细胞ATAC实战05: 差异可及区域

 生信探索 2023-04-14 发布于云南
import warnings

import numpy as np
import pandas as pd
import scanpy as sc
import snapatac2 as snap
import polars as pl
warnings.filterwarnings(action="ignore")

对每个细胞call peaks

snap.tl.call_peaks这个函数需要anndata对象中.obsm['insertion']和.uns['reference_sequences']两个数据去call peaks,但是atac_annot对象中没有,因此需要加进去。

dataset = snap.read_dataset("pbmc.h5ads")
atac_annot=sc.read("atac_annot.h5ad")
atac_annot.obsm['insertion']=dataset.adatas.obsm['insertion']
pbmc_5k = sc.read("pbmc_5k.h5ad")
atac_annot.uns['reference_sequences']=pbmc_5k.uns['reference_sequences']

  • 删除不需要的变量
del pbmc_5k
dataset.close()
  • callpeak

Use the callpeak command in MACS2 to identify regions enriched with TN5 insertions. The parameters passed to MACS2 are: “-shift -100 -extsize 200 -nomodel -callsummits -nolambda -keep-dup all”

snap.tl.call_peaks(atac_annot, groupby="CellType",out_dir="tmp",key_added="Peaks_CellType")

  • cell by peak count matrix
peak_mat = snap.pp.make_peak_matrix(atac_annot, file="peak_matrix.h5ad",use_rep="Peaks_CellType")
peak_mat.uns['Peaks_CellType']=atac_annot.uns["Peaks_CellType"]
  • 删除不需要的变量
del atac_annot

每类细胞的marker peaks

identify marker regions for each cell type

tl.marker_regions function aggregates signal across cells and utilizes z-scores to identify specifically enriched peaks.

Aggregate values in adata.X in a row-wise fashion. This is used to compute RPKM or RPM values stratified by user-provided groupings.

tl.marker_regions这个函数对每类细胞进行summarize(R中的函数),aggregates(python中的聚合),再对的到的matrix归一化,画热图

marker_peaks = snap.tl.marker_regions(peak_mat, groupby="CellType", pvalue=0.01)
snap.pl.regions(peak_mat, groupby="CellType", peaks=marker_peaks, interactive=False,show=True)

每类细胞的转录因子富集

Identify enriched transcription factor motifs.

genome_fasta参数,必须是绝对路径,没有压缩的fasta格式的基因组文件

motifs = snap.tl.motif_enrichment(
    motifs=snap.datasets.cis_bp(unique=True),
    regions=marker_peaks,
    genome_fasta="/User/victor/DataHub/Genomics/GENCODE/GRCh38.primary_assembly.genome.fa",
)
snap.pl.motif_enrichment(enrichment=motifs,min_log_fc=1, max_fdr=0.0001, height=1200, interactive=False)

Naive B 和Memory B细胞中特异的peaks

group1 = "Naive B"
group2 = "Memory B"
naive_B = np.array(peak_mat.obs['CellType'] == group1)
memory_B = np.array(peak_mat.obs['CellType'] == group2)
peaks_selected = np.logical_or(
    peak_mat.uns["Peaks_CellType"][group1].to_numpy(),
    peak_mat.uns["Peaks_CellType"][group2].to_numpy(),
)

cell_group1和cell_group2这两个参数一定要是np.array否则报错。

diff_peaks = snap.tl.diff_test(
    data=peak_mat,
    cell_group1=naive_B,
    cell_group2=memory_B,
    features=peaks_selected,
    direction="both"
    min_log_fc=0.25,
    min_pct=0.05
)
  • 根据p值过滤peaks,官网使用的是校准的p值,但是adjusted p-value全部大于0.05因此使用p-value
diff_peaks = diff_peaks.filter(pl.col('p-value') < 0.01)
diff_peaks
  • diff_peaks
shape: (490, 4)
┌───────────────────────────┬───────────────────┬──────────┬──────────────────┐
│ feature name ┆ log2(fold_change) ┆ p-value ┆ adjusted p-value │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 ┆ f64 │
╞═══════════════════════════╪═══════════════════╪══════════╪══════════════════╡
│ chr1:40152971-40153472 ┆ -1.616357 ┆ 0.000054 ┆ 0.263484 │
│ chr13:110593401-110593902 ┆ 1.445044 ┆ 0.000052 ┆ 0.263484 │
│ chr16:11768789-11769290 ┆ -2.19686 ┆ 0.00004 ┆ 0.263484 │
│ chr11:59135818-59136319 ┆ 0.781031 ┆ 0.000163 ┆ 0.264818 │
│ … ┆ … ┆ … ┆ … │
│ chr9:35619345-35619846 ┆ 0.501729 ┆ 0.00957 ┆ 0.295918 │
│ chr9:94639596-94640097 ┆ 0.418962 ┆ 0.009847 ┆ 0.295918 │
│ chr9:99140650-99141151 ┆ 0.814029 ┆ 0.009244 ┆ 0.295918 │
│ chrX:10121779-10122280 ┆ 0.948257 ┆ 0.009888 ┆ 0.295918 │
└───────────────────────────┴───────────────────┴──────────┴──────────────────┘
snap.pl.regions(
    peak_mat,
    groupby = 'CellType',
    peaks = {
        group1: diff_peaks.filter(pl.col("log2(fold_change)") > 0)['feature name'].to_numpy(),
        group2: diff_peaks.filter(pl.col("log2(fold_change)") < 0)['feature name'].to_numpy(),
    },
    interactive = False,
)

memory_B细胞中特异的peaks

从非memory_B的细胞类型中各挑选100个细胞组成背景(background),通过差异分析找出memory_B相对于背景的特异的peaks

barcodes = np.array(peak_mat.obs_names)
background = []
for i in np.unique(peak_mat.obs['CellType']):
    if i != group2:
        cells = np.random.choice(
            barcodes[peak_mat.obs['CellType'] == i],
            size = 100,
            replace = False,
        )
        background.append(cells)
background = np.concatenate(background)
diff_peaks = snap.tl.diff_test(
    peak_mat,
    cell_group1 = memory_B,
    cell_group2 = background,
    features = peak_mat.uns["Peaks_CellType"][group2].to_numpy(),
    direction = "positive",
)
  • 根据p值过滤peaks,官网使用的是校准的p值,但是adjusted p-value全部大于0.05因此使用p-value
diff_peaks = diff_peaks.filter(pl.col('p-value') < 0.01)
diff_peaks
  • diff_peaks
shape: (55724)
┌──────────────────────────┬───────────────────┬──────────┬──────────────────┐
│ feature name             ┆ log2(fold_change) ┆ p-value  ┆ adjusted p-value │
│ ---                      ┆ ---               ┆ ---      ┆ ---              │
│ str                      ┆ f64               ┆ f64      ┆ f64              │
╞══════════════════════════╪═══════════════════╪══════════╪══════════════════╡
│ chr17:32252971-32253472  ┆ 1.046856          ┆ 0.000089 ┆ 0.138804         │
│ chr19:6459579-6460080    ┆ 0.994567          ┆ 0.000108 ┆ 0.138804         │
│ chr19:40529039-40529540  ┆ 0.48354           ┆ 0.000149 ┆ 0.138804         │
│ chr3:155870796-155871297 ┆ 0.606834          ┆ 0.000148 ┆ 0.138804         │
│ …                        ┆ …                 ┆ …        ┆ …                │
│ chr1:1629312-1629813     ┆ 0.283976          ┆ 0.986251 ┆ 0.986782         │
│ chr2:111898593-111899094 ┆ 0.392095          ┆ 0.987453 ┆ 0.987807         │
│ chr18:13562387-13562888  ┆ 0.252019          ┆ 0.996619 ┆ 0.996798         │
│ chr19:45667913-45668414  ┆ 0.31387           ┆ 0.999187 ┆ 0.999187         │
└──────────────────────────┴───────────────────┴──────────┴──────────────────┘
snap.pl.regions(
    peak_mat,
    groupby = 'CellType',
    peaks = {
        group2: diff_peaks['feature name'].to_numpy(),
    },
    interactive = False,
)
peak_mat.close()
  • 文件目录
├── 10x-Multiome-Pbmc10k-RNA.h5ad
├── ATAC.01.pp.ipynb
├── ATAC.02.annot.ipynb
├── ATAC.03.DAR.ipynb
├── atac_annot.h5ad
├── data
│ ├── pbmc_10k
│ │ ├── fragments.tsv.gz
│ │ └── web_summary.html
│ └── pbmc_5k
│ ├── fragments.tsv.gz
│ └── web_summary.html
├── pbmc.h5ads
├── pbmc_10k.h5ad
├── pbmc_5k.h5ad
├── peak_matrix.h5ad
├── quantify.sh.log
└── tmp
├── CD14 Mono.NarrowPeak.gz
├── CD14 Mono_insertion.bed.gz
├── CD4 Naive.NarrowPeak.gz
├── CD4 Naive_insertion.bed.gz
├── Intermediate B.NarrowPeak.gz
├── Intermediate B_insertion.bed.gz
├── MAIT.NarrowPeak.gz
├── MAIT_insertion.bed.gz
├── Memory B.NarrowPeak.gz
├── Memory B_insertion.bed.gz
├── NK.NarrowPeak.gz
├── NK_insertion.bed.gz
├── Naive B.NarrowPeak.gz
├── Naive B_insertion.bed.gz
├── Treg.NarrowPeak.gz
├── Treg_insertion.bed.gz
├── cDC.NarrowPeak.gz
├── cDC_insertion.bed.gz
├── pDC.NarrowPeak.gz
└── pDC_insertion.bed.gz

    转藏 分享 献花(0

    0条评论

    发表

    请遵守用户 评论公约

    类似文章 更多