import warningsimport numpy as npimport pandas as pdimport scanpy as scimport snapatac2 as snapimport polars as pl warnings.filterwarnings(action="ignore" )
对每个细胞call peaks snap.tl.call_peaks这个函数需要anndata对象中.obsm['insertion']和.uns['reference_sequences']两个数据去call peaks,但是atac_annot对象中没有,因此需要加进去。
dataset = snap.read_dataset("pbmc.h5ads" ) atac_annot=sc.read("atac_annot.h5ad" ) atac_annot.obsm['insertion' ]=dataset.adatas.obsm['insertion' ] pbmc_5k = sc.read("pbmc_5k.h5ad" ) atac_annot.uns['reference_sequences' ]=pbmc_5k.uns['reference_sequences' ]
del pbmc_5k dataset.close()
Use the callpeak
command in MACS2 to identify regions enriched with TN5 insertions. The parameters passed to MACS2 are: “-shift -100 -extsize 200 -nomodel -callsummits -nolambda -keep-dup all”
snap.tl.call_peaks(atac_annot, groupby="CellType" ,out_dir="tmp" ,key_added="Peaks_CellType" )
cell by peak count matrix peak_mat = snap.pp.make_peak_matrix(atac_annot, file="peak_matrix.h5ad" ,use_rep="Peaks_CellType" ) peak_mat.uns['Peaks_CellType' ]=atac_annot.uns["Peaks_CellType" ]
del atac_annot
每类细胞的marker peaks identify marker regions for each cell type
tl.marker_regions
function aggregates signal across cells and utilizes z-scores to identify specifically enriched peaks.
Aggregate values in adata.X in a row-wise fashion. This is used to compute RPKM or RPM values stratified by user-provided groupings.
tl.marker_regions
这个函数对每类细胞进行summarize(R中的函数),aggregates(python中的聚合),再对的到的matrix归一化,画热图
marker_peaks = snap.tl.marker_regions(peak_mat, groupby="CellType" , pvalue=0.01 ) snap.pl.regions(peak_mat, groupby="CellType" , peaks=marker_peaks, interactive=False ,show=True )
每类细胞的转录因子富集 Identify enriched transcription factor motifs.
genome_fasta参数,必须是绝对路径,没有压缩的fasta格式的基因组文件
motifs = snap.tl.motif_enrichment( motifs=snap.datasets.cis_bp(unique=True ), regions=marker_peaks, genome_fasta="/User/victor/DataHub/Genomics/GENCODE/GRCh38.primary_assembly.genome.fa" , ) snap.pl.motif_enrichment(enrichment=motifs,min_log_fc=1 , max_fdr=0.0001 , height=1200 , interactive=False )
Naive B 和Memory B细胞中特异的peaks group1 = "Naive B" group2 = "Memory B" naive_B = np.array(peak_mat.obs['CellType' ] == group1) memory_B = np.array(peak_mat.obs['CellType' ] == group2) peaks_selected = np.logical_or( peak_mat.uns["Peaks_CellType" ][group1].to_numpy(), peak_mat.uns["Peaks_CellType" ][group2].to_numpy(), )
cell_group1和cell_group2这两个参数一定要是np.array否则报错。
diff_peaks = snap.tl.diff_test( data=peak_mat, cell_group1=naive_B, cell_group2=memory_B, features=peaks_selected, direction="both" min_log_fc=0.25 , min_pct=0.05 )
根据p值过滤peaks,官网使用的是校准的p值,但是adjusted p-value全部大于0.05因此使用p-value diff_peaks = diff_peaks.filter(pl.col('p-value' ) < 0.01 ) diff_peaks
shape: (490, 4) ┌───────────────────────────┬───────────────────┬──────────┬──────────────────┐ │ feature name ┆ log2(fold_change) ┆ p-value ┆ adjusted p-value │ │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ f64 ┆ f64 ┆ f64 │ ╞═══════════════════════════╪═══════════════════╪══════════╪══════════════════╡ │ chr1:40152971-40153472 ┆ -1.616357 ┆ 0.000054 ┆ 0.263484 │ │ chr13:110593401-110593902 ┆ 1.445044 ┆ 0.000052 ┆ 0.263484 │ │ chr16:11768789-11769290 ┆ -2.19686 ┆ 0.00004 ┆ 0.263484 │ │ chr11:59135818-59136319 ┆ 0.781031 ┆ 0.000163 ┆ 0.264818 │ │ … ┆ … ┆ … ┆ … │ │ chr9:35619345-35619846 ┆ 0.501729 ┆ 0.00957 ┆ 0.295918 │ │ chr9:94639596-94640097 ┆ 0.418962 ┆ 0.009847 ┆ 0.295918 │ │ chr9:99140650-99141151 ┆ 0.814029 ┆ 0.009244 ┆ 0.295918 │ │ chrX:10121779-10122280 ┆ 0.948257 ┆ 0.009888 ┆ 0.295918 │ └───────────────────────────┴───────────────────┴──────────┴──────────────────┘
snap.pl.regions( peak_mat, groupby = 'CellType' , peaks = { group1: diff_peaks.filter(pl.col("log2(fold_change)" ) > 0 )['feature name' ].to_numpy(), group2: diff_peaks.filter(pl.col("log2(fold_change)" ) < 0 )['feature name' ].to_numpy(), }, interactive = False , )
memory_B细胞中特异的peaks 从非memory_B的细胞类型中各挑选100个细胞组成背景(background),通过差异分析找出memory_B相对于背景的特异的peaks
barcodes = np.array(peak_mat.obs_names) background = []for i in np.unique(peak_mat.obs['CellType' ]): if i != group2: cells = np.random.choice( barcodes[peak_mat.obs['CellType' ] == i], size = 100 , replace = False , ) background.append(cells) background = np.concatenate(background)
diff_peaks = snap.tl.diff_test( peak_mat, cell_group1 = memory_B, cell_group2 = background, features = peak_mat.uns["Peaks_CellType" ][group2].to_numpy(), direction = "positive" , )
根据p值过滤peaks,官网使用的是校准的p值,但是adjusted p-value全部大于0.05因此使用p-value diff_peaks = diff_peaks.filter(pl.col('p-value' ) < 0.01 ) diff_peaks
shape: (5572 , 4 ) ┌──────────────────────────┬───────────────────┬──────────┬──────────────────┐ │ feature name ┆ log2(fold_change) ┆ p-value ┆ adjusted p-value │ │ --- ┆ --- ┆ --- ┆ --- │ │ str ┆ f64 ┆ f64 ┆ f64 │ ╞══════════════════════════╪═══════════════════╪══════════╪══════════════════╡ │ chr17:32252971 -32253472 ┆ 1.046856 ┆ 0.000089 ┆ 0.138804 │ │ chr19:6459579 -6460080 ┆ 0.994567 ┆ 0.000108 ┆ 0.138804 │ │ chr19:40529039 -40529540 ┆ 0.48354 ┆ 0.000149 ┆ 0.138804 │ │ chr3:155870796 -155871297 ┆ 0.606834 ┆ 0.000148 ┆ 0.138804 │ │ … ┆ … ┆ … ┆ … │ │ chr1:1629312 -1629813 ┆ 0.283976 ┆ 0.986251 ┆ 0.986782 │ │ chr2:111898593 -111899094 ┆ 0.392095 ┆ 0.987453 ┆ 0.987807 │ │ chr18:13562387 -13562888 ┆ 0.252019 ┆ 0.996619 ┆ 0.996798 │ │ chr19:45667913 -45668414 ┆ 0.31387 ┆ 0.999187 ┆ 0.999187 │ └──────────────────────────┴───────────────────┴──────────┴──────────────────┘
snap.pl.regions( peak_mat, groupby = 'CellType' , peaks = { group2: diff_peaks['feature name' ].to_numpy(), }, interactive = False , )
peak_mat.close()
├── 10x-Multiome-Pbmc10k-RNA.h5ad ├── ATAC.01.pp.ipynb ├── ATAC.02.annot.ipynb ├── ATAC.03.DAR.ipynb ├── atac_annot.h5ad ├── data │ ├── pbmc_10k │ │ ├── fragments.tsv.gz │ │ └── web_summary.html │ └── pbmc_5k │ ├── fragments.tsv.gz │ └── web_summary.html ├── pbmc.h5ads ├── pbmc_10k.h5ad ├── pbmc_5k.h5ad ├── peak_matrix.h5ad ├── quantify.sh.log └── tmp ├── CD14 Mono.NarrowPeak.gz ├── CD14 Mono_insertion.bed.gz ├── CD4 Naive.NarrowPeak.gz ├── CD4 Naive_insertion.bed.gz ├── Intermediate B.NarrowPeak.gz ├── Intermediate B_insertion.bed.gz ├── MAIT.NarrowPeak.gz ├── MAIT_insertion.bed.gz ├── Memory B.NarrowPeak.gz ├── Memory B_insertion.bed.gz ├── NK.NarrowPeak.gz ├── NK_insertion.bed.gz ├── Naive B.NarrowPeak.gz ├── Naive B_insertion.bed.gz ├── Treg.NarrowPeak.gz ├── Treg_insertion.bed.gz ├── cDC.NarrowPeak.gz ├── cDC_insertion.bed.gz ├── pDC.NarrowPeak.gz └── pDC_insertion.bed.gz