from glob import glob
import anndata
import pandas as pd
from scipy import io
from scipy import sparse
from tqdm import tqdm
import anndata
# Important: anndata behavior changed in later versions, this notebook requires anndata 0.6.18
# for setting the layers matrix through a view
anndata.__version__
'0.6.18'
# create a dictionary with the paths for the subsampling folder where snakemake outputs are for each dataset
datasets = ['heart10k', 'pbmc10k', 'neurons10k']
subsampling_paths = {
'heart10k' :'/home/projects/seqdepth/subsampling/heart_10k_v3/genecounts/',
'pbmc10k' :'/home/projects/seqdepth/subsampling/pbmc_10k_v3/genecounts/',
'neurons10k' :'/home/projects/seqdepth/subsampling/neuron_10k_v3/genecounts/',
}
# these are annotation files downloaded from the 10x website, they are used to parse the barcodes and total number of cells in each dataset
cluster_files = {
'heart10k' :'./heart10k_clusters.csv',
'pbmc10k' :'./pbmc10k_clusters.csv',
'neurons10k' :'./neurons10k_clusters.csv',
}
# check that output folders eist
for ds in datasets:
print('\n files for dataset', ds, ':')
!ls -v {subsampling_paths[ds]}
files for dataset heart10k : genecounts_subsampled_0 genecounts_subsampled_6400000 genecounts_subsampled_100000 genecounts_subsampled_9050967 genecounts_subsampled_141421 genecounts_subsampled_12800000 genecounts_subsampled_200000 genecounts_subsampled_18101934 genecounts_subsampled_282843 genecounts_subsampled_25600000 genecounts_subsampled_400000 genecounts_subsampled_36203867 genecounts_subsampled_565685 genecounts_subsampled_51200000 genecounts_subsampled_800000 genecounts_subsampled_72407734 genecounts_subsampled_1131371 genecounts_subsampled_102400000 genecounts_subsampled_1600000 genecounts_subsampled_144815469 genecounts_subsampled_2262742 genecounts_subsampled_204800000 genecounts_subsampled_3200000 genecounts_subsampled_289630938 genecounts_subsampled_4525483 files for dataset pbmc10k : genecounts_subsampled_0 genecounts_subsampled_9050967 genecounts_subsampled_100000 genecounts_subsampled_12800000 genecounts_subsampled_141421 genecounts_subsampled_18101934 genecounts_subsampled_200000 genecounts_subsampled_25600000 genecounts_subsampled_282843 genecounts_subsampled_36203867 genecounts_subsampled_400000 genecounts_subsampled_51200000 genecounts_subsampled_565685 genecounts_subsampled_72407734 genecounts_subsampled_800000 genecounts_subsampled_102400000 genecounts_subsampled_1131371 genecounts_subsampled_144815469 genecounts_subsampled_1600000 genecounts_subsampled_204800000 genecounts_subsampled_2262742 genecounts_subsampled_289630938 genecounts_subsampled_3200000 genecounts_subsampled_409600000 genecounts_subsampled_4525483 genecounts_subsampled_579261875 genecounts_subsampled_6400000 files for dataset neurons10k : genecounts_subsampled_0 genecounts_subsampled_6400000 genecounts_subsampled_100000 genecounts_subsampled_9050967 genecounts_subsampled_141421 genecounts_subsampled_12800000 genecounts_subsampled_200000 genecounts_subsampled_18101934 genecounts_subsampled_282843 genecounts_subsampled_25600000 genecounts_subsampled_400000 genecounts_subsampled_36203867 genecounts_subsampled_565685 genecounts_subsampled_51200000 genecounts_subsampled_800000 genecounts_subsampled_72407734 genecounts_subsampled_1131371 genecounts_subsampled_102400000 genecounts_subsampled_1600000 genecounts_subsampled_144815469 genecounts_subsampled_2262742 genecounts_subsampled_204800000 genecounts_subsampled_3200000 genecounts_subsampled_289630938 genecounts_subsampled_4525483
# check one folder
!ls -lah {subsampling_paths['pbmc10k']}/genecounts_subsampled_100000/
total 5.0M drwxrwxr-x. 2 munfred munfred 4.0K Aug 6 03:43 . drwxrwxr-x. 29 munfred munfred 4.0K Aug 6 03:41 .. -rw-rw-r--. 1 munfred munfred 223K Aug 6 03:43 genecounts.barcodes.txt -rw-rw-r--. 1 munfred munfred 640K Aug 6 03:43 genecounts.genes.txt -rw-rw-r--. 1 munfred munfred 557K Aug 6 03:43 genecounts.mtx -rw-rw-r--. 1 munfred munfred 1.7M Aug 6 03:43 output.correct.sort.bus -rw-rw-r--. 1 munfred munfred 2.0M Aug 6 03:43 output.correct.sort.bus.txt
for ds in datasets:
print(' Now processing dataset: ', ds)
annotation = pd.read_csv(cluster_files[ds], index_col=0)
annotation.index = annotation.index.str.split('-').str.get(0)
display(annotation.head(2))
genes = pd.read_csv(subsampling_paths[ds] + 'genecounts_subsampled_0/genecounts.genes.txt', names=['gene_id']).set_index('gene_id')
display(genes.head(2))
base_ad = anndata.AnnData(obs=annotation, var=genes)
for mtx_file in tqdm(glob( subsampling_paths[ds] + '/genecounts_subsampled_*/genecounts.mtx')):
ss_depth = mtx_file.split('ed_')[-1].split('/')[0]
X = io.mmread(mtx_file).tocsr()
barcodes = pd.read_csv(
mtx_file.replace('.mtx', '.barcodes.txt'),
names=['barcode'],
index_col=0
)
curr_adata = anndata.AnnData(X=X, obs=barcodes, var=genes)
idx = curr_adata.obs.index.intersection(base_ad.obs.index)
base_ad.layers[ss_depth] = sparse.csr_matrix(base_ad.shape)
base_ad[idx].layers[ss_depth] = curr_adata[idx].X
# This is a check to ensure the anndata layers are correct
for depth in base_ad.layers.keys():
print('UMIs: \t', int(base_ad.layers[depth].sum()),'\t Reads: \t', depth )
print(base_ad)
# base_ad.write( ds + '_subamples.h5ad')
break
Now processing dataset: heart10k
100%|██████████| 25/25 [10:53<00:00, 25.47s/it]
UMIs: 19373016 Reads: 51200000 UMIs: 1341886 Reads: 3200000 UMIs: 950966 Reads: 2262742 UMIs: 72563226 Reads: 289630938 UMIs: 59635 Reads: 141421 UMIs: 72679923 Reads: 0 UMIs: 1891433 Reads: 4525483 UMIs: 3747318 Reads: 9050967 UMIs: 337292 Reads: 800000 UMIs: 5256131 Reads: 12800000 UMIs: 673164 Reads: 1600000 UMIs: 45811720 Reads: 144815469 UMIs: 238550 Reads: 565685 UMIs: 84409 Reads: 200000 UMIs: 2664470 Reads: 6400000 UMIs: 476734 Reads: 1131371 UMIs: 35021220 Reads: 102400000 UMIs: 42202 Reads: 100000 UMIs: 7346236 Reads: 18101934 UMIs: 58484104 Reads: 204800000 UMIs: 168862 Reads: 400000 UMIs: 10223782 Reads: 25600000 UMIs: 14134634 Reads: 36203867 UMIs: 26246409 Reads: 72407734 UMIs: 119301 Reads: 282843 AnnData object with n_obs × n_vars = 7713 × 36047 obs: 'Cluster' layers: '51200000', '3200000', '2262742', '289630938', '141421', '0', '4525483', '9050967', '800000', '12800000', '1600000', '144815469', '565685', '200000', '6400000', '1131371', '102400000', '100000', '18101934', '204800000', '400000', '25600000', '36203867', '72407734', '282843'
ls -lah
total 6.4G drwxrwxr-x. 9 munfred munfred 4.0K Aug 31 16:13 ./ drwxrwxr-x. 12 munfred munfred 4.0K Aug 31 12:33 ../ drwxrwxr-x. 5 munfred munfred 4.0K Aug 25 20:47 benchmarks/ -rw-rw-r--. 1 munfred munfred 2.4K Aug 25 20:03 cell-depth-tradeoff-metadata.tsv -rw-rw-r--. 1 munfred munfred 7.8K Aug 25 20:21 cell-depth-tradeoff-subsampling.py -rw-rw-r--. 1 munfred munfred 13K Aug 24 21:03 create_H5AD-Copy1.ipynb -rw-rw-r--. 1 munfred munfred 12K Aug 31 16:13 create_H5AD-Copy2.ipynb -rw-rw-r--. 1 munfred munfred 24K Aug 30 17:41 create_H5AD.ipynb -rw-rw-r--. 1 munfred munfred 160K Aug 25 17:35 heart10k_clusters.csv -rw-rw-r--. 1 munfred munfred 1.6G Aug 29 20:44 heart10k_subsamples.h5ad drwxrwxr-x. 5 munfred munfred 4.0K Aug 25 21:15 heart_10k_v3/ drwxrwxr-x. 2 munfred munfred 4.0K Aug 31 16:01 .ipynb_checkpoints/ drwxrwxr-x. 5 munfred munfred 4.0K Aug 25 21:15 neuron_10k_v3/ -rw-rw-r--. 1 munfred munfred 246K Aug 25 17:35 neurons10k_clusters.csv -rw-rw-r--. 1 munfred munfred 2.3G Aug 29 21:14 neurons10k_subsamples.h5ad -rw-rw-r--. 1 munfred munfred 244K Aug 25 17:35 pbmc10k_clusters.csv -rw-rw-r--. 1 munfred munfred 2.6G Aug 29 20:59 pbmc10k_subsamples.h5ad drwxrwxr-x. 5 munfred munfred 4.0K Aug 25 21:15 pbmc_10k_v3/ -rw-rw-r--. 1 munfred munfred 75K Aug 30 18:59 run_scVI.ipynb -rw-rw-r--. 1 munfred munfred 5.7K Aug 30 18:57 run_scvi.py -rw-rw-r--. 1 munfred munfred 23K Aug 30 18:35 scVI❤️heart10k❤️-Copy1.ipynb drwxrwxr-x. 2 munfred munfred 20K Aug 31 16:10 scvi_output_neurons10k/ drwxrwxr-x. 9 munfred munfred 4.0K Aug 25 17:26 .snakemake/