In [1]:

from glob import glob
import anndata
import pandas as pd
from scipy import io
from scipy import sparse
from tqdm import tqdm
import anndata
# Important: anndata behavior changed in later versions, this notebook requires anndata 0.6.18 
#  for setting the layers matrix through a view 

In [2]:

anndata.__version__

Out[2]:

'0.6.18'

In [3]:

# create a dictionary with the paths for the subsampling folder where snakemake outputs are for each dataset

datasets = ['heart10k', 'pbmc10k', 'neurons10k']

subsampling_paths = {
'heart10k' :'/home/projects/seqdepth/subsampling/heart_10k_v3/genecounts/',    
'pbmc10k' :'/home/projects/seqdepth/subsampling/pbmc_10k_v3/genecounts/',    
'neurons10k' :'/home/projects/seqdepth/subsampling/neuron_10k_v3/genecounts/',    
}

# these are annotation files downloaded from the 10x website, they are used to parse the barcodes and total number of cells in each dataset
cluster_files = {
'heart10k' :'./heart10k_clusters.csv',    
'pbmc10k' :'./pbmc10k_clusters.csv',    
'neurons10k' :'./neurons10k_clusters.csv',     
    
}

In [4]:

# check that output folders eist
for ds in datasets:
    print('\n files for dataset', ds, ':')
    !ls -v {subsampling_paths[ds]}

 files for dataset heart10k :
genecounts_subsampled_0        genecounts_subsampled_6400000
genecounts_subsampled_100000   genecounts_subsampled_9050967
genecounts_subsampled_141421   genecounts_subsampled_12800000
genecounts_subsampled_200000   genecounts_subsampled_18101934
genecounts_subsampled_282843   genecounts_subsampled_25600000
genecounts_subsampled_400000   genecounts_subsampled_36203867
genecounts_subsampled_565685   genecounts_subsampled_51200000
genecounts_subsampled_800000   genecounts_subsampled_72407734
genecounts_subsampled_1131371  genecounts_subsampled_102400000
genecounts_subsampled_1600000  genecounts_subsampled_144815469
genecounts_subsampled_2262742  genecounts_subsampled_204800000
genecounts_subsampled_3200000  genecounts_subsampled_289630938
genecounts_subsampled_4525483

 files for dataset pbmc10k :
genecounts_subsampled_0        genecounts_subsampled_9050967
genecounts_subsampled_100000   genecounts_subsampled_12800000
genecounts_subsampled_141421   genecounts_subsampled_18101934
genecounts_subsampled_200000   genecounts_subsampled_25600000
genecounts_subsampled_282843   genecounts_subsampled_36203867
genecounts_subsampled_400000   genecounts_subsampled_51200000
genecounts_subsampled_565685   genecounts_subsampled_72407734
genecounts_subsampled_800000   genecounts_subsampled_102400000
genecounts_subsampled_1131371  genecounts_subsampled_144815469
genecounts_subsampled_1600000  genecounts_subsampled_204800000
genecounts_subsampled_2262742  genecounts_subsampled_289630938
genecounts_subsampled_3200000  genecounts_subsampled_409600000
genecounts_subsampled_4525483  genecounts_subsampled_579261875
genecounts_subsampled_6400000

 files for dataset neurons10k :
genecounts_subsampled_0        genecounts_subsampled_6400000
genecounts_subsampled_100000   genecounts_subsampled_9050967
genecounts_subsampled_141421   genecounts_subsampled_12800000
genecounts_subsampled_200000   genecounts_subsampled_18101934
genecounts_subsampled_282843   genecounts_subsampled_25600000
genecounts_subsampled_400000   genecounts_subsampled_36203867
genecounts_subsampled_565685   genecounts_subsampled_51200000
genecounts_subsampled_800000   genecounts_subsampled_72407734
genecounts_subsampled_1131371  genecounts_subsampled_102400000
genecounts_subsampled_1600000  genecounts_subsampled_144815469
genecounts_subsampled_2262742  genecounts_subsampled_204800000
genecounts_subsampled_3200000  genecounts_subsampled_289630938
genecounts_subsampled_4525483

In [5]:

# check one folder
!ls -lah {subsampling_paths['pbmc10k']}/genecounts_subsampled_100000/

total 5.0M
drwxrwxr-x.  2 munfred munfred 4.0K Aug  6 03:43 .
drwxrwxr-x. 29 munfred munfred 4.0K Aug  6 03:41 ..
-rw-rw-r--.  1 munfred munfred 223K Aug  6 03:43 genecounts.barcodes.txt
-rw-rw-r--.  1 munfred munfred 640K Aug  6 03:43 genecounts.genes.txt
-rw-rw-r--.  1 munfred munfred 557K Aug  6 03:43 genecounts.mtx
-rw-rw-r--.  1 munfred munfred 1.7M Aug  6 03:43 output.correct.sort.bus
-rw-rw-r--.  1 munfred munfred 2.0M Aug  6 03:43 output.correct.sort.bus.txt

In [8]:

for ds in datasets:
    print(' Now processing dataset: ', ds)
    annotation = pd.read_csv(cluster_files[ds], index_col=0)
    annotation.index = annotation.index.str.split('-').str.get(0)
    display(annotation.head(2))


    genes = pd.read_csv(subsampling_paths[ds] + 'genecounts_subsampled_0/genecounts.genes.txt', names=['gene_id']).set_index('gene_id')
    display(genes.head(2))

    base_ad = anndata.AnnData(obs=annotation, var=genes)

    for mtx_file in tqdm(glob( subsampling_paths[ds] + '/genecounts_subsampled_*/genecounts.mtx')):
        ss_depth = mtx_file.split('ed_')[-1].split('/')[0]
        X = io.mmread(mtx_file).tocsr()

        barcodes = pd.read_csv(
            mtx_file.replace('.mtx', '.barcodes.txt'),
            names=['barcode'],
            index_col=0
        )
        curr_adata = anndata.AnnData(X=X, obs=barcodes, var=genes)
        idx = curr_adata.obs.index.intersection(base_ad.obs.index)

        base_ad.layers[ss_depth] = sparse.csr_matrix(base_ad.shape)
        base_ad[idx].layers[ss_depth] = curr_adata[idx].X

    # This is a check to ensure the anndata layers are correct
    for depth in base_ad.layers.keys():
        print('UMIs: \t', int(base_ad.layers[depth].sum()),'\t Reads: \t', depth )
    print(base_ad)


#     base_ad.write( ds + '_subamples.h5ad')
    
    break

 Now processing dataset:  heart10k

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } Cluster Barcode AAACCCAAGCGAGTCA 2 AAACCCAAGGTCATTC 1

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } gene_id ENSMUSG00000051951.5 ENSMUSG00000102851.1

100%|██████████| 25/25 [10:53<00:00, 25.47s/it]

UMIs: 	 19373016 	 Reads: 	 51200000
UMIs: 	 1341886 	 Reads: 	 3200000
UMIs: 	 950966 	 Reads: 	 2262742
UMIs: 	 72563226 	 Reads: 	 289630938
UMIs: 	 59635 	 Reads: 	 141421
UMIs: 	 72679923 	 Reads: 	 0
UMIs: 	 1891433 	 Reads: 	 4525483
UMIs: 	 3747318 	 Reads: 	 9050967
UMIs: 	 337292 	 Reads: 	 800000
UMIs: 	 5256131 	 Reads: 	 12800000
UMIs: 	 673164 	 Reads: 	 1600000
UMIs: 	 45811720 	 Reads: 	 144815469
UMIs: 	 238550 	 Reads: 	 565685
UMIs: 	 84409 	 Reads: 	 200000
UMIs: 	 2664470 	 Reads: 	 6400000
UMIs: 	 476734 	 Reads: 	 1131371
UMIs: 	 35021220 	 Reads: 	 102400000
UMIs: 	 42202 	 Reads: 	 100000
UMIs: 	 7346236 	 Reads: 	 18101934
UMIs: 	 58484104 	 Reads: 	 204800000
UMIs: 	 168862 	 Reads: 	 400000
UMIs: 	 10223782 	 Reads: 	 25600000
UMIs: 	 14134634 	 Reads: 	 36203867
UMIs: 	 26246409 	 Reads: 	 72407734
UMIs: 	 119301 	 Reads: 	 282843
AnnData object with n_obs × n_vars = 7713 × 36047 
    obs: 'Cluster'
    layers: '51200000', '3200000', '2262742', '289630938', '141421', '0', '4525483', '9050967', '800000', '12800000', '1600000', '144815469', '565685', '200000', '6400000', '1131371', '102400000', '100000', '18101934', '204800000', '400000', '25600000', '36203867', '72407734', '282843'

In [7]:

ls -lah

total 6.4G
drwxrwxr-x.  9 munfred munfred 4.0K Aug 31 16:13 ./
drwxrwxr-x. 12 munfred munfred 4.0K Aug 31 12:33 ../
drwxrwxr-x.  5 munfred munfred 4.0K Aug 25 20:47 benchmarks/
-rw-rw-r--.  1 munfred munfred 2.4K Aug 25 20:03 cell-depth-tradeoff-metadata.tsv
-rw-rw-r--.  1 munfred munfred 7.8K Aug 25 20:21 cell-depth-tradeoff-subsampling.py
-rw-rw-r--.  1 munfred munfred  13K Aug 24 21:03 create_H5AD-Copy1.ipynb
-rw-rw-r--.  1 munfred munfred  12K Aug 31 16:13 create_H5AD-Copy2.ipynb
-rw-rw-r--.  1 munfred munfred  24K Aug 30 17:41 create_H5AD.ipynb
-rw-rw-r--.  1 munfred munfred 160K Aug 25 17:35 heart10k_clusters.csv
-rw-rw-r--.  1 munfred munfred 1.6G Aug 29 20:44 heart10k_subsamples.h5ad
drwxrwxr-x.  5 munfred munfred 4.0K Aug 25 21:15 heart_10k_v3/
drwxrwxr-x.  2 munfred munfred 4.0K Aug 31 16:01 .ipynb_checkpoints/
drwxrwxr-x.  5 munfred munfred 4.0K Aug 25 21:15 neuron_10k_v3/
-rw-rw-r--.  1 munfred munfred 246K Aug 25 17:35 neurons10k_clusters.csv
-rw-rw-r--.  1 munfred munfred 2.3G Aug 29 21:14 neurons10k_subsamples.h5ad
-rw-rw-r--.  1 munfred munfred 244K Aug 25 17:35 pbmc10k_clusters.csv
-rw-rw-r--.  1 munfred munfred 2.6G Aug 29 20:59 pbmc10k_subsamples.h5ad
drwxrwxr-x.  5 munfred munfred 4.0K Aug 25 21:15 pbmc_10k_v3/
-rw-rw-r--.  1 munfred munfred  75K Aug 30 18:59 run_scVI.ipynb
-rw-rw-r--.  1 munfred munfred 5.7K Aug 30 18:57 run_scvi.py
-rw-rw-r--.  1 munfred munfred  23K Aug 30 18:35 scVI❤️heart10k❤️-Copy1.ipynb
drwxrwxr-x.  2 munfred munfred  20K Aug 31 16:10 scvi_output_neurons10k/
drwxrwxr-x.  9 munfred munfred 4.0K Aug 25 17:26 .snakemake/

In [ ]: