%pylab inline
import anndata
import pandas as pd
from scipy import optimize
from scipy.special import gammaln
from scipy.special import psi
from scipy.special import factorial
from scipy.optimize import fmin_l_bfgs_b as optim
from tqdm import tqdm
zheng = anndata.read('../Data/zheng_gemcode_control.h5ad')
macos = anndata.read('../Data/macosko_dropseq_control_GSM1629193.h5ad')
klein = anndata.read('../Data/klein_indrops_control_GSM1599501.h5ad')
svens = anndata.read('../Data/svensson_chromium_control.h5ad')
hek29 = anndata.read('../Data/10x_v3_5k_HEK293T.h5ad')
nih3t = anndata.read('../Data/10x_v3_5k_NIH3T3.h5ad')
pbmcs = anndata.read('../Data/10x_v3_1k_PBMC.h5ad')
pedav = anndata.read('../Data/fluidigm_GSE66053.h5ad')
sven1 = svens[svens.obs.query('sample == "20311"').index]
sven2 = svens[svens.obs.query('sample == "20312"').index]
zheng.uns['name'] = 'Zheng et al 2017'
macos.uns['name'] = 'Macosko et al 2015'
klein.uns['name'] = 'Klein et al 2015'
sven1.uns['name'] = 'Svensson et al 2017 (1)'
sven2.uns['name'] = 'Svensson et al 2017 (2)'
hek29.uns['name'] = '10x v3 HEK293T'
nih3t.uns['name'] = '10x v3 NIH3T3'
pbmcs.uns['name'] = '10x v3 PBMC'
pedav.uns['name'] = 'Padovan-Merhar et al 2015 (SMARTer)'
Populating the interactive namespace from numpy and matplotlib
Trying to set attribute `.uns` of view, making a copy. Trying to set attribute `.uns` of view, making a copy.
# Remember to rerun with all data! Just redoing Macosko now
datasets = [pbmcs, hek29, nih3t, klein, macos, zheng, sven1, sven2, pedav]
# X is a numpy array representing the data
# initial params is a numpy array representing the initial values of
# size and prob parameters
def fit_nbinom(X, initial_params=None):
''' This code is adapted from https://github.com/gokceneraslan/fit_nbinom
'''
infinitesimal = np.finfo(np.float).eps
def log_likelihood(params, *args):
r, p = params
X = args[0]
N = X.size
# MLE estimate based on the formula on Wikipedia:
# http://en.wikipedia.org/wiki/Negative_binomial_distribution#Maximum_likelihood_estimation
result = np.sum(gammaln(X + r)) \
- np.sum(np.log(factorial(X))) \
- N * (gammaln(r)) \
+ N * r * np.log(p) \
+ np.sum(X * np.log(1 - (p if p < 1 else 1 - infinitesimal)))
return -result
if initial_params is None:
# reasonable initial values (from fitdistr function in R)
m = np.mean(X)
v = np.var(X)
size = (m ** 2) / (v-m) if v > m else 10
# convert mu/size parameterization to prob/size
p0 = size / ((size + m) if size + m != 0 else 1)
r0 = size
initial_params = np.array([r0, p0])
bounds = [(infinitesimal, None), (infinitesimal, 1)]
optimres = optim(log_likelihood,
x0=initial_params,
args=(X,),
approx_grad=1,
bounds=bounds)
params = optimres[0]
return {'size': params[0], 'prob': params[1]}
First determine the empirical statistics for each gene.
Then fit NB parameters to the each gene by ML.
def fit_per_gene_stats(adata):
adata.var['empirical_mean'] = np.nan
adata.var['empirical_variance'] = np.nan
adata.var['empirical_zero_fraction'] = np.nan
adata.var['ml_mean'] = np.nan
adata.var['genewise_dispersion'] = np.nan
for g in tqdm(adata.var.index):
y = adata[:, g].X
adata.var.loc[g, 'empirical_mean'] = y.mean(0)
adata.var.loc[g, 'empirical_variance'] = y.var(0)
adata.var.loc[g, 'empirical_zero_fraction'] = 1 - (y > 0).sum(0) / y.shape[0]
result = fit_nbinom(y)
mu = ((1. - result['prob']) * result['size']) / (result['prob'])
adata.var.loc[g, 'ml_mean'] = mu
adata.var.loc[g, 'genewise_dispersion'] = 1. / result['size']
for adata in datasets:
fit_per_gene_stats(adata)
Now fit global parameters to each data set
def var_fun(mu, phi):
return mu + phi * mu ** 2
def prob_zero_fun(mu, phi):
if phi == .0:
return np.exp(-mu)
phi_1 = 1. / phi
return (phi_1 / (mu + phi_1)) ** phi_1
def prob_zero_fun_vec(mu, phi):
phi_1 = 1. / phi
return (phi_1 / (mu + phi_1)) ** phi_1
for adata in datasets:
phi_hat, _ = optimize.curve_fit(var_fun, adata.var['empirical_mean'], adata.var['empirical_variance'])
adata.uns['global_dispersion'] = phi_hat
Calculate predicted values to compare the empirical values with
for adata in datasets:
adata.var['global_zero_fraction'] = prob_zero_fun_vec(adata.var['empirical_mean'],
adata.uns['global_dispersion'])
adata.var['genewise_zero_fraction'] = prob_zero_fun_vec(adata.var['empirical_mean'],
adata.var['genewise_dispersion'])
Finally save the results for making plots
for adata in datasets:
adata.write('../Data/output/' + adata.uns['name'] + '.h5ad')