import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from scipy import stats
import statsmodels.stats.multitest as multi
import pickle
sns.set_style('white')
%matplotlib inline
mpl.rcParams['xtick.labelsize'] = 16
mpl.rcParams['ytick.labelsize'] = 16
mpl.rcParams['font.weight'] = 'bold'
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
C:\Users\Jake\Anaconda3\lib\site-packages\statsmodels\tools\_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead. import pandas.util.testing as tm
df_rel_sort_list = pickle.load(open('pickle_files/rel_sort_lod_list_paired.pkl', 'rb'))
df_abs_sort_list = pickle.load(open('pickle_files/abs_sort_lod_list_paired.pkl', 'rb'))
df_pseudo_rel_sort_list = pickle.load(open('pickle_files/pseudo_rel_sort_lod_list_paired.pkl', 'rb'))
df_pseudo_abs_sort_list = pickle.load(open('pickle_files/pseudo_abs_sort_lod_list_paired.pkl', 'rb'))
df_col_names_list = pickle.load(open('pickle_files/col_names_lod_list_paired.pkl', 'rb'))
seq_metadata = pickle.load(open('pickle_files/seq_paired_metadata_paired.pkl', 'rb'))
total_load = pickle.load(open('pickle_files/total_load_paired.pkl', 'rb'))
metadata = seq_metadata.merge(total_load[['mod_ID', 'Copies/mL', 'Log Copies/mL']], left_index=True, right_on='mod_ID').set_index('mod_ID')
for i in range(6):
df_abs_sort_list[i] = df_abs_sort_list[i].merge(metadata, left_index=True, right_index=True)
df_rel_sort_list[i] = df_rel_sort_list[i].merge(metadata, left_index=True, right_index=True)
df_pseudo_abs_sort_list[i] = df_pseudo_abs_sort_list[i].merge(metadata, left_index=True, right_index=True)
df_pseudo_rel_sort_list[i] = df_pseudo_rel_sort_list[i].merge(metadata, left_index=True, right_index=True)
comp = 'Body_Site'
cond1 = 'Duodenum'
cond2 = 'Saliva'
comp_exp = df_abs_sort_list[0][comp].value_counts().to_dict()
comp_exp
{'Saliva': 21, 'Duodenum': 21}
df_stats_list = [None]*6
# Calculate both kruskal statistic and fisher-exact test for each comparison with benjamini-hochberg correction
for i in range(6):
df_stats_list[i] = pd.DataFrame(columns=['Comp', 'ID', 'Group', 'Kruskal p-value', 'Rel_cond1_average', 'Rel_cond2_average', 'Fisher p-value', 'cond1_nonzero_N', 'cond2_nonzero_N', 'total_nonzero', 'log2 Rel FC'])
count=0
for col in df_col_names_list[i].index.tolist():
if (col not in df_rel_sort_list[i].columns):
continue
rel_cond1 = df_rel_sort_list[i][(df_rel_sort_list[i][comp] == cond1)][col].tolist()
rel_cond2 = df_rel_sort_list[i][(df_rel_sort_list[i][comp] == cond2)][col].tolist()
pseudo_rel_cond1 = df_pseudo_rel_sort_list[i][(df_pseudo_rel_sort_list[i][comp] == cond1)][col].tolist()
pseudo_rel_cond2 = df_pseudo_rel_sort_list[i][(df_pseudo_rel_sort_list[i][comp] == cond2)][col].tolist()
log2_rel_fc = np.log2(np.mean(pseudo_rel_cond1)/np.mean(pseudo_rel_cond2))
nonzero_cond1 = np.count_nonzero(rel_cond1)
nonzero_cond2 = np.count_nonzero(rel_cond2)
comp_cond1 = comp_exp[cond1]
comp_cond2 = comp_exp[cond2]
if nonzero_cond1+nonzero_cond2 > 5:
kruskal_stat, kruskal_p_value = stats.kruskal(rel_cond1, rel_cond2)
fisher_stat, fisher_p_value = stats.fisher_exact([[nonzero_cond1, comp_cond1],[nonzero_cond2, comp_cond2]])
df_stats_list[i].loc[count] = [str(cond1) + ' v ' + str(cond2), col, df_col_names_list[i].loc[col]['label'], kruskal_p_value, np.mean(pseudo_rel_cond1), np.mean(pseudo_rel_cond2), fisher_p_value, nonzero_cond1, nonzero_cond2, nonzero_cond1+nonzero_cond2, log2_rel_fc]
count+=1
df_stats_list[i]['fdr Kruskal p-value'] = multi.multipletests(df_stats_list[i]['Kruskal p-value'].tolist(), method='fdr_bh')[1]
df_stats_list[i]['fdr Fisher p-value'] = multi.multipletests(df_stats_list[i]['Fisher p-value'].tolist(), method='fdr_bh')[1]
df_stats_list[i]['-log10 Kruskal fdr'] = -np.log10(df_stats_list[i]['fdr Kruskal p-value'])
df_stats_list[i]['-log10 Fisher fdr'] = -np.log10(df_stats_list[i]['fdr Fisher p-value'])
df_stats_list[i]['abs log2 Rel FC'] = abs(df_stats_list[i]['log2 Rel FC'])
pal = sns.cubehelix_palette(20, rot=-.25, light=0.7)
sns.set_style("ticks", {"xtick.major.size": 8, "ytick.major.size": 8})
fig = plt.figure(figsize=(6,6))
ax1 = fig.add_subplot(111)
sns.scatterplot(ax=ax1, x='log2 Rel FC', y='-log10 Kruskal fdr', data=df_stats_list[5][df_stats_list[5]['total_nonzero']>5], color='lightgrey', s=150, edgecolor='k', alpha=0.7)
ax1.set_ylim(-0.1, 5)
ax1.set_xlim(-15,15)
ax1.axhline(y=-np.log10(0.1), linestyle='--', color='#DF2935')
ax1.set_ylabel('-log10 Kruskal fdr', fontsize=16, fontweight='bold')
ax1.set_xlabel('log2 (Duodenum / Saliva)', fontsize=16, fontweight='bold')
ax1.set_title('', fontsize=16, fontweight='bold')
sig_taxa = df_stats_list[5][(df_stats_list[5]['-log10 Kruskal fdr']>-np.log10(0.1))]
count=0
for index, row in sig_taxa.iterrows():
if row['log2 Rel FC']>10:
ax1.text(row['log2 Rel FC']-11.6, row['-log10 Kruskal fdr']-0.2, 'Streptococcus\nundefined sp.', color='k', fontsize=14)
else:
if row['-log10 Kruskal fdr']>3:
ax1.text(row['log2 Rel FC']-10, row['-log10 Kruskal fdr']-0.05, 'Campylobacter', color='k', fontsize=14)
#fig.savefig('duo_sal_volcano.png', dpi=200, bbox_inches='tight')
#fig.savefig('duo_sal_volcano.pdf', bbox_inches='tight', transparent=True)
df_stats_list[5].sort_values('fdr Kruskal p-value').head(n=10)
# Count number of saliva samples each taxon is in
_df = pd.DataFrame(df_abs_sort_list[5][df_abs_sort_list[5]['Body_Site']=='Saliva'][df_abs_sort_list[5].columns[:-4]].astype(bool).sum().sort_values(ascending=False)).rename(columns={0:'Saliva Prevalence (N=21 total)'})
# Count number of duodenum samples each taxon is in
_df['Duodenum Prevalence (N=21 total)'] = df_abs_sort_list[5][df_abs_sort_list[5]['Body_Site']=='Duodenum'][_df.index.tolist()].astype(bool).sum().tolist()
#Calculate average relative abundance for each taxon in saliva and duodenum
saliva_rel_abundance = []
duodenum_rel_abundance = []
for index,row in _df.iterrows():
saliva_rel_abundance.append(np.mean(df_rel_sort_list[5][df_rel_sort_list[5]['Body_Site']=='Saliva'][index].tolist()))
duodenum_rel_abundance.append(np.mean(df_rel_sort_list[5][df_rel_sort_list[5]['Body_Site']=='Duodenum'][index].tolist()))
_df['Saliva Rel. Abundance (%)'] = saliva_rel_abundance
_df['Duodenum Rel. Abundance (%)'] = duodenum_rel_abundance
# Merge taxa identifiers with taxonomy labels
_df = _df.merge(df_col_names_list[5][['taxonomy', 'label']], left_index=True, right_index=True)
_df = _df[_df['Saliva Prevalence (N=21 total)']+_df['Duodenum Prevalence (N=21 total)']>5]
# Create column counting the difference in number of samples each taxon is present in between saliva and duodenum
_df['Difference'] = abs(_df['Saliva Prevalence (N=21 total)']-_df['Duodenum Prevalence (N=21 total)'])
_df.sort_values('Difference', ascending=False, inplace=True)
_df.to_excel('STable_3.xlsx')