import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
from scipy import stats
import statsmodels.stats.multitest as multi
from sklearn.decomposition import PCA
import pickle
sns.set_style('white')
%matplotlib inline
mpl.rcParams['xtick.labelsize'] = 16
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['font.weight'] = 'bold'
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
df_rel_sort_list = pickle.load(open('pickle_files/rel_sort_lod_list.pkl', 'rb'))
df_abs_sort_list = pickle.load(open('pickle_files/abs_sort_lod_list.pkl', 'rb'))
df_pseudo_rel_sort_list = pickle.load(open('pickle_files/pseudo_rel_sort_lod_list.pkl', 'rb'))
df_pseudo_abs_sort_list = pickle.load(open('pickle_files/pseudo_abs_sort_lod_list.pkl', 'rb'))
df_col_names_list = pickle.load(open('pickle_files/col_names_lod_list.pkl', 'rb'))
df_metadata = pickle.load(open('pickle_files/metadata_all.pkl', 'rb'))
# Color palette
pal = sns.cubehelix_palette(20, rot=-.25, light=0.7)
def get_important_features(PC1, PC2, components_, columns):
"""
This function will return the most "important"
features so we can determine which have the most
effect on PCA
"""
num_columns = len(columns)
# Scale the principal components by the max value in
# the transformed set belonging to that component
xvector = components_[0] * max(PC1)
yvector = components_[1] * max(PC2)
# Sort each column by it's length.
important_features = { columns[i] : (xvector[i], yvector[i], np.sqrt(xvector[i]**2 + yvector[i]**2)) for i in range(num_columns) }
return important_features
# Perform log transform on pseudocount values
df_pseudo_abs_log_list = [None]*6
for i in range(6):
df_pseudo_abs_log_list[i] = np.log10(df_pseudo_abs_sort_list[i])
# Calculate top 10 principal components for each of the taxonomic levels
pca_list = [PCA(n_components=10), PCA(n_components=10), PCA(n_components=10), PCA(n_components=10), PCA(n_components=10), PCA(n_components=10)]
df_pca_list = [None]*6
for i in range(6):
df_pca_list[i] = pd.DataFrame(pca_list[i].fit_transform(df_pseudo_abs_log_list[i]), columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5',
'PC6', 'PC7', 'PC8', 'PC9', 'PC10'])
df_pca_list[i]['Sample'] = df_pseudo_abs_log_list[i].index.tolist()
#Merge with metadata
df_pca_list[i] = df_pca_list[i].merge(df_metadata, left_on='Sample', right_index=True)
df_pca_list[0]
250 rows × 101 columns
df_features_list = [None]*6
# Calculate the feature loadings for the first two principal components
for i in range(6):
df_features_list[i] = pd.DataFrame.from_dict(data=get_important_features(np.array(df_pca_list[i]['PC1']),
np.array(df_pca_list[i]['PC2']),
pca_list[i].components_[0:2],
df_pseudo_abs_log_list[i].columns.values),
orient='index', columns=['PC1 Loading', 'PC2 Loading', 'Magnitude'])
df_features_list[4].sort_values('PC2 Loading', ascending=False)[0:10]
mpl.rcParams['ytick.labelsize'] = 16
sns.set_style("ticks", {"xtick.major.size": 8, "ytick.major.size": 8})
fig = plt.figure(figsize=(7,6))
ax1 = fig.add_subplot(111)
_df = df_features_list[4].sort_values('PC2 Loading', ascending=True)
clrs = [pal[9] if (x > _df['PC2 Loading'].tolist()[-6]) else pal[0] if (x < _df['PC2 Loading'].tolist()[5]) else 'grey' for x in _df['PC2 Loading'].tolist()]
sns.barplot(ax=ax1, x=_df.index, y='PC2 Loading', data=_df, palette=clrs, edgecolor=clrs)
ax1.set_xticklabels('')
ax1.set_ylabel('PC2 Loading', fontsize=16, fontweight='bold')
ax1.set_xlabel('')
ax1.set_ylim(-4,4)
ax1.set_xticks([])
leg1 = ax1.legend(pd.DataFrame(_df.head(n=5)).merge(df_col_names_list[4], left_index=True, right_index=True)['label'],
loc='upper right', fontsize=14, bbox_to_anchor=(0.6, 0.4))
leg2 = ax1.legend(pd.DataFrame(_df.tail(n=5).iloc[::-1]).merge(df_col_names_list[4], left_index=True, right_index=True)['label'],
loc='upper right', fontsize=14, bbox_to_anchor=(0.9, 1))
ax1.add_artist(leg1)
leg2.legendHandles[0].set_color(pal[9])
leg2.legendHandles[1].set_color(pal[9])
leg2.legendHandles[2].set_color(pal[9])
leg2.legendHandles[3].set_color(pal[9])
leg2.legendHandles[4].set_color(pal[9])
leg1.legendHandles[0].set_color(pal[0])
leg1.legendHandles[1].set_color(pal[0])
leg1.legendHandles[2].set_color(pal[0])
leg1.legendHandles[3].set_color(pal[0])
leg1.legendHandles[4].set_color(pal[0])
#fig.savefig('PC2_feature_genus.png', dpi=200, bbox_inches='tight')
#fig.savefig('PC2_feature_genus.pdf', bbox_inches='tight', transparent=True)
continuous_var = ['Copies/mL', 'bloating_vas', 'excess_gas_vas', 'incomplete_evac_vas',
'constipation_vas', 'diarrhea_vas', 'urgency_vas', 'IFNY', 'GM-CSF', 'IL13', 'IL4',
'IL12P70', 'IL5', 'IL6', 'IL10', 'TNFA', 'IL8', 'IL2', 'IL1B', 'MCP1', 'Age (years)', 'weight (lbs)', 'anaerobic_percent',
'bloating>50th', 'excess_gas>50th', 'incomplete_evac>50th',
'constipation>50th', 'diarrhea>50th', 'urgency>50th', 'Shannon_Diversity', 'Evenness']
df_pca_corr = pd.DataFrame(columns=['PC1_corr', 'PC2_corr', 'Magnitude', 'PC1_scaled', 'PC2_scaled', 'Magnitude_scaled', 'PC1_pvalue', 'PC2_pvalue'])
level=4
for var in continuous_var:
pc1_corr, pc1_pvalue = stats.spearmanr(df_pca_list[level]['PC1'], df_pca_list[level][var], nan_policy='omit')
pc2_corr, pc2_pvalue = stats.spearmanr(df_pca_list[level]['PC2'], df_pca_list[level][var], nan_policy='omit')
# Calculate scaled correlations to overlay on PC plot
if pc1_corr>0:
pc1_scaled = pc1_corr*max(df_pca_list[level]['PC1'])
else:
pc1_scaled = pc1_corr*min(df_pca_list[level]['PC1'])
if pc2_corr>0:
pc2_scaled = pc2_corr*max(df_pca_list[level]['PC2'])
else:
pc2_scaled = -1*pc2_corr*min(df_pca_list[level]['PC2'])
df_pca_corr.loc[var] = [pc1_corr, pc2_corr, np.sqrt(pc1_corr**2+pc2_corr**2), pc1_scaled, pc2_scaled, np.sqrt(pc1_scaled**2+pc2_scaled**2), pc1_pvalue, pc2_pvalue]
# Sort by the magnitude in PC1/2 space
df_pca_corr.sort_values('Magnitude', ascending=False, inplace=True)
df_pca_corr
mpl.rcParams['xtick.labelsize'] = 24
mpl.rcParams['ytick.labelsize'] = 24
mpl.rcParams['font.weight'] = 'bold'
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(8,8))
tax_level_list = ['Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']
ax1 = fig.add_subplot(111)
_df = df_pca_list[4]
_df['PC1_inverse'] = _df['PC1']*-1
sns.scatterplot(ax=ax1, x='PC1_inverse', y='PC2', hue='SIBO', data=_df, s=250, palette=['lightgrey', 'PeachPuff'], edgecolor='k')
ax1.set_xticklabels([])
ax1.set_yticklabels([])
ax1.set_xticks([])
ax1.set_yticks([])
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles[1:], ['non-SIBO', 'SIBO'], loc='upper left', markerscale=2.5, fontsize=16)
ax1.set_xlabel('PC1 (' + str(round(pca_list[4].explained_variance_ratio_[0]*100,2)) + '%)', fontsize=16, fontweight='bold')
ax1.set_ylabel('PC2 (' + str(round(pca_list[4].explained_variance_ratio_[1]*100,2)) + '%)', fontsize=16, fontweight='bold')
ax1.set_title('', fontsize=20, fontweight='bold')
#fig.savefig('genus_pca_SIBO.png', dpi=200, bbox_inches='tight')
#fig.savefig('genus_pca_SIBO.pdf', bbox_inches='tight', transparent=True)
Text(0.5, 1.0, '')
mpl.rcParams['xtick.labelsize'] = 24
mpl.rcParams['ytick.labelsize'] = 24
mpl.rcParams['font.weight'] = 'bold'
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(8,8))
tax_level_list = ['Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']
ax1 = fig.add_subplot(111)
_df = df_pca_list[4]
colors = ['lightgrey'] + sns.color_palette('Set3', 12)
sns.scatterplot(ax=ax1, x='PC1_inverse', y='PC2', data=_df, s=250, color=pal[0], edgecolor='k')
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles[1:], ['no SIBO', 'SIBO'], loc='upper left', markerscale=2.5, fontsize=16)
ax1.get_xaxis().set_ticks([])
ax1.get_yaxis().set_ticks([])
ax1.set_xlabel('PC1 (' + str(round(pca_list[4].explained_variance_ratio_[0]*100,2)) + '%)', fontsize=16, fontweight='bold')
ax1.set_ylabel('PC2 (' + str(round(pca_list[4].explained_variance_ratio_[1]*100,2)) + '%)', fontsize=16, fontweight='bold')
ax1.set_title('', fontsize=20, fontweight='bold')
ax1.get_legend().remove()
ax1.arrow(0, 0, df_pca_corr.iloc[0]['PC1_scaled'], df_pca_corr.iloc[0]['PC2_scaled'], color='k', width=0.05, head_width=1, alpha=0.75)
ax1.arrow(0, 0, df_pca_corr.iloc[1]['PC1_scaled'], df_pca_corr.iloc[1]['PC2_scaled'], color='k', width=0.05, head_width=0.6, alpha=0.75)
ax1.text(df_pca_corr.iloc[0]['PC1_scaled']-6.5, df_pca_corr.iloc[0]['PC2_scaled']-3, 'Total\nLoad', color='k', fontsize=20)
ax1.text(df_pca_corr.iloc[1]['PC1_scaled'], df_pca_corr.iloc[1]['PC2_scaled']-3, 'Shannon\nDiversity', color='k', fontsize=20)
#fig.savefig('genus_pca.png', dpi=200, bbox_inches='tight')
#fig.savefig('genus_pca.pdf', bbox_inches='tight', transparent=True)
Text(9.919860605101945, -9.026754837992854, 'Shannon\nDiversity')
continuous_var = ['bloating_vas', 'excess_gas_vas', 'incomplete_evac_vas',
'constipation_vas', 'diarrhea_vas', 'urgency_vas']
df_pca_corr = pd.DataFrame(columns=['PC1_corr', 'PC2_corr', 'Magnitude', 'PC1_scaled', 'PC2_scaled', 'Magnitude_scaled', 'PC1_pvalue', 'PC2_pvalue'])
level=4
for var in continuous_var:
pc1_corr, pc1_pvalue = stats.spearmanr(df_pca_list[level]['PC1'], df_pca_list[level][var], nan_policy='omit')
pc2_corr, pc2_pvalue = stats.spearmanr(df_pca_list[level]['PC2'], df_pca_list[level][var], nan_policy='omit')
if pc1_corr>0:
pc1_scaled = pc1_corr*max(df_pca_list[level]['PC1'])
else:
pc1_scaled = -1*pc1_corr*min(df_pca_list[level]['PC1'])
if pc2_corr>0:
pc2_scaled = pc2_corr*max(df_pca_list[level]['PC2'])
else:
pc2_scaled = -1*pc2_corr*min(df_pca_list[level]['PC2'])
df_pca_corr.loc[var] = [pc1_corr, pc2_corr, np.sqrt(pc1_corr**2+pc2_corr**2), pc1_scaled, pc2_scaled, np.sqrt(pc1_scaled**2+pc2_scaled**2), pc1_pvalue, pc2_pvalue]
df_pca_corr_PC2_symptom = df_pca_corr.sort_values('PC2_pvalue')[['PC2_corr', 'PC2_pvalue']].reset_index()
df_pca_corr_PC2_symptom['color'] = np.where(df_pca_corr_PC2_symptom['PC2_corr']>0.16, 'lightblue', 'lightgrey')
df_pca_corr_PC2_symptom
fig = plt.figure(figsize=(6,5))
ax1 = fig.add_subplot(111)
sns.barplot(x='PC2_corr', y='index', data=df_pca_corr_PC2_symptom, palette=[pal[0]]*3+['lightgrey']*3, edgecolor='k')
ax1.set_ylabel('')
ax1.set_xlabel('Spearman Correlation Coefficient', fontsize=16, fontweight='bold')
ax1.set_yticklabels(['Bloating', 'Incomplete Evac.', 'Constipation', 'Urgency', 'Excess Gas', 'Diarrhea'], fontsize=16, fontweight='bold')
ax1.axvline(0.16, ls='--', color='red')
ax1.set_xlim(0.1, 0.28)
#fig.savefig('symptom_correlation.png', bbox_inches='tight', dpi=200)
#fig.savefig('symptom_correlation.pdf', bbox_inches='tight', transparent=True)
(0.1, 0.28)
continuous_var = ['IL2','IL4','IL5','IL6','IL8','IL10','IL12P70','IL13','IL1B','TNFA','IFNY','GM-CSF', 'MCP1']
df_pca_corr = pd.DataFrame(columns=['PC1_corr', 'PC2_corr', 'Magnitude', 'PC1_scaled', 'PC2_scaled', 'Magnitude_scaled', 'PC1_pvalue', 'PC2_pvalue'])
level=4
for var in continuous_var:
pc1_corr, pc1_pvalue = stats.spearmanr(df_pca_list[level]['PC1'], df_pca_list[level][var], nan_policy='omit')
pc2_corr, pc2_pvalue = stats.spearmanr(df_pca_list[level]['PC2'], df_pca_list[level][var], nan_policy='omit')
if pc1_corr>0:
pc1_scaled = pc1_corr*max(df_pca_list[level]['PC1'])
else:
pc1_scaled = -1*pc1_corr*min(df_pca_list[level]['PC1'])
if pc2_corr>0:
pc2_scaled = pc2_corr*max(df_pca_list[level]['PC2'])
else:
pc2_scaled = -1*pc2_corr*min(df_pca_list[level]['PC2'])
df_pca_corr.loc[var] = [pc1_corr, pc2_corr, np.sqrt(pc1_corr**2+pc2_corr**2), pc1_scaled, pc2_scaled, np.sqrt(pc1_scaled**2+pc2_scaled**2), pc1_pvalue, pc2_pvalue]
df_pca_corr_PC2_cytokine = df_pca_corr.sort_values('PC2_pvalue')[['PC2_corr', 'PC2_pvalue']].reset_index()
df_pca_corr_PC2_cytokine['abs PC2_corr'] = abs(df_pca_corr_PC2_cytokine['PC2_corr'])
df_pca_corr_PC2_cytokine['color'] = np.where(df_pca_corr_PC2_cytokine['PC2_corr']>0.16, 'lightblue', 'lightgrey')
df_pca_corr_PC2_cytokine
mpl.rcParams['ytick.labelsize'] = 20
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
fig = plt.figure(figsize=(6,5))
ax1 = fig.add_subplot(111)
sns.barplot(x='PC2_corr', y='index', data=df_pca_corr_PC2_cytokine, palette=[pal[0]]+['lightgrey']*12, edgecolor='k')
ax1.set_ylabel('')
ax1.set_xlabel('Spearman Correlation Coefficient', fontsize=16, fontweight='bold')
ax1.axvline(0.16, ls='--', color='red')
ax1.axvline(-0.16, ls='--', color='red')
ax1.set_xlim(-0.28, 0.28)
#fig.savefig('cytokine_correlation.png', bbox_inches='tight', dpi=200)
#fig.savefig('cytokine_correlation.pdf', bbox_inches='tight', transparent=True)
(-0.28, 0.28)
pvalues = df_pca_corr_PC2_cytokine['PC2_pvalue'].tolist()+df_pca_corr_PC2_symptom['PC2_pvalue'].tolist()
coeffs = df_pca_corr_PC2_cytokine['PC2_corr'].tolist()+df_pca_corr_PC2_symptom['PC2_corr'].tolist()
qvalues = multi.multipletests(pvalues, method='fdr_bh')[1]
sns.scatterplot(x=coeffs, y=-np.log10(qvalues))
# q-value equal to 5% FDR
plt.axhline(-np.log10(0.05))
plt.axvline(0.16)
plt.axvline(-0.16)
plt.xlim(-0.25,0.25)
(-0.25, 0.25)
df_pca_list[4]['Lacto-noSIBO'] = np.where((df_pca_list[4]['SIBO']==0) & (df_pca_list[4]['Lacto']==1), 1, 0)
mpl.rcParams['xtick.labelsize'] = 24
mpl.rcParams['ytick.labelsize'] = 24
mpl.rcParams['font.weight'] = 'bold'
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(8,8))
tax_level_list = ['Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']
ax1 = fig.add_subplot(111)
_df = df_pca_list[4]
_df['PC1_inverse'] = _df['PC1']*-1
sns.scatterplot(ax=ax1, x='PC1_inverse', y='PC2', hue='SIBO', style='Lacto-noSIBO', data=_df, s=250, palette=['lightgrey', 'PeachPuff'], edgecolor='k')
ax1.set_xticklabels([])
ax1.set_yticklabels([])
ax1.set_xticks([])
ax1.set_yticks([])
handles, labels = ax1.get_legend_handles_labels()
ax1.legend(handles[1:3]+handles[4:], ['non-SIBO', 'SIBO'], loc='upper left', markerscale=2.5, fontsize=16)
ax1.set_xlabel('PC1 (' + str(round(pca_list[4].explained_variance_ratio_[0]*100,2)) + '%)', fontsize=16, fontweight='bold')
ax1.set_ylabel('PC2 (' + str(round(pca_list[4].explained_variance_ratio_[1]*100,2)) + '%)', fontsize=16, fontweight='bold')
ax1.set_title('', fontsize=20, fontweight='bold')
#fig.savefig('genus_pca_SIBO_Lacto.png', dpi=200, bbox_inches='tight')
#fig.savefig('genus_pca_SIBO_Lacto.pdf', bbox_inches='tight', transparent=True)
Text(0.5, 1.0, '')