import numpy as np
import pandas as pd
import pickle
Samples were subsampled to 45,386 reads. Samples with less than this number of reads after DADA2 processing were removed.
df_seq_orig_species = pd.read_csv('data_files/species_counts_duodenum_45386.csv').set_index('index')
read_depth = df_seq_orig_species.sum(axis=1)[0]
read_depth
45386.0
num_metadata_cols = 2
df_seq_orig_species = df_seq_orig_species.drop(['387_Duo', '388_Duo', '390_Duo', '391_Duo', '392_Duo', '394_Duo', '409_Duo', '410_Duo', '418_Duo', '423_Duo', '425_Duo', '433_Duo'])
df_seq_orig_species.rename({'417_Duo':'417', '434_Duo':'434', '437_Duo':'437', '438_Duo':'438', '441_Duo':'441', '446_Duo':'446', '447_Duo':'447', '448_Duo':'448', '449_Duo':'449', '451_Duo':'451'}, axis='index', inplace=True)
df_seq_orig_species.sort_index(inplace=True)
df_seq_orig_species.index = df_seq_orig_species.index.astype(int)
df_seq_orig_species
254 rows × 1068 columns
# This taxa was only in second batch of sequenced duodenum samples likely indicating it is a contaminant. It is removed because
# it interferes with a plot comparing saliva to duodenum samples.
df_seq_orig_species.drop(['D_0__Bacteria;D_1__Firmicutes;D_2__Bacilli;D_3__Bacillales;D_4__Paenibacillaceae;D_5__Paenibacillus;D_6__Paenibacillus darwinianus'], axis=1, inplace=True)
df_total_load = pd.read_excel('dPCR data/dPCR_total_loads_duodenum.xlsx', index_col=0)
df_weights = pd.read_csv('data_files/sample weights.csv')
# Merge the two dataframes together based on the sample ID
df_total_load = df_total_load.merge(df_weights, left_on='Sample', right_on='Study ID')
# Add a column saying whether the sample weight is missing or not
df_total_load['Weight (True/False)'] = df_total_load.apply(lambda x: x['Weight (mL)'][0].isdigit(), axis=1)
# Determine the average sample weight for all samples
mean_weight = df_total_load[df_total_load['Weight (True/False)']==True]['Weight (mL)'].astype(float).mean()
# Create new column where any sample with a missing weight is set to the average weight of all samples
df_total_load['Corrected Weight (mL)'] = df_total_load.apply(lambda x: float(x['Weight (mL)']) if x['Weight (True/False)'] else mean_weight, axis=1)
# Print out the samples without weights for reference (N=11)
df_total_load[~df_total_load['Weight (True/False)']]
## Set the lower dPCR threshold. 95% CI is +-1X and the dPCR blanks are <1cp/uL with +3std dev of ~1 cp/uL.
## This means we would have ~2X resolution at 2 cp/uL.
df_total_load = df_total_load[(df_total_load['Concentration']>2)]
## Calculate Copies/mL
df_total_load['Copies/mL'] = df_total_load['Corrected Concentration']/df_total_load['Corrected Weight (mL)']
df_total_load['Log Copies/mL'] = np.log10(df_total_load['Copies/mL'])
df_total_load[['Sample', 'Copies/mL']].to_excel('duodenum_total_loads.xlsx')
# These samples were diluted before placing sample in library reaction due to inhibitors preventing amplification in undiluted sample
diluted_samples = {423:100, 437:10, 438:10, 441:10, 446:10, 447:10, 448:10, 449:10, 451:10,
395:100, 198:50, 423:50, 427:50, 373:10, 321:10, 169:10, 375:10, 353:10,
242:10, 411:10, 312:10, 433:2, 366:2}
# Create column to account for the fact that some samples were diluted before input into library prep reaction
df_total_load['Seq_Dilution'] = df_total_load.apply(lambda x: diluted_samples[x['Sample']] if x['Sample'] in diluted_samples.keys() else 1, axis=1)
# uL added to the amplification rxn
seq_volume = 3.5
copy_input_threshold = 3
df_total_load['Copies in Amp Rxn'] = df_total_load['Concentration']*df_total_load['Dilution']/df_total_load['Seq_Dilution']*seq_volume
df_total_load['Rel. Abundance LOD (%)'] = copy_input_threshold/df_total_load['Copies in Amp Rxn']*100
df_total_load['Abs. Abundance LOD'] = df_total_load['Rel. Abundance LOD (%)']*df_total_load['Copies/mL']/100
df_total_load
256 rows × 20 columns
total_load_dict = {df_total_load['Sample'].iloc[i] : df_total_load['Copies/mL'].iloc[i] for i in range(len(df_total_load))}
len(total_load_dict)
256
seq_lloq = 7.115*(read_depth**(-0.556))
df_total_load['Rel. Abundance LOD (%) Corrected'] = df_total_load['Rel. Abundance LOD (%)'].where(df_total_load['Rel. Abundance LOD (%)']>seq_lloq, seq_lloq)
lod_dict = {df_total_load['Sample'].iloc[i] : df_total_load['Rel. Abundance LOD (%) Corrected'].iloc[i]*read_depth/100 for i in range(len(df_total_load))}
df_seq_samples = df_seq_orig_species[df_seq_orig_species.index.isin(total_load_dict.keys())][df_seq_orig_species.columns[:-1*num_metadata_cols]]
# This is num_metadata_cols-1 because we don't need the description column since it is already stored as the index
seq_metadata = df_seq_orig_species[df_seq_orig_species.columns[-1*(num_metadata_cols-1):]]
df_seq_samples
250 rows × 1065 columns
This is defined as the load at which there should be a 95% chance of one copy being loaded into the amplification reaction (3 copy average).
df_species_lod_filter = pd.DataFrame()
for col in df_seq_samples.columns:
df_species_lod_filter[col] = df_seq_samples.apply(lambda x: x[col] if x[col]>lod_dict[x.name] else 0, axis=1)
# Remove columns (taxa) that have zero counts after filtering
df_species_lod_filter = df_species_lod_filter[df_species_lod_filter.sum(axis=1)>0]
# Remove rows (samples) that have zero counts after filtering
df_species_lod_filter = df_species_lod_filter.loc[:, (df_species_lod_filter != 0).any(axis=0)]
df_species_lod_filter
250 rows × 546 columns
orig_indexes = df_seq_samples.index.tolist()
filter_indexes = df_species_lod_filter.index.tolist()
lost = list(set(set(orig_indexes) - set(filter_indexes)))
df_total_load[df_total_load['Sample'].isin(lost)]
0 rows × 21 columns
def collapse_taxonomy(_df, level):
collapsed_dict = {}
index=0
# Evaluate the selected taxonomy level to collapse to
if level == 'Genus':
index = -1
elif level == 'Family':
index = -2
elif level == 'Order':
index = -3
elif level == 'Class':
index = -4
elif level == 'Phylum':
index = -5
else:
raise ValueError('Could not interpret taxonomy level. Please use (Phylum, Class, Order, Family, Genus)')
# Iterate through columns adding values together for each sample if the new column name already exists
for col in _df:
new_col = ";".join(col.split(';')[:index])
if new_col in collapsed_dict.keys():
collapsed_dict[new_col] += np.array(_df[col])
else:
collapsed_dict[new_col] = np.array(_df[col])
df_collapsed = pd.DataFrame.from_dict(collapsed_dict).set_index(_df.index)
return df_collapsed
df_lod_list = [None]*6
df_lod_list[0] = collapse_taxonomy(df_species_lod_filter, 'Phylum')
df_lod_list[1] = collapse_taxonomy(df_species_lod_filter, 'Class')
df_lod_list[2] = collapse_taxonomy(df_species_lod_filter, 'Order')
df_lod_list[3] = collapse_taxonomy(df_species_lod_filter, 'Family')
df_lod_list[4] = collapse_taxonomy(df_species_lod_filter, 'Genus')
df_lod_list[5] = df_species_lod_filter
df_rel_lod_list = [None]*6
df_abs_lod_list = [None]*6
df_pseudo_rel_lod_list = [None]*6
df_pseudo_abs_lod_list = [None]*6
for index, df in enumerate(df_lod_list):
df_rel_lod_list[index] = df.div(read_depth, axis=0).multiply(100)
df_abs_lod_list[index] = df_rel_lod_list[index].apply(lambda x: x*total_load_dict[x.name], 1).div(100)
df_pseudo_rel_lod_list[index] = df_rel_lod_list[index]+(0.1/read_depth)*100
#df_pseudo_abs_lod_list[index] = df_pseudo_rel_lod_list[index].multiply(1e4).div(100)
df_pseudo_abs_lod_list[index] = df_pseudo_rel_lod_list[index].apply(lambda x: x*total_load_dict[x.name], 1).div(100)
df_pseudo_abs_lod_list[5]
250 rows × 546 columns
This overcomes downstream issue when multiple columns have the same name
df_col_names_lod_list = [None]*6
for index, df in enumerate(df_rel_lod_list):
num_cols = len(df.columns)
col_names = ['ASV' + str(x) for x in range(num_cols)]
df_col_names_lod_list[index] = pd.DataFrame(index=col_names, data={'taxonomy':df.columns.tolist()})
df_rel_lod_list[index].columns = col_names
df_abs_lod_list[index].columns = col_names
df_pseudo_rel_lod_list[index].columns = col_names
df_pseudo_abs_lod_list[index].columns = col_names
df_col_names_lod_list[0]
exclusion_list = ['', 'uncultured bacterium', 'metagenome', 'uncultured',
'gut metagenome', 'uncultured organism', 'unidentified',
'uncultured Bacteroidales bacterium', 'uncultured Mollicutes bacterium', 'uncultured archaeon']
for i in range(6):
if i == 0:
df_col_names_lod_list[i][['Kingdom', 'Phylum']] = df_col_names_lod_list[i]['taxonomy'].str.split(';', expand=True)
elif i==1:
df_col_names_lod_list[i][['Kingdom', 'Phylum', 'Class']] = df_col_names_lod_list[i]['taxonomy'].str.split(';', expand=True)
elif i==2:
df_col_names_lod_list[i][['Kingdom', 'Phylum', 'Class', 'Order']] = df_col_names_lod_list[i]['taxonomy'].str.split(';', expand=True)
elif i==3:
df_col_names_lod_list[i][['Kingdom', 'Phylum', 'Class', 'Order', 'Family']] = df_col_names_lod_list[i]['taxonomy'].str.split(';', expand=True)
elif i==4:
df_col_names_lod_list[i][['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus']] = df_col_names_lod_list[i]['taxonomy'].str.split(';', expand=True)
else:
df_col_names_lod_list[i][['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']] = df_col_names_lod_list[i]['taxonomy'].str.split(';', expand=True)
labels_list = []
for index, row in df_col_names_lod_list[i].iterrows():
# Species
if row[-1][5:] in exclusion_list:
# Genus
if row[-2][5:] in exclusion_list:
# Family
if row[-3][5:] in exclusion_list:
# Order
if row[-4][5:] in exclusion_list:
# Class
if row[-5][5:] in exclusion_list:
# Phylum
if row[-6][5:] in exclusion_list:
labels_list.append(row[-7][5:] + '(' + df_col_names_lod_list[i].columns[-7][0].lower() + ')')
else:
labels_list.append(row[-6][5:] + '(' + df_col_names_lod_list[i].columns[-6][0].lower() + ')')
else:
labels_list.append(row[-5][5:] + '(' + df_col_names_lod_list[i].columns[-5][0].lower() + ')')
else:
labels_list.append(row[-4][5:] + '(' + df_col_names_lod_list[i].columns[-4][0].lower() + ')')
else:
labels_list.append(row[-3][5:] + '(' + df_col_names_lod_list[i].columns[-3][0].lower() + ')')
else:
labels_list.append(row[-2][5:] + '(' + df_col_names_lod_list[i].columns[-2][0].lower() + ')')
else:
labels_list.append(row[-1][5:] + '(' + df_col_names_lod_list[i].columns[-1][0].lower() + ')')
df_col_names_lod_list[i]['label'] = labels_list
df_rel_sort_lod_list = [None]*6
df_abs_sort_lod_list = [None]*6
df_pseudo_rel_sort_lod_list = [None]*6
df_pseudo_abs_sort_lod_list = [None]*6
for i in range(6):
taxa_sorted = df_abs_lod_list[i].mean().sort_values(ascending=False).index
df_rel_sort_lod_list[i] = df_rel_lod_list[i].loc[:, taxa_sorted]
df_abs_sort_lod_list[i] = df_abs_lod_list[i].loc[:, taxa_sorted]
df_pseudo_rel_sort_lod_list[i] = df_pseudo_rel_lod_list[i].loc[:, taxa_sorted]
df_pseudo_abs_sort_lod_list[i] = df_pseudo_abs_lod_list[i].loc[:, taxa_sorted]
df_abs_sort_lod_list[4]
250 rows × 286 columns
pickle.dump(df_rel_sort_lod_list, open('pickle_files/rel_sort_lod_list.pkl', 'wb'))
pickle.dump(df_abs_sort_lod_list, open('pickle_files/abs_sort_lod_list.pkl', 'wb'))
pickle.dump(df_pseudo_rel_sort_lod_list, open('pickle_files/pseudo_rel_sort_lod_list.pkl', 'wb'))
pickle.dump(df_pseudo_abs_sort_lod_list, open('pickle_files/pseudo_abs_sort_lod_list.pkl', 'wb'))
pickle.dump(df_col_names_lod_list, open('pickle_files/col_names_lod_list.pkl', 'wb'))
pickle.dump(df_total_load, open('pickle_files/total_load_duodenum.pkl', 'wb'))
pickle.dump(seq_metadata, open('pickle_files/seq_duodenum_metadata.pkl', 'wb'))
df_col_names_lod_list[5].loc[df_abs_sort_lod_list[5].columns[:10].tolist()]