import numpy as np
import pandas as pd
import pickle
num_metadata_cols = 3
Samples were subsampled to 45,386 reads. Samples with less than this number of reads after DADA2 processing were removed.
df_seq_orig_species = pd.read_csv('data_files/species_counts_paired_43922.csv').set_index('index')
read_depth = df_seq_orig_species[df_seq_orig_species.columns[:-num_metadata_cols]].sum(axis=1)[0]
read_depth
43922
# This taxa was only in second batch of sequenced duodenum samples likely indicating it is a contaminant. It is removed because
# it interferes with a plot comparing saliva to duodenum samples.
df_seq_orig_species.drop(['D_0__Bacteria;D_1__Firmicutes;D_2__Bacilli;D_3__Bacillales;D_4__Paenibacillaceae;D_5__Paenibacillus;D_6__Paenibacillus darwinianus'], axis=1, inplace=True)
df_total_load = pd.read_excel('dPCR data/dPCR_total_loads_paired.xlsx', index_col=0)
df_total_load['Sample'] = df_total_load['Sample'].astype(str)
Several saliva volumes (weights) were missing. The volume was set to the average of all other saliva samples.
Stool samples did not have weights. These were approximated. The median collected weight is 0.5g of stool and these are collected in Omnigut tubes which contain 2mL of fluid. 1/8 of this volume is used for extraction. Therefore the weight of stool was set to 0.5/8 (0.0625g).
## Filter out samples below LLOQ
df_total_load = df_total_load[df_total_load['Concentration']>2]
## Calculate Copies/mL
df_total_load['Copies/mL'] = df_total_load['Corrected Concentration']/df_total_load['Weight']
df_total_load['Log Copies/mL'] = np.log10(df_total_load['Copies/mL'])
# These samples were diluted before placing sample in library reaction due to inhibitors preventing amplification in undiluted sample
seq_dilutions = pd.read_csv('dPCR data/seq_dilution_paired.csv')
seq_dilutions_dict = {seq_dilutions['mod_ID'].iloc[i] : seq_dilutions['Seq_Dilution'].iloc[i] for i in range(len(seq_dilutions))}
# Create column to account for the fact that some samples were diluted before input into library prep reaction
df_total_load['Seq_Dilution'] = df_total_load.apply(lambda x: seq_dilutions_dict[x['mod_ID']], axis=1)
# uL added to the amplification rxn
seq_volume = 3.5
copy_input_threshold = 3
df_total_load['Copies in Amp Rxn'] = df_total_load['Concentration']*df_total_load['Dilution']/df_total_load['Seq_Dilution']*seq_volume
df_total_load['Rel. Abundance LOD (%)'] = copy_input_threshold/df_total_load['Copies in Amp Rxn']*100
df_total_load['Abs. Abundance LOD'] = df_total_load['Rel. Abundance LOD (%)']*df_total_load['Copies/mL']/100
df_total_load
total_load_dict = {df_total_load['mod_ID'].iloc[i] : df_total_load['Copies/mL'].iloc[i] for i in range(len(df_total_load))}
len(total_load_dict)
42
seq_lloq = 7.115*(read_depth**(-0.556))
df_total_load['Rel. Abundance LOD (%) Corrected'] = df_total_load['Rel. Abundance LOD (%)'].where(df_total_load['Rel. Abundance LOD (%)']>seq_lloq, seq_lloq)
lod_dict = {df_total_load['mod_ID'].iloc[i] : df_total_load['Rel. Abundance LOD (%) Corrected'].iloc[i]*read_depth/100 for i in range(len(df_total_load))}
df_seq_samples = df_seq_orig_species[df_seq_orig_species.index.isin(total_load_dict.keys())][df_seq_orig_species.columns[:-1*num_metadata_cols]]
# This is num_metadata_cols-1 because we don't need the description column since it is already stored as the index
seq_metadata = df_seq_orig_species[df_seq_orig_species.columns[-1*(num_metadata_cols-1):]]
df_seq_samples
42 rows × 755 columns
This is defined as the load at which there should be a 95% chance of one copy being loaded into the amplification reaction (3 copy average).
df_species_lod_filter = pd.DataFrame()
for col in df_seq_samples.columns:
df_species_lod_filter[col] = df_seq_samples.apply(lambda x: x[col] if x[col]>lod_dict[x.name] else 0, axis=1)
# Remove columns (taxa) that have zero counts after filtering
df_species_lod_filter = df_species_lod_filter[df_species_lod_filter.sum(axis=1)>0]
# Remove rows (samples) that have zero counts after filtering
df_species_lod_filter = df_species_lod_filter.loc[:, (df_species_lod_filter != 0).any(axis=0)]
df_species_lod_filter
42 rows × 222 columns
orig_indexes = df_seq_samples.index.tolist()
filter_indexes = df_species_lod_filter.index.tolist()
lost = list(set(set(orig_indexes) - set(filter_indexes)))
df_total_load[df_total_load['Sample'].isin(lost)]
0 rows × 21 columns
def collapse_taxonomy(_df, level):
collapsed_dict = {}
index=0
# Evaluate the selected taxonomy level to collapse to
if level == 'Genus':
index = -1
elif level == 'Family':
index = -2
elif level == 'Order':
index = -3
elif level == 'Class':
index = -4
elif level == 'Phylum':
index = -5
else:
raise ValueError('Could not interprest taxonomy level. Please use (Phylum, Class, Order, Family, Genus)')
# Iterate through columns adding values together for each sample if the new column name already exists
for col in _df:
new_col = ";".join(col.split(';')[:index])
if new_col in collapsed_dict.keys():
collapsed_dict[new_col] += np.array(_df[col])
else:
collapsed_dict[new_col] = np.array(_df[col])
df_collapsed = pd.DataFrame.from_dict(collapsed_dict).set_index(_df.index)
return df_collapsed
df_lod_list = [None]*6
df_lod_list[0] = collapse_taxonomy(df_species_lod_filter, 'Phylum')
df_lod_list[1] = collapse_taxonomy(df_species_lod_filter, 'Class')
df_lod_list[2] = collapse_taxonomy(df_species_lod_filter, 'Order')
df_lod_list[3] = collapse_taxonomy(df_species_lod_filter, 'Family')
df_lod_list[4] = collapse_taxonomy(df_species_lod_filter, 'Genus')
df_lod_list[5] = df_species_lod_filter
df_rel_lod_list = [None]*6
df_abs_lod_list = [None]*6
df_pseudo_rel_lod_list = [None]*6
df_pseudo_abs_lod_list = [None]*6
for index, df in enumerate(df_lod_list):
df_rel_lod_list[index] = df.div(read_depth, axis=0).multiply(100)
df_abs_lod_list[index] = df_rel_lod_list[index].apply(lambda x: x*total_load_dict[x.name], 1).div(100)
df_pseudo_rel_lod_list[index] = df_rel_lod_list[index]+(0.01/read_depth)*100
df_pseudo_abs_lod_list[index] = df_pseudo_rel_lod_list[index].apply(lambda x: x*total_load_dict[x.name], 1).div(100)
#df_pseudo_abs_lloq_list[index] = df_abs_lloq_list[index]+0.001
df_rel_lod_list[0]
This overcomes downstream issue when multiple columns have the same name
df_col_names_lod_list = [None]*6
for index, df in enumerate(df_rel_lod_list):
num_cols = len(df.columns)
col_names = ['ASV' + str(x) for x in range(num_cols)]
df_col_names_lod_list[index] = pd.DataFrame(index=col_names, data={'taxonomy':df.columns.tolist()})
df_rel_lod_list[index].columns = col_names
df_abs_lod_list[index].columns = col_names
df_pseudo_rel_lod_list[index].columns = col_names
df_pseudo_abs_lod_list[index].columns = col_names
df_col_names_lod_list[0]
exclusion_list = ['', 'uncultured bacterium', 'metagenome', 'uncultured',
'gut metagenome', 'uncultured organism', 'unidentified',
'uncultured Bacteroidales bacterium', 'uncultured Mollicutes bacterium', 'uncultured archaeon']
for i in range(6):
if i == 0:
df_col_names_lod_list[i][['Kingdom', 'Phylum']] = df_col_names_lod_list[i]['taxonomy'].str.split(';', expand=True)
elif i==1:
df_col_names_lod_list[i][['Kingdom', 'Phylum', 'Class']] = df_col_names_lod_list[i]['taxonomy'].str.split(';', expand=True)
elif i==2:
df_col_names_lod_list[i][['Kingdom', 'Phylum', 'Class', 'Order']] = df_col_names_lod_list[i]['taxonomy'].str.split(';', expand=True)
elif i==3:
df_col_names_lod_list[i][['Kingdom', 'Phylum', 'Class', 'Order', 'Family']] = df_col_names_lod_list[i]['taxonomy'].str.split(';', expand=True)
elif i==4:
df_col_names_lod_list[i][['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus']] = df_col_names_lod_list[i]['taxonomy'].str.split(';', expand=True)
else:
df_col_names_lod_list[i][['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']] = df_col_names_lod_list[i]['taxonomy'].str.split(';', expand=True)
labels_list = []
for index, row in df_col_names_lod_list[i].iterrows():
# Species
if row[-1][5:] in exclusion_list:
# Genus
if row[-2][5:] in exclusion_list:
# Family
if row[-3][5:] in exclusion_list:
# Order
if row[-4][5:] in exclusion_list:
# Class
if row[-5][5:] in exclusion_list:
# Phylum
if row[-6][5:] in exclusion_list:
labels_list.append(row[-7][5:] + '(' + df_col_names_lod_list[i].columns[-7][0].lower() + ')')
else:
labels_list.append(row[-6][5:] + '(' + df_col_names_lod_list[i].columns[-6][0].lower() + ')')
else:
labels_list.append(row[-5][5:] + '(' + df_col_names_lod_list[i].columns[-5][0].lower() + ')')
else:
labels_list.append(row[-4][5:] + '(' + df_col_names_lod_list[i].columns[-4][0].lower() + ')')
else:
labels_list.append(row[-3][5:] + '(' + df_col_names_lod_list[i].columns[-3][0].lower() + ')')
else:
labels_list.append(row[-2][5:] + '(' + df_col_names_lod_list[i].columns[-2][0].lower() + ')')
else:
labels_list.append(row[-1][5:] + '(' + df_col_names_lod_list[i].columns[-1][0].lower() + ')')
df_col_names_lod_list[i]['label'] = labels_list
df_rel_sort_lod_list = [None]*6
df_abs_sort_lod_list = [None]*6
df_pseudo_rel_sort_lod_list = [None]*6
df_pseudo_abs_sort_lod_list = [None]*6
for i in range(6):
taxa_sorted = df_abs_lod_list[i].mean().sort_values(ascending=False).index
df_rel_sort_lod_list[i] = df_rel_lod_list[i].loc[:, taxa_sorted]
df_abs_sort_lod_list[i] = df_abs_lod_list[i].loc[:, taxa_sorted]
df_pseudo_rel_sort_lod_list[i] = df_pseudo_rel_lod_list[i].loc[:, taxa_sorted]
df_pseudo_abs_sort_lod_list[i] = df_pseudo_abs_lod_list[i].loc[:, taxa_sorted]
df_abs_sort_lod_list[4]
42 rows × 104 columns
pickle.dump(df_rel_sort_lod_list, open('pickle_files/rel_sort_lod_list_paired.pkl', 'wb'))
pickle.dump(df_abs_sort_lod_list, open('pickle_files/abs_sort_lod_list_paired.pkl', 'wb'))
pickle.dump(df_pseudo_rel_sort_lod_list, open('pickle_files/pseudo_rel_sort_lod_list_paired.pkl', 'wb'))
pickle.dump(df_pseudo_abs_sort_lod_list, open('pickle_files/pseudo_abs_sort_lod_list_paired.pkl', 'wb'))
pickle.dump(df_col_names_lod_list, open('pickle_files/col_names_lod_list_paired.pkl', 'wb'))
pickle.dump(df_total_load, open('pickle_files/total_load_paired.pkl', 'wb'))
pickle.dump(seq_metadata, open('pickle_files/seq_paired_metadata_paired.pkl', 'wb'))