In [1]:

import numpy as np
import pandas as pd
import pickle

Load in read count data (Qiime2 taxa barplot csv files)¶

Samples were subsampled to 45,386 reads. Samples with less than this number of reads after DADA2 processing were removed.

In [2]:

df_seq_orig_species = pd.read_csv('data_files/species_counts_duodenum_45386.csv').set_index('index')
read_depth = df_seq_orig_species.sum(axis=1)[0]

In [3]:

read_depth

Out[3]:

45386.0

Set the number of metadata columns in the sequencing data¶

In [4]:

num_metadata_cols = 2

Remove the duplicate sequencing samples and rename the columns with '_Duo' in the name just to the sample ID¶

In [5]:

df_seq_orig_species = df_seq_orig_species.drop(['387_Duo', '388_Duo', '390_Duo', '391_Duo', '392_Duo', '394_Duo', '409_Duo', '410_Duo', '418_Duo', '423_Duo', '425_Duo', '433_Duo'])
df_seq_orig_species.rename({'417_Duo':'417', '434_Duo':'434', '437_Duo':'437', '438_Duo':'438', '441_Duo':'441', '446_Duo':'446', '447_Duo':'447', '448_Duo':'448', '449_Duo':'449', '451_Duo':'451'}, axis='index', inplace=True)
df_seq_orig_species.sort_index(inplace=True)
df_seq_orig_species.index = df_seq_orig_species.index.astype(int)
df_seq_orig_species

Out[5]:

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } D_0__Archaea;D_1__Euryarchaeota;D_2__Halobacteria;D_3__Halobacteriales;D_4__Haloferacaceae;D_5__Halopenitus;D_6__Halopenitus persicus D_0__Archaea;D_1__Euryarchaeota;D_2__Methanobacteria;D_3__Methanobacteriales;D_4__Methanobacteriaceae;D_5__Methanobacterium;D_6__Methanobacterium formicicum D_0__Archaea;D_1__Euryarchaeota;D_2__Methanobacteria;D_3__Methanobacteriales;D_4__Methanobacteriaceae;D_5__Methanobacterium;D_6__uncultured archaeon D_0__Archaea;D_1__Euryarchaeota;D_2__Methanobacteria;D_3__Methanobacteriales;D_4__Methanobacteriaceae;D_5__Methanobrevibacter;__ D_0__Archaea;D_1__Nanoarchaeaeota;D_2__Woesearchaeia;__;__;__;__ D_0__Bacteria;D_1__Acidobacteria;D_2__Blastocatellia (Subgroup 4);D_3__Blastocatellales;D_4__Blastocatellaceae;D_5__Blastocatella;__ D_0__Bacteria;D_1__Acidobacteria;D_2__Blastocatellia (Subgroup 4);D_3__Blastocatellales;D_4__Blastocatellaceae;D_5__uncultured;__ D_0__Bacteria;D_1__Acidobacteria;D_2__Holophagae;D_3__Subgroup 7;__;__;__ D_0__Bacteria;D_1__Actinobacteria;D_2__Acidimicrobiia;D_3__Microtrichales;D_4__Microtrichaceae;D_5__IMCC26207;D_6__uncultured bacterium D_0__Bacteria;D_1__Actinobacteria;D_2__Acidimicrobiia;D_3__Microtrichales;D_4__uncultured;D_5__uncultured Acidimicrobidae bacterium;D_6__uncultured Acidimicrobidae bacterium ... D_0__Bacteria;D_1__Verrucomicrobia;D_2__Verrucomicrobiae;D_3__Opitutales;D_4__Opitutaceae;D_5__Opitutus;D_6__uncultured Verrucomicrobia bacterium D_0__Bacteria;D_1__Verrucomicrobia;D_2__Verrucomicrobiae;D_3__Opitutales;D_4__Opitutaceae;__;__ D_0__Bacteria;D_1__Verrucomicrobia;D_2__Verrucomicrobiae;D_3__Verrucomicrobiales;D_4__Akkermansiaceae;D_5__Akkermansia;D_6__uncultured bacterium D_0__Bacteria;D_1__Verrucomicrobia;D_2__Verrucomicrobiae;D_3__Verrucomicrobiales;D_4__Akkermansiaceae;D_5__Akkermansia;__ D_0__Bacteria;D_1__Verrucomicrobia;D_2__Verrucomicrobiae;D_3__Verrucomicrobiales;D_4__Verrucomicrobiaceae;D_5__uncultured;D_6__uncultured bacterium D_0__Bacteria;D_1__WPS-2;D_2__metagenome;D_3__metagenome;D_4__metagenome;D_5__metagenome;D_6__metagenome D_0__Bacteria;__;__;__;__;__;__ Unassigned;__;__;__;__;__;__ Description Body_Site index 141 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 141 Duodenum 142 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 17.0 0.0 142 Duodenum 144 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 25.0 0.0 144 Duodenum 145 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 145 Duodenum 146 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 146 Duodenum ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 446 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 70.0 0.0 446_Duo Duodenum 447 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 447_Duo Duodenum 448 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 448_Duo Duodenum 449 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 449_Duo Duodenum 451 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 451_Duo Duodenum

254 rows × 1068 columns

In [6]:

# This taxa was only in second batch of sequenced duodenum samples likely indicating it is a contaminant. It is removed because
# it interferes with a plot comparing saliva to duodenum samples.
df_seq_orig_species.drop(['D_0__Bacteria;D_1__Firmicutes;D_2__Bacilli;D_3__Bacillales;D_4__Paenibacillaceae;D_5__Paenibacillus;D_6__Paenibacillus darwinianus'], axis=1, inplace=True)

Load in absolute abundance data (dPCR)¶

In [7]:

df_total_load = pd.read_excel('dPCR data/dPCR_total_loads_duodenum.xlsx', index_col=0)
df_weights = pd.read_csv('data_files/sample weights.csv')

# Merge the two dataframes together based on the sample ID
df_total_load = df_total_load.merge(df_weights, left_on='Sample', right_on='Study ID')

# Add a column saying whether the sample weight is missing or not
df_total_load['Weight (True/False)'] = df_total_load.apply(lambda x: x['Weight (mL)'][0].isdigit(), axis=1)

# Determine the average sample weight for all samples
mean_weight = df_total_load[df_total_load['Weight (True/False)']==True]['Weight (mL)'].astype(float).mean()

# Create new column where any sample with a missing weight is set to the average weight of all samples
df_total_load['Corrected Weight (mL)'] = df_total_load.apply(lambda x: float(x['Weight (mL)']) if x['Weight (True/False)'] else mean_weight, axis=1)

In [8]:

# Print out the samples without weights for reference (N=11)
df_total_load[~df_total_load['Weight (True/False)']]

Out[8]:

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } Well Concentration PoissonConfMax PoissonConfMin Total Positives Primer Sample Dilution Corrected Concentration Study ID Weight (mL) Weight (True/False) Corrected Weight (mL) 74 B08 1430.0 1456.0 1417.0 18706 13158 mod_Caporaso 215 10 1430000.0 215 no data False 0.884958 88 E03 671.0 686.0 663.0 17883 7771 mod_Caporaso 233 500 33550000.0 233 no weight False 0.884958 119 A10 23.3 25.7 22.0 17629 345 mod_Caporaso 280 10 23300.0 280 no data False 0.884958 142 B06 268.0 277.0 264.0 19925 4064 mod_Caporaso 318 10 268000.0 318 no data False 0.884958 145 F06 782.0 799.0 774.0 18020 8753 mod_Caporaso 324 10 782000.0 324 no data False 0.884958 178 H02 286.0 294.0 282.0 20243 4368 mod_Caporaso 360 10 286000.0 360 no data False 0.884958 189 C10 3079.0 3139.0 3050.0 19684 18247 mod_Caporaso 372 10 3079000.0 372 no sample False 0.884958 201 G11 3910.0 4010.0 3860.0 16826 16220 mod_Caporaso 384 10 3910000.0 384 no data False 0.884958 214 F04 16.7 18.7 15.7 19006 268 mod_Caporaso 400 10 16700.0 400 no data False 0.884958 242 B09 90.1 94.9 87.7 18334 1352 mod_Caporaso 430 10 90100.0 430 no data False 0.884958 251 D07 3740.0 3830.0 3700.0 15466 14823 mod_Caporaso 448 10 3740000.0 448 no data False 0.884958

Normalize concentration to the input volume¶

In [9]:

## Set the lower dPCR threshold. 95% CI is +-1X and the dPCR blanks are <1cp/uL with +3std dev of ~1 cp/uL. 
## This means we would have ~2X resolution at 2 cp/uL.
df_total_load = df_total_load[(df_total_load['Concentration']>2)]

## Calculate Copies/mL
df_total_load['Copies/mL'] = df_total_load['Corrected Concentration']/df_total_load['Corrected Weight (mL)']
df_total_load['Log Copies/mL'] = np.log10(df_total_load['Copies/mL'])

In [10]:

df_total_load[['Sample', 'Copies/mL']].to_excel('duodenum_total_loads.xlsx')

Calculate LOD in terms of absolute abundance and relative abundance, 95% confidence of the template being added to the sample (3 copy input)¶

In [11]:

# These samples were diluted before placing sample in library reaction due to inhibitors preventing amplification in undiluted sample
diluted_samples = {423:100, 437:10, 438:10, 441:10, 446:10, 447:10, 448:10, 449:10, 451:10,
                   395:100, 198:50, 423:50, 427:50, 373:10, 321:10, 169:10, 375:10, 353:10,
                   242:10, 411:10, 312:10, 433:2, 366:2}

# Create column to account for the fact that some samples were diluted before input into library prep reaction
df_total_load['Seq_Dilution'] = df_total_load.apply(lambda x: diluted_samples[x['Sample']] if x['Sample'] in diluted_samples.keys() else 1, axis=1)

In [12]:

# uL added to the amplification rxn
seq_volume = 3.5
copy_input_threshold = 3

df_total_load['Copies in Amp Rxn'] = df_total_load['Concentration']*df_total_load['Dilution']/df_total_load['Seq_Dilution']*seq_volume
df_total_load['Rel. Abundance LOD (%)'] = copy_input_threshold/df_total_load['Copies in Amp Rxn']*100
df_total_load['Abs. Abundance LOD'] = df_total_load['Rel. Abundance LOD (%)']*df_total_load['Copies/mL']/100
df_total_load

Out[12]:

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } Well Concentration PoissonConfMax PoissonConfMin Total Positives Primer Sample Dilution Corrected Concentration Study ID Weight (mL) Weight (True/False) Corrected Weight (mL) Copies/mL Log Copies/mL Seq_Dilution Copies in Amp Rxn Rel. Abundance LOD (%) Abs. Abundance LOD 0 A02 400.0 411.0 394.0 17179 4950 mod_Caporaso 141 10 400000.0 141 1.6259 True 1.6259 2.460176e+05 5.390966 1 14000.0 0.021429 52.718055 1 B02 24.3 26.9 23.1 17527 359 mod_Caporaso 142 10 24300.0 142 0.2091 True 0.2091 1.162123e+05 5.065252 1 850.5 0.352734 409.920066 2 A05 373.0 383.0 368.0 19427 5276 mod_Caporaso 145 500 18650000.0 145 1.77 True 1.7700 1.053672e+07 7.022706 1 652750.0 0.000460 48.426150 3 D02 1234.0 1258.0 1222.0 17000 11044 mod_Caporaso 146 10 1234000.0 146 1.6973 True 1.6973 7.270371e+05 5.861557 1 43190.0 0.006946 50.500375 4 E02 642.0 657.0 634.0 17180 7226 mod_Caporaso 147 10 642000.0 147 1.7476 True 1.7476 3.673610e+05 5.565093 1 22470.0 0.013351 49.046856 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 255 D01 5810.0 6060.0 5590.0 13577 13480 mod_Caporaso 451 500 290500000.0 451 0.605 True 0.6050 4.801653e+08 8.681391 10 1016750.0 0.000295 1416.765053 256 A07 154.0 161.0 150.0 14056 1720 mod_Caporaso 207 25000 385000000.0 207 1.8095 True 1.8095 2.127660e+08 8.327902 1 13475000.0 0.000022 47.369044 257 B07 170.0 177.0 167.0 16105 2169 mod_Caporaso 274 25000 425000000.0 274 1.083 True 1.0830 3.924284e+08 8.593760 1 14875000.0 0.000020 79.145231 258 C07 405.0 417.0 399.0 15574 4539 mod_Caporaso 322 5000 202500000.0 322 0.219 True 0.2190 9.246575e+08 8.965981 1 7087500.0 0.000042 391.389432 259 D07 428.0 440.0 422.0 15624 4766 mod_Caporaso 395 5000 214000000.0 395 0.491 True 0.4910 4.358452e+08 8.639332 100 74900.0 0.004005 17457.084667

256 rows × 20 columns

Generate dictionary for easier downstream conversion of relative to absolute abundances¶

In [13]:

total_load_dict = {df_total_load['Sample'].iloc[i] : df_total_load['Copies/mL'].iloc[i] for i in range(len(df_total_load))}
len(total_load_dict)

Out[13]:

Determine LOD thresholds. If LOD from poisson loading > LOD from sequencing use the sequencing value. LOD from sequencing is based on a 50% CV from replicates (Fig 2d from quant-seq paper).¶

In [14]:

seq_lloq = 7.115*(read_depth**(-0.556))

df_total_load['Rel. Abundance LOD (%) Corrected'] = df_total_load['Rel. Abundance LOD (%)'].where(df_total_load['Rel. Abundance LOD (%)']>seq_lloq, seq_lloq)

In [15]:

lod_dict = {df_total_load['Sample'].iloc[i] : df_total_load['Rel. Abundance LOD (%) Corrected'].iloc[i]*read_depth/100 for i in range(len(df_total_load))}

Filter out samples without accurate total loads and store metadata in separate file¶

In [16]:

df_seq_samples = df_seq_orig_species[df_seq_orig_species.index.isin(total_load_dict.keys())][df_seq_orig_species.columns[:-1*num_metadata_cols]]

# This is num_metadata_cols-1 because we don't need the description column since it is already stored as the index
seq_metadata = df_seq_orig_species[df_seq_orig_species.columns[-1*(num_metadata_cols-1):]]
df_seq_samples

Out[16]:

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } D_0__Archaea;D_1__Euryarchaeota;D_2__Halobacteria;D_3__Halobacteriales;D_4__Haloferacaceae;D_5__Halopenitus;D_6__Halopenitus persicus D_0__Archaea;D_1__Euryarchaeota;D_2__Methanobacteria;D_3__Methanobacteriales;D_4__Methanobacteriaceae;D_5__Methanobacterium;D_6__Methanobacterium formicicum D_0__Archaea;D_1__Euryarchaeota;D_2__Methanobacteria;D_3__Methanobacteriales;D_4__Methanobacteriaceae;D_5__Methanobacterium;D_6__uncultured archaeon D_0__Archaea;D_1__Euryarchaeota;D_2__Methanobacteria;D_3__Methanobacteriales;D_4__Methanobacteriaceae;D_5__Methanobrevibacter;__ D_0__Archaea;D_1__Nanoarchaeaeota;D_2__Woesearchaeia;__;__;__;__ D_0__Bacteria;D_1__Acidobacteria;D_2__Blastocatellia (Subgroup 4);D_3__Blastocatellales;D_4__Blastocatellaceae;D_5__Blastocatella;__ D_0__Bacteria;D_1__Acidobacteria;D_2__Blastocatellia (Subgroup 4);D_3__Blastocatellales;D_4__Blastocatellaceae;D_5__uncultured;__ D_0__Bacteria;D_1__Acidobacteria;D_2__Holophagae;D_3__Subgroup 7;__;__;__ D_0__Bacteria;D_1__Actinobacteria;D_2__Acidimicrobiia;D_3__Microtrichales;D_4__Microtrichaceae;D_5__IMCC26207;D_6__uncultured bacterium D_0__Bacteria;D_1__Actinobacteria;D_2__Acidimicrobiia;D_3__Microtrichales;D_4__uncultured;D_5__uncultured Acidimicrobidae bacterium;D_6__uncultured Acidimicrobidae bacterium ... D_0__Bacteria;D_1__Verrucomicrobia;D_2__Verrucomicrobiae;D_3__Chthoniobacterales;D_4__Chthoniobacteraceae;D_5__Chthoniobacter;D_6__uncultured bacterium D_0__Bacteria;D_1__Verrucomicrobia;D_2__Verrucomicrobiae;D_3__Chthoniobacterales;D_4__Xiphinematobacteraceae;D_5__Candidatus Xiphinematobacter;D_6__metagenome D_0__Bacteria;D_1__Verrucomicrobia;D_2__Verrucomicrobiae;D_3__Opitutales;D_4__Opitutaceae;D_5__Opitutus;D_6__uncultured Verrucomicrobia bacterium D_0__Bacteria;D_1__Verrucomicrobia;D_2__Verrucomicrobiae;D_3__Opitutales;D_4__Opitutaceae;__;__ D_0__Bacteria;D_1__Verrucomicrobia;D_2__Verrucomicrobiae;D_3__Verrucomicrobiales;D_4__Akkermansiaceae;D_5__Akkermansia;D_6__uncultured bacterium D_0__Bacteria;D_1__Verrucomicrobia;D_2__Verrucomicrobiae;D_3__Verrucomicrobiales;D_4__Akkermansiaceae;D_5__Akkermansia;__ D_0__Bacteria;D_1__Verrucomicrobia;D_2__Verrucomicrobiae;D_3__Verrucomicrobiales;D_4__Verrucomicrobiaceae;D_5__uncultured;D_6__uncultured bacterium D_0__Bacteria;D_1__WPS-2;D_2__metagenome;D_3__metagenome;D_4__metagenome;D_5__metagenome;D_6__metagenome D_0__Bacteria;__;__;__;__;__;__ Unassigned;__;__;__;__;__;__ index 141 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 142 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 17.0 0.0 144 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 25.0 0.0 145 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 146 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 446 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 70.0 0.0 447 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 448 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 449 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 451 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

250 rows × 1065 columns

Set abundance to zero for taxa below LOD defined by # molecules input into amplification rxn or sequencing 50% CV threshold¶

This is defined as the load at which there should be a 95% chance of one copy being loaded into the amplification reaction (3 copy average).

In [17]:

df_species_lod_filter = pd.DataFrame()

for col in df_seq_samples.columns:
    df_species_lod_filter[col] = df_seq_samples.apply(lambda x: x[col] if x[col]>lod_dict[x.name] else 0, axis=1)
    
# Remove columns (taxa) that have zero counts after filtering
df_species_lod_filter = df_species_lod_filter[df_species_lod_filter.sum(axis=1)>0]

# Remove rows (samples) that have zero counts after filtering
df_species_lod_filter = df_species_lod_filter.loc[:, (df_species_lod_filter != 0).any(axis=0)]
df_species_lod_filter

Out[17]:

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Actinomycetales;D_4__Actinomycetaceae;D_5__F0332;D_6__uncultured bacterium D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Bifidobacteriales;D_4__Bifidobacteriaceae;D_5__Alloscardovia;D_6__Bifidobacterium longum subsp. longum D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Bifidobacteriales;D_4__Bifidobacteriaceae;D_5__Scardovia;D_6__unidentified D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Corynebacteriales;D_4__Corynebacteriaceae;D_5__Corynebacterium 1;D_6__Corynebacterium kroppenstedtii D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Corynebacteriales;D_4__Corynebacteriaceae;D_5__Corynebacterium 1;D_6__Corynebacterium pseudodiphtheriticum D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Corynebacteriales;D_4__Corynebacteriaceae;D_5__Corynebacterium 1;__ D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Corynebacteriales;D_4__Corynebacteriaceae;D_5__Corynebacterium;D_6__Corynebacterium durum D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Corynebacteriales;D_4__Corynebacteriaceae;D_5__Corynebacterium;__ D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Corynebacteriales;D_4__Corynebacteriaceae;D_5__Lawsonella;D_6__uncultured bacterium D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Corynebacteriales;D_4__Corynebacteriaceae;__;__ ... D_0__Bacteria;D_1__Synergistetes;D_2__Synergistia;D_3__Synergistales;D_4__Synergistaceae;D_5__Pyramidobacter;D_6__Pyramidobacter piscolens D_0__Bacteria;D_1__Tenericutes;D_2__Mollicutes;D_3__Mollicutes RF39;D_4__Firmicutes oral clone FM046;D_5__Firmicutes oral clone FM046;D_6__Firmicutes oral clone FM046 D_0__Bacteria;D_1__Tenericutes;D_2__Mollicutes;D_3__Mollicutes RF39;D_4__gut metagenome;D_5__gut metagenome;D_6__gut metagenome D_0__Bacteria;D_1__Tenericutes;D_2__Mollicutes;D_3__Mollicutes RF39;D_4__uncultured bacterium;D_5__uncultured bacterium;D_6__uncultured bacterium D_0__Bacteria;D_1__Tenericutes;D_2__Mollicutes;D_3__Mollicutes RF39;__;__;__ D_0__Bacteria;D_1__Tenericutes;D_2__Mollicutes;D_3__Mycoplasmatales;D_4__Mycoplasmataceae;D_5__Mycoplasma;D_6__Mycoplasma salivarium ATCC 23064 D_0__Bacteria;D_1__Tenericutes;D_2__Mollicutes;D_3__Mycoplasmatales;D_4__Mycoplasmataceae;D_5__Mycoplasma;__ D_0__Bacteria;D_1__Verrucomicrobia;D_2__Verrucomicrobiae;D_3__Verrucomicrobiales;D_4__Akkermansiaceae;D_5__Akkermansia;D_6__uncultured bacterium D_0__Bacteria;__;__;__;__;__;__ Unassigned;__;__;__;__;__;__ index 141 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 142 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 328.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 144 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 25.0 0.0 145 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 146 0.0 11.0 93.0 0.0 0.0 0.0 75.0 18.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 446 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 447 0.0 0.0 13.0 0.0 0.0 0.0 18.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 448 0.0 0.0 13.0 0.0 0.0 0.0 14.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 449 0.0 0.0 715.0 0.0 0.0 0.0 1306.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 451 0.0 0.0 0.0 0.0 0.0 0.0 135.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 9.0 0.0 0.0 0.0 0.0 0.0 0.0

250 rows × 546 columns

Determine which samples (if any) were filtered out¶

In [18]:

orig_indexes = df_seq_samples.index.tolist()
filter_indexes = df_species_lod_filter.index.tolist()

lost = list(set(set(orig_indexes) - set(filter_indexes)))

In [19]:

df_total_load[df_total_load['Sample'].isin(lost)]

Out[19]:

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } Well Concentration PoissonConfMax PoissonConfMin Total Positives Primer Sample Dilution Corrected Concentration ... Weight (mL) Weight (True/False) Corrected Weight (mL) Copies/mL Log Copies/mL Seq_Dilution Copies in Amp Rxn Rel. Abundance LOD (%) Abs. Abundance LOD Rel. Abundance LOD (%) Corrected

0 rows × 21 columns

Generate dataframes for each taxonomy level¶

In [20]:

def collapse_taxonomy(_df, level):
    collapsed_dict = {}
    index=0
    
    # Evaluate the selected taxonomy level to collapse to
    if level == 'Genus':
        index = -1
    elif level == 'Family':
        index = -2
    elif level == 'Order':
        index = -3
    elif level == 'Class':
        index = -4
    elif level == 'Phylum':
        index = -5
    else:
        raise ValueError('Could not interpret taxonomy level. Please use (Phylum, Class, Order, Family, Genus)')

    # Iterate through columns adding values together for each sample if the new column name already exists
    for col in _df:
        new_col = ";".join(col.split(';')[:index])

        if new_col in collapsed_dict.keys():
            collapsed_dict[new_col] += np.array(_df[col])
        else:
            collapsed_dict[new_col] = np.array(_df[col])

    df_collapsed = pd.DataFrame.from_dict(collapsed_dict).set_index(_df.index)
    return df_collapsed

In [21]:

df_lod_list = [None]*6

df_lod_list[0] = collapse_taxonomy(df_species_lod_filter, 'Phylum')
df_lod_list[1] = collapse_taxonomy(df_species_lod_filter, 'Class')
df_lod_list[2] = collapse_taxonomy(df_species_lod_filter, 'Order')
df_lod_list[3] = collapse_taxonomy(df_species_lod_filter, 'Family')
df_lod_list[4] = collapse_taxonomy(df_species_lod_filter, 'Genus')
df_lod_list[5] = df_species_lod_filter

Generate relative and absolute abundance tables¶

In [22]:

df_rel_lod_list = [None]*6
df_abs_lod_list = [None]*6

df_pseudo_rel_lod_list = [None]*6
df_pseudo_abs_lod_list = [None]*6

for index, df in enumerate(df_lod_list):
    df_rel_lod_list[index] = df.div(read_depth, axis=0).multiply(100)
    df_abs_lod_list[index] = df_rel_lod_list[index].apply(lambda x: x*total_load_dict[x.name], 1).div(100)
    
    df_pseudo_rel_lod_list[index] = df_rel_lod_list[index]+(0.1/read_depth)*100
    #df_pseudo_abs_lod_list[index] = df_pseudo_rel_lod_list[index].multiply(1e4).div(100)
    df_pseudo_abs_lod_list[index] = df_pseudo_rel_lod_list[index].apply(lambda x: x*total_load_dict[x.name], 1).div(100)

df_pseudo_abs_lod_list[5]

Out[22]:

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Actinomycetales;D_4__Actinomycetaceae;D_5__F0332;D_6__uncultured bacterium D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Bifidobacteriales;D_4__Bifidobacteriaceae;D_5__Alloscardovia;D_6__Bifidobacterium longum subsp. longum D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Bifidobacteriales;D_4__Bifidobacteriaceae;D_5__Scardovia;D_6__unidentified D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Corynebacteriales;D_4__Corynebacteriaceae;D_5__Corynebacterium 1;D_6__Corynebacterium kroppenstedtii D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Corynebacteriales;D_4__Corynebacteriaceae;D_5__Corynebacterium 1;D_6__Corynebacterium pseudodiphtheriticum D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Corynebacteriales;D_4__Corynebacteriaceae;D_5__Corynebacterium 1;__ D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Corynebacteriales;D_4__Corynebacteriaceae;D_5__Corynebacterium;D_6__Corynebacterium durum D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Corynebacteriales;D_4__Corynebacteriaceae;D_5__Corynebacterium;__ D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Corynebacteriales;D_4__Corynebacteriaceae;D_5__Lawsonella;D_6__uncultured bacterium D_0__Bacteria;D_1__Actinobacteria;D_2__Actinobacteria;D_3__Corynebacteriales;D_4__Corynebacteriaceae;__;__ ... D_0__Bacteria;D_1__Synergistetes;D_2__Synergistia;D_3__Synergistales;D_4__Synergistaceae;D_5__Pyramidobacter;D_6__Pyramidobacter piscolens D_0__Bacteria;D_1__Tenericutes;D_2__Mollicutes;D_3__Mollicutes RF39;D_4__Firmicutes oral clone FM046;D_5__Firmicutes oral clone FM046;D_6__Firmicutes oral clone FM046 D_0__Bacteria;D_1__Tenericutes;D_2__Mollicutes;D_3__Mollicutes RF39;D_4__gut metagenome;D_5__gut metagenome;D_6__gut metagenome D_0__Bacteria;D_1__Tenericutes;D_2__Mollicutes;D_3__Mollicutes RF39;D_4__uncultured bacterium;D_5__uncultured bacterium;D_6__uncultured bacterium D_0__Bacteria;D_1__Tenericutes;D_2__Mollicutes;D_3__Mollicutes RF39;__;__;__ D_0__Bacteria;D_1__Tenericutes;D_2__Mollicutes;D_3__Mycoplasmatales;D_4__Mycoplasmataceae;D_5__Mycoplasma;D_6__Mycoplasma salivarium ATCC 23064 D_0__Bacteria;D_1__Tenericutes;D_2__Mollicutes;D_3__Mycoplasmatales;D_4__Mycoplasmataceae;D_5__Mycoplasma;__ D_0__Bacteria;D_1__Verrucomicrobia;D_2__Verrucomicrobiae;D_3__Verrucomicrobiales;D_4__Akkermansiaceae;D_5__Akkermansia;D_6__uncultured bacterium D_0__Bacteria;__;__;__;__;__;__ Unassigned;__;__;__;__;__;__ index 141 0.542056 0.542056 0.542056 0.542056 0.542056 0.542056 5.420561e-01 0.542056 0.542056 0.542056 ... 0.542056 0.542056 0.542056 0.542056 0.542056 0.542056 0.542056 0.542056 0.542056 0.542056 142 0.256053 0.256053 0.256053 0.256053 0.256053 0.256053 2.560533e-01 0.256053 0.256053 0.256053 ... 0.256053 840.110790 0.256053 0.256053 0.256053 0.256053 0.256053 0.256053 0.256053 0.256053 144 2.677622 2.677622 2.677622 2.677622 2.677622 2.677622 2.677622e+00 2.677622 2.677622 2.677622 ... 2.677622 2.677622 2.677622 2.677622 2.677622 2.677622 2.677622 2.677622 672.083146 2.677622 145 23.215800 23.215800 23.215800 23.215800 23.215800 23.215800 2.321580e+01 23.215800 23.215800 23.215800 ... 23.215800 23.215800 23.215800 23.215800 23.215800 23.215800 23.215800 23.215800 23.215800 23.215800 146 1.601897 177.810588 1491.366284 1.601897 1.601897 1.601897 1.203025e+03 289.943391 1.601897 1.601897 ... 1.601897 1.601897 1.601897 1.601897 1.601897 1.601897 1.601897 1.601897 1.601897 1.601897 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 446 1.573802 1.573802 1.573802 1.573802 1.573802 1.573802 1.573802e+00 1.573802 1.573802 1.573802 ... 1.573802 1.573802 1.573802 1.573802 1.573802 1.573802 1.573802 1.573802 1.573802 1.573802 447 28.031160 28.031160 3672.081944 28.031160 28.031160 28.031160 5.073640e+03 28.031160 28.031160 28.031160 ... 28.031160 28.031160 28.031160 28.031160 28.031160 28.031160 28.031160 28.031160 28.031160 28.031160 448 9.311661 9.311661 1219.827534 9.311661 9.311661 9.311661 1.312944e+03 9.311661 9.311661 9.311661 ... 9.311661 9.311661 9.311661 9.311661 9.311661 9.311661 9.311661 9.311661 9.311661 9.311661 449 1.855765 1.855765 13270.577008 1.855765 1.855765 1.855765 2.423815e+04 1.855765 1.855765 1.855765 ... 1.855765 1.855765 1.855765 1.855765 1.855765 1.855765 1.855765 1.855765 1.855765 1.855765 451 1057.959039 1057.959039 1057.959039 1057.959039 1057.959039 1057.959039 1.429303e+06 1057.959039 1057.959039 1057.959039 ... 1057.959039 1057.959039 1057.959039 96274.272512 1057.959039 1057.959039 1057.959039 1057.959039 1057.959039 1057.959039

250 rows × 546 columns

Transform column taxa names into unique IDs.¶

This overcomes downstream issue when multiple columns have the same name

In [23]:

df_col_names_lod_list = [None]*6

for index, df in enumerate(df_rel_lod_list):
    num_cols = len(df.columns)
    col_names = ['ASV' + str(x) for x in range(num_cols)]
    df_col_names_lod_list[index] = pd.DataFrame(index=col_names, data={'taxonomy':df.columns.tolist()})
    
    df_rel_lod_list[index].columns = col_names
    df_abs_lod_list[index].columns = col_names
    
    df_pseudo_rel_lod_list[index].columns = col_names
    df_pseudo_abs_lod_list[index].columns = col_names
    
df_col_names_lod_list[0]

Out[23]:

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } taxonomy ASV0 D_0__Bacteria;D_1__Actinobacteria ASV1 D_0__Bacteria;D_1__Bacteroidetes ASV2 D_0__Bacteria;D_1__Chloroflexi ASV3 D_0__Bacteria;D_1__Cyanobacteria ASV4 D_0__Bacteria;D_1__Epsilonbacteraeota ASV5 D_0__Bacteria;D_1__Firmicutes ASV6 D_0__Bacteria;D_1__Fusobacteria ASV7 D_0__Bacteria;D_1__Patescibacteria ASV8 D_0__Bacteria;D_1__Proteobacteria ASV9 D_0__Bacteria;D_1__Spirochaetes ASV10 D_0__Bacteria;D_1__Synergistetes ASV11 D_0__Bacteria;D_1__Tenericutes ASV12 D_0__Bacteria;D_1__Verrucomicrobia ASV13 D_0__Bacteria;__ ASV14 Unassigned;__

Generate shorter taxonomy names for plotting purposes¶

In [24]:

exclusion_list = ['', 'uncultured bacterium', 'metagenome', 'uncultured', 
                  'gut metagenome', 'uncultured organism', 'unidentified', 
                  'uncultured Bacteroidales bacterium', 'uncultured Mollicutes bacterium', 'uncultured archaeon']

for i in range(6):
    if i == 0:
        df_col_names_lod_list[i][['Kingdom', 'Phylum']] = df_col_names_lod_list[i]['taxonomy'].str.split(';', expand=True)
    elif i==1:
        df_col_names_lod_list[i][['Kingdom', 'Phylum', 'Class']] = df_col_names_lod_list[i]['taxonomy'].str.split(';', expand=True)
    elif i==2:
        df_col_names_lod_list[i][['Kingdom', 'Phylum', 'Class', 'Order']] = df_col_names_lod_list[i]['taxonomy'].str.split(';', expand=True)
    elif i==3:
        df_col_names_lod_list[i][['Kingdom', 'Phylum', 'Class', 'Order', 'Family']] = df_col_names_lod_list[i]['taxonomy'].str.split(';', expand=True)
    elif i==4:
        df_col_names_lod_list[i][['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus']] = df_col_names_lod_list[i]['taxonomy'].str.split(';', expand=True)
    else:
        df_col_names_lod_list[i][['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']] = df_col_names_lod_list[i]['taxonomy'].str.split(';', expand=True)
        
    labels_list = []
    for index, row in df_col_names_lod_list[i].iterrows():
        # Species
        if row[-1][5:] in exclusion_list:
            # Genus
            if row[-2][5:] in exclusion_list:
                # Family
                if row[-3][5:] in exclusion_list:
                    # Order
                    if row[-4][5:] in exclusion_list:
                        # Class
                        if row[-5][5:] in exclusion_list:
                            # Phylum
                            if row[-6][5:] in exclusion_list:
                                labels_list.append(row[-7][5:] + '(' + df_col_names_lod_list[i].columns[-7][0].lower() + ')')
                            else:
                                labels_list.append(row[-6][5:] + '(' + df_col_names_lod_list[i].columns[-6][0].lower() + ')')
                        else:
                            labels_list.append(row[-5][5:] + '(' + df_col_names_lod_list[i].columns[-5][0].lower() + ')')
                    else:
                        labels_list.append(row[-4][5:] + '(' + df_col_names_lod_list[i].columns[-4][0].lower() + ')')
                else:
                    labels_list.append(row[-3][5:] + '(' + df_col_names_lod_list[i].columns[-3][0].lower() + ')')
            else:
                labels_list.append(row[-2][5:] + '(' + df_col_names_lod_list[i].columns[-2][0].lower() + ')')
        else:
            labels_list.append(row[-1][5:] + '(' + df_col_names_lod_list[i].columns[-1][0].lower() + ')')
                                   
    df_col_names_lod_list[i]['label'] = labels_list

Sort the columns by the max abundance of taxa across all samples¶

In [25]:

df_rel_sort_lod_list = [None]*6
df_abs_sort_lod_list = [None]*6

df_pseudo_rel_sort_lod_list = [None]*6
df_pseudo_abs_sort_lod_list = [None]*6

for i in range(6):
    taxa_sorted = df_abs_lod_list[i].mean().sort_values(ascending=False).index
    
    df_rel_sort_lod_list[i] = df_rel_lod_list[i].loc[:, taxa_sorted]
    df_abs_sort_lod_list[i] = df_abs_lod_list[i].loc[:, taxa_sorted]
    
    df_pseudo_rel_sort_lod_list[i] = df_pseudo_rel_lod_list[i].loc[:, taxa_sorted]
    df_pseudo_abs_sort_lod_list[i] = df_pseudo_abs_lod_list[i].loc[:, taxa_sorted]
    
df_abs_sort_lod_list[4]

Out[25]:

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } ASV76 ASV259 ASV255 ASV191 ASV247 ASV29 ASV188 ASV241 ASV61 ASV81 ... ASV256 ASV111 ASV262 ASV89 ASV226 ASV269 ASV151 ASV229 ASV180 ASV31 index 141 2.577477e+04 7252.710875 0.000000e+00 1.036953e+04 0.000000e+00 76511.221224 3.498972e+04 3908.224619 6.016823e+02 0.000000e+00 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 54.205612 142 3.738122e+04 0.000000 0.000000e+00 1.258758e+04 0.000000e+00 6001.888725 1.603150e+04 0.000000 3.774225e+03 0.000000e+00 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 144 1.877013e+04 267.762210 2.583905e+04 0.000000e+00 1.155207e+06 0.000000 2.677622e+02 0.000000 9.103915e+02 2.945384e+02 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 145 1.950127e+04 808142.011487 5.256522e+06 2.321580e+03 0.000000e+00 0.000000 1.085107e+06 3018.054050 2.785896e+03 1.798064e+06 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 146 3.716241e+05 17124.280966 5.446450e+02 1.728447e+04 0.000000e+00 8538.112025 3.996733e+04 20087.790768 2.745652e+04 0.000000e+00 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 446 3.187421e+05 13817.980372 0.000000e+00 5.738082e+04 0.000000e+00 38778.477945 6.306224e+04 19121.692655 3.322296e+04 0.000000e+00 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 447 8.643969e+06 22424.927902 0.000000e+00 9.110127e+04 0.000000e+00 58865.435744 7.904787e+05 15977.761130 5.894953e+05 0.000000e+00 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 448 1.287523e+06 290989.392571 0.000000e+00 8.101145e+04 0.000000e+00 772774.710062 3.945351e+05 112671.092803 4.614859e+05 0.000000e+00 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 449 4.342119e+05 83991.933352 0.000000e+00 2.551677e+04 0.000000e+00 0.000000 3.099128e+03 27780.805176 3.305118e+04 0.000000e+00 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000 451 3.000689e+08 528979.519297 0.000000e+00 1.263203e+07 0.000000e+00 0.000000 4.528065e+06 740571.327016 1.153916e+08 0.000000e+00 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.000000

250 rows × 286 columns

Save the working files to allow use in individual analysis workbooks¶

In [26]:

pickle.dump(df_rel_sort_lod_list, open('pickle_files/rel_sort_lod_list.pkl', 'wb'))
pickle.dump(df_abs_sort_lod_list, open('pickle_files/abs_sort_lod_list.pkl', 'wb'))

pickle.dump(df_pseudo_rel_sort_lod_list, open('pickle_files/pseudo_rel_sort_lod_list.pkl', 'wb'))
pickle.dump(df_pseudo_abs_sort_lod_list, open('pickle_files/pseudo_abs_sort_lod_list.pkl', 'wb'))

pickle.dump(df_col_names_lod_list, open('pickle_files/col_names_lod_list.pkl', 'wb'))
pickle.dump(df_total_load, open('pickle_files/total_load_duodenum.pkl', 'wb'))
pickle.dump(seq_metadata, open('pickle_files/seq_duodenum_metadata.pkl', 'wb'))

In [34]:

df_col_names_lod_list[5].loc[df_abs_sort_lod_list[5].columns[:10].tolist()]

Out[34]:

.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } taxonomy Kingdom Phylum Class Order Family Genus Species label ASV193 D_0__Bacteria;D_1__Firmicutes;D_2__Bacilli;D_3... D_0__Bacteria D_1__Firmicutes D_2__Bacilli D_3__Lactobacillales D_4__Streptococcaceae D_5__Streptococcus __ Streptococcus(g) ASV501 D_0__Bacteria;D_1__Proteobacteria;D_2__Gammapr... D_0__Bacteria D_1__Proteobacteria D_2__Gammaproteobacteria D_3__Pasteurellales D_4__Pasteurellaceae D_5__Haemophilus __ Haemophilus(g) ASV494 D_0__Bacteria;D_1__Proteobacteria;D_2__Gammapr... D_0__Bacteria D_1__Proteobacteria D_2__Gammaproteobacteria D_3__Enterobacteriales D_4__Enterobacteriaceae __ __ Enterobacteriaceae(f) ASV404 D_0__Bacteria;D_1__Fusobacteria;D_2__Fusobacte... D_0__Bacteria D_1__Fusobacteria D_2__Fusobacteriia D_3__Fusobacteriales D_4__Fusobacteriaceae D_5__Fusobacterium __ Fusobacterium(g) ASV486 D_0__Bacteria;D_1__Proteobacteria;D_2__Gammapr... D_0__Bacteria D_1__Proteobacteria D_2__Gammaproteobacteria D_3__Enterobacteriales D_4__Enterobacteriaceae D_5__Escherichia-Shigella __ Escherichia-Shigella(g) ASV397 D_0__Bacteria;D_1__Firmicutes;D_2__Negativicut... D_0__Bacteria D_1__Firmicutes D_2__Negativicutes D_3__Selenomonadales D_4__Veillonellaceae D_5__Veillonella __ Veillonella(g) ASV477 D_0__Bacteria;D_1__Proteobacteria;D_2__Gammapr... D_0__Bacteria D_1__Proteobacteria D_2__Gammaproteobacteria D_3__Betaproteobacteriales D_4__Neisseriaceae D_5__Neisseria D_6__uncultured bacterium Neisseria(g) ASV67 D_0__Bacteria;D_1__Bacteroidetes;D_2__Bacteroi... D_0__Bacteria D_1__Bacteroidetes D_2__Bacteroidia D_3__Bacteroidales D_4__Prevotellaceae D_5__Prevotella 7 D_6__Prevotella melaninogenica Prevotella melaninogenica(s) ASV155 D_0__Bacteria;D_1__Firmicutes;D_2__Bacilli;D_3... D_0__Bacteria D_1__Firmicutes D_2__Bacilli D_3__Bacillales D_4__Family XI D_5__Gemella __ Gemella(g) ASV206 D_0__Bacteria;D_1__Firmicutes;D_2__Clostridia;... D_0__Bacteria D_1__Firmicutes D_2__Clostridia D_3__Clostridiales D_4__Clostridiaceae 1 D_5__Clostridium sensu stricto 1 __ Clostridium sensu stricto 1(g)

In [ ]: