In [1]:
import numpy as np
import pandas as pd
import pickle

### Load in the raw metadata file and processed total microbial loads from dPCR

In [2]:
# raw metadata
df_raw_metadata = pd.read_excel('data_files/metadata_raw.xlsx', index_col=0)
df_colony_counts = pd.read_excel('data_files/Colony counts.xlsx', index_col=0)
df_total_load = pd.read_pickle('pickle_files/total_load_duodenum.pkl').set_index('Sample')

### Merge relevant total load columns with raw metadata

In [3]:
df_metadata = df_total_load[['Weight (mL)', 'Copies/mL', 'Log Copies/mL', 'Rel. Abundance LOD (%) Corrected']].merge(df_raw_metadata, left_index=True, right_index=True).merge(df_colony_counts, left_index=True, right_index=True)

### Generate binary variables used in downstream analysis

In [5]:
#Determine the max culture load available from each patient. Some patients had samples from more than one SI location
df_metadata['max aerobic'] = df_metadata[['MacConkey Duodenum', 'MacConkey Jejunum', 'MacConkey Ileum']].max(axis=1, skipna=True)
df_metadata['max anaerobic'] = df_metadata[['Blood Duodenum', 'Blood Jejunum', 'Blood Ileum']].max(axis=1, skipna=True)

#Determine SIBO classification by seeing if patient had >1e3 CFU/mL culturable microbes on aerobic MacConkey Agar
df_metadata['SIBO'] = np.where(df_metadata['max aerobic']>=1000, 1, 0)

#Generate binary categories based on pain thresholds above median score
df_metadata['bloating>50th'] = np.where(np.isnan(df_metadata['bloating_vas']), np.nan, np.where(df_metadata['bloating_vas']>=df_metadata['bloating_vas'].median(),1,0))
df_metadata['constipation>50th'] = np.where(np.isnan(df_metadata['constipation_vas']),np.nan, np.where(df_metadata['constipation_vas']>=df_metadata['constipation_vas'].median(),1,0))
df_metadata['excess_gas>50th'] = np.where(np.isnan(df_metadata['excess_gas_vas']),np.nan,np.where(df_metadata['excess_gas_vas']>=df_metadata['excess_gas_vas'].median(),1,0))
df_metadata['incomplete_evac>50th'] = np.where(np.isnan(df_metadata['incomplete_evac_vas']),np.nan, np.where(df_metadata['incomplete_evac_vas']>=df_metadata['incomplete_evac_vas'].median(),1,0))
df_metadata['diarrhea>50th'] = np.where(np.isnan(df_metadata['diarrhea_vas']),np.nan, np.where(df_metadata['diarrhea_vas']>=df_metadata['diarrhea_vas'].median(),1,0))
df_metadata['urgency>50th'] = np.where(np.isnan(df_metadata['urgency_vas']),np.nan, np.where(df_metadata['urgency_vas']>=df_metadata['urgency_vas'].median(),1,0))

#Generate binary category for current smoker
df_metadata['current_smoker'] = np.where(df_metadata['quit_smoking']==0, 1, 0)

#Generate binary category for any probiotics
probiotics_list = ['VSL#3', 'Align', 'Culturelle', 'Cultura', 'Yakult', 'Vifit', 'Floraster', 'Other pro or prebiotics',]
df_metadata[probiotics_list] = df_metadata[probiotics_list].fillna(0)
df_metadata['any_probiotics'] = np.where(df_metadata['probiotics sum']>0, 1, 0)

#Generate binary category for gender
df_metadata['Gender_binary'] = np.where(df_metadata['Gender']=='Male',1,0)

#Update abx_current column
df_metadata['abx_current'] = df_metadata['abx_current'].replace(np.nan, 0)

#Generate binary category for PPI
df_metadata['Med'] = df_metadata['Med'].replace(np.nan, 'None')
df_metadata['any_PPI'] = np.where(df_metadata['Med']=='None', 0, 1)

df_metadata

Unnamed: 0,Weight (mL),Copies/mL,Log Copies/mL,Rel. Abundance LOD (%) Corrected,Age (years),Gender,weight (lbs),GM-CSF,IFNY,IL10,...,bloating>50th,constipation>50th,excess_gas>50th,incomplete_evac>50th,diarrhea>50th,urgency>50th,current_smoker,any_probiotics,Gender_binary,any_PPI
141,1.6259,2.460176e+05,5.390966,0.021429,42,Female,437,1.089782,0.320000,9.613930,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0
142,0.2091,1.162123e+05,5.065252,0.352734,81,Male,137,3.464436,5.116849,11.172813,...,1.0,0.0,1.0,1.0,0.0,0.0,0,0,1,1
145,1.77,1.053672e+07,7.022706,0.018320,68,Female,213,2.194994,0.881870,12.464125,...,0.0,0.0,0.0,1.0,0.0,1.0,0,1,0,0
146,1.6973,7.270371e+05,5.861557,0.018320,72,Female,201,0.320000,1.413385,5.787293,...,1.0,0.0,1.0,0.0,1.0,1.0,0,0,0,1
147,1.7476,3.673610e+05,5.565093,0.018320,56,Female,195,2.847633,0.080797,1.501182,...,1.0,1.0,1.0,1.0,1.0,1.0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
451,0.605,4.801653e+08,8.681391,0.018320,34,Male,160,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,0,0,1,0
207,1.8095,2.127660e+08,8.327902,0.018320,52,Female,200,0.320000,0.320000,1.438906,...,0.0,0.0,1.0,0.0,0.0,0.0,0,0,0,0
274,1.083,3.924284e+08,8.593760,0.018320,55,Male,195,34.464303,90.534830,24.358041,...,,,,,,,0,0,1,0
322,0.219,9.246575e+08,8.965981,0.018320,73,Male,176,0.320000,0.996686,3.348208,...,1.0,1.0,1.0,1.0,0.0,0.0,0,0,1,1


### Save the data as a pickle file

In [6]:
df_metadata.sort_index().to_pickle('pickle_files/metadata_comp.pkl')