import numpy as np
import pandas as pd
import pickle
# raw metadata
df_raw_metadata = pd.read_excel('data_files/metadata_raw.xlsx', index_col=0)
df_colony_counts = pd.read_excel('data_files/Colony counts.xlsx', index_col=0)
df_total_load = pd.read_pickle('pickle_files/total_load_duodenum.pkl').set_index('Sample')
df_metadata = df_total_load[['Weight (mL)', 'Copies/mL', 'Log Copies/mL', 'Rel. Abundance LOD (%) Corrected']].merge(df_raw_metadata, left_index=True, right_index=True).merge(df_colony_counts, left_index=True, right_index=True)
#Determine the max culture load available from each patient. Some patients had samples from more than one SI location
df_metadata['max aerobic'] = df_metadata[['MacConkey Duodenum', 'MacConkey Jejunum', 'MacConkey Ileum']].max(axis=1, skipna=True)
df_metadata['max anaerobic'] = df_metadata[['Blood Duodenum', 'Blood Jejunum', 'Blood Ileum']].max(axis=1, skipna=True)
#Determine SIBO classification by seeing if patient had >1e3 CFU/mL culturable microbes on aerobic MacConkey Agar
df_metadata['SIBO'] = np.where(df_metadata['max aerobic']>=1000, 1, 0)
#Generate binary categories based on pain thresholds above median score
df_metadata['bloating>50th'] = np.where(np.isnan(df_metadata['bloating_vas']), np.nan, np.where(df_metadata['bloating_vas']>=df_metadata['bloating_vas'].median(),1,0))
df_metadata['constipation>50th'] = np.where(np.isnan(df_metadata['constipation_vas']),np.nan, np.where(df_metadata['constipation_vas']>=df_metadata['constipation_vas'].median(),1,0))
df_metadata['excess_gas>50th'] = np.where(np.isnan(df_metadata['excess_gas_vas']),np.nan,np.where(df_metadata['excess_gas_vas']>=df_metadata['excess_gas_vas'].median(),1,0))
df_metadata['incomplete_evac>50th'] = np.where(np.isnan(df_metadata['incomplete_evac_vas']),np.nan, np.where(df_metadata['incomplete_evac_vas']>=df_metadata['incomplete_evac_vas'].median(),1,0))
df_metadata['diarrhea>50th'] = np.where(np.isnan(df_metadata['diarrhea_vas']),np.nan, np.where(df_metadata['diarrhea_vas']>=df_metadata['diarrhea_vas'].median(),1,0))
df_metadata['urgency>50th'] = np.where(np.isnan(df_metadata['urgency_vas']),np.nan, np.where(df_metadata['urgency_vas']>=df_metadata['urgency_vas'].median(),1,0))
#Generate binary category for current smoker
df_metadata['current_smoker'] = np.where(df_metadata['quit_smoking']==0, 1, 0)
#Generate binary category for any probiotics
probiotics_list = ['VSL#3', 'Align', 'Culturelle', 'Cultura', 'Yakult', 'Vifit', 'Floraster', 'Other pro or prebiotics',]
df_metadata[probiotics_list] = df_metadata[probiotics_list].fillna(0)
df_metadata['any_probiotics'] = np.where(df_metadata['probiotics sum']>0, 1, 0)
#Generate binary category for gender
df_metadata['Gender_binary'] = np.where(df_metadata['Gender']=='Male',1,0)
#Update abx_current column
df_metadata['abx_current'] = df_metadata['abx_current'].replace(np.nan, 0)
#Generate binary category for PPI
df_metadata['Med'] = df_metadata['Med'].replace(np.nan, 'None')
df_metadata['any_PPI'] = np.where(df_metadata['Med']=='None', 0, 1)
df_metadata
250 rows × 69 columns
df_metadata.sort_index().to_pickle('pickle_files/metadata_comp.pkl')