{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "import pickle" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load in the raw metadata file and processed total microbial loads from dPCR" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# raw metadata\n", "df_raw_metadata = pd.read_excel('data_files/metadata_raw.xlsx', index_col=0)\n", "df_colony_counts = pd.read_excel('data_files/Colony counts.xlsx', index_col=0)\n", "df_total_load = pd.read_pickle('pickle_files/total_load_duodenum.pkl').set_index('Sample')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Merge relevant total load columns with raw metadata" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "df_metadata = df_total_load[['Weight (mL)', 'Copies/mL', 'Log Copies/mL', 'Rel. Abundance LOD (%) Corrected']].merge(df_raw_metadata, left_index=True, right_index=True).merge(df_colony_counts, left_index=True, right_index=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Generate binary variables used in downstream analysis" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Weight (mL)Copies/mLLog Copies/mLRel. Abundance LOD (%) CorrectedAge (years)Genderweight (lbs)GM-CSFIFNYIL10...bloating>50thconstipation>50thexcess_gas>50thincomplete_evac>50thdiarrhea>50thurgency>50thcurrent_smokerany_probioticsGender_binaryany_PPI
1411.62592.460176e+055.3909660.02142942Female4371.0897820.3200009.613930...0.00.00.00.00.00.00000
1420.20911.162123e+055.0652520.35273481Male1373.4644365.11684911.172813...1.00.01.01.00.00.00011
1451.771.053672e+077.0227060.01832068Female2132.1949940.88187012.464125...0.00.00.01.00.01.00100
1461.69737.270371e+055.8615570.01832072Female2010.3200001.4133855.787293...1.00.01.00.01.01.00001
1471.74763.673610e+055.5650930.01832056Female1952.8476330.0807971.501182...1.01.01.01.01.01.00000
..................................................................
4510.6054.801653e+088.6813910.01832034Male160NaNNaNNaN...1.01.01.01.01.01.00010
2071.80952.127660e+088.3279020.01832052Female2000.3200000.3200001.438906...0.00.01.00.00.00.00000
2741.0833.924284e+088.5937600.01832055Male19534.46430390.53483024.358041...NaNNaNNaNNaNNaNNaN0010
3220.2199.246575e+088.9659810.01832073Male1760.3200000.9966863.348208...1.01.01.01.00.00.00011
3950.4914.358452e+088.6393320.01832064Female12310.5641264.2892186.107177...1.00.01.01.01.01.00000
\n", "

250 rows × 69 columns

\n", "
" ], "text/plain": [ " Weight (mL) Copies/mL Log Copies/mL \\\n", "141 1.6259 2.460176e+05 5.390966 \n", "142 0.2091 1.162123e+05 5.065252 \n", "145 1.77 1.053672e+07 7.022706 \n", "146 1.6973 7.270371e+05 5.861557 \n", "147 1.7476 3.673610e+05 5.565093 \n", ".. ... ... ... \n", "451 0.605 4.801653e+08 8.681391 \n", "207 1.8095 2.127660e+08 8.327902 \n", "274 1.083 3.924284e+08 8.593760 \n", "322 0.219 9.246575e+08 8.965981 \n", "395 0.491 4.358452e+08 8.639332 \n", "\n", " Rel. Abundance LOD (%) Corrected Age (years) Gender weight (lbs) \\\n", "141 0.021429 42 Female 437 \n", "142 0.352734 81 Male 137 \n", "145 0.018320 68 Female 213 \n", "146 0.018320 72 Female 201 \n", "147 0.018320 56 Female 195 \n", ".. ... ... ... ... \n", "451 0.018320 34 Male 160 \n", "207 0.018320 52 Female 200 \n", "274 0.018320 55 Male 195 \n", "322 0.018320 73 Male 176 \n", "395 0.018320 64 Female 123 \n", "\n", " GM-CSF IFNY IL10 ... bloating>50th constipation>50th \\\n", "141 1.089782 0.320000 9.613930 ... 0.0 0.0 \n", "142 3.464436 5.116849 11.172813 ... 1.0 0.0 \n", "145 2.194994 0.881870 12.464125 ... 0.0 0.0 \n", "146 0.320000 1.413385 5.787293 ... 1.0 0.0 \n", "147 2.847633 0.080797 1.501182 ... 1.0 1.0 \n", ".. ... ... ... ... ... ... \n", "451 NaN NaN NaN ... 1.0 1.0 \n", "207 0.320000 0.320000 1.438906 ... 0.0 0.0 \n", "274 34.464303 90.534830 24.358041 ... NaN NaN \n", "322 0.320000 0.996686 3.348208 ... 1.0 1.0 \n", "395 10.564126 4.289218 6.107177 ... 1.0 0.0 \n", "\n", " excess_gas>50th incomplete_evac>50th diarrhea>50th urgency>50th \\\n", "141 0.0 0.0 0.0 0.0 \n", "142 1.0 1.0 0.0 0.0 \n", "145 0.0 1.0 0.0 1.0 \n", "146 1.0 0.0 1.0 1.0 \n", "147 1.0 1.0 1.0 1.0 \n", ".. ... ... ... ... \n", "451 1.0 1.0 1.0 1.0 \n", "207 1.0 0.0 0.0 0.0 \n", "274 NaN NaN NaN NaN \n", "322 1.0 1.0 0.0 0.0 \n", "395 1.0 1.0 1.0 1.0 \n", "\n", " current_smoker any_probiotics Gender_binary any_PPI \n", "141 0 0 0 0 \n", "142 0 0 1 1 \n", "145 0 1 0 0 \n", "146 0 0 0 1 \n", "147 0 0 0 0 \n", ".. ... ... ... ... \n", "451 0 0 1 0 \n", "207 0 0 0 0 \n", "274 0 0 1 0 \n", "322 0 0 1 1 \n", "395 0 0 0 0 \n", "\n", "[250 rows x 69 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Determine the max culture load available from each patient. Some patients had samples from more than one SI location\n", "df_metadata['max aerobic'] = df_metadata[['MacConkey Duodenum', 'MacConkey Jejunum', 'MacConkey Ileum']].max(axis=1, skipna=True)\n", "df_metadata['max anaerobic'] = df_metadata[['Blood Duodenum', 'Blood Jejunum', 'Blood Ileum']].max(axis=1, skipna=True)\n", "\n", "#Determine SIBO classification by seeing if patient had >1e3 CFU/mL culturable microbes on aerobic MacConkey Agar\n", "df_metadata['SIBO'] = np.where(df_metadata['max aerobic']>=1000, 1, 0)\n", "\n", "#Generate binary categories based on pain thresholds above median score\n", "df_metadata['bloating>50th'] = np.where(np.isnan(df_metadata['bloating_vas']), np.nan, np.where(df_metadata['bloating_vas']>=df_metadata['bloating_vas'].median(),1,0))\n", "df_metadata['constipation>50th'] = np.where(np.isnan(df_metadata['constipation_vas']),np.nan, np.where(df_metadata['constipation_vas']>=df_metadata['constipation_vas'].median(),1,0))\n", "df_metadata['excess_gas>50th'] = np.where(np.isnan(df_metadata['excess_gas_vas']),np.nan,np.where(df_metadata['excess_gas_vas']>=df_metadata['excess_gas_vas'].median(),1,0))\n", "df_metadata['incomplete_evac>50th'] = np.where(np.isnan(df_metadata['incomplete_evac_vas']),np.nan, np.where(df_metadata['incomplete_evac_vas']>=df_metadata['incomplete_evac_vas'].median(),1,0))\n", "df_metadata['diarrhea>50th'] = np.where(np.isnan(df_metadata['diarrhea_vas']),np.nan, np.where(df_metadata['diarrhea_vas']>=df_metadata['diarrhea_vas'].median(),1,0))\n", "df_metadata['urgency>50th'] = np.where(np.isnan(df_metadata['urgency_vas']),np.nan, np.where(df_metadata['urgency_vas']>=df_metadata['urgency_vas'].median(),1,0))\n", "\n", "#Generate binary category for current smoker\n", "df_metadata['current_smoker'] = np.where(df_metadata['quit_smoking']==0, 1, 0)\n", "\n", "#Generate binary category for any probiotics\n", "probiotics_list = ['VSL#3', 'Align', 'Culturelle', 'Cultura', 'Yakult', 'Vifit', 'Floraster', 'Other pro or prebiotics',]\n", "df_metadata[probiotics_list] = df_metadata[probiotics_list].fillna(0)\n", "df_metadata['any_probiotics'] = np.where(df_metadata['probiotics sum']>0, 1, 0)\n", "\n", "#Generate binary category for gender\n", "df_metadata['Gender_binary'] = np.where(df_metadata['Gender']=='Male',1,0)\n", "\n", "#Update abx_current column\n", "df_metadata['abx_current'] = df_metadata['abx_current'].replace(np.nan, 0)\n", "\n", "#Generate binary category for PPI\n", "df_metadata['Med'] = df_metadata['Med'].replace(np.nan, 'None')\n", "df_metadata['any_PPI'] = np.where(df_metadata['Med']=='None', 0, 1)\n", "\n", "df_metadata" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Save the data as a pickle file" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "df_metadata.sort_index().to_pickle('pickle_files/metadata_comp.pkl')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.10" } }, "nbformat": 4, "nbformat_minor": 4 }