From f9500d5324f84084a55904b35c766d831d5164bd Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 10 Jul 2020 15:24:57 +0100 Subject: [PATCH] added sanity checks for or_kin --- scripts/or_kinship_link.py | 98 +++++++++++++++++++++++++++++++------- 1 file changed, 80 insertions(+), 18 deletions(-) diff --git a/scripts/or_kinship_link.py b/scripts/or_kinship_link.py index de1d93b..82f4ed0 100755 --- a/scripts/or_kinship_link.py +++ b/scripts/or_kinship_link.py @@ -286,17 +286,36 @@ else: , '\nGot:', len(dfm2_mis.columns)) sys.exit() -del(ncols_add) -#%% formatting data for output -print('no of cols pre-formatting data:', len(dfm2_mis.columns) +del(ncols_add, orig_len) +#%% Calculating OR from beta coeff +df_ncols = dfm2_mis.shape[1] +print('No. of cols pre-formatting data:', df_ncols , '\n======================================') -#1) Add column: OR for kinship calculated from beta coeff -print('converting beta coeff to OR by exponent function\n:' - , dfm2_mis['beta'].head() - , '\n======================================') -dfm2_mis['or_kin'] = np.exp(dfm2_mis['beta']) -print(dfm2_mis['or_kin'].head()) -print('No. of cols after adidng OR kinship:', len(dfm2_mis.columns)) + +#1) Add column: OR for kinship calculated from beta coef + +ncols_add = 0 +if not 'or_kin' in dfm2_mis.columns: + dfm2_mis['or_kin'] = np.exp(dfm2_mis['beta']) + print(dfm2_mis['or_kin'].head()) + ncols_add+=1 + print('Calculating OR from beta coeff by exponent function and adding column:' + , '\nNo. of cols added:', ncols_add + , dfm2_mis['beta'].head() + , '\n======================================') + +if dfm2_mis.shape[1] == df_ncols + ncols_add: + print('PASS: Dimension of df match' + , '\nDim of df:', dfm2_mis.shape + , 'n====================================') +else: + print('FAIL: Dim mismatch' + , '\nOriginal no. of cols:', df_ncols + , '\nExpected no. of cols:', df_ncols + ncols_add + , '\nGot:', dfm2_mis.shape[1]) + sys.exit() + +print('No. of cols after adding OR kinship:', len(dfm2_mis.columns)) #2) rename af column dfm2_mis.rename(columns = {'af': 'af_kin' @@ -314,31 +333,63 @@ dfm2_mis.rename(columns = {'af': 'af_kin' #3a) drop duplicate columns dfm2_mis2 = dfm2_mis.T.drop_duplicates().T #changes dtypes in cols, so not used dup_cols = set(dfm2_mis.columns).difference(dfm2_mis2.columns) -print('Total no of duplciate columns:', len(dup_cols) +print('Total no of duplicate columns:', len(dup_cols) , '\nDuplicate columns identified:', dup_cols) #dup_cols = {'alt_allele0', 'ps'} # didn't want to remove tot_diff -del(dfm2_mis2) +del(dfm2_mis2, df_ncols, ncols_add) #print('removing duplicate columns: kept one of the dup_cols i.e tot_diff') print('Removing duplicate columns' , '\nOriginal dim:', dfm2_mis.shape) dfm2_mis.drop(list(dup_cols), axis = 1, inplace = True) -print('\nRevised dim:', dfm2_mis.shape, 'after removing', len(dup_cols), 'columns') + +df_ncols = dfm2_mis.shape[1] +if dfm2_mis.shape[1] == df_ncols - len(dup_cols): + print('PASS: Dimensions match' + , '\nDim:', dfm2_mis.shape + , '\nRemoved', len(dup_cols), 'columns from' , df_ncols + , '\n======================================') +else: + print('FAIL: Dimensions mismatch' + , '\nOriginal no. of cols:', df_ncols + , '\nNo. of cols to drop:', len(dup_cols) + , '\nExpected:', df_cols - len(dup_cols) + , '\nGot:', dfm2_mis.shape[1]) + sys.exit() + +del(df_ncols) #3b) other not useful columns cols_to_drop = ['chromosome_text', 'n_diff', 'chr', 'symbol', '_merge' ] +df_ncols = dfm2_mis.shape[1] + dfm2_mis.drop(cols_to_drop, axis = 1, inplace = True) #dfm2_mis.rename(columns = {'ref_allele1': 'reference_allele'}, inplace = True) - + + +if dfm2_mis.shape[1] == df_ncols - len(cols_to_drop): + print('PASS:', len(cols_to_drop), 'columns successfully dropped' + , '\nDim:', dfm2_mis.shape + , '\nRemoved', len(cols_to_drop), 'columns from', df_ncols + , '\n===========================================') +else: + print('FAIL: Dimensions mismatch' + , '\nOriginal no. of cols:', df_ncols + , '\nExpected:', df_ncols - len(cols_to_drop) + , '\nGot:', dfm2_mis.shape[1]) + sys.exit() + +del(df_ncols) print('Dim after dropping', len(cols_to_drop), 'columns:', dfm2_mis.shape) #print(dfm2_mis.columns) -#4) reorder columns -#orkin_linked = dfm2_mis.copy() +#4) reorder columnn +print('Reordering', dfm2_mis.shape[1], 'columns' + , '\n===============================================') -orkin_linked = dfm2_mis[['mutation', +column_order = ['mutation', 'mutationinformation', 'wild_type', 'position', @@ -375,8 +426,19 @@ orkin_linked = dfm2_mis[['mutation', # 'tot_diff', 'n_miss', 'wt_3let', - 'mt_3let']] + 'mt_3let'] +#orkin_linked = dfm2_mis.copy() +if len(column_order) == dfm2_mis.shape[1]: + print('PASS: Column order generated for', len(column_order), 'columns' + , '\nApplying column order to df...' ) + orkin_linked = dfm2_mis[column_order] +else: + print('FAIL: Mismatch in no. of cols to reorder' + , '\nNo. of cols in df to reorder:', dfm2_mis.shape[1] + , '\nOrder generated for:', len(column_order), 'columns' + , '\n', dfm2_mis.shape[1], 'should match', len(column_order)) + sys.exit() # sanity check after reassigning columns if orkin_linked.shape == dfm2_mis.shape and set(orkin_linked.columns) == set(dfm2_mis.columns):