added sanity checks for or_kin

This commit is contained in:
Tanushree Tunstall 2020-07-10 15:24:57 +01:00
parent 5677175423
commit f9500d5324

View file

@ -286,17 +286,36 @@ else:
, '\nGot:', len(dfm2_mis.columns))
sys.exit()
del(ncols_add)
#%% formatting data for output
print('no of cols pre-formatting data:', len(dfm2_mis.columns)
del(ncols_add, orig_len)
#%% Calculating OR from beta coeff
df_ncols = dfm2_mis.shape[1]
print('No. of cols pre-formatting data:', df_ncols
, '\n======================================')
#1) Add column: OR for kinship calculated from beta coeff
print('converting beta coeff to OR by exponent function\n:'
, dfm2_mis['beta'].head()
, '\n======================================')
dfm2_mis['or_kin'] = np.exp(dfm2_mis['beta'])
print(dfm2_mis['or_kin'].head())
print('No. of cols after adidng OR kinship:', len(dfm2_mis.columns))
#1) Add column: OR for kinship calculated from beta coef
ncols_add = 0
if not 'or_kin' in dfm2_mis.columns:
dfm2_mis['or_kin'] = np.exp(dfm2_mis['beta'])
print(dfm2_mis['or_kin'].head())
ncols_add+=1
print('Calculating OR from beta coeff by exponent function and adding column:'
, '\nNo. of cols added:', ncols_add
, dfm2_mis['beta'].head()
, '\n======================================')
if dfm2_mis.shape[1] == df_ncols + ncols_add:
print('PASS: Dimension of df match'
, '\nDim of df:', dfm2_mis.shape
, 'n====================================')
else:
print('FAIL: Dim mismatch'
, '\nOriginal no. of cols:', df_ncols
, '\nExpected no. of cols:', df_ncols + ncols_add
, '\nGot:', dfm2_mis.shape[1])
sys.exit()
print('No. of cols after adding OR kinship:', len(dfm2_mis.columns))
#2) rename af column
dfm2_mis.rename(columns = {'af': 'af_kin'
@ -314,31 +333,63 @@ dfm2_mis.rename(columns = {'af': 'af_kin'
#3a) drop duplicate columns
dfm2_mis2 = dfm2_mis.T.drop_duplicates().T #changes dtypes in cols, so not used
dup_cols = set(dfm2_mis.columns).difference(dfm2_mis2.columns)
print('Total no of duplciate columns:', len(dup_cols)
print('Total no of duplicate columns:', len(dup_cols)
, '\nDuplicate columns identified:', dup_cols)
#dup_cols = {'alt_allele0', 'ps'} # didn't want to remove tot_diff
del(dfm2_mis2)
del(dfm2_mis2, df_ncols, ncols_add)
#print('removing duplicate columns: kept one of the dup_cols i.e tot_diff')
print('Removing duplicate columns'
, '\nOriginal dim:', dfm2_mis.shape)
dfm2_mis.drop(list(dup_cols), axis = 1, inplace = True)
print('\nRevised dim:', dfm2_mis.shape, 'after removing', len(dup_cols), 'columns')
df_ncols = dfm2_mis.shape[1]
if dfm2_mis.shape[1] == df_ncols - len(dup_cols):
print('PASS: Dimensions match'
, '\nDim:', dfm2_mis.shape
, '\nRemoved', len(dup_cols), 'columns from' , df_ncols
, '\n======================================')
else:
print('FAIL: Dimensions mismatch'
, '\nOriginal no. of cols:', df_ncols
, '\nNo. of cols to drop:', len(dup_cols)
, '\nExpected:', df_cols - len(dup_cols)
, '\nGot:', dfm2_mis.shape[1])
sys.exit()
del(df_ncols)
#3b) other not useful columns
cols_to_drop = ['chromosome_text', 'n_diff', 'chr', 'symbol', '_merge' ]
df_ncols = dfm2_mis.shape[1]
dfm2_mis.drop(cols_to_drop, axis = 1, inplace = True)
#dfm2_mis.rename(columns = {'ref_allele1': 'reference_allele'}, inplace = True)
if dfm2_mis.shape[1] == df_ncols - len(cols_to_drop):
print('PASS:', len(cols_to_drop), 'columns successfully dropped'
, '\nDim:', dfm2_mis.shape
, '\nRemoved', len(cols_to_drop), 'columns from', df_ncols
, '\n===========================================')
else:
print('FAIL: Dimensions mismatch'
, '\nOriginal no. of cols:', df_ncols
, '\nExpected:', df_ncols - len(cols_to_drop)
, '\nGot:', dfm2_mis.shape[1])
sys.exit()
del(df_ncols)
print('Dim after dropping', len(cols_to_drop), 'columns:', dfm2_mis.shape)
#print(dfm2_mis.columns)
#4) reorder columns
#orkin_linked = dfm2_mis.copy()
#4) reorder columnn
print('Reordering', dfm2_mis.shape[1], 'columns'
, '\n===============================================')
orkin_linked = dfm2_mis[['mutation',
column_order = ['mutation',
'mutationinformation',
'wild_type',
'position',
@ -375,8 +426,19 @@ orkin_linked = dfm2_mis[['mutation',
# 'tot_diff',
'n_miss',
'wt_3let',
'mt_3let']]
'mt_3let']
#orkin_linked = dfm2_mis.copy()
if len(column_order) == dfm2_mis.shape[1]:
print('PASS: Column order generated for', len(column_order), 'columns'
, '\nApplying column order to df...' )
orkin_linked = dfm2_mis[column_order]
else:
print('FAIL: Mismatch in no. of cols to reorder'
, '\nNo. of cols in df to reorder:', dfm2_mis.shape[1]
, '\nOrder generated for:', len(column_order), 'columns'
, '\n', dfm2_mis.shape[1], 'should match', len(column_order))
sys.exit()
# sanity check after reassigning columns
if orkin_linked.shape == dfm2_mis.shape and set(orkin_linked.columns) == set(dfm2_mis.columns):