added sanity checks for or_kin

This commit is contained in:
Tanushree Tunstall 2020-07-10 15:24:57 +01:00
parent 5677175423
commit f9500d5324

View file

@ -286,17 +286,36 @@ else:
, '\nGot:', len(dfm2_mis.columns)) , '\nGot:', len(dfm2_mis.columns))
sys.exit() sys.exit()
del(ncols_add) del(ncols_add, orig_len)
#%% formatting data for output #%% Calculating OR from beta coeff
print('no of cols pre-formatting data:', len(dfm2_mis.columns) df_ncols = dfm2_mis.shape[1]
print('No. of cols pre-formatting data:', df_ncols
, '\n======================================') , '\n======================================')
#1) Add column: OR for kinship calculated from beta coeff
print('converting beta coeff to OR by exponent function\n:' #1) Add column: OR for kinship calculated from beta coef
, dfm2_mis['beta'].head()
, '\n======================================') ncols_add = 0
dfm2_mis['or_kin'] = np.exp(dfm2_mis['beta']) if not 'or_kin' in dfm2_mis.columns:
print(dfm2_mis['or_kin'].head()) dfm2_mis['or_kin'] = np.exp(dfm2_mis['beta'])
print('No. of cols after adidng OR kinship:', len(dfm2_mis.columns)) print(dfm2_mis['or_kin'].head())
ncols_add+=1
print('Calculating OR from beta coeff by exponent function and adding column:'
, '\nNo. of cols added:', ncols_add
, dfm2_mis['beta'].head()
, '\n======================================')
if dfm2_mis.shape[1] == df_ncols + ncols_add:
print('PASS: Dimension of df match'
, '\nDim of df:', dfm2_mis.shape
, 'n====================================')
else:
print('FAIL: Dim mismatch'
, '\nOriginal no. of cols:', df_ncols
, '\nExpected no. of cols:', df_ncols + ncols_add
, '\nGot:', dfm2_mis.shape[1])
sys.exit()
print('No. of cols after adding OR kinship:', len(dfm2_mis.columns))
#2) rename af column #2) rename af column
dfm2_mis.rename(columns = {'af': 'af_kin' dfm2_mis.rename(columns = {'af': 'af_kin'
@ -314,31 +333,63 @@ dfm2_mis.rename(columns = {'af': 'af_kin'
#3a) drop duplicate columns #3a) drop duplicate columns
dfm2_mis2 = dfm2_mis.T.drop_duplicates().T #changes dtypes in cols, so not used dfm2_mis2 = dfm2_mis.T.drop_duplicates().T #changes dtypes in cols, so not used
dup_cols = set(dfm2_mis.columns).difference(dfm2_mis2.columns) dup_cols = set(dfm2_mis.columns).difference(dfm2_mis2.columns)
print('Total no of duplciate columns:', len(dup_cols) print('Total no of duplicate columns:', len(dup_cols)
, '\nDuplicate columns identified:', dup_cols) , '\nDuplicate columns identified:', dup_cols)
#dup_cols = {'alt_allele0', 'ps'} # didn't want to remove tot_diff #dup_cols = {'alt_allele0', 'ps'} # didn't want to remove tot_diff
del(dfm2_mis2) del(dfm2_mis2, df_ncols, ncols_add)
#print('removing duplicate columns: kept one of the dup_cols i.e tot_diff') #print('removing duplicate columns: kept one of the dup_cols i.e tot_diff')
print('Removing duplicate columns' print('Removing duplicate columns'
, '\nOriginal dim:', dfm2_mis.shape) , '\nOriginal dim:', dfm2_mis.shape)
dfm2_mis.drop(list(dup_cols), axis = 1, inplace = True) dfm2_mis.drop(list(dup_cols), axis = 1, inplace = True)
print('\nRevised dim:', dfm2_mis.shape, 'after removing', len(dup_cols), 'columns')
df_ncols = dfm2_mis.shape[1]
if dfm2_mis.shape[1] == df_ncols - len(dup_cols):
print('PASS: Dimensions match'
, '\nDim:', dfm2_mis.shape
, '\nRemoved', len(dup_cols), 'columns from' , df_ncols
, '\n======================================')
else:
print('FAIL: Dimensions mismatch'
, '\nOriginal no. of cols:', df_ncols
, '\nNo. of cols to drop:', len(dup_cols)
, '\nExpected:', df_cols - len(dup_cols)
, '\nGot:', dfm2_mis.shape[1])
sys.exit()
del(df_ncols)
#3b) other not useful columns #3b) other not useful columns
cols_to_drop = ['chromosome_text', 'n_diff', 'chr', 'symbol', '_merge' ] cols_to_drop = ['chromosome_text', 'n_diff', 'chr', 'symbol', '_merge' ]
df_ncols = dfm2_mis.shape[1]
dfm2_mis.drop(cols_to_drop, axis = 1, inplace = True) dfm2_mis.drop(cols_to_drop, axis = 1, inplace = True)
#dfm2_mis.rename(columns = {'ref_allele1': 'reference_allele'}, inplace = True) #dfm2_mis.rename(columns = {'ref_allele1': 'reference_allele'}, inplace = True)
if dfm2_mis.shape[1] == df_ncols - len(cols_to_drop):
print('PASS:', len(cols_to_drop), 'columns successfully dropped'
, '\nDim:', dfm2_mis.shape
, '\nRemoved', len(cols_to_drop), 'columns from', df_ncols
, '\n===========================================')
else:
print('FAIL: Dimensions mismatch'
, '\nOriginal no. of cols:', df_ncols
, '\nExpected:', df_ncols - len(cols_to_drop)
, '\nGot:', dfm2_mis.shape[1])
sys.exit()
del(df_ncols)
print('Dim after dropping', len(cols_to_drop), 'columns:', dfm2_mis.shape) print('Dim after dropping', len(cols_to_drop), 'columns:', dfm2_mis.shape)
#print(dfm2_mis.columns) #print(dfm2_mis.columns)
#4) reorder columns #4) reorder columnn
#orkin_linked = dfm2_mis.copy() print('Reordering', dfm2_mis.shape[1], 'columns'
, '\n===============================================')
orkin_linked = dfm2_mis[['mutation', column_order = ['mutation',
'mutationinformation', 'mutationinformation',
'wild_type', 'wild_type',
'position', 'position',
@ -375,8 +426,19 @@ orkin_linked = dfm2_mis[['mutation',
# 'tot_diff', # 'tot_diff',
'n_miss', 'n_miss',
'wt_3let', 'wt_3let',
'mt_3let']] 'mt_3let']
#orkin_linked = dfm2_mis.copy()
if len(column_order) == dfm2_mis.shape[1]:
print('PASS: Column order generated for', len(column_order), 'columns'
, '\nApplying column order to df...' )
orkin_linked = dfm2_mis[column_order]
else:
print('FAIL: Mismatch in no. of cols to reorder'
, '\nNo. of cols in df to reorder:', dfm2_mis.shape[1]
, '\nOrder generated for:', len(column_order), 'columns'
, '\n', dfm2_mis.shape[1], 'should match', len(column_order))
sys.exit()
# sanity check after reassigning columns # sanity check after reassigning columns
if orkin_linked.shape == dfm2_mis.shape and set(orkin_linked.columns) == set(dfm2_mis.columns): if orkin_linked.shape == dfm2_mis.shape and set(orkin_linked.columns) == set(dfm2_mis.columns):