added sanity checks for or_kin
This commit is contained in:
parent
5677175423
commit
f9500d5324
1 changed files with 80 additions and 18 deletions
|
@ -286,17 +286,36 @@ else:
|
||||||
, '\nGot:', len(dfm2_mis.columns))
|
, '\nGot:', len(dfm2_mis.columns))
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
del(ncols_add)
|
del(ncols_add, orig_len)
|
||||||
#%% formatting data for output
|
#%% Calculating OR from beta coeff
|
||||||
print('no of cols pre-formatting data:', len(dfm2_mis.columns)
|
df_ncols = dfm2_mis.shape[1]
|
||||||
|
print('No. of cols pre-formatting data:', df_ncols
|
||||||
, '\n======================================')
|
, '\n======================================')
|
||||||
#1) Add column: OR for kinship calculated from beta coeff
|
|
||||||
print('converting beta coeff to OR by exponent function\n:'
|
#1) Add column: OR for kinship calculated from beta coef
|
||||||
, dfm2_mis['beta'].head()
|
|
||||||
|
ncols_add = 0
|
||||||
|
if not 'or_kin' in dfm2_mis.columns:
|
||||||
|
dfm2_mis['or_kin'] = np.exp(dfm2_mis['beta'])
|
||||||
|
print(dfm2_mis['or_kin'].head())
|
||||||
|
ncols_add+=1
|
||||||
|
print('Calculating OR from beta coeff by exponent function and adding column:'
|
||||||
|
, '\nNo. of cols added:', ncols_add
|
||||||
|
, dfm2_mis['beta'].head()
|
||||||
, '\n======================================')
|
, '\n======================================')
|
||||||
dfm2_mis['or_kin'] = np.exp(dfm2_mis['beta'])
|
|
||||||
print(dfm2_mis['or_kin'].head())
|
if dfm2_mis.shape[1] == df_ncols + ncols_add:
|
||||||
print('No. of cols after adidng OR kinship:', len(dfm2_mis.columns))
|
print('PASS: Dimension of df match'
|
||||||
|
, '\nDim of df:', dfm2_mis.shape
|
||||||
|
, 'n====================================')
|
||||||
|
else:
|
||||||
|
print('FAIL: Dim mismatch'
|
||||||
|
, '\nOriginal no. of cols:', df_ncols
|
||||||
|
, '\nExpected no. of cols:', df_ncols + ncols_add
|
||||||
|
, '\nGot:', dfm2_mis.shape[1])
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
print('No. of cols after adding OR kinship:', len(dfm2_mis.columns))
|
||||||
|
|
||||||
#2) rename af column
|
#2) rename af column
|
||||||
dfm2_mis.rename(columns = {'af': 'af_kin'
|
dfm2_mis.rename(columns = {'af': 'af_kin'
|
||||||
|
@ -314,31 +333,63 @@ dfm2_mis.rename(columns = {'af': 'af_kin'
|
||||||
#3a) drop duplicate columns
|
#3a) drop duplicate columns
|
||||||
dfm2_mis2 = dfm2_mis.T.drop_duplicates().T #changes dtypes in cols, so not used
|
dfm2_mis2 = dfm2_mis.T.drop_duplicates().T #changes dtypes in cols, so not used
|
||||||
dup_cols = set(dfm2_mis.columns).difference(dfm2_mis2.columns)
|
dup_cols = set(dfm2_mis.columns).difference(dfm2_mis2.columns)
|
||||||
print('Total no of duplciate columns:', len(dup_cols)
|
print('Total no of duplicate columns:', len(dup_cols)
|
||||||
, '\nDuplicate columns identified:', dup_cols)
|
, '\nDuplicate columns identified:', dup_cols)
|
||||||
#dup_cols = {'alt_allele0', 'ps'} # didn't want to remove tot_diff
|
#dup_cols = {'alt_allele0', 'ps'} # didn't want to remove tot_diff
|
||||||
|
|
||||||
del(dfm2_mis2)
|
del(dfm2_mis2, df_ncols, ncols_add)
|
||||||
|
|
||||||
#print('removing duplicate columns: kept one of the dup_cols i.e tot_diff')
|
#print('removing duplicate columns: kept one of the dup_cols i.e tot_diff')
|
||||||
print('Removing duplicate columns'
|
print('Removing duplicate columns'
|
||||||
, '\nOriginal dim:', dfm2_mis.shape)
|
, '\nOriginal dim:', dfm2_mis.shape)
|
||||||
dfm2_mis.drop(list(dup_cols), axis = 1, inplace = True)
|
dfm2_mis.drop(list(dup_cols), axis = 1, inplace = True)
|
||||||
print('\nRevised dim:', dfm2_mis.shape, 'after removing', len(dup_cols), 'columns')
|
|
||||||
|
df_ncols = dfm2_mis.shape[1]
|
||||||
|
if dfm2_mis.shape[1] == df_ncols - len(dup_cols):
|
||||||
|
print('PASS: Dimensions match'
|
||||||
|
, '\nDim:', dfm2_mis.shape
|
||||||
|
, '\nRemoved', len(dup_cols), 'columns from' , df_ncols
|
||||||
|
, '\n======================================')
|
||||||
|
else:
|
||||||
|
print('FAIL: Dimensions mismatch'
|
||||||
|
, '\nOriginal no. of cols:', df_ncols
|
||||||
|
, '\nNo. of cols to drop:', len(dup_cols)
|
||||||
|
, '\nExpected:', df_cols - len(dup_cols)
|
||||||
|
, '\nGot:', dfm2_mis.shape[1])
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
del(df_ncols)
|
||||||
|
|
||||||
#3b) other not useful columns
|
#3b) other not useful columns
|
||||||
cols_to_drop = ['chromosome_text', 'n_diff', 'chr', 'symbol', '_merge' ]
|
cols_to_drop = ['chromosome_text', 'n_diff', 'chr', 'symbol', '_merge' ]
|
||||||
|
|
||||||
|
df_ncols = dfm2_mis.shape[1]
|
||||||
|
|
||||||
dfm2_mis.drop(cols_to_drop, axis = 1, inplace = True)
|
dfm2_mis.drop(cols_to_drop, axis = 1, inplace = True)
|
||||||
#dfm2_mis.rename(columns = {'ref_allele1': 'reference_allele'}, inplace = True)
|
#dfm2_mis.rename(columns = {'ref_allele1': 'reference_allele'}, inplace = True)
|
||||||
|
|
||||||
|
|
||||||
|
if dfm2_mis.shape[1] == df_ncols - len(cols_to_drop):
|
||||||
|
print('PASS:', len(cols_to_drop), 'columns successfully dropped'
|
||||||
|
, '\nDim:', dfm2_mis.shape
|
||||||
|
, '\nRemoved', len(cols_to_drop), 'columns from', df_ncols
|
||||||
|
, '\n===========================================')
|
||||||
|
else:
|
||||||
|
print('FAIL: Dimensions mismatch'
|
||||||
|
, '\nOriginal no. of cols:', df_ncols
|
||||||
|
, '\nExpected:', df_ncols - len(cols_to_drop)
|
||||||
|
, '\nGot:', dfm2_mis.shape[1])
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
del(df_ncols)
|
||||||
print('Dim after dropping', len(cols_to_drop), 'columns:', dfm2_mis.shape)
|
print('Dim after dropping', len(cols_to_drop), 'columns:', dfm2_mis.shape)
|
||||||
#print(dfm2_mis.columns)
|
#print(dfm2_mis.columns)
|
||||||
|
|
||||||
#4) reorder columns
|
#4) reorder columnn
|
||||||
#orkin_linked = dfm2_mis.copy()
|
print('Reordering', dfm2_mis.shape[1], 'columns'
|
||||||
|
, '\n===============================================')
|
||||||
|
|
||||||
orkin_linked = dfm2_mis[['mutation',
|
column_order = ['mutation',
|
||||||
'mutationinformation',
|
'mutationinformation',
|
||||||
'wild_type',
|
'wild_type',
|
||||||
'position',
|
'position',
|
||||||
|
@ -375,8 +426,19 @@ orkin_linked = dfm2_mis[['mutation',
|
||||||
# 'tot_diff',
|
# 'tot_diff',
|
||||||
'n_miss',
|
'n_miss',
|
||||||
'wt_3let',
|
'wt_3let',
|
||||||
'mt_3let']]
|
'mt_3let']
|
||||||
|
#orkin_linked = dfm2_mis.copy()
|
||||||
|
|
||||||
|
if len(column_order) == dfm2_mis.shape[1]:
|
||||||
|
print('PASS: Column order generated for', len(column_order), 'columns'
|
||||||
|
, '\nApplying column order to df...' )
|
||||||
|
orkin_linked = dfm2_mis[column_order]
|
||||||
|
else:
|
||||||
|
print('FAIL: Mismatch in no. of cols to reorder'
|
||||||
|
, '\nNo. of cols in df to reorder:', dfm2_mis.shape[1]
|
||||||
|
, '\nOrder generated for:', len(column_order), 'columns'
|
||||||
|
, '\n', dfm2_mis.shape[1], 'should match', len(column_order))
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
# sanity check after reassigning columns
|
# sanity check after reassigning columns
|
||||||
if orkin_linked.shape == dfm2_mis.shape and set(orkin_linked.columns) == set(dfm2_mis.columns):
|
if orkin_linked.shape == dfm2_mis.shape and set(orkin_linked.columns) == set(dfm2_mis.columns):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue