added sanity checks for or_kinship calcs

This commit is contained in:
Tanushree Tunstall 2020-07-13 11:37:43 +01:00
parent 167b051ae7
commit 5655af42c0
2 changed files with 73 additions and 59 deletions

View file

@ -33,7 +33,7 @@ def find_missense(test_df, ref_allele_column, alt_allele_column, n_diff_colname
@alt_a_colname: user defined colname containing extracted alt allele
@type: str
returns df: with 4 columns. If column names clash, the function column
returns df: with 4 added columns. If column names clash, the function column
name will override original column
@rtype: pandas df
"""

View file

@ -161,24 +161,29 @@ else:
# Hence the merge needs to be performed on a unique set of attributes which in our case
# would be chromosome_number, ref_allele and alt_allele
orig_len = len(or_df.columns)
df_ncols = len(or_df.columns)
print('Dim of df:',or_df.shape
, '\nExtracting missense muts as single letters')
#find_missense(or_df, 'ref_allele1', 'alt_allele0')
# adds columns to df passed
find_missense(or_df, ref_allele_column = 'ref_allele1', alt_allele_column = 'alt_allele0')
ncols_add = 4
if len(or_df.columns) == orig_len + ncols_add:
print('Dim of revised df:', or_df.shape, ' after extraction of missense muts')
# FIXME: import this from function
ncols_from_func = 4
if or_df.shape[1] == df_ncols + ncols_from_func:
print('PASS: Succesfuly extracted ref and alt alleles for missense muts')
else:
print('FAIL: No. of cols mismatch'
,'\noriginal length:', orig_len
, '\nExpected no. of cols:', orig_len + ncols_add
, '\nGot no. of cols:', len(or_df.columns)
,'\nOriginal length:', df_ncols
, '\nExpected no. of cols:', df_ncols + ncols_from_func
, '\nGot:', or_df.shape[1]
, '\nCheck hardcoded value of ncols_add?')
sys.exit()
del(orig_len, ncols_add)
del(df_ncols, ncols_from_func)
#%% TRY MERGE
# check dtypes
@ -198,116 +203,129 @@ if or_df['chromosome_number'].isin(genomic_pos_df['chr_pos']).sum() == len(or_df
print('PASS: all genomic locs in or_df have meta datain info.txt')
else:
sys.exit('FAIL: some genomic locs or_df chr number DO NOT have meta data in snp_info.txt')
#%% Perform merge
#%% perform merge
#my_join = 'inner'
#my_join = 'outer'
my_join = 'left'
#my_join = 'right'
merging_cols = ['chromosome_number', 'ref_allele', 'alt_allele']
print('Merging 2 dfs: or_df and info_df using join type:', my_join
, '\nColumns to merge on:', merging_cols
, '\n=================================================')
#dfm1 = pd.merge(or_df, info_df, on ='chromosome_number', how = my_join, indicator = True) # not unique!
dfm1 = pd.merge(or_df, info_df, on = ['chromosome_number', 'ref_allele', 'alt_allele'], how = my_join, indicator = True)
dfm1 = pd.merge(or_df, info_df, on = merging_cols, how = my_join, indicator = True)
dfm1['_merge'].value_counts()
# count no. of missense mutations ONLY
dfm1.snp_info.str.count(r'(missense.*)').sum()
print('Expected no. of missense mutations:', dfm1.snp_info.str.count(r'(missense.*)').sum())
dfm2 = pd.merge(or_df, info_df2, on = ['chromosome_number', 'ref_allele', 'alt_allele'], how = my_join, indicator = True)
# Merge with info_df2 has this has extra columns due to bash preformatting
# These extra columns are just 'snp_info' column split on '|'
print('Merging with info_df2 as it has,', len(set(info_df2.columns).difference(info_df.columns))
, 'extra columns relevant for downstream analyses:\n\n'
, set(info_df2.columns).difference(info_df.columns))
dfm2 = pd.merge(or_df, info_df2, on = merging_cols, how = my_join, indicator = True)
dfm2['_merge'].value_counts()
print(or_df.columns, info_df2.columns)
# count no. of nan
print('No. of NA in dfm2:', dfm2['mut_type'].isna().sum())
# drop nan
#dfm2_mis = dfm2[dfm2['mut_type'].notnull()]
# drop nan from dfm2_mis
dfm2_mis = dfm2[dfm2['mut_type'].notnull()]
#%% sanity check
# count no. of missense muts
if len(dfm1) - dfm1.snp_info.str.count(r'(missense.*)').sum() == dfm2['mut_type'].isna().sum():
#if len(dfm1) - dfm1.snp_info.str.count(r'(missense.*)').sum() == dfm2['mut_type'].isna().sum():
if dfm2_mis.shape[0] == dfm1.snp_info.str.count(r'(missense.*)').sum():
print('PASSED: numbers cross checked'
, '\nTotal no. of missense mutations:', dfm1.snp_info.str.count(r'(missense.*)').sum()
, '\nNo. of mutations falsely assumed to be missense:', len(dfm1) - dfm1.snp_info.str.count(r'(missense.*)').sum())
else:
print('FAIL: numbers mismatch'
, '\Expected no. of rows:',dfm1.snp_info.str.count(r'(missense.*)').sum()
, '\nGot:', dfm2_mis.shape[0]
, '\nExpected no. of cols:', dfm1.shape[1] + len(set(info_df2.columns).difference(info_df.columns))-1)
# two ways to filter to get only missense muts
# two ways to filter to get only missense muts
test = dfm1[dfm1['snp_info'].str.count('missense.*')>0]
dfm1_mis = dfm1[dfm1['snp_info'].str.match('(missense.*)') == True]
test.equals(dfm1_mis)
# drop nan from dfm2
dfm2_mis = dfm2[dfm2['mut_type'].notnull()]
if dfm1_mis[['chromosome_number', 'ref_allele', 'alt_allele']].equals(dfm2_mis[['chromosome_number', 'ref_allele', 'alt_allele']]):
print('PASS: Further cross checks successful')
else:
sys.exit('FAIL: Second cross check unsuccessfull!')
sys.exit('FAIL: Second cross check unsuccessful!')
del(test, dfm1_mis)
#%% extract mut info into three cols
orig_len = len(dfm2_mis.columns)
df_ncols = len(dfm2_mis.columns)
print('Dim of df to add cols to:', dfm2_mis.shape)
# column names already present, wrap this in a if and perform sanity check
ncols_add = 0
if not 'wild_type' in dfm2_mis.columns:
print('Extracting and adding column: wild_type'
, '\n======================================')
, '\n===============================================================')
dfm2_mis['wild_type'] = dfm2_mis['mut_info'].str.extract('(\w{1})>')
ncols_add+=1
if not 'position' in dfm2_mis.columns:
print('Extracting and adding column: position'
, '\n======================================')
, '\n===============================================================')
dfm2_mis['position'] = dfm2_mis['mut_info'].str.extract('(\d+)')
ncols_add+=1
if not 'mutant_type' in dfm2_mis.columns:
print('Extracting and adding column: mutant_type'
, '\n======================================')
, '\n================================================================')
dfm2_mis['mutant_type'] = dfm2_mis['mut_info'].str.extract('>\d+(\w{1})')
ncols_add+=1
if not 'mutationinformation' in dfm2_mis.columns:
print('combining to create column: mutationinformation'
, '\n======================================')
, '\n===============================================================')
dfm2_mis['mutationinformation'] = dfm2_mis['wild_type'] + dfm2_mis['position'] + dfm2_mis['mutant_type']
ncols_add+=1
print('No. of cols added:', ncols_add)
if len(dfm2_mis.columns) == orig_len + ncols_add:
if len(dfm2_mis.columns) == df_ncols + ncols_add:
print('PASS: mcsm style muts present in df'
, '\n======================================')
, '\n===============================================================')
else:
print('FAIL: No. of cols mismatch'
,'\nOriginal length:', orig_len
, '\nExpected no. of cols:', orig_len + ncols_add
,'\nOriginal length:', df_ncols
, '\nExpected no. of cols:', df_ncols + ncols_add
, '\nGot:', len(dfm2_mis.columns))
sys.exit()
del(ncols_add, orig_len)
del(df_ncols, ncols_add)
#%% Calculating OR from beta coeff
print('Calculating OR...')
df_ncols = dfm2_mis.shape[1]
print('No. of cols pre-formatting data:', df_ncols
, '\n======================================')
, '\n===================================================================')
#1) Add column: OR for kinship calculated from beta coef
ncols_add = 0
if not 'or_kin' in dfm2_mis.columns:
dfm2_mis['or_kin'] = np.exp(dfm2_mis['beta'])
print(dfm2_mis['or_kin'].head())
ncols_add+=1
print('Calculating OR from beta coeff by exponent function and adding column:'
, '\nNo. of cols added:', ncols_add
, '\nNo. of cols added:', ncols_add, '\n'
, dfm2_mis['beta'].head()
, '\n======================================')
, '\n===================================================================')
if dfm2_mis.shape[1] == df_ncols + ncols_add:
print('PASS: Dimension of df match'
, '\nDim of df:', dfm2_mis.shape
, 'n====================================')
, 'n================================================================')
else:
print('FAIL: Dim mismatch'
, '\nOriginal no. of cols:', df_ncols
@ -315,46 +333,47 @@ else:
, '\nGot:', dfm2_mis.shape[1])
sys.exit()
print('No. of cols after adding OR kinship:', len(dfm2_mis.columns))
print('No. of cols after adding OR_kin:', len(dfm2_mis.columns))
#2) rename af column
#2) rename columns to reflect that it is coming from kinship matrix adjustment
dfm2_mis.rename(columns = {'af': 'af_kin'
, 'beta': 'beta_kin'
, 'p_wald': 'pwald_kin'
, 'se': 'se_kin', 'logl_H1': 'logl_H1_kin'
, 'l_remle': 'l_remle_kin'
, 'ref_allele1': 'reference_allele'}, inplace = True)
del(df_ncols, ncols_add)
#%%==============================!!!!!!!=======================================
# FIXME: should be at source
# checking tot_diff column
#print((dfm2_mis['tot_diff']==1).all()) and remove these cols
#%%==============================!!!!!!!=======================================
#3) drop some not required cols (including duplicate if you want)
#3a) drop duplicate columns
dfm2_mis2 = dfm2_mis.T.drop_duplicates().T #changes dtypes in cols, so not used
dup_cols = set(dfm2_mis.columns).difference(dfm2_mis2.columns)
print('Total no of duplicate columns:', len(dup_cols)
, '\nDuplicate columns identified:', dup_cols)
, '\nDuplicate columns identified:', dup_cols
, '\n===================================================================')
#dup_cols = {'alt_allele0', 'ps'} # didn't want to remove tot_diff
del(dfm2_mis2, df_ncols, ncols_add)
#print('removing duplicate columns: kept one of the dup_cols i.e tot_diff')
df_ncols = dfm2_mis.shape[1]
print('Removing duplicate columns'
, '\nOriginal dim:', dfm2_mis.shape)
dfm2_mis.drop(list(dup_cols), axis = 1, inplace = True)
df_ncols = dfm2_mis.shape[1]
if dfm2_mis.shape[1] == df_ncols - len(dup_cols):
print('PASS: Dimensions match'
, '\nDim:', dfm2_mis.shape
, '\nRemoved', len(dup_cols), 'columns from' , df_ncols
, '\n======================================')
, '\n===============================================================')
else:
print('FAIL: Dimensions mismatch'
, '\nOriginal no. of cols:', df_ncols
, '\nNo. of cols to drop:', len(dup_cols)
, '\nExpected:', df_cols - len(dup_cols)
, '\nExpected:', df_ncols - len(dup_cols)
, '\nGot:', dfm2_mis.shape[1])
sys.exit()
@ -362,17 +381,15 @@ del(df_ncols)
#3b) other not useful columns
cols_to_drop = ['chromosome_text', 'n_diff', 'chr', 'symbol', '_merge' ]
df_ncols = dfm2_mis.shape[1]
dfm2_mis.drop(cols_to_drop, axis = 1, inplace = True)
#dfm2_mis.rename(columns = {'ref_allele1': 'reference_allele'}, inplace = True)
if dfm2_mis.shape[1] == df_ncols - len(cols_to_drop):
print('PASS:', len(cols_to_drop), 'columns successfully dropped'
, '\nDim:', dfm2_mis.shape
, '\nRemoved', len(cols_to_drop), 'columns from', df_ncols
, '\nDim after dropping', len(cols_to_drop), 'columns:', dfm2_mis.shape
, '\n===========================================')
else:
print('FAIL: Dimensions mismatch'
@ -382,9 +399,7 @@ else:
sys.exit()
del(df_ncols)
print('Dim after dropping', len(cols_to_drop), 'columns:', dfm2_mis.shape)
#print(dfm2_mis.columns)
#%%
#4) reorder columnn
print('Reordering', dfm2_mis.shape[1], 'columns'
, '\n===============================================')
@ -427,7 +442,6 @@ column_order = ['mutation',
'n_miss',
'wt_3let',
'mt_3let']
#orkin_linked = dfm2_mis.copy()
if len(column_order) == dfm2_mis.shape[1]:
print('PASS: Column order generated for', len(column_order), 'columns'