added sanity checks for or_kinship calcs
This commit is contained in:
parent
f9500d5324
commit
57a966c7c4
2 changed files with 73 additions and 59 deletions
|
@ -33,7 +33,7 @@ def find_missense(test_df, ref_allele_column, alt_allele_column, n_diff_colname
|
|||
@alt_a_colname: user defined colname containing extracted alt allele
|
||||
@type: str
|
||||
|
||||
returns df: with 4 columns. If column names clash, the function column
|
||||
returns df: with 4 added columns. If column names clash, the function column
|
||||
name will override original column
|
||||
@rtype: pandas df
|
||||
"""
|
||||
|
|
|
@ -161,24 +161,29 @@ else:
|
|||
# Hence the merge needs to be performed on a unique set of attributes which in our case
|
||||
# would be chromosome_number, ref_allele and alt_allele
|
||||
|
||||
orig_len = len(or_df.columns)
|
||||
df_ncols = len(or_df.columns)
|
||||
print('Dim of df:',or_df.shape
|
||||
, '\nExtracting missense muts as single letters')
|
||||
|
||||
#find_missense(or_df, 'ref_allele1', 'alt_allele0')
|
||||
# adds columns to df passed
|
||||
find_missense(or_df, ref_allele_column = 'ref_allele1', alt_allele_column = 'alt_allele0')
|
||||
|
||||
ncols_add = 4
|
||||
if len(or_df.columns) == orig_len + ncols_add:
|
||||
print('Dim of revised df:', or_df.shape, ' after extraction of missense muts')
|
||||
|
||||
# FIXME: import this from function
|
||||
ncols_from_func = 4
|
||||
if or_df.shape[1] == df_ncols + ncols_from_func:
|
||||
print('PASS: Succesfuly extracted ref and alt alleles for missense muts')
|
||||
else:
|
||||
print('FAIL: No. of cols mismatch'
|
||||
,'\noriginal length:', orig_len
|
||||
, '\nExpected no. of cols:', orig_len + ncols_add
|
||||
, '\nGot no. of cols:', len(or_df.columns)
|
||||
,'\nOriginal length:', df_ncols
|
||||
, '\nExpected no. of cols:', df_ncols + ncols_from_func
|
||||
, '\nGot:', or_df.shape[1]
|
||||
, '\nCheck hardcoded value of ncols_add?')
|
||||
sys.exit()
|
||||
|
||||
del(orig_len, ncols_add)
|
||||
del(df_ncols, ncols_from_func)
|
||||
|
||||
#%% TRY MERGE
|
||||
# check dtypes
|
||||
|
@ -198,116 +203,129 @@ if or_df['chromosome_number'].isin(genomic_pos_df['chr_pos']).sum() == len(or_df
|
|||
print('PASS: all genomic locs in or_df have meta datain info.txt')
|
||||
else:
|
||||
sys.exit('FAIL: some genomic locs or_df chr number DO NOT have meta data in snp_info.txt')
|
||||
#%% Perform merge
|
||||
#%% perform merge
|
||||
|
||||
#my_join = 'inner'
|
||||
#my_join = 'outer'
|
||||
my_join = 'left'
|
||||
#my_join = 'right'
|
||||
|
||||
merging_cols = ['chromosome_number', 'ref_allele', 'alt_allele']
|
||||
|
||||
print('Merging 2 dfs: or_df and info_df using join type:', my_join
|
||||
, '\nColumns to merge on:', merging_cols
|
||||
, '\n=================================================')
|
||||
#dfm1 = pd.merge(or_df, info_df, on ='chromosome_number', how = my_join, indicator = True) # not unique!
|
||||
dfm1 = pd.merge(or_df, info_df, on = ['chromosome_number', 'ref_allele', 'alt_allele'], how = my_join, indicator = True)
|
||||
dfm1 = pd.merge(or_df, info_df, on = merging_cols, how = my_join, indicator = True)
|
||||
dfm1['_merge'].value_counts()
|
||||
|
||||
# count no. of missense mutations ONLY
|
||||
dfm1.snp_info.str.count(r'(missense.*)').sum()
|
||||
print('Expected no. of missense mutations:', dfm1.snp_info.str.count(r'(missense.*)').sum())
|
||||
|
||||
dfm2 = pd.merge(or_df, info_df2, on = ['chromosome_number', 'ref_allele', 'alt_allele'], how = my_join, indicator = True)
|
||||
# Merge with info_df2 has this has extra columns due to bash preformatting
|
||||
# These extra columns are just 'snp_info' column split on '|'
|
||||
print('Merging with info_df2 as it has,', len(set(info_df2.columns).difference(info_df.columns))
|
||||
, 'extra columns relevant for downstream analyses:\n\n'
|
||||
, set(info_df2.columns).difference(info_df.columns))
|
||||
|
||||
dfm2 = pd.merge(or_df, info_df2, on = merging_cols, how = my_join, indicator = True)
|
||||
dfm2['_merge'].value_counts()
|
||||
|
||||
print(or_df.columns, info_df2.columns)
|
||||
|
||||
# count no. of nan
|
||||
print('No. of NA in dfm2:', dfm2['mut_type'].isna().sum())
|
||||
|
||||
# drop nan
|
||||
#dfm2_mis = dfm2[dfm2['mut_type'].notnull()]
|
||||
# drop nan from dfm2_mis
|
||||
dfm2_mis = dfm2[dfm2['mut_type'].notnull()]
|
||||
|
||||
#%% sanity check
|
||||
# count no. of missense muts
|
||||
if len(dfm1) - dfm1.snp_info.str.count(r'(missense.*)').sum() == dfm2['mut_type'].isna().sum():
|
||||
|
||||
#if len(dfm1) - dfm1.snp_info.str.count(r'(missense.*)').sum() == dfm2['mut_type'].isna().sum():
|
||||
if dfm2_mis.shape[0] == dfm1.snp_info.str.count(r'(missense.*)').sum():
|
||||
print('PASSED: numbers cross checked'
|
||||
, '\nTotal no. of missense mutations:', dfm1.snp_info.str.count(r'(missense.*)').sum()
|
||||
, '\nNo. of mutations falsely assumed to be missense:', len(dfm1) - dfm1.snp_info.str.count(r'(missense.*)').sum())
|
||||
else:
|
||||
print('FAIL: numbers mismatch'
|
||||
, '\Expected no. of rows:',dfm1.snp_info.str.count(r'(missense.*)').sum()
|
||||
, '\nGot:', dfm2_mis.shape[0]
|
||||
, '\nExpected no. of cols:', dfm1.shape[1] + len(set(info_df2.columns).difference(info_df.columns))-1)
|
||||
|
||||
# two ways to filter to get only missense muts
|
||||
test = dfm1[dfm1['snp_info'].str.count('missense.*')>0]
|
||||
dfm1_mis = dfm1[dfm1['snp_info'].str.match('(missense.*)') == True]
|
||||
test.equals(dfm1_mis)
|
||||
|
||||
# drop nan from dfm2
|
||||
dfm2_mis = dfm2[dfm2['mut_type'].notnull()]
|
||||
|
||||
if dfm1_mis[['chromosome_number', 'ref_allele', 'alt_allele']].equals(dfm2_mis[['chromosome_number', 'ref_allele', 'alt_allele']]):
|
||||
print('PASS: Further cross checks successful')
|
||||
else:
|
||||
sys.exit('FAIL: Second cross check unsuccessfull!')
|
||||
sys.exit('FAIL: Second cross check unsuccessful!')
|
||||
|
||||
del(test, dfm1_mis)
|
||||
|
||||
#%% extract mut info into three cols
|
||||
orig_len = len(dfm2_mis.columns)
|
||||
df_ncols = len(dfm2_mis.columns)
|
||||
print('Dim of df to add cols to:', dfm2_mis.shape)
|
||||
|
||||
# column names already present, wrap this in a if and perform sanity check
|
||||
ncols_add = 0
|
||||
if not 'wild_type' in dfm2_mis.columns:
|
||||
print('Extracting and adding column: wild_type'
|
||||
, '\n======================================')
|
||||
, '\n===============================================================')
|
||||
dfm2_mis['wild_type'] = dfm2_mis['mut_info'].str.extract('(\w{1})>')
|
||||
ncols_add+=1
|
||||
|
||||
if not 'position' in dfm2_mis.columns:
|
||||
print('Extracting and adding column: position'
|
||||
, '\n======================================')
|
||||
, '\n===============================================================')
|
||||
dfm2_mis['position'] = dfm2_mis['mut_info'].str.extract('(\d+)')
|
||||
ncols_add+=1
|
||||
|
||||
if not 'mutant_type' in dfm2_mis.columns:
|
||||
print('Extracting and adding column: mutant_type'
|
||||
, '\n======================================')
|
||||
, '\n================================================================')
|
||||
dfm2_mis['mutant_type'] = dfm2_mis['mut_info'].str.extract('>\d+(\w{1})')
|
||||
ncols_add+=1
|
||||
|
||||
if not 'mutationinformation' in dfm2_mis.columns:
|
||||
print('combining to create column: mutationinformation'
|
||||
, '\n======================================')
|
||||
, '\n===============================================================')
|
||||
dfm2_mis['mutationinformation'] = dfm2_mis['wild_type'] + dfm2_mis['position'] + dfm2_mis['mutant_type']
|
||||
ncols_add+=1
|
||||
|
||||
print('No. of cols added:', ncols_add)
|
||||
|
||||
|
||||
if len(dfm2_mis.columns) == orig_len + ncols_add:
|
||||
if len(dfm2_mis.columns) == df_ncols + ncols_add:
|
||||
print('PASS: mcsm style muts present in df'
|
||||
, '\n======================================')
|
||||
, '\n===============================================================')
|
||||
else:
|
||||
print('FAIL: No. of cols mismatch'
|
||||
,'\nOriginal length:', orig_len
|
||||
, '\nExpected no. of cols:', orig_len + ncols_add
|
||||
,'\nOriginal length:', df_ncols
|
||||
, '\nExpected no. of cols:', df_ncols + ncols_add
|
||||
, '\nGot:', len(dfm2_mis.columns))
|
||||
sys.exit()
|
||||
|
||||
del(ncols_add, orig_len)
|
||||
del(df_ncols, ncols_add)
|
||||
#%% Calculating OR from beta coeff
|
||||
print('Calculating OR...')
|
||||
df_ncols = dfm2_mis.shape[1]
|
||||
print('No. of cols pre-formatting data:', df_ncols
|
||||
, '\n======================================')
|
||||
, '\n===================================================================')
|
||||
|
||||
#1) Add column: OR for kinship calculated from beta coef
|
||||
|
||||
ncols_add = 0
|
||||
if not 'or_kin' in dfm2_mis.columns:
|
||||
dfm2_mis['or_kin'] = np.exp(dfm2_mis['beta'])
|
||||
print(dfm2_mis['or_kin'].head())
|
||||
ncols_add+=1
|
||||
print('Calculating OR from beta coeff by exponent function and adding column:'
|
||||
, '\nNo. of cols added:', ncols_add
|
||||
, '\nNo. of cols added:', ncols_add, '\n'
|
||||
, dfm2_mis['beta'].head()
|
||||
, '\n======================================')
|
||||
, '\n===================================================================')
|
||||
|
||||
if dfm2_mis.shape[1] == df_ncols + ncols_add:
|
||||
print('PASS: Dimension of df match'
|
||||
, '\nDim of df:', dfm2_mis.shape
|
||||
, 'n====================================')
|
||||
, 'n================================================================')
|
||||
else:
|
||||
print('FAIL: Dim mismatch'
|
||||
, '\nOriginal no. of cols:', df_ncols
|
||||
|
@ -315,46 +333,47 @@ else:
|
|||
, '\nGot:', dfm2_mis.shape[1])
|
||||
sys.exit()
|
||||
|
||||
print('No. of cols after adding OR kinship:', len(dfm2_mis.columns))
|
||||
print('No. of cols after adding OR_kin:', len(dfm2_mis.columns))
|
||||
|
||||
#2) rename af column
|
||||
#2) rename columns to reflect that it is coming from kinship matrix adjustment
|
||||
dfm2_mis.rename(columns = {'af': 'af_kin'
|
||||
, 'beta': 'beta_kin'
|
||||
, 'p_wald': 'pwald_kin'
|
||||
, 'se': 'se_kin', 'logl_H1': 'logl_H1_kin'
|
||||
, 'l_remle': 'l_remle_kin'
|
||||
, 'ref_allele1': 'reference_allele'}, inplace = True)
|
||||
|
||||
del(df_ncols, ncols_add)
|
||||
#%%==============================!!!!!!!=======================================
|
||||
# FIXME: should be at source
|
||||
# checking tot_diff column
|
||||
#print((dfm2_mis['tot_diff']==1).all()) and remove these cols
|
||||
|
||||
#%%==============================!!!!!!!=======================================
|
||||
#3) drop some not required cols (including duplicate if you want)
|
||||
#3a) drop duplicate columns
|
||||
dfm2_mis2 = dfm2_mis.T.drop_duplicates().T #changes dtypes in cols, so not used
|
||||
dup_cols = set(dfm2_mis.columns).difference(dfm2_mis2.columns)
|
||||
print('Total no of duplicate columns:', len(dup_cols)
|
||||
, '\nDuplicate columns identified:', dup_cols)
|
||||
, '\nDuplicate columns identified:', dup_cols
|
||||
, '\n===================================================================')
|
||||
#dup_cols = {'alt_allele0', 'ps'} # didn't want to remove tot_diff
|
||||
|
||||
del(dfm2_mis2, df_ncols, ncols_add)
|
||||
|
||||
#print('removing duplicate columns: kept one of the dup_cols i.e tot_diff')
|
||||
df_ncols = dfm2_mis.shape[1]
|
||||
|
||||
print('Removing duplicate columns'
|
||||
, '\nOriginal dim:', dfm2_mis.shape)
|
||||
dfm2_mis.drop(list(dup_cols), axis = 1, inplace = True)
|
||||
|
||||
df_ncols = dfm2_mis.shape[1]
|
||||
if dfm2_mis.shape[1] == df_ncols - len(dup_cols):
|
||||
print('PASS: Dimensions match'
|
||||
, '\nDim:', dfm2_mis.shape
|
||||
, '\nRemoved', len(dup_cols), 'columns from' , df_ncols
|
||||
, '\n======================================')
|
||||
, '\n===============================================================')
|
||||
else:
|
||||
print('FAIL: Dimensions mismatch'
|
||||
, '\nOriginal no. of cols:', df_ncols
|
||||
, '\nNo. of cols to drop:', len(dup_cols)
|
||||
, '\nExpected:', df_cols - len(dup_cols)
|
||||
, '\nExpected:', df_ncols - len(dup_cols)
|
||||
, '\nGot:', dfm2_mis.shape[1])
|
||||
sys.exit()
|
||||
|
||||
|
@ -362,17 +381,15 @@ del(df_ncols)
|
|||
|
||||
#3b) other not useful columns
|
||||
cols_to_drop = ['chromosome_text', 'n_diff', 'chr', 'symbol', '_merge' ]
|
||||
|
||||
df_ncols = dfm2_mis.shape[1]
|
||||
|
||||
dfm2_mis.drop(cols_to_drop, axis = 1, inplace = True)
|
||||
#dfm2_mis.rename(columns = {'ref_allele1': 'reference_allele'}, inplace = True)
|
||||
|
||||
|
||||
if dfm2_mis.shape[1] == df_ncols - len(cols_to_drop):
|
||||
print('PASS:', len(cols_to_drop), 'columns successfully dropped'
|
||||
, '\nDim:', dfm2_mis.shape
|
||||
, '\nRemoved', len(cols_to_drop), 'columns from', df_ncols
|
||||
, '\nDim after dropping', len(cols_to_drop), 'columns:', dfm2_mis.shape
|
||||
, '\n===========================================')
|
||||
else:
|
||||
print('FAIL: Dimensions mismatch'
|
||||
|
@ -382,9 +399,7 @@ else:
|
|||
sys.exit()
|
||||
|
||||
del(df_ncols)
|
||||
print('Dim after dropping', len(cols_to_drop), 'columns:', dfm2_mis.shape)
|
||||
#print(dfm2_mis.columns)
|
||||
|
||||
#%%
|
||||
#4) reorder columnn
|
||||
print('Reordering', dfm2_mis.shape[1], 'columns'
|
||||
, '\n===============================================')
|
||||
|
@ -427,7 +442,6 @@ column_order = ['mutation',
|
|||
'n_miss',
|
||||
'wt_3let',
|
||||
'mt_3let']
|
||||
#orkin_linked = dfm2_mis.copy()
|
||||
|
||||
if len(column_order) == dfm2_mis.shape[1]:
|
||||
print('PASS: Column order generated for', len(column_order), 'columns'
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue