From 5655af42c07b15515b27b2e893447e0ccaa72c53 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Mon, 13 Jul 2020 11:37:43 +0100 Subject: [PATCH] added sanity checks for or_kinship calcs --- scripts/find_missense.py | 2 +- scripts/or_kinship_link.py | 130 ++++++++++++++++++++----------------- 2 files changed, 73 insertions(+), 59 deletions(-) diff --git a/scripts/find_missense.py b/scripts/find_missense.py index ae52065..71f2c5d 100755 --- a/scripts/find_missense.py +++ b/scripts/find_missense.py @@ -33,7 +33,7 @@ def find_missense(test_df, ref_allele_column, alt_allele_column, n_diff_colname @alt_a_colname: user defined colname containing extracted alt allele @type: str - returns df: with 4 columns. If column names clash, the function column + returns df: with 4 added columns. If column names clash, the function column name will override original column @rtype: pandas df """ diff --git a/scripts/or_kinship_link.py b/scripts/or_kinship_link.py index 82f4ed0..2ddac12 100755 --- a/scripts/or_kinship_link.py +++ b/scripts/or_kinship_link.py @@ -161,24 +161,29 @@ else: # Hence the merge needs to be performed on a unique set of attributes which in our case # would be chromosome_number, ref_allele and alt_allele -orig_len = len(or_df.columns) +df_ncols = len(or_df.columns) +print('Dim of df:',or_df.shape + , '\nExtracting missense muts as single letters') #find_missense(or_df, 'ref_allele1', 'alt_allele0') # adds columns to df passed find_missense(or_df, ref_allele_column = 'ref_allele1', alt_allele_column = 'alt_allele0') -ncols_add = 4 -if len(or_df.columns) == orig_len + ncols_add: +print('Dim of revised df:', or_df.shape, ' after extraction of missense muts') + +# FIXME: import this from function +ncols_from_func = 4 +if or_df.shape[1] == df_ncols + ncols_from_func: print('PASS: Succesfuly extracted ref and alt alleles for missense muts') else: print('FAIL: No. of cols mismatch' - ,'\noriginal length:', orig_len - , '\nExpected no. of cols:', orig_len + ncols_add - , '\nGot no. of cols:', len(or_df.columns) + ,'\nOriginal length:', df_ncols + , '\nExpected no. of cols:', df_ncols + ncols_from_func + , '\nGot:', or_df.shape[1] , '\nCheck hardcoded value of ncols_add?') sys.exit() -del(orig_len, ncols_add) +del(df_ncols, ncols_from_func) #%% TRY MERGE # check dtypes @@ -198,116 +203,129 @@ if or_df['chromosome_number'].isin(genomic_pos_df['chr_pos']).sum() == len(or_df print('PASS: all genomic locs in or_df have meta datain info.txt') else: sys.exit('FAIL: some genomic locs or_df chr number DO NOT have meta data in snp_info.txt') -#%% Perform merge +#%% perform merge #my_join = 'inner' #my_join = 'outer' my_join = 'left' #my_join = 'right' +merging_cols = ['chromosome_number', 'ref_allele', 'alt_allele'] + +print('Merging 2 dfs: or_df and info_df using join type:', my_join + , '\nColumns to merge on:', merging_cols + , '\n=================================================') #dfm1 = pd.merge(or_df, info_df, on ='chromosome_number', how = my_join, indicator = True) # not unique! -dfm1 = pd.merge(or_df, info_df, on = ['chromosome_number', 'ref_allele', 'alt_allele'], how = my_join, indicator = True) +dfm1 = pd.merge(or_df, info_df, on = merging_cols, how = my_join, indicator = True) dfm1['_merge'].value_counts() - # count no. of missense mutations ONLY -dfm1.snp_info.str.count(r'(missense.*)').sum() +print('Expected no. of missense mutations:', dfm1.snp_info.str.count(r'(missense.*)').sum()) -dfm2 = pd.merge(or_df, info_df2, on = ['chromosome_number', 'ref_allele', 'alt_allele'], how = my_join, indicator = True) +# Merge with info_df2 has this has extra columns due to bash preformatting +# These extra columns are just 'snp_info' column split on '|' +print('Merging with info_df2 as it has,', len(set(info_df2.columns).difference(info_df.columns)) + , 'extra columns relevant for downstream analyses:\n\n' + , set(info_df2.columns).difference(info_df.columns)) + +dfm2 = pd.merge(or_df, info_df2, on = merging_cols, how = my_join, indicator = True) dfm2['_merge'].value_counts() -print(or_df.columns, info_df2.columns) - # count no. of nan -print('No. of NA in dfm2:', dfm2['mut_type'].isna().sum()) +print('No. of NA in dfm2:', dfm2['mut_type'].isna().sum()) -# drop nan -#dfm2_mis = dfm2[dfm2['mut_type'].notnull()] +# drop nan from dfm2_mis +dfm2_mis = dfm2[dfm2['mut_type'].notnull()] #%% sanity check # count no. of missense muts -if len(dfm1) - dfm1.snp_info.str.count(r'(missense.*)').sum() == dfm2['mut_type'].isna().sum(): + +#if len(dfm1) - dfm1.snp_info.str.count(r'(missense.*)').sum() == dfm2['mut_type'].isna().sum(): +if dfm2_mis.shape[0] == dfm1.snp_info.str.count(r'(missense.*)').sum(): print('PASSED: numbers cross checked' , '\nTotal no. of missense mutations:', dfm1.snp_info.str.count(r'(missense.*)').sum() , '\nNo. of mutations falsely assumed to be missense:', len(dfm1) - dfm1.snp_info.str.count(r'(missense.*)').sum()) - -# two ways to filter to get only missense muts +else: + print('FAIL: numbers mismatch' + , '\Expected no. of rows:',dfm1.snp_info.str.count(r'(missense.*)').sum() + , '\nGot:', dfm2_mis.shape[0] + , '\nExpected no. of cols:', dfm1.shape[1] + len(set(info_df2.columns).difference(info_df.columns))-1) + +# two ways to filter to get only missense muts test = dfm1[dfm1['snp_info'].str.count('missense.*')>0] dfm1_mis = dfm1[dfm1['snp_info'].str.match('(missense.*)') == True] test.equals(dfm1_mis) -# drop nan from dfm2 -dfm2_mis = dfm2[dfm2['mut_type'].notnull()] - if dfm1_mis[['chromosome_number', 'ref_allele', 'alt_allele']].equals(dfm2_mis[['chromosome_number', 'ref_allele', 'alt_allele']]): print('PASS: Further cross checks successful') else: - sys.exit('FAIL: Second cross check unsuccessfull!') + sys.exit('FAIL: Second cross check unsuccessful!') + +del(test, dfm1_mis) #%% extract mut info into three cols -orig_len = len(dfm2_mis.columns) +df_ncols = len(dfm2_mis.columns) print('Dim of df to add cols to:', dfm2_mis.shape) # column names already present, wrap this in a if and perform sanity check ncols_add = 0 if not 'wild_type' in dfm2_mis.columns: print('Extracting and adding column: wild_type' - , '\n======================================') + , '\n===============================================================') dfm2_mis['wild_type'] = dfm2_mis['mut_info'].str.extract('(\w{1})>') ncols_add+=1 if not 'position' in dfm2_mis.columns: print('Extracting and adding column: position' - , '\n======================================') + , '\n===============================================================') dfm2_mis['position'] = dfm2_mis['mut_info'].str.extract('(\d+)') ncols_add+=1 if not 'mutant_type' in dfm2_mis.columns: print('Extracting and adding column: mutant_type' - , '\n======================================') + , '\n================================================================') dfm2_mis['mutant_type'] = dfm2_mis['mut_info'].str.extract('>\d+(\w{1})') ncols_add+=1 if not 'mutationinformation' in dfm2_mis.columns: print('combining to create column: mutationinformation' - , '\n======================================') + , '\n===============================================================') dfm2_mis['mutationinformation'] = dfm2_mis['wild_type'] + dfm2_mis['position'] + dfm2_mis['mutant_type'] ncols_add+=1 print('No. of cols added:', ncols_add) - -if len(dfm2_mis.columns) == orig_len + ncols_add: +if len(dfm2_mis.columns) == df_ncols + ncols_add: print('PASS: mcsm style muts present in df' - , '\n======================================') + , '\n===============================================================') else: print('FAIL: No. of cols mismatch' - ,'\nOriginal length:', orig_len - , '\nExpected no. of cols:', orig_len + ncols_add + ,'\nOriginal length:', df_ncols + , '\nExpected no. of cols:', df_ncols + ncols_add , '\nGot:', len(dfm2_mis.columns)) sys.exit() -del(ncols_add, orig_len) +del(df_ncols, ncols_add) #%% Calculating OR from beta coeff +print('Calculating OR...') df_ncols = dfm2_mis.shape[1] print('No. of cols pre-formatting data:', df_ncols - , '\n======================================') + , '\n===================================================================') #1) Add column: OR for kinship calculated from beta coef - ncols_add = 0 if not 'or_kin' in dfm2_mis.columns: dfm2_mis['or_kin'] = np.exp(dfm2_mis['beta']) print(dfm2_mis['or_kin'].head()) ncols_add+=1 print('Calculating OR from beta coeff by exponent function and adding column:' - , '\nNo. of cols added:', ncols_add + , '\nNo. of cols added:', ncols_add, '\n' , dfm2_mis['beta'].head() - , '\n======================================') + , '\n===================================================================') if dfm2_mis.shape[1] == df_ncols + ncols_add: print('PASS: Dimension of df match' , '\nDim of df:', dfm2_mis.shape - , 'n====================================') + , 'n================================================================') else: print('FAIL: Dim mismatch' , '\nOriginal no. of cols:', df_ncols @@ -315,46 +333,47 @@ else: , '\nGot:', dfm2_mis.shape[1]) sys.exit() -print('No. of cols after adding OR kinship:', len(dfm2_mis.columns)) +print('No. of cols after adding OR_kin:', len(dfm2_mis.columns)) -#2) rename af column +#2) rename columns to reflect that it is coming from kinship matrix adjustment dfm2_mis.rename(columns = {'af': 'af_kin' , 'beta': 'beta_kin' , 'p_wald': 'pwald_kin' , 'se': 'se_kin', 'logl_H1': 'logl_H1_kin' , 'l_remle': 'l_remle_kin' , 'ref_allele1': 'reference_allele'}, inplace = True) - +del(df_ncols, ncols_add) +#%%==============================!!!!!!!======================================= # FIXME: should be at source # checking tot_diff column #print((dfm2_mis['tot_diff']==1).all()) and remove these cols - +#%%==============================!!!!!!!======================================= #3) drop some not required cols (including duplicate if you want) #3a) drop duplicate columns dfm2_mis2 = dfm2_mis.T.drop_duplicates().T #changes dtypes in cols, so not used dup_cols = set(dfm2_mis.columns).difference(dfm2_mis2.columns) print('Total no of duplicate columns:', len(dup_cols) - , '\nDuplicate columns identified:', dup_cols) + , '\nDuplicate columns identified:', dup_cols + , '\n===================================================================') #dup_cols = {'alt_allele0', 'ps'} # didn't want to remove tot_diff -del(dfm2_mis2, df_ncols, ncols_add) - #print('removing duplicate columns: kept one of the dup_cols i.e tot_diff') +df_ncols = dfm2_mis.shape[1] + print('Removing duplicate columns' , '\nOriginal dim:', dfm2_mis.shape) dfm2_mis.drop(list(dup_cols), axis = 1, inplace = True) -df_ncols = dfm2_mis.shape[1] -if dfm2_mis.shape[1] == df_ncols - len(dup_cols): +if dfm2_mis.shape[1] == df_ncols - len(dup_cols): print('PASS: Dimensions match' , '\nDim:', dfm2_mis.shape , '\nRemoved', len(dup_cols), 'columns from' , df_ncols - , '\n======================================') + , '\n===============================================================') else: print('FAIL: Dimensions mismatch' , '\nOriginal no. of cols:', df_ncols , '\nNo. of cols to drop:', len(dup_cols) - , '\nExpected:', df_cols - len(dup_cols) + , '\nExpected:', df_ncols - len(dup_cols) , '\nGot:', dfm2_mis.shape[1]) sys.exit() @@ -362,17 +381,15 @@ del(df_ncols) #3b) other not useful columns cols_to_drop = ['chromosome_text', 'n_diff', 'chr', 'symbol', '_merge' ] - df_ncols = dfm2_mis.shape[1] - dfm2_mis.drop(cols_to_drop, axis = 1, inplace = True) #dfm2_mis.rename(columns = {'ref_allele1': 'reference_allele'}, inplace = True) - if dfm2_mis.shape[1] == df_ncols - len(cols_to_drop): print('PASS:', len(cols_to_drop), 'columns successfully dropped' , '\nDim:', dfm2_mis.shape , '\nRemoved', len(cols_to_drop), 'columns from', df_ncols + , '\nDim after dropping', len(cols_to_drop), 'columns:', dfm2_mis.shape , '\n===========================================') else: print('FAIL: Dimensions mismatch' @@ -382,9 +399,7 @@ else: sys.exit() del(df_ncols) -print('Dim after dropping', len(cols_to_drop), 'columns:', dfm2_mis.shape) -#print(dfm2_mis.columns) - +#%% #4) reorder columnn print('Reordering', dfm2_mis.shape[1], 'columns' , '\n===============================================') @@ -427,7 +442,6 @@ column_order = ['mutation', 'n_miss', 'wt_3let', 'mt_3let'] -#orkin_linked = dfm2_mis.copy() if len(column_order) == dfm2_mis.shape[1]: print('PASS: Column order generated for', len(column_order), 'columns'