diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index b33d36f..fce24e8 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -8,7 +8,7 @@ Created on Tue Aug 6 12:56:03 2019 #======================================================================= # Task: combining all dfs to a single one -# Input: 8 dfs +# Input: 12/13/14 dfs #1) .lower()'_complex_mcsm_norm.csv' #2) .lower()_foldx.csv' #3) .lower()_dssp.csv' @@ -16,20 +16,16 @@ Created on Tue Aug 6 12:56:03 2019 #5) .lower()_rd.csv' #6) 'ns' + .lower()_snp_info.csv' #7) .lower()_af_or.csv' -#8) .lower() _af_or_kinship.csv +#8) .lower() _af_or_kinship.csv (ONLY for pncA, but omitted for the final run) +#9) .lower()'_dynamut2.csv' +#10) .lower()'_dynamut.csv' +#11) .lower()'_mcsm_na.csv' +#12) .lower()'_mcsm_ppi2.csv' +#13) .lower()'_consurf.csv' +#14) .lower()'_snap2.csv' # combining order -#Merge1 = 1 + 2 -#Merge2 = 3 + 4 -#Merge3 = Merge2 + 5 - -#Merge4 = Merge1 + Merge3 - -#Merge5 = 6 + 7 -#Merge6 = Merge5 + 8 - -#Merge7 = Merge4 + Merge6 # Output: single csv of all 8 dfs combined # useful link @@ -53,10 +49,10 @@ homedir = os.path.expanduser('~') # set working dir os.getcwd() -os.chdir(homedir + '/git/LSHTM_analysis/scripts') +#os.chdir(homedir + '/git/LSHTM_analysis/scripts') +sys.path.append(homedir + '/git/LSHTM_analysis/scripts') os.getcwd() -# FIXME: local imports #from combining import combine_dfs_with_checks from combining_FIXME import detect_common_cols from reference_dict import oneletter_aa_dict @@ -119,6 +115,7 @@ gene_list_normal = ['pnca', 'katg', 'rpob', 'alr'] if gene.lower() == "gid": print("\nReading mCSM file for gene:", gene) in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SRY.csv' # was incorrectly SAM previously + if gene.lower() == "embb": print("\nReading mCSM file for gene:", gene) #in_filename_mcsm = gene.lower() + '_complex_mcsm_norm1.csv' #798 @@ -183,15 +180,12 @@ infile_snap2 = outdir + 'snap2/'+ infilename_snap2 snap2_df = pd.read_csv(infile_snap2, sep = ',') #------------------------------------------------------------------------------ -# ONLY:for gene pnca and gid: End logic should pick this up! +# ONLY: for gene 'gid' and 'rpob': End logic should pick this up! geneL_na = ['gid', 'rpob'] if gene.lower() in geneL_na: print("\nGene:", gene.lower() , "\nReading mCSM_na files") - # infilename_dynamut = gene.lower() + '_dynamut_norm.csv' # gid - # infile_dynamut = outdir + 'dynamut_results/' + infilename_dynamut - # dynamut_df = pd.read_csv(infile_dynamut, sep = ',') - + infilename_mcsm_na = gene.lower() + '_complex_mcsm_na_norm.csv' # gid infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',') @@ -199,18 +193,13 @@ if gene.lower() in geneL_na: geneL_dy = ['gid'] if gene.lower() in geneL_dy: print("\nGene:", gene.lower() - , "\nReading Dynamut and mCSM_na files") + , "\nReading Dynamut files") infilename_dynamut = gene.lower() + '_dynamut_norm.csv' # gid infile_dynamut = outdir + 'dynamut_results/' + infilename_dynamut dynamut_df = pd.read_csv(infile_dynamut, sep = ',') - - # infilename_mcsm_na = gene.lower() + '_complex_mcsm_na_norm.csv' # gid - # infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na - # mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',') -# ONLY:for gene embb and alr and katg: End logic should pick this up! -geneL_ppi2 = ['embb', 'alr'] -#if gene.lower() == "embb" or "alr": +# ONLY: for genes 'alr', 'embb', 'katg' and 'rpob': End logic should pick this up! +geneL_ppi2 = ['alr', 'embb', 'katg', 'rpob'] if gene.lower() in geneL_ppi2: infilename_mcsm_ppi2 = gene.lower() + '_complex_mcsm_ppi2_norm.csv' infile_mcsm_ppi2 = outdir + 'mcsm_ppi2/' + infilename_mcsm_ppi2 @@ -224,10 +213,17 @@ else: #======= # output #======= +# outfile 3 out_filename_comb = gene.lower() + '_all_params.csv' outfile_comb = outdir + out_filename_comb -print('\nOutput filename:', outfile_comb - , '\n===================================================================') + +# outfile 2 +out_filename_comb_afor = gene.lower() + '_comb_afor.csv' +outfile_comb_afor = outdir + out_filename_comb_afor + +# outfile 1 +out_filename_stab_struc = gene.lower() + '_comb_stab_struc_params.csv' +outfile_stab_struc = outdir + out_filename_stab_struc # end of variable assignment for input and output files #%%############################################################################ @@ -235,6 +231,22 @@ print('\nOutput filename:', outfile_comb # some preprocessing #===================== +#=========== +# KD +#=========== +kd_df.shape + +# geneL_kd = ['alr'] +# if gene.lower() in geneL_kd: +# print('\nRunning gene:', gene.lower() +# ,'\nChecking start numbering') + +if kd_df['wild_type_kd'].str.contains('X').any(): + print('\nDetected X in wild_type_kd' + , '\nRunning gene:', gene.lower() + , '\nChecking start numbering') + kd_df = kd_df[~kd_df['wild_type_kd'].str.contains('X')] + #=========== # FoldX #=========== @@ -305,7 +317,6 @@ else: #======================= # Deepddg -# TODO: RERUN 'gid' #======================= deepddg_df.shape @@ -324,7 +335,8 @@ print('\nSelecting chain:', sel_chain, 'for gene:', gene) deepddg_df = deepddg_df[deepddg_df['chain_id'] == sel_chain] #-------------------------- -# Drop chain id col as other targets don't have it.Check for duplicates +# Drop chain_id col as other +# targets don't have it. #-------------------------- col_to_drop = ['chain_id'] deepddg_df = deepddg_df.drop(col_to_drop, axis = 1) @@ -374,14 +386,40 @@ else: , '\nGot:', deepddg_pos2 , '\n======================================================') +if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1: + print('\nPASS: Deepddg data is scaled between -1 and 1', + '\nproceeding with merge') + #-------------------------- # Deepddg outcome category #-------------------------- -deepddg_df['deepddg_outcome'] = deepddg_df['deepddg'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising') -deepddg_df[deepddg_df['deepddg']>=0].count() -doc = deepddg_df['deepddg_outcome'].value_counts() +if 'deepddg_outcome' not in deepddg_df.columns: + print('\nCreating column: deepddg_outcome') + deepddg_df['deepddg_outcome'] = deepddg_df['deepddg'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising') + deepddg_df[deepddg_df['deepddg']>=0].count() + doc = deepddg_df['deepddg_outcome'].value_counts() + print(doc) +else: + print('\nColumn exists: deepddg_outcome') + t1 = deepddg_df['deepddg_outcome'].value_counts() + deepddg_df['deepddg_outcome2'] = deepddg_df['deepddg'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising') + t2 = deepddg_df['deepddg_outcome2'].value_counts() + print('\n', t1, '\n', t2) + #-------------------------- + # Drop deepddg_outcome2 col + #-------------------------- + col_to_drop2 = ['deepddg_outcome2'] + deepddg_df = deepddg_df.drop(col_to_drop2, axis = 1) -if doc['Stabilising'] == deepddg_pos and doc['Stabilising'] == deepddg_pos2: +if all(t1 == t2): + print('\nPASS: Deepddg_outcome category checked!') + doc = deepddg_df['deepddg_outcome'].value_counts() +else: + print('\nMISmatch in deepddg_outcome counts' + , '\n:', t1 + , '\n:', t2) + +if doc['Stabilising'] == deepddg_pos and doc['Stabilising'] == deepddg_pos2: print('\nPASS: Deepddg outcome category created') else: print('\nFAIL: Deepddg outcome category could NOT be created' @@ -389,19 +427,12 @@ else: , '\nGot:', doc[0] , '\n======================================================') sys.exit() - -if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1: - print('\nPASS: Deepddg data is scaled between -1 and 1', - '\nproceeding with merge') - + #======================= # Consurf #======================= consurf_df.shape -# drop row 0: as it contains no value but hangover text -consurf_df = consurf_df.drop(index=0) - #---------------------- # rename colums #---------------------- @@ -418,9 +449,9 @@ if gene.lower() in geneL_consurf: print('\nAdding offset value for gene:', gene.lower()) if gene.lower() == 'alr': - offset_val = 34 - + offset_val = 34 print('\nUsing offset val:', offset_val) + if gene.lower() == 'katg': offset_val = 23 print('\nUsing offset val:', offset_val) @@ -443,7 +474,7 @@ consurf_df = consurf_df.rename(columns={'SEQ' : 'wild_type' , 'MSADATA' : 'consurf_msa_data' , 'RESIDUEVARIETY' : 'consurf_aa_variety'}) # quick check -if len(consurf_df) == len(rd_df): +if len(consurf_df) == len(kd_df): print('\nPASS: length of consurf df is as expected' , '\nProceeding to format consurf df') else: @@ -458,6 +489,7 @@ consurf_df['consurf_colour'] = consurf_df['consurf_colour_str'].str.extract(r'(\ consurf_df['consurf_colour'] = consurf_df['consurf_colour'].astype(int) consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_str'].str.replace(r'.\*','0') +# non struc position are assigned a *, replacing that with a 0 so its all integer consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_rev'].astype(int) consurf_df['consurf_ci_upper'] = consurf_df['consurf_ci'].str.extract(r'(.*):') @@ -468,10 +500,10 @@ consurf_df['consurf_ci_lower'] = consurf_df['consurf_ci_lower'].astype(float) #consurf_df['wt_3upper_f'] = consurf_df['wt_3upper'].str.extract(r'^\w{3}(\d+.*)') #consurf_df['wt_3upper_f'] -consurf_df['wt_3upper'] = consurf_df['wt_3upper'].str.replace(r'(\d+:.*)', '') - consurf_df['chain'] = consurf_df['wt_3upper'].str.extract(r':(.*)') +consurf_df['wt_3upper'] = consurf_df['wt_3upper'].str.replace(r'(\d+:.*)', '') + #------------------------- # scale consurf values #------------------------- @@ -517,21 +549,35 @@ consurf_df.columns #--------------------------- # select columns # (and also determine order) +# this removes redundant cols: + # consurf_colour_str + # consurf_ci #--------------------------- -consurf_df_f = consurf_df[['position' - , 'wild_type' - , 'chain' - , 'wt_3upper' - , 'consurf_score' - , 'consurf_scaled' - , 'consurf_colour' - , 'consurf_colour_rev' - , 'consurf_ci_upper' - , 'consurf_ci_lower' - , 'consurf_ci_colour' - , 'consurf_msa_data' - , 'consurf_aa_variety']] +consurf_col_order = ['position' + , 'wild_type' + , 'chain' + , 'wt_3upper' + , 'consurf_score' + , 'consurf_scaled' + , 'consurf_colour' + , 'consurf_colour_rev' + , 'consurf_ci_upper' + , 'consurf_ci_lower' + , 'consurf_ci_colour' + , 'consurf_msa_data' + , 'consurf_aa_variety'] + +consurf_df_f = consurf_df[consurf_col_order] +# CHECK: whether a general rule or a gene specific rule! + +if consurf_df_f['chain'].isna().sum() > 0: + print('\nNaN detected in column chain for consurf df') +#if gene.lower() == 'embb': + print('\nFurther consurf df processing for gene:', gene.lower()) + print('\nDropping Nan from column name chain') + consurf_df_f = consurf_df_f[consurf_df_f['chain'].notna()] + #======================= # SNAP2 #======================= @@ -610,10 +656,12 @@ else: , '\nGot:', snap2_pos2 , '\n======================================================') -#--------------------------- +#------------------------------------- # select columns # (and also determine order) -#--------------------------- +# renumbering already done using +# bash and corrected file is read in +#------------------------------------- snap2_df.dtypes snap2_df.columns @@ -718,7 +766,7 @@ if mcsm_foldx_dfs.loc[:,'wild_type': 'mut_aa_3lower'].isnull().values.any(): else: print('\nNo NAs detected in mcsm_fold_dfs. Proceeding to merge deepddg_df') -#%% +#%%============================================================================ print('===================================' , '\nSecond merge: mcsm_foldx_dfs + deepddg' , '\n===================================') @@ -735,7 +783,7 @@ ncols_deepddg_merge = len(mcsm_foldx_deepddg_dfs.columns) mcsm_foldx_deepddg_dfs['position'] = mcsm_foldx_deepddg_dfs['position'].astype('int64') #%%============================================================================ -#FIXME: select df with 'chain' to allow corret dim merging! +# Select df with 'chain' to allow corret dim merging! print('===================================' , '\nThird merge: dssp + kd' , '\n===================================') @@ -755,7 +803,6 @@ dssp_kd_dfs = pd.merge(dssp_df #, how = "outer") , how = "inner") - print('\n\nResult of third merge:', dssp_kd_dfs.shape , '\n===================================================================') #%%============================================================================ @@ -816,7 +863,7 @@ combined_df = pd.merge(mcsm_foldx_deepddg_dfs combined_df_expected_cols = ncols_deepddg_merge + ncols_m3 - len(merging_cols_m4) -# FIXME: check logic, doesn't effect anything else! +# Check: whether logic effects anything else! if not gene == "embB": print("\nGene is:", gene) if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols: @@ -859,16 +906,13 @@ combined_df_clean = combined_df.drop(cols_to_drop, axis = 1) combined_df_clean.columns del(foo) #%%============================================================================ -# Output columns -out_filename_stab_struc = gene.lower() + '_comb_stab_struc_params.csv' -outfile_stab_struc = outdir + out_filename_stab_struc -print('Output filename:', outfile_stab_struc - , '\n===================================================================') +#--------------------- +# Output 1: write csv +#--------------------- +print('\nWriting file: combined stability and structural parameters' + , '\nOutput 1 filename:', outfile_stab_struc + , '\n===================================================================\n') -combined_df_clean - -# write csv -print('\nWriting file: combined stability and structural parameters') combined_df_clean.to_csv(outfile_stab_struc, index = False) print('\nFinished writing file:' , '\nNo. of rows:', combined_df_clean.shape[0] @@ -943,14 +987,14 @@ else: sys.exit('\nFAIL: merge unsuccessful for af and or') #%%============================================================================ -# Output columns: when dynamut, dynamut2 and others weren't being combined -out_filename_comb_afor = gene.lower() + '_comb_afor.csv' -outfile_comb_afor = outdir + out_filename_comb_afor -print('Output filename:', outfile_comb_afor - , '\n===================================================================') +#--------------------- +# Output 2: write csv +# when dynamut, dynamut2 and others weren't being combined +#--------------------- +print('\nWriting file: combined stability and afor' + , '\nOutput 2 filename:', outfile_comb_afor + , '\n===================================================================\n') -# write csv -print('Writing file: combined stability and afor') combined_stab_afor.to_csv(outfile_comb_afor, index = False) print('\nFinished writing file:' , '\nNo. of rows:', combined_stab_afor.shape[0] @@ -966,9 +1010,9 @@ if gene.lower() == "gid": if gene.lower() == "embb": dfs_list = [dynamut2_df, mcsm_ppi2_df] if gene.lower() == "katg": - dfs_list = [dynamut2_df] + dfs_list = [dynamut2_df, mcsm_ppi2_df] if gene.lower() == "rpob": - dfs_list = [dynamut2_df, mcsm_na_df] + dfs_list = [dynamut2_df, mcsm_na_df, mcsm_ppi2_df] if gene.lower() == "alr": dfs_list = [dynamut2_df, mcsm_ppi2_df] @@ -1014,7 +1058,6 @@ else: , '\nExpected nrows:', expected_nrows , '\nGot:', len(dfs_merged_clean) ) -# FIXME: need to extract 'cols_to_drop' programatically # Drop cols if combined_all_params.columns.str.contains(r'_x$|_y$', regex = True).any(): print('\nDuplicate column names detected...' @@ -1027,10 +1070,14 @@ if combined_all_params.columns.str.contains(r'_x$|_y$', regex = True).any(): else: print('\nNo duplicate column names detected, just writing file' , '\nTotal cols:', len(combined_all_params.columns) ) -#del(foo) -#%% Done for gid on 10/09/2021 -# write csv -print('Writing file: all params') +#%%============================================================================ +#--------------------- +# Output 3: write csv +#--------------------- +print('\nWriting file: all params') +print('\nOutput 3 filename:', outfile_comb + , '\n===================================================================\n') + combined_all_params.to_csv(outfile_comb, index = False) print('\nFinished writing file:'