thorough checking and updates for final running of all gene targets

2022-01-05 17:55:35 +00:00 · 2022-01-05 17:55:35 +00:00 · bffa3c376c
commit bffa3c376c
parent b66cf31219
1 changed files with 137 additions and 90 deletions
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
@ -8,7 +8,7 @@ Created on Tue Aug  6 12:56:03 2019
 #=======================================================================
 # Task: combining all dfs to a single one
-# Input:  8 dfs
+# Input: 12/13/14 dfs
 #1) <gene>.lower()'_complex_mcsm_norm.csv'
 #2) <gene>.lower()_foldx.csv'
 #3) <gene>.lower()_dssp.csv'
@ -16,20 +16,16 @@ Created on Tue Aug  6 12:56:03 2019
 #5) <gene>.lower()_rd.csv'
 #6) 'ns' + <gene>.lower()_snp_info.csv'
 #7) <gene>.lower()_af_or.csv'
-#8) <gene>.lower() _af_or_kinship.csv
+#8) <gene>.lower() _af_or_kinship.csv (ONLY for pncA, but omitted for the final run)
 #9) <gene>.lower()'_dynamut2.csv'
 #10) <gene>.lower()'_dynamut.csv'
 #11) <gene>.lower()'_mcsm_na.csv'
 #12) <gene>.lower()'_mcsm_ppi2.csv'
 #13) <gene>.lower()'_consurf.csv'
 #14) <gene>.lower()'_snap2.csv'
 # combining order
 #Merge1 = 1 + 2
 #Merge2 = 3 + 4
 #Merge3 = Merge2 + 5
 #Merge4 = Merge1 + Merge3
 #Merge5 = 6 + 7
 #Merge6 = Merge5 + 8
 #Merge7 = Merge4 + Merge6
 # Output: single csv of all 8 dfs combined
 # useful link
@ -53,10 +49,10 @@ homedir = os.path.expanduser('~')
 # set working dir
 os.getcwd()
-os.chdir(homedir + '/git/LSHTM_analysis/scripts')
+#os.chdir(homedir + '/git/LSHTM_analysis/scripts')
 sys.path.append(homedir + '/git/LSHTM_analysis/scripts')
 os.getcwd()
 # FIXME: local imports
 #from combining import combine_dfs_with_checks
 from combining_FIXME import detect_common_cols
 from reference_dict import oneletter_aa_dict
@ -119,6 +115,7 @@ gene_list_normal = ['pnca', 'katg', 'rpob', 'alr']
 if gene.lower() == "gid":
    print("\nReading mCSM file for gene:", gene)
    in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SRY.csv' # was incorrectly SAM previously
 if gene.lower() == "embb":
    print("\nReading mCSM file for gene:", gene)
    #in_filename_mcsm = gene.lower() + '_complex_mcsm_norm1.csv' #798
@ -183,15 +180,12 @@ infile_snap2           = outdir + 'snap2/'+ infilename_snap2
 snap2_df               = pd.read_csv(infile_snap2, sep = ',')
 #------------------------------------------------------------------------------
-# ONLY:for gene pnca and gid: End logic should pick this up!
+# ONLY: for gene 'gid' and 'rpob': End logic should pick this up!
 geneL_na = ['gid', 'rpob']
 if gene.lower() in geneL_na:
    print("\nGene:", gene.lower()
          , "\nReading mCSM_na files")
-    # infilename_dynamut    = gene.lower() + '_dynamut_norm.csv' # gid
+
    # infile_dynamut        = outdir + 'dynamut_results/' + infilename_dynamut
    # dynamut_df            = pd.read_csv(infile_dynamut, sep = ',')
    infilename_mcsm_na    = gene.lower() + '_complex_mcsm_na_norm.csv' # gid
    infile_mcsm_na        = outdir + 'mcsm_na_results/' + infilename_mcsm_na 
    mcsm_na_df            = pd.read_csv(infile_mcsm_na, sep = ',')
@ -199,18 +193,13 @@ if gene.lower() in geneL_na:
 geneL_dy = ['gid']
 if gene.lower() in geneL_dy:
    print("\nGene:", gene.lower()
-          , "\nReading Dynamut and mCSM_na files")
+          , "\nReading Dynamut files")
    infilename_dynamut    = gene.lower() + '_dynamut_norm.csv' # gid
    infile_dynamut        = outdir + 'dynamut_results/' + infilename_dynamut
    dynamut_df            = pd.read_csv(infile_dynamut, sep = ',')
    # infilename_mcsm_na    = gene.lower() + '_complex_mcsm_na_norm.csv' # gid
    # infile_mcsm_na        = outdir + 'mcsm_na_results/' + infilename_mcsm_na 
    # mcsm_na_df            = pd.read_csv(infile_mcsm_na, sep = ',')
-# ONLY:for gene embb and alr and katg: End logic should pick this up!
+# ONLY: for genes 'alr', 'embb', 'katg' and 'rpob': End logic should pick this up!
-geneL_ppi2 = ['embb', 'alr']
+geneL_ppi2 = ['alr', 'embb', 'katg', 'rpob']
 #if gene.lower() == "embb" or "alr":
 if gene.lower() in geneL_ppi2:
    infilename_mcsm_ppi2   = gene.lower() + '_complex_mcsm_ppi2_norm.csv'
    infile_mcsm_ppi2       = outdir + 'mcsm_ppi2/' + infilename_mcsm_ppi2
@ -224,10 +213,17 @@ else:
 #=======
 # output 
 #=======
 # outfile 3
 out_filename_comb = gene.lower() + '_all_params.csv'
 outfile_comb =  outdir + out_filename_comb
-print('\nOutput filename:', outfile_comb
+
-      , '\n===================================================================')
+# outfile 2
 out_filename_comb_afor = gene.lower() + '_comb_afor.csv'
 outfile_comb_afor =  outdir + out_filename_comb_afor
 # outfile 1
 out_filename_stab_struc = gene.lower() + '_comb_stab_struc_params.csv'
 outfile_stab_struc =  outdir + out_filename_stab_struc
 # end of variable assignment for input and output files
 #%%############################################################################  
@ -235,6 +231,22 @@ print('\nOutput filename:', outfile_comb
 # some preprocessing
 #=====================
 #===========
 # KD
 #===========
 kd_df.shape
 # geneL_kd = ['alr']
 # if gene.lower() in geneL_kd:
 #     print('\nRunning gene:', gene.lower()
 #           ,'\nChecking start numbering')
 if kd_df['wild_type_kd'].str.contains('X').any():
    print('\nDetected X in wild_type_kd'
          , '\nRunning gene:', gene.lower()
          , '\nChecking start numbering')
    kd_df = kd_df[~kd_df['wild_type_kd'].str.contains('X')]
 #===========
 # FoldX
 #===========
@ -305,7 +317,6 @@ else:
 #=======================
 # Deepddg
 # TODO: RERUN 'gid'
 #=======================
 deepddg_df.shape
@ -324,7 +335,8 @@ print('\nSelecting chain:', sel_chain, 'for gene:', gene)
 deepddg_df = deepddg_df[deepddg_df['chain_id'] == sel_chain]
 #--------------------------
-# Drop chain id col as other targets don't have it.Check for duplicates
+# Drop chain_id col as other
 # targets don't have it. 
 #--------------------------
 col_to_drop = ['chain_id']
 deepddg_df = deepddg_df.drop(col_to_drop, axis = 1)
@ -374,14 +386,40 @@ else:
          , '\nGot:', deepddg_pos2
          , '\n======================================================')    
 if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1:
    print('\nPASS: Deepddg data is scaled between -1 and 1',
           '\nproceeding with merge')
 #--------------------------
 # Deepddg outcome category
 #--------------------------
-deepddg_df['deepddg_outcome'] = deepddg_df['deepddg'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
+if 'deepddg_outcome' not in deepddg_df.columns:
-deepddg_df[deepddg_df['deepddg']>=0].count()
+    print('\nCreating column: deepddg_outcome')
-doc = deepddg_df['deepddg_outcome'].value_counts()
+    deepddg_df['deepddg_outcome'] = deepddg_df['deepddg'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
    deepddg_df[deepddg_df['deepddg']>=0].count()
    doc = deepddg_df['deepddg_outcome'].value_counts()
    print(doc)
 else:
    print('\nColumn exists: deepddg_outcome')
    t1 = deepddg_df['deepddg_outcome'].value_counts()
    deepddg_df['deepddg_outcome2'] = deepddg_df['deepddg'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
    t2 = deepddg_df['deepddg_outcome2'].value_counts()
    print('\n', t1, '\n', t2)
    #--------------------------
    # Drop deepddg_outcome2 col 
    #--------------------------
    col_to_drop2 = ['deepddg_outcome2']
    deepddg_df = deepddg_df.drop(col_to_drop2, axis = 1)
-if doc['Stabilising'] == deepddg_pos and  doc['Stabilising'] == deepddg_pos2:
+if all(t1 == t2):
    print('\nPASS: Deepddg_outcome category checked!')
    doc = deepddg_df['deepddg_outcome'].value_counts()
 else:
    print('\nMISmatch in deepddg_outcome counts'
          , '\n:', t1
          , '\n:', t2)
 if doc['Stabilising'] == deepddg_pos and doc['Stabilising'] == deepddg_pos2:
    print('\nPASS: Deepddg outcome category created')
 else:
    print('\nFAIL: Deepddg outcome category could NOT be created'
@ -389,19 +427,12 @@ else:
          , '\nGot:', doc[0]
          , '\n======================================================')
    sys.exit()
-        
+
 if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1:
    print('\nPASS: Deepddg data is scaled between -1 and 1',
           '\nproceeding with merge')
 #=======================
 # Consurf
 #=======================
 consurf_df.shape
 # drop row 0: as it contains no value but hangover text
 consurf_df = consurf_df.drop(index=0)
 #----------------------
 # rename colums
 #----------------------   
@ -418,9 +449,9 @@ if gene.lower() in geneL_consurf:
    print('\nAdding offset value for gene:', gene.lower())
    if gene.lower() == 'alr':
-        offset_val = 34
+        offset_val = 34      
        print('\nUsing offset val:', offset_val)
    if gene.lower() == 'katg':
        offset_val = 23
        print('\nUsing offset val:', offset_val)
@ -443,7 +474,7 @@ consurf_df = consurf_df.rename(columns={'SEQ'     : 'wild_type'
                                        , 'MSADATA'  : 'consurf_msa_data'
                                        , 'RESIDUEVARIETY' : 'consurf_aa_variety'})
 # quick check
-if len(consurf_df) == len(rd_df):
+if len(consurf_df) == len(kd_df):
    print('\nPASS: length of consurf df is as expected'
          , '\nProceeding to format consurf df')
 else:
@ -458,6 +489,7 @@ consurf_df['consurf_colour'] = consurf_df['consurf_colour_str'].str.extract(r'(\
 consurf_df['consurf_colour'] = consurf_df['consurf_colour'].astype(int)
 consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_str'].str.replace(r'.\*','0')
 # non struc position are assigned a *, replacing that with a 0 so its all integer
 consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_rev'].astype(int)
 consurf_df['consurf_ci_upper'] = consurf_df['consurf_ci'].str.extract(r'(.*):')
@ -468,10 +500,10 @@ consurf_df['consurf_ci_lower'] = consurf_df['consurf_ci_lower'].astype(float)
 #consurf_df['wt_3upper_f'] = consurf_df['wt_3upper'].str.extract(r'^\w{3}(\d+.*)')
 #consurf_df['wt_3upper_f']
 consurf_df['wt_3upper'] = consurf_df['wt_3upper'].str.replace(r'(\d+:.*)', '')
 consurf_df['chain'] = consurf_df['wt_3upper'].str.extract(r':(.*)')
 consurf_df['wt_3upper'] = consurf_df['wt_3upper'].str.replace(r'(\d+:.*)', '')
 #-------------------------
 # scale consurf values
 #-------------------------
@ -517,21 +549,35 @@ consurf_df.columns
 #---------------------------
 # select columns 
 # (and also determine order)
 # this removes redundant cols:
    # consurf_colour_str
    # consurf_ci
 #---------------------------
-consurf_df_f = consurf_df[['position'
+consurf_col_order = ['position'
-                           , 'wild_type'
+                     , 'wild_type'
-                           , 'chain'
+                     , 'chain'
-                           , 'wt_3upper'
+                     , 'wt_3upper'
-                           , 'consurf_score'
+                     , 'consurf_score'
-                           , 'consurf_scaled'
+                     , 'consurf_scaled'
-                           , 'consurf_colour'
+                     , 'consurf_colour'
-                           , 'consurf_colour_rev'
+                     , 'consurf_colour_rev'
-                           , 'consurf_ci_upper'
+                     , 'consurf_ci_upper'
-                           , 'consurf_ci_lower'
+                     , 'consurf_ci_lower'
-                           , 'consurf_ci_colour'
+                     , 'consurf_ci_colour'
-                           , 'consurf_msa_data'
+                     , 'consurf_msa_data'
-                           , 'consurf_aa_variety']]
+                     , 'consurf_aa_variety']
 consurf_df_f = consurf_df[consurf_col_order]
 # CHECK: whether a general rule or a gene specific rule!
 if consurf_df_f['chain'].isna().sum() > 0:
    print('\nNaN detected in column chain for consurf df')
 #if gene.lower() == 'embb':
    print('\nFurther consurf df processing for gene:', gene.lower())
    print('\nDropping Nan from column name chain')
    consurf_df_f = consurf_df_f[consurf_df_f['chain'].notna()]
 #=======================
 # SNAP2
 #=======================
@ -610,10 +656,12 @@ else:
          , '\nGot:', snap2_pos2
          , '\n======================================================')
-#---------------------------
+#-------------------------------------
 # select columns 
 # (and also determine order)
-#---------------------------
+# renumbering already done using 
 # bash and corrected file is read in
 #-------------------------------------
 snap2_df.dtypes
 snap2_df.columns
@ -718,7 +766,7 @@ if mcsm_foldx_dfs.loc[:,'wild_type': 'mut_aa_3lower'].isnull().values.any():
 else:
    print('\nNo NAs detected in mcsm_fold_dfs. Proceeding to merge deepddg_df')
-#%%
+#%%============================================================================
 print('==================================='
      , '\nSecond merge: mcsm_foldx_dfs + deepddg'
      , '\n===================================')
@ -735,7 +783,7 @@ ncols_deepddg_merge = len(mcsm_foldx_deepddg_dfs.columns)
 mcsm_foldx_deepddg_dfs['position'] = mcsm_foldx_deepddg_dfs['position'].astype('int64')
 #%%============================================================================
-#FIXME: select df with 'chain' to allow corret dim merging!
+# Select df with 'chain' to allow corret dim merging!
 print('==================================='
      , '\nThird merge: dssp + kd'
      , '\n===================================')
@ -755,7 +803,6 @@ dssp_kd_dfs = pd.merge(dssp_df
                       #, how = "outer")
                       , how = "inner")
 print('\n\nResult of third merge:', dssp_kd_dfs.shape
      , '\n===================================================================')
 #%%============================================================================
@ -816,7 +863,7 @@ combined_df = pd.merge(mcsm_foldx_deepddg_dfs
 combined_df_expected_cols = ncols_deepddg_merge + ncols_m3 - len(merging_cols_m4)
-# FIXME: check logic, doesn't effect anything else!
+# Check: whether logic effects anything else!
 if not gene == "embB":
    print("\nGene is:", gene)
    if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols:
@ -859,16 +906,13 @@ combined_df_clean = combined_df.drop(cols_to_drop, axis = 1)
 combined_df_clean.columns
 del(foo)
 #%%============================================================================
-# Output columns
+#---------------------
-out_filename_stab_struc = gene.lower() + '_comb_stab_struc_params.csv'
+# Output 1: write csv
-outfile_stab_struc =  outdir + out_filename_stab_struc
+#---------------------
-print('Output filename:', outfile_stab_struc
+print('\nWriting file: combined stability and structural parameters'
-      , '\n===================================================================')
+      , '\nOutput 1 filename:', outfile_stab_struc
      , '\n===================================================================\n')
 combined_df_clean
 # write csv
 print('\nWriting file: combined stability and structural parameters')
 combined_df_clean.to_csv(outfile_stab_struc, index = False)
 print('\nFinished writing file:'
      , '\nNo. of rows:', combined_df_clean.shape[0]
@ -943,14 +987,14 @@ else:
    sys.exit('\nFAIL: merge unsuccessful for af and or')    
 #%%============================================================================
-# Output columns: when dynamut, dynamut2 and others weren't being combined
+#---------------------
-out_filename_comb_afor = gene.lower() + '_comb_afor.csv'
+# Output 2: write csv
-outfile_comb_afor =  outdir + out_filename_comb_afor
+# when dynamut, dynamut2 and others weren't being combined
-print('Output filename:', outfile_comb_afor
+#---------------------
-      , '\n===================================================================')
+print('\nWriting file: combined stability and afor'
      , '\nOutput 2 filename:', outfile_comb_afor
      , '\n===================================================================\n')
 # write csv
 print('Writing file: combined stability and afor')
 combined_stab_afor.to_csv(outfile_comb_afor, index = False)
 print('\nFinished writing file:'
      , '\nNo. of rows:', combined_stab_afor.shape[0]
@ -966,9 +1010,9 @@ if gene.lower() == "gid":
 if gene.lower() == "embb":
    dfs_list = [dynamut2_df, mcsm_ppi2_df]
 if gene.lower() == "katg":
-    dfs_list = [dynamut2_df]
+    dfs_list = [dynamut2_df, mcsm_ppi2_df]
 if gene.lower() == "rpob":
-    dfs_list = [dynamut2_df, mcsm_na_df]
+    dfs_list = [dynamut2_df, mcsm_na_df, mcsm_ppi2_df]
 if gene.lower() == "alr":
    dfs_list = [dynamut2_df, mcsm_ppi2_df]
@ -1014,7 +1058,6 @@ else:
          , '\nExpected nrows:', expected_nrows
          , '\nGot:', len(dfs_merged_clean) )
 # FIXME: need to extract 'cols_to_drop' programatically
 # Drop cols
 if combined_all_params.columns.str.contains(r'_x$|_y$', regex = True).any():
     print('\nDuplicate column names detected...'
@ -1027,10 +1070,14 @@ if combined_all_params.columns.str.contains(r'_x$|_y$', regex = True).any():
 else:
    print('\nNo duplicate column names detected, just writing file'
          , '\nTotal cols:', len(combined_all_params.columns) )
-#del(foo)
+#%%============================================================================
-#%% Done for gid on 10/09/2021
+#---------------------
-# write csv
+# Output 3: write csv
-print('Writing file: all params')
+#---------------------
 print('\nWriting file: all params')
 print('\nOutput 3 filename:', outfile_comb
      , '\n===================================================================\n')
 combined_all_params.to_csv(outfile_comb, index = False)
 print('\nFinished writing file:'