thorough checking and updates for final running of all gene targets

2022-01-05 17:55:35 +00:00 · 2022-01-05 17:55:35 +00:00 · bffa3c376c
commit bffa3c376c
parent b66cf31219
1 changed files with 137 additions and 90 deletions
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
@ -8,7 +8,7 @@ Created on Tue Aug  6 12:56:03 2019
 #=======================================================================
 # Task: combining all dfs to a single one

-# Input:  8 dfs
+# Input: 12/13/14 dfs
 #1) <gene>.lower()'_complex_mcsm_norm.csv'
 #2) <gene>.lower()_foldx.csv'
 #3) <gene>.lower()_dssp.csv'
@ -16,20 +16,16 @@ Created on Tue Aug  6 12:56:03 2019
 #5) <gene>.lower()_rd.csv'
 #6) 'ns' + <gene>.lower()_snp_info.csv'
 #7) <gene>.lower()_af_or.csv'
-#8) <gene>.lower() _af_or_kinship.csv
+#8) <gene>.lower() _af_or_kinship.csv (ONLY for pncA, but omitted for the final run)
+#9) <gene>.lower()'_dynamut2.csv'
+#10) <gene>.lower()'_dynamut.csv'
+#11) <gene>.lower()'_mcsm_na.csv'
+#12) <gene>.lower()'_mcsm_ppi2.csv'
+#13) <gene>.lower()'_consurf.csv'
+#14) <gene>.lower()'_snap2.csv'

 # combining order
-#Merge1 = 1 + 2

-#Merge2 = 3 + 4
-#Merge3 = Merge2 + 5
-
-#Merge4 = Merge1 + Merge3
-
-#Merge5 = 6 + 7
-#Merge6 = Merge5 + 8
-
-#Merge7 = Merge4 + Merge6

 # Output: single csv of all 8 dfs combined
 # useful link
@ -53,10 +49,10 @@ homedir = os.path.expanduser('~')

 # set working dir
 os.getcwd()
-os.chdir(homedir + '/git/LSHTM_analysis/scripts')
+#os.chdir(homedir + '/git/LSHTM_analysis/scripts')
+sys.path.append(homedir + '/git/LSHTM_analysis/scripts')
 os.getcwd()

-# FIXME: local imports
 #from combining import combine_dfs_with_checks
 from combining_FIXME import detect_common_cols
 from reference_dict import oneletter_aa_dict
@ -119,6 +115,7 @@ gene_list_normal = ['pnca', 'katg', 'rpob', 'alr']
 if gene.lower() == "gid":
    print("\nReading mCSM file for gene:", gene)
    in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SRY.csv' # was incorrectly SAM previously
+
 if gene.lower() == "embb":
    print("\nReading mCSM file for gene:", gene)
    #in_filename_mcsm = gene.lower() + '_complex_mcsm_norm1.csv' #798
@ -183,15 +180,12 @@ infile_snap2           = outdir + 'snap2/'+ infilename_snap2
 snap2_df               = pd.read_csv(infile_snap2, sep = ',')

 #------------------------------------------------------------------------------
-# ONLY:for gene pnca and gid: End logic should pick this up!
+# ONLY: for gene 'gid' and 'rpob': End logic should pick this up!
 geneL_na = ['gid', 'rpob']
 if gene.lower() in geneL_na:
    print("\nGene:", gene.lower()
          , "\nReading mCSM_na files")
-    # infilename_dynamut    = gene.lower() + '_dynamut_norm.csv' # gid
-    # infile_dynamut        = outdir + 'dynamut_results/' + infilename_dynamut
-    # dynamut_df            = pd.read_csv(infile_dynamut, sep = ',')
-    
+
    infilename_mcsm_na    = gene.lower() + '_complex_mcsm_na_norm.csv' # gid
    infile_mcsm_na        = outdir + 'mcsm_na_results/' + infilename_mcsm_na 
    mcsm_na_df            = pd.read_csv(infile_mcsm_na, sep = ',')
@ -199,18 +193,13 @@ if gene.lower() in geneL_na:
 geneL_dy = ['gid']
 if gene.lower() in geneL_dy:
    print("\nGene:", gene.lower()
-          , "\nReading Dynamut and mCSM_na files")
+          , "\nReading Dynamut files")
    infilename_dynamut    = gene.lower() + '_dynamut_norm.csv' # gid
    infile_dynamut        = outdir + 'dynamut_results/' + infilename_dynamut
    dynamut_df            = pd.read_csv(infile_dynamut, sep = ',')
-    
-    # infilename_mcsm_na    = gene.lower() + '_complex_mcsm_na_norm.csv' # gid
-    # infile_mcsm_na        = outdir + 'mcsm_na_results/' + infilename_mcsm_na 
-    # mcsm_na_df            = pd.read_csv(infile_mcsm_na, sep = ',')

-# ONLY:for gene embb and alr and katg: End logic should pick this up!
-geneL_ppi2 = ['embb', 'alr']
-#if gene.lower() == "embb" or "alr":
+# ONLY: for genes 'alr', 'embb', 'katg' and 'rpob': End logic should pick this up!
+geneL_ppi2 = ['alr', 'embb', 'katg', 'rpob']
 if gene.lower() in geneL_ppi2:
    infilename_mcsm_ppi2   = gene.lower() + '_complex_mcsm_ppi2_norm.csv'
    infile_mcsm_ppi2       = outdir + 'mcsm_ppi2/' + infilename_mcsm_ppi2
@ -224,10 +213,17 @@ else:
 #=======
 # output 
 #=======
+# outfile 3
 out_filename_comb = gene.lower() + '_all_params.csv'
 outfile_comb =  outdir + out_filename_comb
-print('\nOutput filename:', outfile_comb
-      , '\n===================================================================')
+
+# outfile 2
+out_filename_comb_afor = gene.lower() + '_comb_afor.csv'
+outfile_comb_afor =  outdir + out_filename_comb_afor
+
+# outfile 1
+out_filename_stab_struc = gene.lower() + '_comb_stab_struc_params.csv'
+outfile_stab_struc =  outdir + out_filename_stab_struc

 # end of variable assignment for input and output files
 #%%############################################################################  
@ -235,6 +231,22 @@ print('\nOutput filename:', outfile_comb
 # some preprocessing
 #=====================

+#===========
+# KD
+#===========
+kd_df.shape
+
+# geneL_kd = ['alr']
+# if gene.lower() in geneL_kd:
+#     print('\nRunning gene:', gene.lower()
+#           ,'\nChecking start numbering')
+    
+if kd_df['wild_type_kd'].str.contains('X').any():
+    print('\nDetected X in wild_type_kd'
+          , '\nRunning gene:', gene.lower()
+          , '\nChecking start numbering')
+    kd_df = kd_df[~kd_df['wild_type_kd'].str.contains('X')]
+
 #===========
 # FoldX
 #===========
@ -305,7 +317,6 @@ else:

 #=======================
 # Deepddg
-# TODO: RERUN 'gid'
 #=======================
 deepddg_df.shape

@ -324,7 +335,8 @@ print('\nSelecting chain:', sel_chain, 'for gene:', gene)
 deepddg_df = deepddg_df[deepddg_df['chain_id'] == sel_chain]

 #--------------------------
-# Drop chain id col as other targets don't have it.Check for duplicates
+# Drop chain_id col as other
+# targets don't have it. 
 #--------------------------
 col_to_drop = ['chain_id']
 deepddg_df = deepddg_df.drop(col_to_drop, axis = 1)
@ -374,14 +386,40 @@ else:
          , '\nGot:', deepddg_pos2
          , '\n======================================================')    

+if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1:
+    print('\nPASS: Deepddg data is scaled between -1 and 1',
+           '\nproceeding with merge')
+    
 #--------------------------
 # Deepddg outcome category
 #--------------------------
-deepddg_df['deepddg_outcome'] = deepddg_df['deepddg'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
-deepddg_df[deepddg_df['deepddg']>=0].count()
-doc = deepddg_df['deepddg_outcome'].value_counts()
+if 'deepddg_outcome' not in deepddg_df.columns:
+    print('\nCreating column: deepddg_outcome')
+    deepddg_df['deepddg_outcome'] = deepddg_df['deepddg'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
+    deepddg_df[deepddg_df['deepddg']>=0].count()
+    doc = deepddg_df['deepddg_outcome'].value_counts()
+    print(doc)
+else:
+    print('\nColumn exists: deepddg_outcome')
+    t1 = deepddg_df['deepddg_outcome'].value_counts()
+    deepddg_df['deepddg_outcome2'] = deepddg_df['deepddg'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
+    t2 = deepddg_df['deepddg_outcome2'].value_counts()
+    print('\n', t1, '\n', t2)
+    #--------------------------
+    # Drop deepddg_outcome2 col 
+    #--------------------------
+    col_to_drop2 = ['deepddg_outcome2']
+    deepddg_df = deepddg_df.drop(col_to_drop2, axis = 1)

-if doc['Stabilising'] == deepddg_pos and  doc['Stabilising'] == deepddg_pos2:
+if all(t1 == t2):
+    print('\nPASS: Deepddg_outcome category checked!')
+    doc = deepddg_df['deepddg_outcome'].value_counts()
+else:
+    print('\nMISmatch in deepddg_outcome counts'
+          , '\n:', t1
+          , '\n:', t2)
+
+if doc['Stabilising'] == deepddg_pos and doc['Stabilising'] == deepddg_pos2:
    print('\nPASS: Deepddg outcome category created')
 else:
    print('\nFAIL: Deepddg outcome category could NOT be created'
@ -389,19 +427,12 @@ else:
          , '\nGot:', doc[0]
          , '\n======================================================')
    sys.exit()
-        
-if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1:
-    print('\nPASS: Deepddg data is scaled between -1 and 1',
-           '\nproceeding with merge')
-     
+
 #=======================
 # Consurf
 #=======================
 consurf_df.shape

-# drop row 0: as it contains no value but hangover text
-consurf_df = consurf_df.drop(index=0)
-
 #----------------------
 # rename colums
 #----------------------   
@ -418,9 +449,9 @@ if gene.lower() in geneL_consurf:
    print('\nAdding offset value for gene:', gene.lower())
        
    if gene.lower() == 'alr':
-        offset_val = 34
-        
+        offset_val = 34      
        print('\nUsing offset val:', offset_val)
+    
    if gene.lower() == 'katg':
        offset_val = 23
        print('\nUsing offset val:', offset_val)
@ -443,7 +474,7 @@ consurf_df = consurf_df.rename(columns={'SEQ'     : 'wild_type'
                                        , 'MSADATA'  : 'consurf_msa_data'
                                        , 'RESIDUEVARIETY' : 'consurf_aa_variety'})
 # quick check
-if len(consurf_df) == len(rd_df):
+if len(consurf_df) == len(kd_df):
    print('\nPASS: length of consurf df is as expected'
          , '\nProceeding to format consurf df')
 else:
@ -458,6 +489,7 @@ consurf_df['consurf_colour'] = consurf_df['consurf_colour_str'].str.extract(r'(\
 consurf_df['consurf_colour'] = consurf_df['consurf_colour'].astype(int)

 consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_str'].str.replace(r'.\*','0')
+# non struc position are assigned a *, replacing that with a 0 so its all integer
 consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_rev'].astype(int)

 consurf_df['consurf_ci_upper'] = consurf_df['consurf_ci'].str.extract(r'(.*):')
@ -468,10 +500,10 @@ consurf_df['consurf_ci_lower'] = consurf_df['consurf_ci_lower'].astype(float)

 #consurf_df['wt_3upper_f'] = consurf_df['wt_3upper'].str.extract(r'^\w{3}(\d+.*)')
 #consurf_df['wt_3upper_f']
-consurf_df['wt_3upper'] = consurf_df['wt_3upper'].str.replace(r'(\d+:.*)', '')
-
 consurf_df['chain'] = consurf_df['wt_3upper'].str.extract(r':(.*)')

+consurf_df['wt_3upper'] = consurf_df['wt_3upper'].str.replace(r'(\d+:.*)', '')
+
 #-------------------------
 # scale consurf values
 #-------------------------
@ -517,21 +549,35 @@ consurf_df.columns
 #---------------------------
 # select columns 
 # (and also determine order)
+# this removes redundant cols:
+    # consurf_colour_str
+    # consurf_ci
 #---------------------------
-consurf_df_f = consurf_df[['position'
-                           , 'wild_type'
-                           , 'chain'
-                           , 'wt_3upper'
-                           , 'consurf_score'
-                           , 'consurf_scaled'
-                           , 'consurf_colour'
-                           , 'consurf_colour_rev'
-                           , 'consurf_ci_upper'
-                           , 'consurf_ci_lower'
-                           , 'consurf_ci_colour'
-                           , 'consurf_msa_data'
-                           , 'consurf_aa_variety']]
+consurf_col_order = ['position'
+                     , 'wild_type'
+                     , 'chain'
+                     , 'wt_3upper'
+                     , 'consurf_score'
+                     , 'consurf_scaled'
+                     , 'consurf_colour'
+                     , 'consurf_colour_rev'
+                     , 'consurf_ci_upper'
+                     , 'consurf_ci_lower'
+                     , 'consurf_ci_colour'
+                     , 'consurf_msa_data'
+                     , 'consurf_aa_variety']
+   
+consurf_df_f = consurf_df[consurf_col_order]

+# CHECK: whether a general rule or a gene specific rule!
+
+if consurf_df_f['chain'].isna().sum() > 0:
+    print('\nNaN detected in column chain for consurf df')
+#if gene.lower() == 'embb':
+    print('\nFurther consurf df processing for gene:', gene.lower())
+    print('\nDropping Nan from column name chain')
+    consurf_df_f = consurf_df_f[consurf_df_f['chain'].notna()]
+   
 #=======================
 # SNAP2
 #=======================
@ -610,10 +656,12 @@ else:
          , '\nGot:', snap2_pos2
          , '\n======================================================')

-#---------------------------
+#-------------------------------------
 # select columns 
 # (and also determine order)
-#---------------------------
+# renumbering already done using 
+# bash and corrected file is read in
+#-------------------------------------
 snap2_df.dtypes
 snap2_df.columns

@ -718,7 +766,7 @@ if mcsm_foldx_dfs.loc[:,'wild_type': 'mut_aa_3lower'].isnull().values.any():
 else:
    print('\nNo NAs detected in mcsm_fold_dfs. Proceeding to merge deepddg_df')
    
-#%%
+#%%============================================================================
 print('==================================='
      , '\nSecond merge: mcsm_foldx_dfs + deepddg'
      , '\n===================================')
@ -735,7 +783,7 @@ ncols_deepddg_merge = len(mcsm_foldx_deepddg_dfs.columns)
 mcsm_foldx_deepddg_dfs['position'] = mcsm_foldx_deepddg_dfs['position'].astype('int64')

 #%%============================================================================
-#FIXME: select df with 'chain' to allow corret dim merging!
+# Select df with 'chain' to allow corret dim merging!
 print('==================================='
      , '\nThird merge: dssp + kd'
      , '\n===================================')
@ -755,7 +803,6 @@ dssp_kd_dfs = pd.merge(dssp_df
                       #, how = "outer")
                       , how = "inner")

-
 print('\n\nResult of third merge:', dssp_kd_dfs.shape
      , '\n===================================================================')
 #%%============================================================================
@ -816,7 +863,7 @@ combined_df = pd.merge(mcsm_foldx_deepddg_dfs

 combined_df_expected_cols = ncols_deepddg_merge + ncols_m3 - len(merging_cols_m4)

-# FIXME: check logic, doesn't effect anything else!
+# Check: whether logic effects anything else!
 if not gene == "embB":
    print("\nGene is:", gene)
    if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols:
@ -859,16 +906,13 @@ combined_df_clean = combined_df.drop(cols_to_drop, axis = 1)
 combined_df_clean.columns
 del(foo)
 #%%============================================================================
-# Output columns
-out_filename_stab_struc = gene.lower() + '_comb_stab_struc_params.csv'
-outfile_stab_struc =  outdir + out_filename_stab_struc
-print('Output filename:', outfile_stab_struc
-      , '\n===================================================================')
+#---------------------
+# Output 1: write csv
+#---------------------
+print('\nWriting file: combined stability and structural parameters'
+      , '\nOutput 1 filename:', outfile_stab_struc
+      , '\n===================================================================\n')

-combined_df_clean
-
-# write csv
-print('\nWriting file: combined stability and structural parameters')
 combined_df_clean.to_csv(outfile_stab_struc, index = False)
 print('\nFinished writing file:'
      , '\nNo. of rows:', combined_df_clean.shape[0]
@ -943,14 +987,14 @@ else:
    sys.exit('\nFAIL: merge unsuccessful for af and or')    
    
 #%%============================================================================
-# Output columns: when dynamut, dynamut2 and others weren't being combined
-out_filename_comb_afor = gene.lower() + '_comb_afor.csv'
-outfile_comb_afor =  outdir + out_filename_comb_afor
-print('Output filename:', outfile_comb_afor
-      , '\n===================================================================')
+#---------------------
+# Output 2: write csv
+# when dynamut, dynamut2 and others weren't being combined
+#---------------------
+print('\nWriting file: combined stability and afor'
+      , '\nOutput 2 filename:', outfile_comb_afor
+      , '\n===================================================================\n')

-# write csv
-print('Writing file: combined stability and afor')
 combined_stab_afor.to_csv(outfile_comb_afor, index = False)
 print('\nFinished writing file:'
      , '\nNo. of rows:', combined_stab_afor.shape[0]
@ -966,9 +1010,9 @@ if gene.lower() == "gid":
 if gene.lower() == "embb":
    dfs_list = [dynamut2_df, mcsm_ppi2_df]
 if gene.lower() == "katg":
-    dfs_list = [dynamut2_df]
+    dfs_list = [dynamut2_df, mcsm_ppi2_df]
 if gene.lower() == "rpob":
-    dfs_list = [dynamut2_df, mcsm_na_df]
+    dfs_list = [dynamut2_df, mcsm_na_df, mcsm_ppi2_df]
 if gene.lower() == "alr":
    dfs_list = [dynamut2_df, mcsm_ppi2_df]

@ -1014,7 +1058,6 @@ else:
          , '\nExpected nrows:', expected_nrows
          , '\nGot:', len(dfs_merged_clean) )

-# FIXME: need to extract 'cols_to_drop' programatically
 # Drop cols
 if combined_all_params.columns.str.contains(r'_x$|_y$', regex = True).any():
     print('\nDuplicate column names detected...'
@ -1027,10 +1070,14 @@ if combined_all_params.columns.str.contains(r'_x$|_y$', regex = True).any():
 else:
    print('\nNo duplicate column names detected, just writing file'
          , '\nTotal cols:', len(combined_all_params.columns) )
-#del(foo)
-#%% Done for gid on 10/09/2021
-# write csv
-print('Writing file: all params')
+#%%============================================================================
+#---------------------
+# Output 3: write csv
+#---------------------
+print('\nWriting file: all params')
+print('\nOutput 3 filename:', outfile_comb
+      , '\n===================================================================\n')
+
 combined_all_params.to_csv(outfile_comb, index = False)

 print('\nFinished writing file:'