thorough checking and updates for final running of all gene targets

This commit is contained in:
Tanushree Tunstall 2022-01-05 17:55:35 +00:00
parent b66cf31219
commit bffa3c376c

View file

@ -8,7 +8,7 @@ Created on Tue Aug 6 12:56:03 2019
#======================================================================= #=======================================================================
# Task: combining all dfs to a single one # Task: combining all dfs to a single one
# Input: 8 dfs # Input: 12/13/14 dfs
#1) <gene>.lower()'_complex_mcsm_norm.csv' #1) <gene>.lower()'_complex_mcsm_norm.csv'
#2) <gene>.lower()_foldx.csv' #2) <gene>.lower()_foldx.csv'
#3) <gene>.lower()_dssp.csv' #3) <gene>.lower()_dssp.csv'
@ -16,20 +16,16 @@ Created on Tue Aug 6 12:56:03 2019
#5) <gene>.lower()_rd.csv' #5) <gene>.lower()_rd.csv'
#6) 'ns' + <gene>.lower()_snp_info.csv' #6) 'ns' + <gene>.lower()_snp_info.csv'
#7) <gene>.lower()_af_or.csv' #7) <gene>.lower()_af_or.csv'
#8) <gene>.lower() _af_or_kinship.csv #8) <gene>.lower() _af_or_kinship.csv (ONLY for pncA, but omitted for the final run)
#9) <gene>.lower()'_dynamut2.csv'
#10) <gene>.lower()'_dynamut.csv'
#11) <gene>.lower()'_mcsm_na.csv'
#12) <gene>.lower()'_mcsm_ppi2.csv'
#13) <gene>.lower()'_consurf.csv'
#14) <gene>.lower()'_snap2.csv'
# combining order # combining order
#Merge1 = 1 + 2
#Merge2 = 3 + 4
#Merge3 = Merge2 + 5
#Merge4 = Merge1 + Merge3
#Merge5 = 6 + 7
#Merge6 = Merge5 + 8
#Merge7 = Merge4 + Merge6
# Output: single csv of all 8 dfs combined # Output: single csv of all 8 dfs combined
# useful link # useful link
@ -53,10 +49,10 @@ homedir = os.path.expanduser('~')
# set working dir # set working dir
os.getcwd() os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/scripts') #os.chdir(homedir + '/git/LSHTM_analysis/scripts')
sys.path.append(homedir + '/git/LSHTM_analysis/scripts')
os.getcwd() os.getcwd()
# FIXME: local imports
#from combining import combine_dfs_with_checks #from combining import combine_dfs_with_checks
from combining_FIXME import detect_common_cols from combining_FIXME import detect_common_cols
from reference_dict import oneletter_aa_dict from reference_dict import oneletter_aa_dict
@ -119,6 +115,7 @@ gene_list_normal = ['pnca', 'katg', 'rpob', 'alr']
if gene.lower() == "gid": if gene.lower() == "gid":
print("\nReading mCSM file for gene:", gene) print("\nReading mCSM file for gene:", gene)
in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SRY.csv' # was incorrectly SAM previously in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SRY.csv' # was incorrectly SAM previously
if gene.lower() == "embb": if gene.lower() == "embb":
print("\nReading mCSM file for gene:", gene) print("\nReading mCSM file for gene:", gene)
#in_filename_mcsm = gene.lower() + '_complex_mcsm_norm1.csv' #798 #in_filename_mcsm = gene.lower() + '_complex_mcsm_norm1.csv' #798
@ -183,14 +180,11 @@ infile_snap2 = outdir + 'snap2/'+ infilename_snap2
snap2_df = pd.read_csv(infile_snap2, sep = ',') snap2_df = pd.read_csv(infile_snap2, sep = ',')
#------------------------------------------------------------------------------ #------------------------------------------------------------------------------
# ONLY:for gene pnca and gid: End logic should pick this up! # ONLY: for gene 'gid' and 'rpob': End logic should pick this up!
geneL_na = ['gid', 'rpob'] geneL_na = ['gid', 'rpob']
if gene.lower() in geneL_na: if gene.lower() in geneL_na:
print("\nGene:", gene.lower() print("\nGene:", gene.lower()
, "\nReading mCSM_na files") , "\nReading mCSM_na files")
# infilename_dynamut = gene.lower() + '_dynamut_norm.csv' # gid
# infile_dynamut = outdir + 'dynamut_results/' + infilename_dynamut
# dynamut_df = pd.read_csv(infile_dynamut, sep = ',')
infilename_mcsm_na = gene.lower() + '_complex_mcsm_na_norm.csv' # gid infilename_mcsm_na = gene.lower() + '_complex_mcsm_na_norm.csv' # gid
infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na
@ -199,18 +193,13 @@ if gene.lower() in geneL_na:
geneL_dy = ['gid'] geneL_dy = ['gid']
if gene.lower() in geneL_dy: if gene.lower() in geneL_dy:
print("\nGene:", gene.lower() print("\nGene:", gene.lower()
, "\nReading Dynamut and mCSM_na files") , "\nReading Dynamut files")
infilename_dynamut = gene.lower() + '_dynamut_norm.csv' # gid infilename_dynamut = gene.lower() + '_dynamut_norm.csv' # gid
infile_dynamut = outdir + 'dynamut_results/' + infilename_dynamut infile_dynamut = outdir + 'dynamut_results/' + infilename_dynamut
dynamut_df = pd.read_csv(infile_dynamut, sep = ',') dynamut_df = pd.read_csv(infile_dynamut, sep = ',')
# infilename_mcsm_na = gene.lower() + '_complex_mcsm_na_norm.csv' # gid # ONLY: for genes 'alr', 'embb', 'katg' and 'rpob': End logic should pick this up!
# infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na geneL_ppi2 = ['alr', 'embb', 'katg', 'rpob']
# mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',')
# ONLY:for gene embb and alr and katg: End logic should pick this up!
geneL_ppi2 = ['embb', 'alr']
#if gene.lower() == "embb" or "alr":
if gene.lower() in geneL_ppi2: if gene.lower() in geneL_ppi2:
infilename_mcsm_ppi2 = gene.lower() + '_complex_mcsm_ppi2_norm.csv' infilename_mcsm_ppi2 = gene.lower() + '_complex_mcsm_ppi2_norm.csv'
infile_mcsm_ppi2 = outdir + 'mcsm_ppi2/' + infilename_mcsm_ppi2 infile_mcsm_ppi2 = outdir + 'mcsm_ppi2/' + infilename_mcsm_ppi2
@ -224,10 +213,17 @@ else:
#======= #=======
# output # output
#======= #=======
# outfile 3
out_filename_comb = gene.lower() + '_all_params.csv' out_filename_comb = gene.lower() + '_all_params.csv'
outfile_comb = outdir + out_filename_comb outfile_comb = outdir + out_filename_comb
print('\nOutput filename:', outfile_comb
, '\n===================================================================') # outfile 2
out_filename_comb_afor = gene.lower() + '_comb_afor.csv'
outfile_comb_afor = outdir + out_filename_comb_afor
# outfile 1
out_filename_stab_struc = gene.lower() + '_comb_stab_struc_params.csv'
outfile_stab_struc = outdir + out_filename_stab_struc
# end of variable assignment for input and output files # end of variable assignment for input and output files
#%%############################################################################ #%%############################################################################
@ -235,6 +231,22 @@ print('\nOutput filename:', outfile_comb
# some preprocessing # some preprocessing
#===================== #=====================
#===========
# KD
#===========
kd_df.shape
# geneL_kd = ['alr']
# if gene.lower() in geneL_kd:
# print('\nRunning gene:', gene.lower()
# ,'\nChecking start numbering')
if kd_df['wild_type_kd'].str.contains('X').any():
print('\nDetected X in wild_type_kd'
, '\nRunning gene:', gene.lower()
, '\nChecking start numbering')
kd_df = kd_df[~kd_df['wild_type_kd'].str.contains('X')]
#=========== #===========
# FoldX # FoldX
#=========== #===========
@ -305,7 +317,6 @@ else:
#======================= #=======================
# Deepddg # Deepddg
# TODO: RERUN 'gid'
#======================= #=======================
deepddg_df.shape deepddg_df.shape
@ -324,7 +335,8 @@ print('\nSelecting chain:', sel_chain, 'for gene:', gene)
deepddg_df = deepddg_df[deepddg_df['chain_id'] == sel_chain] deepddg_df = deepddg_df[deepddg_df['chain_id'] == sel_chain]
#-------------------------- #--------------------------
# Drop chain id col as other targets don't have it.Check for duplicates # Drop chain_id col as other
# targets don't have it.
#-------------------------- #--------------------------
col_to_drop = ['chain_id'] col_to_drop = ['chain_id']
deepddg_df = deepddg_df.drop(col_to_drop, axis = 1) deepddg_df = deepddg_df.drop(col_to_drop, axis = 1)
@ -374,12 +386,38 @@ else:
, '\nGot:', deepddg_pos2 , '\nGot:', deepddg_pos2
, '\n======================================================') , '\n======================================================')
if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1:
print('\nPASS: Deepddg data is scaled between -1 and 1',
'\nproceeding with merge')
#-------------------------- #--------------------------
# Deepddg outcome category # Deepddg outcome category
#-------------------------- #--------------------------
deepddg_df['deepddg_outcome'] = deepddg_df['deepddg'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising') if 'deepddg_outcome' not in deepddg_df.columns:
deepddg_df[deepddg_df['deepddg']>=0].count() print('\nCreating column: deepddg_outcome')
doc = deepddg_df['deepddg_outcome'].value_counts() deepddg_df['deepddg_outcome'] = deepddg_df['deepddg'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
deepddg_df[deepddg_df['deepddg']>=0].count()
doc = deepddg_df['deepddg_outcome'].value_counts()
print(doc)
else:
print('\nColumn exists: deepddg_outcome')
t1 = deepddg_df['deepddg_outcome'].value_counts()
deepddg_df['deepddg_outcome2'] = deepddg_df['deepddg'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
t2 = deepddg_df['deepddg_outcome2'].value_counts()
print('\n', t1, '\n', t2)
#--------------------------
# Drop deepddg_outcome2 col
#--------------------------
col_to_drop2 = ['deepddg_outcome2']
deepddg_df = deepddg_df.drop(col_to_drop2, axis = 1)
if all(t1 == t2):
print('\nPASS: Deepddg_outcome category checked!')
doc = deepddg_df['deepddg_outcome'].value_counts()
else:
print('\nMISmatch in deepddg_outcome counts'
, '\n:', t1
, '\n:', t2)
if doc['Stabilising'] == deepddg_pos and doc['Stabilising'] == deepddg_pos2: if doc['Stabilising'] == deepddg_pos and doc['Stabilising'] == deepddg_pos2:
print('\nPASS: Deepddg outcome category created') print('\nPASS: Deepddg outcome category created')
@ -390,18 +428,11 @@ else:
, '\n======================================================') , '\n======================================================')
sys.exit() sys.exit()
if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1:
print('\nPASS: Deepddg data is scaled between -1 and 1',
'\nproceeding with merge')
#======================= #=======================
# Consurf # Consurf
#======================= #=======================
consurf_df.shape consurf_df.shape
# drop row 0: as it contains no value but hangover text
consurf_df = consurf_df.drop(index=0)
#---------------------- #----------------------
# rename colums # rename colums
#---------------------- #----------------------
@ -419,8 +450,8 @@ if gene.lower() in geneL_consurf:
if gene.lower() == 'alr': if gene.lower() == 'alr':
offset_val = 34 offset_val = 34
print('\nUsing offset val:', offset_val) print('\nUsing offset val:', offset_val)
if gene.lower() == 'katg': if gene.lower() == 'katg':
offset_val = 23 offset_val = 23
print('\nUsing offset val:', offset_val) print('\nUsing offset val:', offset_val)
@ -443,7 +474,7 @@ consurf_df = consurf_df.rename(columns={'SEQ' : 'wild_type'
, 'MSADATA' : 'consurf_msa_data' , 'MSADATA' : 'consurf_msa_data'
, 'RESIDUEVARIETY' : 'consurf_aa_variety'}) , 'RESIDUEVARIETY' : 'consurf_aa_variety'})
# quick check # quick check
if len(consurf_df) == len(rd_df): if len(consurf_df) == len(kd_df):
print('\nPASS: length of consurf df is as expected' print('\nPASS: length of consurf df is as expected'
, '\nProceeding to format consurf df') , '\nProceeding to format consurf df')
else: else:
@ -458,6 +489,7 @@ consurf_df['consurf_colour'] = consurf_df['consurf_colour_str'].str.extract(r'(\
consurf_df['consurf_colour'] = consurf_df['consurf_colour'].astype(int) consurf_df['consurf_colour'] = consurf_df['consurf_colour'].astype(int)
consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_str'].str.replace(r'.\*','0') consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_str'].str.replace(r'.\*','0')
# non struc position are assigned a *, replacing that with a 0 so its all integer
consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_rev'].astype(int) consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_rev'].astype(int)
consurf_df['consurf_ci_upper'] = consurf_df['consurf_ci'].str.extract(r'(.*):') consurf_df['consurf_ci_upper'] = consurf_df['consurf_ci'].str.extract(r'(.*):')
@ -468,10 +500,10 @@ consurf_df['consurf_ci_lower'] = consurf_df['consurf_ci_lower'].astype(float)
#consurf_df['wt_3upper_f'] = consurf_df['wt_3upper'].str.extract(r'^\w{3}(\d+.*)') #consurf_df['wt_3upper_f'] = consurf_df['wt_3upper'].str.extract(r'^\w{3}(\d+.*)')
#consurf_df['wt_3upper_f'] #consurf_df['wt_3upper_f']
consurf_df['wt_3upper'] = consurf_df['wt_3upper'].str.replace(r'(\d+:.*)', '')
consurf_df['chain'] = consurf_df['wt_3upper'].str.extract(r':(.*)') consurf_df['chain'] = consurf_df['wt_3upper'].str.extract(r':(.*)')
consurf_df['wt_3upper'] = consurf_df['wt_3upper'].str.replace(r'(\d+:.*)', '')
#------------------------- #-------------------------
# scale consurf values # scale consurf values
#------------------------- #-------------------------
@ -517,8 +549,11 @@ consurf_df.columns
#--------------------------- #---------------------------
# select columns # select columns
# (and also determine order) # (and also determine order)
# this removes redundant cols:
# consurf_colour_str
# consurf_ci
#--------------------------- #---------------------------
consurf_df_f = consurf_df[['position' consurf_col_order = ['position'
, 'wild_type' , 'wild_type'
, 'chain' , 'chain'
, 'wt_3upper' , 'wt_3upper'
@ -530,7 +565,18 @@ consurf_df_f = consurf_df[['position'
, 'consurf_ci_lower' , 'consurf_ci_lower'
, 'consurf_ci_colour' , 'consurf_ci_colour'
, 'consurf_msa_data' , 'consurf_msa_data'
, 'consurf_aa_variety']] , 'consurf_aa_variety']
consurf_df_f = consurf_df[consurf_col_order]
# CHECK: whether a general rule or a gene specific rule!
if consurf_df_f['chain'].isna().sum() > 0:
print('\nNaN detected in column chain for consurf df')
#if gene.lower() == 'embb':
print('\nFurther consurf df processing for gene:', gene.lower())
print('\nDropping Nan from column name chain')
consurf_df_f = consurf_df_f[consurf_df_f['chain'].notna()]
#======================= #=======================
# SNAP2 # SNAP2
@ -610,10 +656,12 @@ else:
, '\nGot:', snap2_pos2 , '\nGot:', snap2_pos2
, '\n======================================================') , '\n======================================================')
#--------------------------- #-------------------------------------
# select columns # select columns
# (and also determine order) # (and also determine order)
#--------------------------- # renumbering already done using
# bash and corrected file is read in
#-------------------------------------
snap2_df.dtypes snap2_df.dtypes
snap2_df.columns snap2_df.columns
@ -718,7 +766,7 @@ if mcsm_foldx_dfs.loc[:,'wild_type': 'mut_aa_3lower'].isnull().values.any():
else: else:
print('\nNo NAs detected in mcsm_fold_dfs. Proceeding to merge deepddg_df') print('\nNo NAs detected in mcsm_fold_dfs. Proceeding to merge deepddg_df')
#%% #%%============================================================================
print('===================================' print('==================================='
, '\nSecond merge: mcsm_foldx_dfs + deepddg' , '\nSecond merge: mcsm_foldx_dfs + deepddg'
, '\n===================================') , '\n===================================')
@ -735,7 +783,7 @@ ncols_deepddg_merge = len(mcsm_foldx_deepddg_dfs.columns)
mcsm_foldx_deepddg_dfs['position'] = mcsm_foldx_deepddg_dfs['position'].astype('int64') mcsm_foldx_deepddg_dfs['position'] = mcsm_foldx_deepddg_dfs['position'].astype('int64')
#%%============================================================================ #%%============================================================================
#FIXME: select df with 'chain' to allow corret dim merging! # Select df with 'chain' to allow corret dim merging!
print('===================================' print('==================================='
, '\nThird merge: dssp + kd' , '\nThird merge: dssp + kd'
, '\n===================================') , '\n===================================')
@ -755,7 +803,6 @@ dssp_kd_dfs = pd.merge(dssp_df
#, how = "outer") #, how = "outer")
, how = "inner") , how = "inner")
print('\n\nResult of third merge:', dssp_kd_dfs.shape print('\n\nResult of third merge:', dssp_kd_dfs.shape
, '\n===================================================================') , '\n===================================================================')
#%%============================================================================ #%%============================================================================
@ -816,7 +863,7 @@ combined_df = pd.merge(mcsm_foldx_deepddg_dfs
combined_df_expected_cols = ncols_deepddg_merge + ncols_m3 - len(merging_cols_m4) combined_df_expected_cols = ncols_deepddg_merge + ncols_m3 - len(merging_cols_m4)
# FIXME: check logic, doesn't effect anything else! # Check: whether logic effects anything else!
if not gene == "embB": if not gene == "embB":
print("\nGene is:", gene) print("\nGene is:", gene)
if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols: if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols:
@ -859,16 +906,13 @@ combined_df_clean = combined_df.drop(cols_to_drop, axis = 1)
combined_df_clean.columns combined_df_clean.columns
del(foo) del(foo)
#%%============================================================================ #%%============================================================================
# Output columns #---------------------
out_filename_stab_struc = gene.lower() + '_comb_stab_struc_params.csv' # Output 1: write csv
outfile_stab_struc = outdir + out_filename_stab_struc #---------------------
print('Output filename:', outfile_stab_struc print('\nWriting file: combined stability and structural parameters'
, '\n===================================================================') , '\nOutput 1 filename:', outfile_stab_struc
, '\n===================================================================\n')
combined_df_clean
# write csv
print('\nWriting file: combined stability and structural parameters')
combined_df_clean.to_csv(outfile_stab_struc, index = False) combined_df_clean.to_csv(outfile_stab_struc, index = False)
print('\nFinished writing file:' print('\nFinished writing file:'
, '\nNo. of rows:', combined_df_clean.shape[0] , '\nNo. of rows:', combined_df_clean.shape[0]
@ -943,14 +987,14 @@ else:
sys.exit('\nFAIL: merge unsuccessful for af and or') sys.exit('\nFAIL: merge unsuccessful for af and or')
#%%============================================================================ #%%============================================================================
# Output columns: when dynamut, dynamut2 and others weren't being combined #---------------------
out_filename_comb_afor = gene.lower() + '_comb_afor.csv' # Output 2: write csv
outfile_comb_afor = outdir + out_filename_comb_afor # when dynamut, dynamut2 and others weren't being combined
print('Output filename:', outfile_comb_afor #---------------------
, '\n===================================================================') print('\nWriting file: combined stability and afor'
, '\nOutput 2 filename:', outfile_comb_afor
, '\n===================================================================\n')
# write csv
print('Writing file: combined stability and afor')
combined_stab_afor.to_csv(outfile_comb_afor, index = False) combined_stab_afor.to_csv(outfile_comb_afor, index = False)
print('\nFinished writing file:' print('\nFinished writing file:'
, '\nNo. of rows:', combined_stab_afor.shape[0] , '\nNo. of rows:', combined_stab_afor.shape[0]
@ -966,9 +1010,9 @@ if gene.lower() == "gid":
if gene.lower() == "embb": if gene.lower() == "embb":
dfs_list = [dynamut2_df, mcsm_ppi2_df] dfs_list = [dynamut2_df, mcsm_ppi2_df]
if gene.lower() == "katg": if gene.lower() == "katg":
dfs_list = [dynamut2_df] dfs_list = [dynamut2_df, mcsm_ppi2_df]
if gene.lower() == "rpob": if gene.lower() == "rpob":
dfs_list = [dynamut2_df, mcsm_na_df] dfs_list = [dynamut2_df, mcsm_na_df, mcsm_ppi2_df]
if gene.lower() == "alr": if gene.lower() == "alr":
dfs_list = [dynamut2_df, mcsm_ppi2_df] dfs_list = [dynamut2_df, mcsm_ppi2_df]
@ -1014,7 +1058,6 @@ else:
, '\nExpected nrows:', expected_nrows , '\nExpected nrows:', expected_nrows
, '\nGot:', len(dfs_merged_clean) ) , '\nGot:', len(dfs_merged_clean) )
# FIXME: need to extract 'cols_to_drop' programatically
# Drop cols # Drop cols
if combined_all_params.columns.str.contains(r'_x$|_y$', regex = True).any(): if combined_all_params.columns.str.contains(r'_x$|_y$', regex = True).any():
print('\nDuplicate column names detected...' print('\nDuplicate column names detected...'
@ -1027,10 +1070,14 @@ if combined_all_params.columns.str.contains(r'_x$|_y$', regex = True).any():
else: else:
print('\nNo duplicate column names detected, just writing file' print('\nNo duplicate column names detected, just writing file'
, '\nTotal cols:', len(combined_all_params.columns) ) , '\nTotal cols:', len(combined_all_params.columns) )
#del(foo) #%%============================================================================
#%% Done for gid on 10/09/2021 #---------------------
# write csv # Output 3: write csv
print('Writing file: all params') #---------------------
print('\nWriting file: all params')
print('\nOutput 3 filename:', outfile_comb
, '\n===================================================================\n')
combined_all_params.to_csv(outfile_comb, index = False) combined_all_params.to_csv(outfile_comb, index = False)
print('\nFinished writing file:' print('\nFinished writing file:'