thorough checking and updates for final running of all gene targets

This commit is contained in:
Tanushree Tunstall 2022-01-05 17:55:35 +00:00
parent b66cf31219
commit bffa3c376c

View file

@ -8,7 +8,7 @@ Created on Tue Aug 6 12:56:03 2019
#=======================================================================
# Task: combining all dfs to a single one
# Input: 8 dfs
# Input: 12/13/14 dfs
#1) <gene>.lower()'_complex_mcsm_norm.csv'
#2) <gene>.lower()_foldx.csv'
#3) <gene>.lower()_dssp.csv'
@ -16,20 +16,16 @@ Created on Tue Aug 6 12:56:03 2019
#5) <gene>.lower()_rd.csv'
#6) 'ns' + <gene>.lower()_snp_info.csv'
#7) <gene>.lower()_af_or.csv'
#8) <gene>.lower() _af_or_kinship.csv
#8) <gene>.lower() _af_or_kinship.csv (ONLY for pncA, but omitted for the final run)
#9) <gene>.lower()'_dynamut2.csv'
#10) <gene>.lower()'_dynamut.csv'
#11) <gene>.lower()'_mcsm_na.csv'
#12) <gene>.lower()'_mcsm_ppi2.csv'
#13) <gene>.lower()'_consurf.csv'
#14) <gene>.lower()'_snap2.csv'
# combining order
#Merge1 = 1 + 2
#Merge2 = 3 + 4
#Merge3 = Merge2 + 5
#Merge4 = Merge1 + Merge3
#Merge5 = 6 + 7
#Merge6 = Merge5 + 8
#Merge7 = Merge4 + Merge6
# Output: single csv of all 8 dfs combined
# useful link
@ -53,10 +49,10 @@ homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
#os.chdir(homedir + '/git/LSHTM_analysis/scripts')
sys.path.append(homedir + '/git/LSHTM_analysis/scripts')
os.getcwd()
# FIXME: local imports
#from combining import combine_dfs_with_checks
from combining_FIXME import detect_common_cols
from reference_dict import oneletter_aa_dict
@ -119,6 +115,7 @@ gene_list_normal = ['pnca', 'katg', 'rpob', 'alr']
if gene.lower() == "gid":
print("\nReading mCSM file for gene:", gene)
in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SRY.csv' # was incorrectly SAM previously
if gene.lower() == "embb":
print("\nReading mCSM file for gene:", gene)
#in_filename_mcsm = gene.lower() + '_complex_mcsm_norm1.csv' #798
@ -183,14 +180,11 @@ infile_snap2 = outdir + 'snap2/'+ infilename_snap2
snap2_df = pd.read_csv(infile_snap2, sep = ',')
#------------------------------------------------------------------------------
# ONLY:for gene pnca and gid: End logic should pick this up!
# ONLY: for gene 'gid' and 'rpob': End logic should pick this up!
geneL_na = ['gid', 'rpob']
if gene.lower() in geneL_na:
print("\nGene:", gene.lower()
, "\nReading mCSM_na files")
# infilename_dynamut = gene.lower() + '_dynamut_norm.csv' # gid
# infile_dynamut = outdir + 'dynamut_results/' + infilename_dynamut
# dynamut_df = pd.read_csv(infile_dynamut, sep = ',')
infilename_mcsm_na = gene.lower() + '_complex_mcsm_na_norm.csv' # gid
infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na
@ -199,18 +193,13 @@ if gene.lower() in geneL_na:
geneL_dy = ['gid']
if gene.lower() in geneL_dy:
print("\nGene:", gene.lower()
, "\nReading Dynamut and mCSM_na files")
, "\nReading Dynamut files")
infilename_dynamut = gene.lower() + '_dynamut_norm.csv' # gid
infile_dynamut = outdir + 'dynamut_results/' + infilename_dynamut
dynamut_df = pd.read_csv(infile_dynamut, sep = ',')
# infilename_mcsm_na = gene.lower() + '_complex_mcsm_na_norm.csv' # gid
# infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na
# mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',')
# ONLY:for gene embb and alr and katg: End logic should pick this up!
geneL_ppi2 = ['embb', 'alr']
#if gene.lower() == "embb" or "alr":
# ONLY: for genes 'alr', 'embb', 'katg' and 'rpob': End logic should pick this up!
geneL_ppi2 = ['alr', 'embb', 'katg', 'rpob']
if gene.lower() in geneL_ppi2:
infilename_mcsm_ppi2 = gene.lower() + '_complex_mcsm_ppi2_norm.csv'
infile_mcsm_ppi2 = outdir + 'mcsm_ppi2/' + infilename_mcsm_ppi2
@ -224,10 +213,17 @@ else:
#=======
# output
#=======
# outfile 3
out_filename_comb = gene.lower() + '_all_params.csv'
outfile_comb = outdir + out_filename_comb
print('\nOutput filename:', outfile_comb
, '\n===================================================================')
# outfile 2
out_filename_comb_afor = gene.lower() + '_comb_afor.csv'
outfile_comb_afor = outdir + out_filename_comb_afor
# outfile 1
out_filename_stab_struc = gene.lower() + '_comb_stab_struc_params.csv'
outfile_stab_struc = outdir + out_filename_stab_struc
# end of variable assignment for input and output files
#%%############################################################################
@ -235,6 +231,22 @@ print('\nOutput filename:', outfile_comb
# some preprocessing
#=====================
#===========
# KD
#===========
kd_df.shape
# geneL_kd = ['alr']
# if gene.lower() in geneL_kd:
# print('\nRunning gene:', gene.lower()
# ,'\nChecking start numbering')
if kd_df['wild_type_kd'].str.contains('X').any():
print('\nDetected X in wild_type_kd'
, '\nRunning gene:', gene.lower()
, '\nChecking start numbering')
kd_df = kd_df[~kd_df['wild_type_kd'].str.contains('X')]
#===========
# FoldX
#===========
@ -305,7 +317,6 @@ else:
#=======================
# Deepddg
# TODO: RERUN 'gid'
#=======================
deepddg_df.shape
@ -324,7 +335,8 @@ print('\nSelecting chain:', sel_chain, 'for gene:', gene)
deepddg_df = deepddg_df[deepddg_df['chain_id'] == sel_chain]
#--------------------------
# Drop chain id col as other targets don't have it.Check for duplicates
# Drop chain_id col as other
# targets don't have it.
#--------------------------
col_to_drop = ['chain_id']
deepddg_df = deepddg_df.drop(col_to_drop, axis = 1)
@ -374,12 +386,38 @@ else:
, '\nGot:', deepddg_pos2
, '\n======================================================')
if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1:
print('\nPASS: Deepddg data is scaled between -1 and 1',
'\nproceeding with merge')
#--------------------------
# Deepddg outcome category
#--------------------------
deepddg_df['deepddg_outcome'] = deepddg_df['deepddg'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
deepddg_df[deepddg_df['deepddg']>=0].count()
doc = deepddg_df['deepddg_outcome'].value_counts()
if 'deepddg_outcome' not in deepddg_df.columns:
print('\nCreating column: deepddg_outcome')
deepddg_df['deepddg_outcome'] = deepddg_df['deepddg'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
deepddg_df[deepddg_df['deepddg']>=0].count()
doc = deepddg_df['deepddg_outcome'].value_counts()
print(doc)
else:
print('\nColumn exists: deepddg_outcome')
t1 = deepddg_df['deepddg_outcome'].value_counts()
deepddg_df['deepddg_outcome2'] = deepddg_df['deepddg'].apply(lambda x: 'Stabilising' if x >= 0 else 'Destabilising')
t2 = deepddg_df['deepddg_outcome2'].value_counts()
print('\n', t1, '\n', t2)
#--------------------------
# Drop deepddg_outcome2 col
#--------------------------
col_to_drop2 = ['deepddg_outcome2']
deepddg_df = deepddg_df.drop(col_to_drop2, axis = 1)
if all(t1 == t2):
print('\nPASS: Deepddg_outcome category checked!')
doc = deepddg_df['deepddg_outcome'].value_counts()
else:
print('\nMISmatch in deepddg_outcome counts'
, '\n:', t1
, '\n:', t2)
if doc['Stabilising'] == deepddg_pos and doc['Stabilising'] == deepddg_pos2:
print('\nPASS: Deepddg outcome category created')
@ -390,18 +428,11 @@ else:
, '\n======================================================')
sys.exit()
if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1:
print('\nPASS: Deepddg data is scaled between -1 and 1',
'\nproceeding with merge')
#=======================
# Consurf
#=======================
consurf_df.shape
# drop row 0: as it contains no value but hangover text
consurf_df = consurf_df.drop(index=0)
#----------------------
# rename colums
#----------------------
@ -419,8 +450,8 @@ if gene.lower() in geneL_consurf:
if gene.lower() == 'alr':
offset_val = 34
print('\nUsing offset val:', offset_val)
if gene.lower() == 'katg':
offset_val = 23
print('\nUsing offset val:', offset_val)
@ -443,7 +474,7 @@ consurf_df = consurf_df.rename(columns={'SEQ' : 'wild_type'
, 'MSADATA' : 'consurf_msa_data'
, 'RESIDUEVARIETY' : 'consurf_aa_variety'})
# quick check
if len(consurf_df) == len(rd_df):
if len(consurf_df) == len(kd_df):
print('\nPASS: length of consurf df is as expected'
, '\nProceeding to format consurf df')
else:
@ -458,6 +489,7 @@ consurf_df['consurf_colour'] = consurf_df['consurf_colour_str'].str.extract(r'(\
consurf_df['consurf_colour'] = consurf_df['consurf_colour'].astype(int)
consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_str'].str.replace(r'.\*','0')
# non struc position are assigned a *, replacing that with a 0 so its all integer
consurf_df['consurf_colour_rev'] = consurf_df['consurf_colour_rev'].astype(int)
consurf_df['consurf_ci_upper'] = consurf_df['consurf_ci'].str.extract(r'(.*):')
@ -468,10 +500,10 @@ consurf_df['consurf_ci_lower'] = consurf_df['consurf_ci_lower'].astype(float)
#consurf_df['wt_3upper_f'] = consurf_df['wt_3upper'].str.extract(r'^\w{3}(\d+.*)')
#consurf_df['wt_3upper_f']
consurf_df['wt_3upper'] = consurf_df['wt_3upper'].str.replace(r'(\d+:.*)', '')
consurf_df['chain'] = consurf_df['wt_3upper'].str.extract(r':(.*)')
consurf_df['wt_3upper'] = consurf_df['wt_3upper'].str.replace(r'(\d+:.*)', '')
#-------------------------
# scale consurf values
#-------------------------
@ -517,8 +549,11 @@ consurf_df.columns
#---------------------------
# select columns
# (and also determine order)
# this removes redundant cols:
# consurf_colour_str
# consurf_ci
#---------------------------
consurf_df_f = consurf_df[['position'
consurf_col_order = ['position'
, 'wild_type'
, 'chain'
, 'wt_3upper'
@ -530,7 +565,18 @@ consurf_df_f = consurf_df[['position'
, 'consurf_ci_lower'
, 'consurf_ci_colour'
, 'consurf_msa_data'
, 'consurf_aa_variety']]
, 'consurf_aa_variety']
consurf_df_f = consurf_df[consurf_col_order]
# CHECK: whether a general rule or a gene specific rule!
if consurf_df_f['chain'].isna().sum() > 0:
print('\nNaN detected in column chain for consurf df')
#if gene.lower() == 'embb':
print('\nFurther consurf df processing for gene:', gene.lower())
print('\nDropping Nan from column name chain')
consurf_df_f = consurf_df_f[consurf_df_f['chain'].notna()]
#=======================
# SNAP2
@ -610,10 +656,12 @@ else:
, '\nGot:', snap2_pos2
, '\n======================================================')
#---------------------------
#-------------------------------------
# select columns
# (and also determine order)
#---------------------------
# renumbering already done using
# bash and corrected file is read in
#-------------------------------------
snap2_df.dtypes
snap2_df.columns
@ -718,7 +766,7 @@ if mcsm_foldx_dfs.loc[:,'wild_type': 'mut_aa_3lower'].isnull().values.any():
else:
print('\nNo NAs detected in mcsm_fold_dfs. Proceeding to merge deepddg_df')
#%%
#%%============================================================================
print('==================================='
, '\nSecond merge: mcsm_foldx_dfs + deepddg'
, '\n===================================')
@ -735,7 +783,7 @@ ncols_deepddg_merge = len(mcsm_foldx_deepddg_dfs.columns)
mcsm_foldx_deepddg_dfs['position'] = mcsm_foldx_deepddg_dfs['position'].astype('int64')
#%%============================================================================
#FIXME: select df with 'chain' to allow corret dim merging!
# Select df with 'chain' to allow corret dim merging!
print('==================================='
, '\nThird merge: dssp + kd'
, '\n===================================')
@ -755,7 +803,6 @@ dssp_kd_dfs = pd.merge(dssp_df
#, how = "outer")
, how = "inner")
print('\n\nResult of third merge:', dssp_kd_dfs.shape
, '\n===================================================================')
#%%============================================================================
@ -816,7 +863,7 @@ combined_df = pd.merge(mcsm_foldx_deepddg_dfs
combined_df_expected_cols = ncols_deepddg_merge + ncols_m3 - len(merging_cols_m4)
# FIXME: check logic, doesn't effect anything else!
# Check: whether logic effects anything else!
if not gene == "embB":
print("\nGene is:", gene)
if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols:
@ -859,16 +906,13 @@ combined_df_clean = combined_df.drop(cols_to_drop, axis = 1)
combined_df_clean.columns
del(foo)
#%%============================================================================
# Output columns
out_filename_stab_struc = gene.lower() + '_comb_stab_struc_params.csv'
outfile_stab_struc = outdir + out_filename_stab_struc
print('Output filename:', outfile_stab_struc
, '\n===================================================================')
#---------------------
# Output 1: write csv
#---------------------
print('\nWriting file: combined stability and structural parameters'
, '\nOutput 1 filename:', outfile_stab_struc
, '\n===================================================================\n')
combined_df_clean
# write csv
print('\nWriting file: combined stability and structural parameters')
combined_df_clean.to_csv(outfile_stab_struc, index = False)
print('\nFinished writing file:'
, '\nNo. of rows:', combined_df_clean.shape[0]
@ -943,14 +987,14 @@ else:
sys.exit('\nFAIL: merge unsuccessful for af and or')
#%%============================================================================
# Output columns: when dynamut, dynamut2 and others weren't being combined
out_filename_comb_afor = gene.lower() + '_comb_afor.csv'
outfile_comb_afor = outdir + out_filename_comb_afor
print('Output filename:', outfile_comb_afor
, '\n===================================================================')
#---------------------
# Output 2: write csv
# when dynamut, dynamut2 and others weren't being combined
#---------------------
print('\nWriting file: combined stability and afor'
, '\nOutput 2 filename:', outfile_comb_afor
, '\n===================================================================\n')
# write csv
print('Writing file: combined stability and afor')
combined_stab_afor.to_csv(outfile_comb_afor, index = False)
print('\nFinished writing file:'
, '\nNo. of rows:', combined_stab_afor.shape[0]
@ -966,9 +1010,9 @@ if gene.lower() == "gid":
if gene.lower() == "embb":
dfs_list = [dynamut2_df, mcsm_ppi2_df]
if gene.lower() == "katg":
dfs_list = [dynamut2_df]
dfs_list = [dynamut2_df, mcsm_ppi2_df]
if gene.lower() == "rpob":
dfs_list = [dynamut2_df, mcsm_na_df]
dfs_list = [dynamut2_df, mcsm_na_df, mcsm_ppi2_df]
if gene.lower() == "alr":
dfs_list = [dynamut2_df, mcsm_ppi2_df]
@ -1014,7 +1058,6 @@ else:
, '\nExpected nrows:', expected_nrows
, '\nGot:', len(dfs_merged_clean) )
# FIXME: need to extract 'cols_to_drop' programatically
# Drop cols
if combined_all_params.columns.str.contains(r'_x$|_y$', regex = True).any():
print('\nDuplicate column names detected...'
@ -1027,10 +1070,14 @@ if combined_all_params.columns.str.contains(r'_x$|_y$', regex = True).any():
else:
print('\nNo duplicate column names detected, just writing file'
, '\nTotal cols:', len(combined_all_params.columns) )
#del(foo)
#%% Done for gid on 10/09/2021
# write csv
print('Writing file: all params')
#%%============================================================================
#---------------------
# Output 3: write csv
#---------------------
print('\nWriting file: all params')
print('\nOutput 3 filename:', outfile_comb
, '\n===================================================================\n')
combined_all_params.to_csv(outfile_comb, index = False)
print('\nFinished writing file:'