saving work after running combining_dfs.py
This commit is contained in:
parent
1ddc5045d5
commit
45c48485f1
7 changed files with 136 additions and 92 deletions
|
@ -119,7 +119,7 @@ gene_list_normal = ["pnca", "katg", "rpob", "alr"]
|
|||
#FIXME: for gid, this should be SRY as this is the drug...please check!!!!
|
||||
if gene.lower() == "gid":
|
||||
print("\nReading mCSM file for gene:", gene)
|
||||
in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SAM.csv'
|
||||
in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SRY.csv' # was incorrectly SAM previously
|
||||
if gene.lower() == "embb":
|
||||
print("\nReading mCSM file for gene:", gene)
|
||||
in_filename_mcsm = gene.lower() + '_complex_mcsm_norm1.csv'
|
||||
|
@ -140,7 +140,7 @@ deepddg_df = pd.read_csv(infile_deepddg, sep = ',')
|
|||
|
||||
in_filename_dssp = gene.lower() + '_dssp.csv'
|
||||
infile_dssp = outdir + in_filename_dssp
|
||||
dssp_df = pd.read_csv(infile_dssp, sep = ',')
|
||||
dssp_df_raw = pd.read_csv(infile_dssp, sep = ',')
|
||||
|
||||
in_filename_kd = gene.lower() + '_kd.csv'
|
||||
infile_kd = outdir + in_filename_kd
|
||||
|
@ -164,10 +164,13 @@ infilename_dynamut2 = gene.lower() + '_dynamut2_norm.csv'
|
|||
infile_dynamut2 = outdir + 'dynamut_results/dynamut2/' + infilename_dynamut2
|
||||
dynamut2_df = pd.read_csv(infile_dynamut2, sep = ',')
|
||||
|
||||
#------------------------------------------------------------
|
||||
infilename_mcsm_f_snps = gene.lower() + '_mcsm_formatted_snps.csv'
|
||||
infile_mcsm_f_snps = outdir + infilename_mcsm_f_snps
|
||||
mcsm_f_snps = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None)
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# ONLY:for gene pnca and gid: End logic should pick this up!
|
||||
geneL_dy_na = ["pnca", "gid"]
|
||||
#if gene.lower() == "pnca" or "gid" :
|
||||
geneL_dy_na = ['gid']
|
||||
if gene.lower() in geneL_dy_na :
|
||||
print("\nGene:", gene.lower()
|
||||
, "\nReading Dynamut and mCSM_na files")
|
||||
|
@ -179,27 +182,28 @@ if gene.lower() in geneL_dy_na :
|
|||
infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na
|
||||
mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',')
|
||||
|
||||
# FIXME: ppi2, not extracted as expected for alr
|
||||
# TODO: get mcsm_ppi2 data for alr
|
||||
# ONLY:for gene embb and alr: End logic should pick this up!
|
||||
geneL_ppi2 = ["embb", "alr"]
|
||||
geneL_ppi2 = ['embb', 'alr']
|
||||
#if gene.lower() == "embb" or "alr":
|
||||
if gene.lower() in "embb" or "alr":
|
||||
if gene.lower() in geneL_ppi2:
|
||||
infilename_mcsm_ppi2 = gene.lower() + '_complex_mcsm_ppi2_norm.csv'
|
||||
infile_mcsm_ppi2 = outdir + 'mcsm_ppi2/' + infilename_mcsm_ppi2
|
||||
mcsm_ppi2_df = pd.read_csv(infile_mcsm_ppi2, sep = ',')
|
||||
#--------------------------------------------------------------
|
||||
infilename_mcsm_f_snps = gene.lower() + '_mcsm_formatted_snps.csv'
|
||||
infile_mcsm_f_snps = outdir + infilename_mcsm_f_snps
|
||||
mcsm_f_snps = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mutationinformation'], header = None)
|
||||
|
||||
|
||||
|
||||
if gene.lower() == "embb":
|
||||
sel_chain = "B"
|
||||
else:
|
||||
sel_chain = "A"
|
||||
#------------------------------------------------------------------------------
|
||||
#=======
|
||||
# output
|
||||
#=======
|
||||
out_filename_comb = gene.lower() + '_all_params.csv'
|
||||
outfile_comb = outdir + out_filename_comb
|
||||
print('Output filename:', outfile_comb
|
||||
print('\nOutput filename:', outfile_comb
|
||||
, '\n===================================================================')
|
||||
|
||||
# end of variable assignment for input and output files
|
||||
#%%############################################################################
|
||||
#=====================
|
||||
|
@ -233,7 +237,7 @@ len(foldx_df.loc[foldx_df['ddg_foldx'] < 0])
|
|||
foldx_scale = lambda x : x/abs(foldx_min) if x < 0 else (x/foldx_max if x >= 0 else 'failed')
|
||||
|
||||
foldx_df['foldx_scaled'] = foldx_df['ddg_foldx'].apply(foldx_scale)
|
||||
print('Raw foldx scores:\n', foldx_df['ddg_foldx']
|
||||
print('\nRaw foldx scores:\n', foldx_df['ddg_foldx']
|
||||
, '\n---------------------------------------------------------------'
|
||||
, '\nScaled foldx scores:\n', foldx_df['foldx_scaled'])
|
||||
|
||||
|
@ -276,9 +280,42 @@ else:
|
|||
|
||||
#=======================
|
||||
# Deepddg
|
||||
# TODO: RERUN 'gid'
|
||||
#=======================
|
||||
deepddg_df.shape
|
||||
|
||||
#--------------------------
|
||||
# check if >1 chain
|
||||
#--------------------------
|
||||
deepddg_df.loc[:,'chain_id'].value_counts()
|
||||
|
||||
if len(deepddg_df.loc[:,'chain_id'].value_counts()) > 1:
|
||||
print("\nChains detected: >1"
|
||||
, "\nGene:", gene
|
||||
, "\nChains:", deepddg_df.loc[:,'chain_id'].value_counts().index)
|
||||
|
||||
print('\nSelecting chain:', sel_chain, 'for gene:', gene)
|
||||
|
||||
deepddg_df = deepddg_df[deepddg_df['chain_id'] == sel_chain]
|
||||
|
||||
#--------------------------
|
||||
# Check for duplicates
|
||||
#--------------------------
|
||||
if len(deepddg_df['mutationinformation'].duplicated().value_counts())> 1:
|
||||
print("\nFAIL: Duplicates detected in DeepDDG infile"
|
||||
, "\nNo. of duplicates:"
|
||||
, deepddg_df['mutationinformation'].duplicated().value_counts()[1]
|
||||
, "\nformat deepDDG infile before proceeding")
|
||||
sys.exit()
|
||||
else:
|
||||
print("\nPASS: No duplicates detected in DeepDDG infile")
|
||||
|
||||
#--------------------------
|
||||
# Drop chain id col as other targets don't have it.Check for duplicates
|
||||
#--------------------------
|
||||
col_to_drop = ['chain_id']
|
||||
deepddg_df = deepddg_df.drop(col_to_drop, axis = 1)
|
||||
|
||||
#-------------------------
|
||||
# scale Deepddg values
|
||||
#-------------------------
|
||||
|
@ -290,7 +327,7 @@ deepddg_max = deepddg_df['deepddg'].max()
|
|||
deepddg_scale = lambda x : x/abs(deepddg_min) if x < 0 else (x/deepddg_max if x >= 0 else 'failed')
|
||||
|
||||
deepddg_df['deepddg_scaled'] = deepddg_df['deepddg'].apply(deepddg_scale)
|
||||
print('Raw deepddg scores:\n', deepddg_df['deepddg']
|
||||
print('\nRaw deepddg scores:\n', deepddg_df['deepddg']
|
||||
, '\n---------------------------------------------------------------'
|
||||
, '\nScaled deepddg scores:\n', deepddg_df['deepddg_scaled'])
|
||||
|
||||
|
@ -310,8 +347,8 @@ else:
|
|||
print('\nFAIL: deepddg values scaled numbers MISmatch'
|
||||
, '\nExpected number:', deepddg_pos
|
||||
, '\nGot:', deepddg_pos2
|
||||
, '\n======================================================')
|
||||
|
||||
, '\n======================================================')
|
||||
|
||||
#--------------------------
|
||||
# Deepddg outcome category
|
||||
#--------------------------
|
||||
|
@ -327,49 +364,15 @@ else:
|
|||
, '\nGot:', doc[0]
|
||||
, '\n======================================================')
|
||||
sys.exit()
|
||||
|
||||
#--------------------------
|
||||
# check if >1 chain
|
||||
#--------------------------
|
||||
deepddg_df.loc[:,'chain_id'].value_counts()
|
||||
|
||||
if len(deepddg_df.loc[:,'chain_id'].value_counts()) > 1:
|
||||
print("\nChains detected: >1"
|
||||
, "\nGene:", gene
|
||||
, "\nChains:", deepddg_df.loc[:,'chain_id'].value_counts().index)
|
||||
|
||||
#--------------------------
|
||||
# FIXME: This needs to happen BEFORE scaling as it will vary
|
||||
# subset chain
|
||||
#--------------------------
|
||||
if gene.lower() == "embb":
|
||||
sel_chain = "B"
|
||||
else:
|
||||
sel_chain = "A"
|
||||
|
||||
deepddg_df = deepddg_df[deepddg_df['chain_id'] == sel_chain]
|
||||
if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1:
|
||||
print('\nPASS: Deepddg data is scaled between -1 and 1',
|
||||
'\nproceeding with merge')
|
||||
|
||||
#--------------------------
|
||||
# Check for duplicates
|
||||
#--------------------------
|
||||
if len(deepddg_df['mutationinformation'].duplicated().value_counts())> 1:
|
||||
print("\nFAIL: Duplicates detected in DeepDDG infile"
|
||||
, "\nNo. of duplicates:"
|
||||
, deepddg_df['mutationinformation'].duplicated().value_counts()[1]
|
||||
, "\nformat deepDDG infile before proceeding")
|
||||
sys.exit()
|
||||
else:
|
||||
print("\nPASS: No duplicates detected in DeepDDG infile")
|
||||
|
||||
#--------------------------
|
||||
# Drop chain id col as other targets don't have itCheck for duplicates
|
||||
#--------------------------
|
||||
col_to_drop = ['chain_id']
|
||||
deepddg_df = deepddg_df.drop(col_to_drop, axis = 1)
|
||||
|
||||
#%%=============================================================================
|
||||
#%%============================================================================
|
||||
# Now merges begin
|
||||
#%%=============================================================================
|
||||
|
||||
print('==================================='
|
||||
, '\nFirst merge: mcsm + foldx'
|
||||
, '\n===================================')
|
||||
|
@ -399,10 +402,11 @@ print('\n\nResult of first merge:', mcsm_foldx_dfs.shape
|
|||
mcsm_foldx_dfs[merging_cols_m1].apply(len)
|
||||
mcsm_foldx_dfs[merging_cols_m1].apply(len) == len(mcsm_foldx_dfs)
|
||||
|
||||
#%% for embB and any other targets where mCSM-lig hasn't run for
|
||||
# get the empty cells to be full of meaningful info
|
||||
#%% for embB and any other targets where mCSM-lig hasn't run for ALL nsSNPs.
|
||||
# Get the empty cells to be full of meaningful info
|
||||
if mcsm_foldx_dfs.loc[:,'wild_type': 'mut_aa_3lower'].isnull().values.any():
|
||||
print ("NAs detected in mcsm cols after merge")
|
||||
print ('\nNAs detected in mcsm cols after merge.'
|
||||
, '\nCleaning data before merging deepddg_df')
|
||||
|
||||
##############################
|
||||
# Extract relevant col values
|
||||
|
@ -446,6 +450,8 @@ if mcsm_foldx_dfs.loc[:,'wild_type': 'mut_aa_3lower'].isnull().values.any():
|
|||
mcsm_foldx_dfs['wt_aa_3lower'] = wt.map(lookup_dict)
|
||||
mut = mcsm_foldx_dfs['mutant_type'].squeeze()
|
||||
mcsm_foldx_dfs['mut_aa_3lower'] = mut.map(lookup_dict)
|
||||
else:
|
||||
print('\nNo NAs detected in mcsm_fold_dfs. Proceeding to merge deepddg_df')
|
||||
|
||||
#%%
|
||||
print('==================================='
|
||||
|
@ -464,14 +470,18 @@ ncols_deepddg_merge = len(mcsm_foldx_deepddg_dfs.columns)
|
|||
mcsm_foldx_deepddg_dfs['position'] = mcsm_foldx_deepddg_dfs['position'].astype('int64')
|
||||
|
||||
#%%============================================================================
|
||||
#FIXME: select df with 'chain' to allow corret dim merging!
|
||||
print('==================================='
|
||||
, '\Third merge: dssp + kd'
|
||||
, '\nThird merge: dssp + kd'
|
||||
, '\n===================================')
|
||||
|
||||
dssp_df.shape
|
||||
dssp_df_raw.shape
|
||||
kd_df.shape
|
||||
rd_df.shape
|
||||
|
||||
dssp_df = dssp_df_raw[dssp_df_raw['chain_id'] == sel_chain]
|
||||
dssp_df['chain_id'].value_counts()
|
||||
|
||||
#dssp_kd_dfs = combine_dfs_with_checks(dssp_df, kd_df, my_join = "outer")
|
||||
merging_cols_m2 = detect_common_cols(dssp_df, kd_df)
|
||||
dssp_kd_dfs = pd.merge(dssp_df
|
||||
|
@ -557,17 +567,19 @@ combined_df['wild_type'].equals(combined_df['wild_type_dssp'])
|
|||
foo = combined_df[['wild_type', 'wild_type_kd', 'wt_3letter_caps', 'wt_aa_3lower', 'mut_aa_3lower']]
|
||||
|
||||
# Drop cols
|
||||
cols_to_drop = ['chain_id', 'wild_type_kd', 'wild_type_dssp', 'wt_3letter_caps' ]
|
||||
cols_to_drop = ['chain_id', 'wild_type_kd', 'wild_type_dssp', 'wt_3letter_caps']
|
||||
combined_df_clean = combined_df.drop(cols_to_drop, axis = 1)
|
||||
|
||||
del(foo)
|
||||
#%%============================================================================
|
||||
# Output columns
|
||||
out_filename_stab_struc = gene.lower() + '_comb_stab_struc_params.csv'
|
||||
outfile_stab_struc = outdir + '/' + out_filename_stab_struc
|
||||
outfile_stab_struc = outdir + out_filename_stab_struc
|
||||
print('Output filename:', outfile_stab_struc
|
||||
, '\n===================================================================')
|
||||
|
||||
combined_df_clean
|
||||
|
||||
# write csv
|
||||
print('\nWriting file: combined stability and structural parameters')
|
||||
combined_df_clean.to_csv(outfile_stab_struc, index = False)
|
||||
|
@ -646,7 +658,7 @@ else:
|
|||
#%%============================================================================
|
||||
# Output columns: when dynamut, dynamut2 and others weren't being combined
|
||||
out_filename_comb_afor = gene.lower() + '_comb_afor.csv'
|
||||
outfile_comb_afor = outdir + '/' + out_filename_comb_afor
|
||||
outfile_comb_afor = outdir + out_filename_comb_afor
|
||||
print('Output filename:', outfile_comb_afor
|
||||
, '\n===================================================================')
|
||||
|
||||
|
@ -661,7 +673,7 @@ print('\nFinished writing file:'
|
|||
#dfs_list = [dynamut_df, dynamut2_df, mcsm_na_df] # gid
|
||||
|
||||
if gene.lower() == "pnca":
|
||||
dfs_list = [dynamut_df, dynamut2_df]
|
||||
dfs_list = [dynamut2_df]
|
||||
if gene.lower() == "gid":
|
||||
dfs_list = [dynamut_df, dynamut2_df, mcsm_na_df]
|
||||
if gene.lower() == "embb":
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue