merged changes from the combining_dfs.py file from branch 'embb_dev'
This commit is contained in:
commit
46e2c93885
1 changed files with 60 additions and 26 deletions
|
@ -59,7 +59,7 @@ os.getcwd()
|
||||||
# FIXME: local imports
|
# FIXME: local imports
|
||||||
#from combining import combine_dfs_with_checks
|
#from combining import combine_dfs_with_checks
|
||||||
from combining_FIXME import detect_common_cols
|
from combining_FIXME import detect_common_cols
|
||||||
from reference_dict import oneletter_aa_dict
|
from reference_dict import oneletter_aa_dict
|
||||||
from reference_dict import low_3letter_dict
|
from reference_dict import low_3letter_dict
|
||||||
|
|
||||||
from aa_code import get_aa_3lower
|
from aa_code import get_aa_3lower
|
||||||
|
@ -114,16 +114,16 @@ if not outdir:
|
||||||
#=======
|
#=======
|
||||||
# input
|
# input
|
||||||
#=======
|
#=======
|
||||||
gene_list_normal = ["pnca", "katg", "rpob", "alr"]
|
gene_list_normal = ['pnca', 'katg', 'rpob', 'alr']
|
||||||
|
|
||||||
#FIXME: for gid, this should be SRY as this is the drug...please check!!!!
|
|
||||||
if gene.lower() == "gid":
|
if gene.lower() == "gid":
|
||||||
print("\nReading mCSM file for gene:", gene)
|
print("\nReading mCSM file for gene:", gene)
|
||||||
in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SRY.csv' # was incorrectly SAM previously
|
in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SRY.csv' # was incorrectly SAM previously
|
||||||
if gene.lower() == "embb":
|
if gene.lower() == "embb":
|
||||||
print("\nReading mCSM file for gene:", gene)
|
print("\nReading mCSM file for gene:", gene)
|
||||||
#in_filename_mcsm = gene.lower() + '_complex_mcsm_norm1.csv' #798
|
#in_filename_mcsm = gene.lower() + '_complex_mcsm_norm1.csv' #798
|
||||||
in_filename_mcsm = gene.lower() + '_complex_mcsm_norm2.csv' #844
|
#in_filename_mcsm = gene.lower() + '_complex_mcsm_norm2.csv' #844
|
||||||
|
in_filename_mcsm = gene.lower() + '_complex_mcsm_norm3.csv' #851
|
||||||
|
|
||||||
if gene.lower() in gene_list_normal:
|
if gene.lower() in gene_list_normal:
|
||||||
print("\nReading mCSM file for gene:", gene)
|
print("\nReading mCSM file for gene:", gene)
|
||||||
|
@ -172,17 +172,29 @@ mcsm_f_snps = pd.read_csv(infile_mcsm_f_snps, sep = ',', names = ['mu
|
||||||
|
|
||||||
#------------------------------------------------------------------------------
|
#------------------------------------------------------------------------------
|
||||||
# ONLY:for gene pnca and gid: End logic should pick this up!
|
# ONLY:for gene pnca and gid: End logic should pick this up!
|
||||||
geneL_dy_na = ['gid']
|
geneL_na = ['gid', 'rpob']
|
||||||
if gene.lower() in geneL_dy_na :
|
if gene.lower() in geneL_na:
|
||||||
|
print("\nGene:", gene.lower()
|
||||||
|
, "\nReading mCSM_na files")
|
||||||
|
# infilename_dynamut = gene.lower() + '_dynamut_norm.csv' # gid
|
||||||
|
# infile_dynamut = outdir + 'dynamut_results/' + infilename_dynamut
|
||||||
|
# dynamut_df = pd.read_csv(infile_dynamut, sep = ',')
|
||||||
|
|
||||||
|
infilename_mcsm_na = gene.lower() + '_complex_mcsm_na_norm.csv' # gid
|
||||||
|
infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na
|
||||||
|
mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',')
|
||||||
|
|
||||||
|
geneL_dy = ['gid']
|
||||||
|
if gene.lower() in geneL_dy:
|
||||||
print("\nGene:", gene.lower()
|
print("\nGene:", gene.lower()
|
||||||
, "\nReading Dynamut and mCSM_na files")
|
, "\nReading Dynamut and mCSM_na files")
|
||||||
infilename_dynamut = gene.lower() + '_dynamut_norm.csv' # gid
|
infilename_dynamut = gene.lower() + '_dynamut_norm.csv' # gid
|
||||||
infile_dynamut = outdir + 'dynamut_results/' + infilename_dynamut
|
infile_dynamut = outdir + 'dynamut_results/' + infilename_dynamut
|
||||||
dynamut_df = pd.read_csv(infile_dynamut, sep = ',')
|
dynamut_df = pd.read_csv(infile_dynamut, sep = ',')
|
||||||
|
|
||||||
infilename_mcsm_na = gene.lower() + '_complex_mcsm_na_norm.csv' # gid
|
# infilename_mcsm_na = gene.lower() + '_complex_mcsm_na_norm.csv' # gid
|
||||||
infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na
|
# infile_mcsm_na = outdir + 'mcsm_na_results/' + infilename_mcsm_na
|
||||||
mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',')
|
# mcsm_na_df = pd.read_csv(infile_mcsm_na, sep = ',')
|
||||||
|
|
||||||
# ONLY:for gene embb and alr: End logic should pick this up!
|
# ONLY:for gene embb and alr: End logic should pick this up!
|
||||||
geneL_ppi2 = ['embb', 'alr']
|
geneL_ppi2 = ['embb', 'alr']
|
||||||
|
@ -192,7 +204,6 @@ if gene.lower() in geneL_ppi2:
|
||||||
infile_mcsm_ppi2 = outdir + 'mcsm_ppi2/' + infilename_mcsm_ppi2
|
infile_mcsm_ppi2 = outdir + 'mcsm_ppi2/' + infilename_mcsm_ppi2
|
||||||
mcsm_ppi2_df = pd.read_csv(infile_mcsm_ppi2, sep = ',')
|
mcsm_ppi2_df = pd.read_csv(infile_mcsm_ppi2, sep = ',')
|
||||||
|
|
||||||
|
|
||||||
if gene.lower() == "embb":
|
if gene.lower() == "embb":
|
||||||
sel_chain = "B"
|
sel_chain = "B"
|
||||||
else:
|
else:
|
||||||
|
@ -227,7 +238,7 @@ foldx_df['ddg_foldx']
|
||||||
|
|
||||||
# Rescale values in Foldx_change col b/w -1 and 1 so negative numbers
|
# Rescale values in Foldx_change col b/w -1 and 1 so negative numbers
|
||||||
# stay neg and pos numbers stay positive
|
# stay neg and pos numbers stay positive
|
||||||
foldx_min = foldx_df['ddg_foldx'].min()
|
foldx_min = foldx_df['ddg_foldx'].min()
|
||||||
foldx_max = foldx_df['ddg_foldx'].max()
|
foldx_max = foldx_df['ddg_foldx'].max()
|
||||||
foldx_min
|
foldx_min
|
||||||
foldx_max
|
foldx_max
|
||||||
|
@ -299,7 +310,13 @@ if len(deepddg_df.loc[:,'chain_id'].value_counts()) > 1:
|
||||||
print('\nSelecting chain:', sel_chain, 'for gene:', gene)
|
print('\nSelecting chain:', sel_chain, 'for gene:', gene)
|
||||||
|
|
||||||
deepddg_df = deepddg_df[deepddg_df['chain_id'] == sel_chain]
|
deepddg_df = deepddg_df[deepddg_df['chain_id'] == sel_chain]
|
||||||
|
|
||||||
|
#--------------------------
|
||||||
|
# Drop chain id col as other targets don't have it.Check for duplicates
|
||||||
|
#--------------------------
|
||||||
|
col_to_drop = ['chain_id']
|
||||||
|
deepddg_df = deepddg_df.drop(col_to_drop, axis = 1)
|
||||||
|
|
||||||
#--------------------------
|
#--------------------------
|
||||||
# Check for duplicates
|
# Check for duplicates
|
||||||
#--------------------------
|
#--------------------------
|
||||||
|
@ -312,12 +329,6 @@ if len(deepddg_df['mutationinformation'].duplicated().value_counts())> 1:
|
||||||
else:
|
else:
|
||||||
print("\nPASS: No duplicates detected in DeepDDG infile")
|
print("\nPASS: No duplicates detected in DeepDDG infile")
|
||||||
|
|
||||||
#--------------------------
|
|
||||||
# Drop chain id col as other targets don't have it.Check for duplicates
|
|
||||||
#--------------------------
|
|
||||||
col_to_drop = ['chain_id']
|
|
||||||
deepddg_df = deepddg_df.drop(col_to_drop, axis = 1)
|
|
||||||
|
|
||||||
#-------------------------
|
#-------------------------
|
||||||
# scale Deepddg values
|
# scale Deepddg values
|
||||||
#-------------------------
|
#-------------------------
|
||||||
|
@ -366,8 +377,7 @@ else:
|
||||||
, '\nGot:', doc[0]
|
, '\nGot:', doc[0]
|
||||||
, '\n======================================================')
|
, '\n======================================================')
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
|
|
||||||
if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1:
|
if deepddg_df['deepddg_scaled'].min() == -1 and deepddg_df['deepddg_scaled'].max() == 1:
|
||||||
print('\nPASS: Deepddg data is scaled between -1 and 1',
|
print('\nPASS: Deepddg data is scaled between -1 and 1',
|
||||||
'\nproceeding with merge')
|
'\nproceeding with merge')
|
||||||
|
@ -571,7 +581,7 @@ foo = combined_df[['wild_type', 'wild_type_kd', 'wt_3letter_caps', 'wt_aa_3lower
|
||||||
# Drop cols
|
# Drop cols
|
||||||
cols_to_drop = ['chain_id', 'wild_type_kd', 'wild_type_dssp', 'wt_3letter_caps']
|
cols_to_drop = ['chain_id', 'wild_type_kd', 'wild_type_dssp', 'wt_3letter_caps']
|
||||||
combined_df_clean = combined_df.drop(cols_to_drop, axis = 1)
|
combined_df_clean = combined_df.drop(cols_to_drop, axis = 1)
|
||||||
|
combined_df_clean.columns
|
||||||
del(foo)
|
del(foo)
|
||||||
#%%============================================================================
|
#%%============================================================================
|
||||||
# Output columns
|
# Output columns
|
||||||
|
@ -611,7 +621,7 @@ get_aa_1upper(df = afor_df
|
||||||
afor_df['mutationinformation'] = afor_df['wild_type'] + afor_df['position'].map(str) + afor_df['mutant_type']
|
afor_df['mutationinformation'] = afor_df['wild_type'] + afor_df['position'].map(str) + afor_df['mutant_type']
|
||||||
afor_cols = afor_df.columns
|
afor_cols = afor_df.columns
|
||||||
|
|
||||||
merging_cols_m5 = detect_common_cols(combined_df_clean, afor_df)
|
merging_cols_m5 = detect_common_cols(combined_df_clean, afor_df)
|
||||||
|
|
||||||
# remove position so that merging can take place without dtype conflicts
|
# remove position so that merging can take place without dtype conflicts
|
||||||
merging_cols_m5.remove('position')
|
merging_cols_m5.remove('position')
|
||||||
|
@ -683,14 +693,24 @@ if gene.lower() == "embb":
|
||||||
if gene.lower() == "katg":
|
if gene.lower() == "katg":
|
||||||
dfs_list = [dynamut2_df]
|
dfs_list = [dynamut2_df]
|
||||||
if gene.lower() == "rpob":
|
if gene.lower() == "rpob":
|
||||||
dfs_list = [dynamut2_df]
|
dfs_list = [dynamut2_df, mcsm_na_df]
|
||||||
if gene.lower() == "alr":
|
if gene.lower() == "alr":
|
||||||
dfs_list = [dynamut2_df, mcsm_ppi2_df]
|
dfs_list = [dynamut2_df, mcsm_ppi2_df]
|
||||||
|
|
||||||
|
# noticed that with revised rpoB that mcsm-NA had one less position,
|
||||||
|
# Hence this condition else the last check fails with discrepancy for expected_nrows
|
||||||
|
if len(dfs_list) > 1:
|
||||||
|
join_type = 'outer'
|
||||||
|
else:
|
||||||
|
join_type = 'inner'
|
||||||
|
|
||||||
|
print('\nUsing join type: "', join_type, '" for the last but one merge')
|
||||||
|
|
||||||
dfs_merged = reduce(lambda left,right: pd.merge(left
|
dfs_merged = reduce(lambda left,right: pd.merge(left
|
||||||
, right
|
, right
|
||||||
, on = ['mutationinformation']
|
, on = ['mutationinformation']
|
||||||
, how = 'inner')
|
#, how = 'inner')
|
||||||
|
, how = join_type)
|
||||||
, dfs_list)
|
, dfs_list)
|
||||||
# drop excess columns
|
# drop excess columns
|
||||||
drop_cols = detect_common_cols(dfs_merged, combined_stab_afor)
|
drop_cols = detect_common_cols(dfs_merged, combined_stab_afor)
|
||||||
|
@ -718,7 +738,21 @@ else:
|
||||||
, '\nGot:', len(dfs_merged_clean.columns)
|
, '\nGot:', len(dfs_merged_clean.columns)
|
||||||
, '\nExpected nrows:', expected_nrows
|
, '\nExpected nrows:', expected_nrows
|
||||||
, '\nGot:', len(dfs_merged_clean) )
|
, '\nGot:', len(dfs_merged_clean) )
|
||||||
|
|
||||||
|
# FIXME: need to extract 'cols_to_drop' programatically
|
||||||
|
# Drop cols
|
||||||
|
if combined_all_params.columns.str.contains(r'_x$|_y$', regex = True).any():
|
||||||
|
print('\nDuplicate column names detected...'
|
||||||
|
, '\nDropping these before writing file')
|
||||||
|
extra_cols_to_drop = list(combined_all_params.columns.str.extract(r'(.*_x$|.*_y$)', expand = True).dropna()[0])
|
||||||
|
print('\nTotal cols:', len(combined_all_params.columns)
|
||||||
|
,'\nDropping:', len(extra_cols_to_drop), 'columns')
|
||||||
|
#extra_cols_to_drop = ['chain_x', 'chain_y']
|
||||||
|
combined_all_params = combined_all_params.drop(extra_cols_to_drop, axis = 1)
|
||||||
|
else:
|
||||||
|
print('\nNo duplicate column names detected, just writing file'
|
||||||
|
, '\nTotal cols:', len(combined_all_params.columns) )
|
||||||
|
#del(foo)
|
||||||
#%% Done for gid on 10/09/2021
|
#%% Done for gid on 10/09/2021
|
||||||
# write csv
|
# write csv
|
||||||
print('Writing file: all params')
|
print('Writing file: all params')
|
||||||
|
@ -727,4 +761,4 @@ combined_all_params.to_csv(outfile_comb, index = False)
|
||||||
print('\nFinished writing file:'
|
print('\nFinished writing file:'
|
||||||
, '\nNo. of rows:', combined_all_params.shape[0]
|
, '\nNo. of rows:', combined_all_params.shape[0]
|
||||||
, '\nNo. of cols:', combined_all_params.shape[1])
|
, '\nNo. of cols:', combined_all_params.shape[1])
|
||||||
#%% end of script
|
#%% end of script
|
Loading…
Add table
Add a link
Reference in a new issue