added active site indication for merged_dfs in count_vars_ML.R and also added 'gene_name' in combining_dfs.py
This commit is contained in:
parent
1204f1faba
commit
f355846dae
3 changed files with 59 additions and 15 deletions
|
@ -124,7 +124,7 @@ if gene.lower() == "embb":
|
||||||
print("\nReading mCSM file for gene:", gene)
|
print("\nReading mCSM file for gene:", gene)
|
||||||
#in_filename_mcsm = gene.lower() + '_complex_mcsm_norm1.csv' #798
|
#in_filename_mcsm = gene.lower() + '_complex_mcsm_norm1.csv' #798
|
||||||
#in_filename_mcsm = gene.lower() + '_complex_mcsm_norm2.csv' #844
|
#in_filename_mcsm = gene.lower() + '_complex_mcsm_norm2.csv' #844
|
||||||
in_filename_mcsm = gene.lower() + '_complex_mcsm_norm3.csv' #851
|
in_filename_mcsm = gene.lower() + '_complex_mcsm_norm3.csv' #858 [got all muts on 15/05/2022]
|
||||||
|
|
||||||
if gene.lower() in gene_list_normal:
|
if gene.lower() in gene_list_normal:
|
||||||
print("\nReading mCSM file for gene:", gene)
|
print("\nReading mCSM file for gene:", gene)
|
||||||
|
@ -1081,21 +1081,35 @@ combined_df = pd.merge(combined_7dfs
|
||||||
|
|
||||||
combined_df_expected_cols = len(combined_7dfs.columns) + len(snap2_df_f.columns) - len(merging_cols_m4_v2)
|
combined_df_expected_cols = len(combined_7dfs.columns) + len(snap2_df_f.columns) - len(merging_cols_m4_v2)
|
||||||
|
|
||||||
#%% Dimension checks for combined_df:
|
#%% Dimension checks for combined_df: specially because embB didn't have mCSM
|
||||||
|
# values for all muts. This was fixed in UQ, and this rerun on 15/05/2022
|
||||||
|
#------------------------------------------------------------------------------
|
||||||
|
# No longer required since mCSM values for fetched for all muts!
|
||||||
# CHECK whether logic effects anything else!
|
# CHECK whether logic effects anything else!
|
||||||
if not gene == "embB":
|
# if not gene == "embB":
|
||||||
print("\nGene is:", gene)
|
# print("\nGene is:", gene)
|
||||||
if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols:
|
# if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols:
|
||||||
print('PASS: successfully combined 8 dfs'
|
# print('PASS: successfully combined 8 dfs'
|
||||||
, '\nNo. of rows combined_df:', len(combined_df)
|
# , '\nNo. of rows combined_df:', len(combined_df)
|
||||||
, '\nNo. of cols combined_df:', len(combined_df.columns))
|
# , '\nNo. of cols combined_df:', len(combined_df.columns))
|
||||||
elif len(combined_df) == len(foldx_df) and len(combined_df.columns) == combined_df_expected_cols:
|
# elif len(combined_df) == len(foldx_df) and len(combined_df.columns) == combined_df_expected_cols:
|
||||||
|
# print('PASS: successfully combined 8 dfs'
|
||||||
|
# , '\nNo. of rows combined_df:', len(combined_df)
|
||||||
|
# , '\nNo. of cols combined_df:', len(combined_df.columns))
|
||||||
|
# else:
|
||||||
|
# sys.exit('FAIL: check individual merges for seventh merge')
|
||||||
|
#------------------------------------------------------------------------------
|
||||||
|
print("\nRunning Dimension check for combined_df for gene:", gene)
|
||||||
|
|
||||||
|
if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols:
|
||||||
print('PASS: successfully combined 8 dfs'
|
print('PASS: successfully combined 8 dfs'
|
||||||
, '\nNo. of rows combined_df:', len(combined_df)
|
, '\nNo. of rows combined_df:', len(combined_df)
|
||||||
, '\nNo. of cols combined_df:', len(combined_df.columns))
|
, '\nNo. of cols combined_df:', len(combined_df.columns))
|
||||||
else:
|
else:
|
||||||
sys.exit('FAIL: check individual merges for seventh merge')
|
sys.exit('FAIL: check individual merges for seventh merge')
|
||||||
|
|
||||||
|
print('\n====================================================')
|
||||||
|
|
||||||
print('\nResult of Seventh merge:', combined_df.shape
|
print('\nResult of Seventh merge:', combined_df.shape
|
||||||
, '\n===================================================================')
|
, '\n===================================================================')
|
||||||
|
|
||||||
|
@ -1361,6 +1375,12 @@ else:
|
||||||
, '\nGot:', len(combined_all_params_f2.columns)
|
, '\nGot:', len(combined_all_params_f2.columns)
|
||||||
, '\nExpected nrows:', expected_nrows
|
, '\nExpected nrows:', expected_nrows
|
||||||
, '\nGot:', len(combined_all_params_f2) )
|
, '\nGot:', len(combined_all_params_f2) )
|
||||||
|
|
||||||
|
#---------------------------------------
|
||||||
|
# Add gene name
|
||||||
|
#---------------------------------------
|
||||||
|
combined_all_params_f2['gene_name'] = gene.lower()
|
||||||
|
|
||||||
#---------------------------------------
|
#---------------------------------------
|
||||||
# Add pdb_file name at the end
|
# Add pdb_file name at the end
|
||||||
#---------------------------------------
|
#---------------------------------------
|
||||||
|
|
|
@ -2,13 +2,33 @@
|
||||||
|
|
||||||
#source("~/git/LSHTM_analysis/config/alr.R")
|
#source("~/git/LSHTM_analysis/config/alr.R")
|
||||||
#source("~/git/LSHTM_analysis/config/embb.R")
|
#source("~/git/LSHTM_analysis/config/embb.R")
|
||||||
#source("~/git/LSHTM_analysis/config/gid.R")
|
##source("~/git/LSHTM_analysis/config/gid.R")
|
||||||
#source("~/git/LSHTM_analysis/config/katg.R")
|
#source("~/git/LSHTM_analysis/config/katg.R")
|
||||||
#source("~/git/LSHTM_analysis/config/pnca.R")
|
#source("~/git/LSHTM_analysis/config/pnca.R")
|
||||||
#source("~/git/LSHTM_analysis/config/rpob.R")
|
source("~/git/LSHTM_analysis/config/rpob.R")
|
||||||
|
|
||||||
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||||
|
|
||||||
|
################################################
|
||||||
|
# Add acticve site indication
|
||||||
|
###############################################
|
||||||
|
merged_df2$active_site = as.integer(merged_df2$position %in% active_aa_pos)
|
||||||
|
merged_df2_comp$active_site = as.integer(merged_df2_comp$position %in% active_aa_pos)
|
||||||
|
|
||||||
|
merged_df3$active_site = as.integer(merged_df3$position %in% active_aa_pos)
|
||||||
|
merged_df3_comp$active_site = as.integer(merged_df3_comp$position %in% active_aa_pos)
|
||||||
|
|
||||||
|
# sanity check
|
||||||
|
table(merged_df2$active_site)
|
||||||
|
table(merged_df3$active_site)
|
||||||
|
|
||||||
|
if( all(table(merged_df2$active_site) == table(as.integer(merged_df2$position %in% active_aa_pos))) &&
|
||||||
|
all(table(merged_df3$active_site) == table(as.integer(merged_df3$position %in% active_aa_pos)))
|
||||||
|
){
|
||||||
|
cat('\nActive site indications successfully applied to merged_dfs for gene:', tolower(gene))
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
gene
|
gene
|
||||||
gene_match
|
gene_match
|
||||||
|
|
||||||
|
@ -68,6 +88,9 @@ sum(table(merged_df3$drtype_mode_labels))
|
||||||
table(merged_df3$lineage)
|
table(merged_df3$lineage)
|
||||||
sum(table(merged_df3$lineage_labels))
|
sum(table(merged_df3$lineage_labels))
|
||||||
|
|
||||||
|
cat("\nWriting merged_df3 for:"
|
||||||
|
, "\nDrug:", drug
|
||||||
|
, "\nGene:", gene)
|
||||||
# write file
|
# write file
|
||||||
outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv')
|
outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv')
|
||||||
outfile_merged_df3
|
outfile_merged_df3
|
||||||
|
|
|
@ -87,6 +87,7 @@ merged_df2 = all_plot_dfs[[1]]
|
||||||
merged_df3 = all_plot_dfs[[2]]
|
merged_df3 = all_plot_dfs[[2]]
|
||||||
merged_df2_comp = all_plot_dfs[[3]]
|
merged_df2_comp = all_plot_dfs[[3]]
|
||||||
merged_df3_comp = all_plot_dfs[[4]]
|
merged_df3_comp = all_plot_dfs[[4]]
|
||||||
|
|
||||||
#======================================================================
|
#======================================================================
|
||||||
|
|
||||||
####################################################################
|
####################################################################
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue