added my_data4 after outputting merged_df3 for pnca to test the ml models
This commit is contained in:
parent
25a55ac914
commit
04e0267dd1
11 changed files with 5918 additions and 377 deletions
1
ml_data/.~lock.data_colnames.ods#
Normal file
1
ml_data/.~lock.data_colnames.ods#
Normal file
|
@ -0,0 +1 @@
|
|||
Tanushree Tunstall,tanu,panino.tunstall.in,03.03.2022 11:47,file:///home/tanu/.config/libreoffice/4;
|
107
ml_data/colnames_order.csv
Normal file
107
ml_data/colnames_order.csv
Normal file
|
@ -0,0 +1,107 @@
|
|||
mutationinformation
|
||||
lineage_labels
|
||||
ligand_id
|
||||
wild_type
|
||||
wild_pos
|
||||
position
|
||||
mutant_type
|
||||
pyrazinamide
|
||||
drtype
|
||||
mutation_info_labels
|
||||
wt_prop_water
|
||||
mut_prop_water
|
||||
wt_prop_polarity
|
||||
mut_prop_polarity
|
||||
wt_calcprop
|
||||
mut_calcprop
|
||||
ligand_distance
|
||||
ligand_affinity_change
|
||||
duet_stability_change
|
||||
ddg_foldx
|
||||
deepddg
|
||||
ddg_dynamut2
|
||||
snap2_score
|
||||
snap2_accuracy_pc
|
||||
consurf_score
|
||||
consurf_colour
|
||||
consurf_colour_rev
|
||||
asa
|
||||
rsa
|
||||
ss_class
|
||||
kd_values
|
||||
rd_values
|
||||
af
|
||||
or_mychisq
|
||||
or_logistic
|
||||
or_fisher
|
||||
est_chisq
|
||||
contacts
|
||||
electro_rr
|
||||
electro_mm
|
||||
electro_sm
|
||||
electro_ss
|
||||
disulfide_rr
|
||||
disulfide_mm
|
||||
disulfide_sm
|
||||
disulfide_ss
|
||||
hbonds_rr
|
||||
hbonds_mm
|
||||
hbonds_sm
|
||||
hbonds_ss
|
||||
partcov_rr
|
||||
partcov_mm
|
||||
partcov_sm
|
||||
partcov_ss
|
||||
vdwclashes_rr
|
||||
vdwclashes_mm
|
||||
vdwclashes_sm
|
||||
vdwclashes_ss
|
||||
volumetric_rr
|
||||
volumetric_mm
|
||||
volumetric_sm
|
||||
volumetric_ss
|
||||
affinity_scaled
|
||||
duet_scaled
|
||||
foldx_scaled
|
||||
deepddg_scaled
|
||||
ddg_dynamut2_scaled
|
||||
snap2_scaled
|
||||
consurf_scaled
|
||||
ligand_outcome
|
||||
duet_outcome
|
||||
foldx_outcome
|
||||
deepddg_outcome
|
||||
ddg_dynamut2_outcome
|
||||
snap2_outcome
|
||||
consurf_ci_upper
|
||||
consurf_ci_lower
|
||||
consurf_ci_colour
|
||||
consurf_msa_data
|
||||
consurf_aa_variety
|
||||
beta_logistic
|
||||
pval_logistic
|
||||
se_logistic
|
||||
zval_logistic
|
||||
ci_low_logistic
|
||||
ci_hi_logistic
|
||||
log10_or_mychisq
|
||||
pval_fisher
|
||||
neglog_pval_fisher
|
||||
ci_low_fisher
|
||||
ci_hi_fisher
|
||||
pval_chisq
|
||||
lineage
|
||||
mutation_info
|
||||
mut_3upper
|
||||
seq_offset4pdb
|
||||
id
|
||||
sample
|
||||
sublineage
|
||||
country_code
|
||||
geographic_source
|
||||
mutation
|
||||
chain
|
||||
ss
|
||||
wt_3upper
|
||||
wild_chain_pos
|
||||
pdb_file
|
|
108
ml_data/data_colnames.csv
Normal file
108
ml_data/data_colnames.csv
Normal file
|
@ -0,0 +1,108 @@
|
|||
"","x"
|
||||
"1","mutationinformation"
|
||||
"2","id"
|
||||
"3","sample"
|
||||
"4","lineage"
|
||||
"5","sublineage"
|
||||
"6","country_code"
|
||||
"7","geographic_source"
|
||||
"8","drtype"
|
||||
"9","pyrazinamide"
|
||||
"10","mutation"
|
||||
"11","mutation_info"
|
||||
"12","wild_type"
|
||||
"13","mutant_type"
|
||||
"14","position"
|
||||
"15","wt_prop_water"
|
||||
"16","mut_prop_water"
|
||||
"17","wt_prop_polarity"
|
||||
"18","mut_prop_polarity"
|
||||
"19","wt_calcprop"
|
||||
"20","mut_calcprop"
|
||||
"21","chain"
|
||||
"22","ligand_id"
|
||||
"23","ligand_distance"
|
||||
"24","duet_stability_change"
|
||||
"25","duet_outcome"
|
||||
"26","ligand_affinity_change"
|
||||
"27","ligand_outcome"
|
||||
"28","duet_scaled"
|
||||
"29","affinity_scaled"
|
||||
"30","wild_pos"
|
||||
"31","wild_chain_pos"
|
||||
"32","ddg_foldx"
|
||||
"33","contacts"
|
||||
"34","electro_rr"
|
||||
"35","electro_mm"
|
||||
"36","electro_sm"
|
||||
"37","electro_ss"
|
||||
"38","disulfide_rr"
|
||||
"39","disulfide_mm"
|
||||
"40","disulfide_sm"
|
||||
"41","disulfide_ss"
|
||||
"42","hbonds_rr"
|
||||
"43","hbonds_mm"
|
||||
"44","hbonds_sm"
|
||||
"45","hbonds_ss"
|
||||
"46","partcov_rr"
|
||||
"47","partcov_mm"
|
||||
"48","partcov_sm"
|
||||
"49","partcov_ss"
|
||||
"50","vdwclashes_rr"
|
||||
"51","vdwclashes_mm"
|
||||
"52","vdwclashes_sm"
|
||||
"53","vdwclashes_ss"
|
||||
"54","volumetric_rr"
|
||||
"55","volumetric_mm"
|
||||
"56","volumetric_sm"
|
||||
"57","volumetric_ss"
|
||||
"58","foldx_scaled"
|
||||
"59","foldx_outcome"
|
||||
"60","deepddg"
|
||||
"61","deepddg_outcome"
|
||||
"62","deepddg_scaled"
|
||||
"63","asa"
|
||||
"64","rsa"
|
||||
"65","ss"
|
||||
"66","ss_class"
|
||||
"67","kd_values"
|
||||
"68","rd_values"
|
||||
"69","wt_3upper"
|
||||
"70","consurf_score"
|
||||
"71","consurf_scaled"
|
||||
"72","consurf_colour"
|
||||
"73","consurf_colour_rev"
|
||||
"74","consurf_ci_upper"
|
||||
"75","consurf_ci_lower"
|
||||
"76","consurf_ci_colour"
|
||||
"77","consurf_msa_data"
|
||||
"78","consurf_aa_variety"
|
||||
"79","snap2_score"
|
||||
"80","snap2_scaled"
|
||||
"81","snap2_accuracy_pc"
|
||||
"82","snap2_outcome"
|
||||
"83","af"
|
||||
"84","beta_logistic"
|
||||
"85","or_logistic"
|
||||
"86","pval_logistic"
|
||||
"87","se_logistic"
|
||||
"88","zval_logistic"
|
||||
"89","ci_low_logistic"
|
||||
"90","ci_hi_logistic"
|
||||
"91","or_mychisq"
|
||||
"92","log10_or_mychisq"
|
||||
"93","or_fisher"
|
||||
"94","pval_fisher"
|
||||
"95","neglog_pval_fisher"
|
||||
"96","ci_low_fisher"
|
||||
"97","ci_hi_fisher"
|
||||
"98","est_chisq"
|
||||
"99","pval_chisq"
|
||||
"100","ddg_dynamut2"
|
||||
"101","ddg_dynamut2_scaled"
|
||||
"102","ddg_dynamut2_outcome"
|
||||
"103","mut_3upper"
|
||||
"104","seq_offset4pdb"
|
||||
"105","pdb_file"
|
||||
"106","mutation_info_labels"
|
||||
"107","lineage_labels"
|
|
BIN
ml_data/data_colnames.ods
Normal file
BIN
ml_data/data_colnames.ods
Normal file
Binary file not shown.
65
ml_data/ml_data.R
Normal file
65
ml_data/ml_data.R
Normal file
|
@ -0,0 +1,65 @@
|
|||
#!/usr/bin/env Rscript
|
||||
|
||||
# target var options:
|
||||
# drtype: MDR, etc, full data
|
||||
# pyrazinamide: 0 and 1, loss of data
|
||||
# mutation_info_labels: DM and OM, full data
|
||||
##################################################
|
||||
# ONLY ONCE
|
||||
#source("~/git/LSHTM_analysis/config/pnca.R")
|
||||
#source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||
#write.csv(colnames(merged_df3), "data_colnames.csv")
|
||||
#---------------------------------------------------
|
||||
colnames_order_pnca = read.csv("~/git/ML_AI_training/ml_data/colnames_order.csv"
|
||||
, header = F)
|
||||
# reorder columns by name
|
||||
colnames_order_pnca <- colnames_order_pnca$V1
|
||||
###################################################
|
||||
#config_gene = c("alr", "embb", "gid", "katg", "pnca", "rpob")
|
||||
#config_gene = c("alr", "embb")
|
||||
#sapply(config_gene, function(x) source(paste0("~/git/LSHTM_analysis/config/", x, ".R")), USE.NAMES = F)
|
||||
|
||||
#source("~/git/LSHTM_analysis/config/alr.R")
|
||||
# FIXME: "cycloserine" "mcsm_ppi2_affinity" "mcsm_ppi2_scaled" "mcsm_ppi2_outcome" "interface_dist"
|
||||
# source("~/git/LSHTM_analysis/config/embb.R")
|
||||
# source("~/git/LSHTM_analysis/config/gid.R")
|
||||
# source("~/git/LSHTM_analysis/config/katg.R")
|
||||
source("~/git/LSHTM_analysis/config/pnca.R")
|
||||
# source("~/git/LSHTM_analysis/config/rpob.R")
|
||||
##################################################
|
||||
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||
|
||||
######################################################
|
||||
mdf3_outName = paste0(outdir, "/", tolower(gene), "_merged_df3.csv")
|
||||
mdf3_outName
|
||||
|
||||
if( (length(colnames_order) == ncol(merged_df3)) && (all(colnames_order %in%colnames(merged_df3))) ){
|
||||
cat("\nProceeding with rearranging columns in merged_df3")
|
||||
merged_df3_o = merged_df3[ , colnames_order]
|
||||
cat("\nWriting output file:", mdf3_outName)
|
||||
write.csv(merged_df3_o, mdf3_outName, row.names = F)
|
||||
cat("\nnrows:" , nrow(merged_df3_o)
|
||||
, "\nncols:" , ncol(merged_df3_o))
|
||||
|
||||
}else
|
||||
cat("length mismatch:"
|
||||
, colnames(merged_df3)[!colnames(merged_df3)%in%(colnames_order )]
|
||||
)
|
||||
|
||||
mdf2_outName = paste0(outdir, "/", tolower(gene), "_merged_df2.csv")
|
||||
mdf2_outName
|
||||
|
||||
if( (length(colnames_order) == ncol(merged_df2)) && (all(colnames_order %in%colnames(merged_df2))) ){
|
||||
cat("\nProceeding with rearranging columns in merged_df3")
|
||||
merged_df2_o = merged_df2[ , colnames_order]
|
||||
cat("\nWriting output file:", mdf2_outName)
|
||||
write.csv(merged_df2_o, mdf2_outName, row.names = F)
|
||||
cat("\nnrows:" , nrow(merged_df2_o)
|
||||
, "\nncols:" , ncol(merged_df2_o))
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue