added my_data4 after outputting merged_df3 for pnca to test the ml models

This commit is contained in:
Tanushree Tunstall 2022-03-03 13:35:05 +00:00
parent 25a55ac914
commit 04e0267dd1
11 changed files with 5918 additions and 377 deletions

View file

@ -0,0 +1 @@
Tanushree Tunstall,tanu,panino.tunstall.in,03.03.2022 11:47,file:///home/tanu/.config/libreoffice/4;

107
ml_data/colnames_order.csv Normal file
View file

@ -0,0 +1,107 @@
mutationinformation
lineage_labels
ligand_id
wild_type
wild_pos
position
mutant_type
pyrazinamide
drtype
mutation_info_labels
wt_prop_water
mut_prop_water
wt_prop_polarity
mut_prop_polarity
wt_calcprop
mut_calcprop
ligand_distance
ligand_affinity_change
duet_stability_change
ddg_foldx
deepddg
ddg_dynamut2
snap2_score
snap2_accuracy_pc
consurf_score
consurf_colour
consurf_colour_rev
asa
rsa
ss_class
kd_values
rd_values
af
or_mychisq
or_logistic
or_fisher
est_chisq
contacts
electro_rr
electro_mm
electro_sm
electro_ss
disulfide_rr
disulfide_mm
disulfide_sm
disulfide_ss
hbonds_rr
hbonds_mm
hbonds_sm
hbonds_ss
partcov_rr
partcov_mm
partcov_sm
partcov_ss
vdwclashes_rr
vdwclashes_mm
vdwclashes_sm
vdwclashes_ss
volumetric_rr
volumetric_mm
volumetric_sm
volumetric_ss
affinity_scaled
duet_scaled
foldx_scaled
deepddg_scaled
ddg_dynamut2_scaled
snap2_scaled
consurf_scaled
ligand_outcome
duet_outcome
foldx_outcome
deepddg_outcome
ddg_dynamut2_outcome
snap2_outcome
consurf_ci_upper
consurf_ci_lower
consurf_ci_colour
consurf_msa_data
consurf_aa_variety
beta_logistic
pval_logistic
se_logistic
zval_logistic
ci_low_logistic
ci_hi_logistic
log10_or_mychisq
pval_fisher
neglog_pval_fisher
ci_low_fisher
ci_hi_fisher
pval_chisq
lineage
mutation_info
mut_3upper
seq_offset4pdb
id
sample
sublineage
country_code
geographic_source
mutation
chain
ss
wt_3upper
wild_chain_pos
pdb_file
1 mutationinformation
2 lineage_labels
3 ligand_id
4 wild_type
5 wild_pos
6 position
7 mutant_type
8 pyrazinamide
9 drtype
10 mutation_info_labels
11 wt_prop_water
12 mut_prop_water
13 wt_prop_polarity
14 mut_prop_polarity
15 wt_calcprop
16 mut_calcprop
17 ligand_distance
18 ligand_affinity_change
19 duet_stability_change
20 ddg_foldx
21 deepddg
22 ddg_dynamut2
23 snap2_score
24 snap2_accuracy_pc
25 consurf_score
26 consurf_colour
27 consurf_colour_rev
28 asa
29 rsa
30 ss_class
31 kd_values
32 rd_values
33 af
34 or_mychisq
35 or_logistic
36 or_fisher
37 est_chisq
38 contacts
39 electro_rr
40 electro_mm
41 electro_sm
42 electro_ss
43 disulfide_rr
44 disulfide_mm
45 disulfide_sm
46 disulfide_ss
47 hbonds_rr
48 hbonds_mm
49 hbonds_sm
50 hbonds_ss
51 partcov_rr
52 partcov_mm
53 partcov_sm
54 partcov_ss
55 vdwclashes_rr
56 vdwclashes_mm
57 vdwclashes_sm
58 vdwclashes_ss
59 volumetric_rr
60 volumetric_mm
61 volumetric_sm
62 volumetric_ss
63 affinity_scaled
64 duet_scaled
65 foldx_scaled
66 deepddg_scaled
67 ddg_dynamut2_scaled
68 snap2_scaled
69 consurf_scaled
70 ligand_outcome
71 duet_outcome
72 foldx_outcome
73 deepddg_outcome
74 ddg_dynamut2_outcome
75 snap2_outcome
76 consurf_ci_upper
77 consurf_ci_lower
78 consurf_ci_colour
79 consurf_msa_data
80 consurf_aa_variety
81 beta_logistic
82 pval_logistic
83 se_logistic
84 zval_logistic
85 ci_low_logistic
86 ci_hi_logistic
87 log10_or_mychisq
88 pval_fisher
89 neglog_pval_fisher
90 ci_low_fisher
91 ci_hi_fisher
92 pval_chisq
93 lineage
94 mutation_info
95 mut_3upper
96 seq_offset4pdb
97 id
98 sample
99 sublineage
100 country_code
101 geographic_source
102 mutation
103 chain
104 ss
105 wt_3upper
106 wild_chain_pos
107 pdb_file

108
ml_data/data_colnames.csv Normal file
View file

@ -0,0 +1,108 @@
"","x"
"1","mutationinformation"
"2","id"
"3","sample"
"4","lineage"
"5","sublineage"
"6","country_code"
"7","geographic_source"
"8","drtype"
"9","pyrazinamide"
"10","mutation"
"11","mutation_info"
"12","wild_type"
"13","mutant_type"
"14","position"
"15","wt_prop_water"
"16","mut_prop_water"
"17","wt_prop_polarity"
"18","mut_prop_polarity"
"19","wt_calcprop"
"20","mut_calcprop"
"21","chain"
"22","ligand_id"
"23","ligand_distance"
"24","duet_stability_change"
"25","duet_outcome"
"26","ligand_affinity_change"
"27","ligand_outcome"
"28","duet_scaled"
"29","affinity_scaled"
"30","wild_pos"
"31","wild_chain_pos"
"32","ddg_foldx"
"33","contacts"
"34","electro_rr"
"35","electro_mm"
"36","electro_sm"
"37","electro_ss"
"38","disulfide_rr"
"39","disulfide_mm"
"40","disulfide_sm"
"41","disulfide_ss"
"42","hbonds_rr"
"43","hbonds_mm"
"44","hbonds_sm"
"45","hbonds_ss"
"46","partcov_rr"
"47","partcov_mm"
"48","partcov_sm"
"49","partcov_ss"
"50","vdwclashes_rr"
"51","vdwclashes_mm"
"52","vdwclashes_sm"
"53","vdwclashes_ss"
"54","volumetric_rr"
"55","volumetric_mm"
"56","volumetric_sm"
"57","volumetric_ss"
"58","foldx_scaled"
"59","foldx_outcome"
"60","deepddg"
"61","deepddg_outcome"
"62","deepddg_scaled"
"63","asa"
"64","rsa"
"65","ss"
"66","ss_class"
"67","kd_values"
"68","rd_values"
"69","wt_3upper"
"70","consurf_score"
"71","consurf_scaled"
"72","consurf_colour"
"73","consurf_colour_rev"
"74","consurf_ci_upper"
"75","consurf_ci_lower"
"76","consurf_ci_colour"
"77","consurf_msa_data"
"78","consurf_aa_variety"
"79","snap2_score"
"80","snap2_scaled"
"81","snap2_accuracy_pc"
"82","snap2_outcome"
"83","af"
"84","beta_logistic"
"85","or_logistic"
"86","pval_logistic"
"87","se_logistic"
"88","zval_logistic"
"89","ci_low_logistic"
"90","ci_hi_logistic"
"91","or_mychisq"
"92","log10_or_mychisq"
"93","or_fisher"
"94","pval_fisher"
"95","neglog_pval_fisher"
"96","ci_low_fisher"
"97","ci_hi_fisher"
"98","est_chisq"
"99","pval_chisq"
"100","ddg_dynamut2"
"101","ddg_dynamut2_scaled"
"102","ddg_dynamut2_outcome"
"103","mut_3upper"
"104","seq_offset4pdb"
"105","pdb_file"
"106","mutation_info_labels"
"107","lineage_labels"
1 x
2 1 mutationinformation
3 2 id
4 3 sample
5 4 lineage
6 5 sublineage
7 6 country_code
8 7 geographic_source
9 8 drtype
10 9 pyrazinamide
11 10 mutation
12 11 mutation_info
13 12 wild_type
14 13 mutant_type
15 14 position
16 15 wt_prop_water
17 16 mut_prop_water
18 17 wt_prop_polarity
19 18 mut_prop_polarity
20 19 wt_calcprop
21 20 mut_calcprop
22 21 chain
23 22 ligand_id
24 23 ligand_distance
25 24 duet_stability_change
26 25 duet_outcome
27 26 ligand_affinity_change
28 27 ligand_outcome
29 28 duet_scaled
30 29 affinity_scaled
31 30 wild_pos
32 31 wild_chain_pos
33 32 ddg_foldx
34 33 contacts
35 34 electro_rr
36 35 electro_mm
37 36 electro_sm
38 37 electro_ss
39 38 disulfide_rr
40 39 disulfide_mm
41 40 disulfide_sm
42 41 disulfide_ss
43 42 hbonds_rr
44 43 hbonds_mm
45 44 hbonds_sm
46 45 hbonds_ss
47 46 partcov_rr
48 47 partcov_mm
49 48 partcov_sm
50 49 partcov_ss
51 50 vdwclashes_rr
52 51 vdwclashes_mm
53 52 vdwclashes_sm
54 53 vdwclashes_ss
55 54 volumetric_rr
56 55 volumetric_mm
57 56 volumetric_sm
58 57 volumetric_ss
59 58 foldx_scaled
60 59 foldx_outcome
61 60 deepddg
62 61 deepddg_outcome
63 62 deepddg_scaled
64 63 asa
65 64 rsa
66 65 ss
67 66 ss_class
68 67 kd_values
69 68 rd_values
70 69 wt_3upper
71 70 consurf_score
72 71 consurf_scaled
73 72 consurf_colour
74 73 consurf_colour_rev
75 74 consurf_ci_upper
76 75 consurf_ci_lower
77 76 consurf_ci_colour
78 77 consurf_msa_data
79 78 consurf_aa_variety
80 79 snap2_score
81 80 snap2_scaled
82 81 snap2_accuracy_pc
83 82 snap2_outcome
84 83 af
85 84 beta_logistic
86 85 or_logistic
87 86 pval_logistic
88 87 se_logistic
89 88 zval_logistic
90 89 ci_low_logistic
91 90 ci_hi_logistic
92 91 or_mychisq
93 92 log10_or_mychisq
94 93 or_fisher
95 94 pval_fisher
96 95 neglog_pval_fisher
97 96 ci_low_fisher
98 97 ci_hi_fisher
99 98 est_chisq
100 99 pval_chisq
101 100 ddg_dynamut2
102 101 ddg_dynamut2_scaled
103 102 ddg_dynamut2_outcome
104 103 mut_3upper
105 104 seq_offset4pdb
106 105 pdb_file
107 106 mutation_info_labels
108 107 lineage_labels

BIN
ml_data/data_colnames.ods Normal file

Binary file not shown.

65
ml_data/ml_data.R Normal file
View file

@ -0,0 +1,65 @@
#!/usr/bin/env Rscript
# target var options:
# drtype: MDR, etc, full data
# pyrazinamide: 0 and 1, loss of data
# mutation_info_labels: DM and OM, full data
##################################################
# ONLY ONCE
#source("~/git/LSHTM_analysis/config/pnca.R")
#source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
#write.csv(colnames(merged_df3), "data_colnames.csv")
#---------------------------------------------------
colnames_order_pnca = read.csv("~/git/ML_AI_training/ml_data/colnames_order.csv"
, header = F)
# reorder columns by name
colnames_order_pnca <- colnames_order_pnca$V1
###################################################
#config_gene = c("alr", "embb", "gid", "katg", "pnca", "rpob")
#config_gene = c("alr", "embb")
#sapply(config_gene, function(x) source(paste0("~/git/LSHTM_analysis/config/", x, ".R")), USE.NAMES = F)
#source("~/git/LSHTM_analysis/config/alr.R")
# FIXME: "cycloserine" "mcsm_ppi2_affinity" "mcsm_ppi2_scaled" "mcsm_ppi2_outcome" "interface_dist"
# source("~/git/LSHTM_analysis/config/embb.R")
# source("~/git/LSHTM_analysis/config/gid.R")
# source("~/git/LSHTM_analysis/config/katg.R")
source("~/git/LSHTM_analysis/config/pnca.R")
# source("~/git/LSHTM_analysis/config/rpob.R")
##################################################
source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
######################################################
mdf3_outName = paste0(outdir, "/", tolower(gene), "_merged_df3.csv")
mdf3_outName
if( (length(colnames_order) == ncol(merged_df3)) && (all(colnames_order %in%colnames(merged_df3))) ){
cat("\nProceeding with rearranging columns in merged_df3")
merged_df3_o = merged_df3[ , colnames_order]
cat("\nWriting output file:", mdf3_outName)
write.csv(merged_df3_o, mdf3_outName, row.names = F)
cat("\nnrows:" , nrow(merged_df3_o)
, "\nncols:" , ncol(merged_df3_o))
}else
cat("length mismatch:"
, colnames(merged_df3)[!colnames(merged_df3)%in%(colnames_order )]
)
mdf2_outName = paste0(outdir, "/", tolower(gene), "_merged_df2.csv")
mdf2_outName
if( (length(colnames_order) == ncol(merged_df2)) && (all(colnames_order %in%colnames(merged_df2))) ){
cat("\nProceeding with rearranging columns in merged_df3")
merged_df2_o = merged_df2[ , colnames_order]
cat("\nWriting output file:", mdf2_outName)
write.csv(merged_df2_o, mdf2_outName, row.names = F)
cat("\nnrows:" , nrow(merged_df2_o)
, "\nncols:" , ncol(merged_df2_o))
}