reran to generate merged_df3 with correct dst for dst muts. modified combining_dfs_plotting.R
This commit is contained in:
parent
289c8913d0
commit
8079dd7b6c
6 changed files with 148 additions and 211 deletions
|
@ -4,7 +4,7 @@
|
||||||
#source("~/git/LSHTM_analysis/config/embb.R")
|
#source("~/git/LSHTM_analysis/config/embb.R")
|
||||||
#source("~/git/LSHTM_analysis/config/gid.R")
|
#source("~/git/LSHTM_analysis/config/gid.R")
|
||||||
#source("~/git/LSHTM_analysis/config/katg.R")
|
#source("~/git/LSHTM_analysis/config/katg.R")
|
||||||
source("~/git/LSHTM_analysis/config/pnca.R")
|
#source("~/git/LSHTM_analysis/config/pnca.R")
|
||||||
source("~/git/LSHTM_analysis/config/rpob.R")
|
source("~/git/LSHTM_analysis/config/rpob.R")
|
||||||
|
|
||||||
#############################
|
#############################
|
||||||
|
@ -16,19 +16,19 @@ source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
|
||||||
# Output files: merged data
|
# Output files: merged data
|
||||||
#############################
|
#############################
|
||||||
outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv')
|
outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv')
|
||||||
outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv')
|
#outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv')
|
||||||
|
|
||||||
################################################
|
################################################
|
||||||
# Add acticve site indication
|
# Add acticve site indication
|
||||||
###############################################
|
###############################################
|
||||||
merged_df2$active_site = as.integer(merged_df2$position %in% active_aa_pos)
|
merged_df2$active_site = as.integer(merged_df2$position %in% active_aa_pos)
|
||||||
merged_df2_comp$active_site = as.integer(merged_df2_comp$position %in% active_aa_pos)
|
#merged_df2_comp$active_site = as.integer(merged_df2_comp$position %in% active_aa_pos)
|
||||||
|
|
||||||
merged_df3$active_site = as.integer(merged_df3$position %in% active_aa_pos)
|
merged_df3$active_site = as.integer(merged_df3$position %in% active_aa_pos)
|
||||||
merged_df3_comp$active_site = as.integer(merged_df3_comp$position %in% active_aa_pos)
|
#merged_df3_comp$active_site = as.integer(merged_df3_comp$position %in% active_aa_pos)
|
||||||
|
|
||||||
# check
|
# check
|
||||||
cols_sel = c('mutationinformation', 'dst', 'mutation_info_labels', 'dm_om_numeric', 'dst_mode')
|
cols_sel = c('mutationinformation', 'mutation_info_labels', 'dm_om_numeric', 'dst', 'dst_mode')
|
||||||
|
|
||||||
check_mdf2 = merged_df2[, cols_sel]
|
check_mdf2 = merged_df2[, cols_sel]
|
||||||
check_mdf2T = table(check_mdf2$mutationinformation, check_mdf2$dst_mode)
|
check_mdf2T = table(check_mdf2$mutationinformation, check_mdf2$dst_mode)
|
||||||
|
@ -57,6 +57,8 @@ if (check12) {
|
||||||
stop('FAIL: Something is wrong with the dst_mode column. Quitting!')
|
stop('FAIL: Something is wrong with the dst_mode column. Quitting!')
|
||||||
}
|
}
|
||||||
|
|
||||||
|
table(is.na(merged_df3$dst))
|
||||||
|
|
||||||
#==========================
|
#==========================
|
||||||
# CHECK: active site labels
|
# CHECK: active site labels
|
||||||
#==========================
|
#==========================
|
||||||
|
@ -137,7 +139,6 @@ if (all(a2 && b2)){
|
||||||
stop("FAIL: could not add drtype mode labels to merged_df3")
|
stop("FAIL: could not add drtype mode labels to merged_df3")
|
||||||
##quit()
|
##quit()
|
||||||
}
|
}
|
||||||
##############################################
|
|
||||||
#===============
|
#===============
|
||||||
# CHECK: lineage
|
# CHECK: lineage
|
||||||
#===============
|
#===============
|
||||||
|
@ -179,10 +180,11 @@ if ( all( check12 && aa_check1 && aa_check2 && a1 && b1 && a2 && b2 && l1 && l2
|
||||||
, "\nGene:", gene)
|
, "\nGene:", gene)
|
||||||
|
|
||||||
write.csv(merged_df3, outfile_merged_df3)
|
write.csv(merged_df3, outfile_merged_df3)
|
||||||
write.csv(merged_df2, outfile_merged_df2)
|
#write.csv(merged_df2, outfile_merged_df2)
|
||||||
|
|
||||||
cat(paste("\nmerged df3 filename:", outfile_merged_df3
|
cat(paste("\nmerged df3 filename:", outfile_merged_df3
|
||||||
, "\nmerged df2 filename:", outfile_merged_df2))
|
#, "\nmerged df2 filename:", outfile_merged_df2)
|
||||||
|
))
|
||||||
|
|
||||||
} else{
|
} else{
|
||||||
stop("FAIL: Not able to write merged dfs. Please check numbers!")
|
stop("FAIL: Not able to write merged dfs. Please check numbers!")
|
||||||
|
@ -207,15 +209,6 @@ a = merged_df3[, sel]
|
||||||
str(a)
|
str(a)
|
||||||
|
|
||||||
|
|
||||||
# write file
|
|
||||||
# outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv')
|
|
||||||
# outfile_merged_df3
|
|
||||||
# write.csv(merged_df3, outfile_merged_df3)
|
|
||||||
#
|
|
||||||
# outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv')
|
|
||||||
# outfile_merged_df2
|
|
||||||
# write.csv(merged_df2, outfile_merged_df2)
|
|
||||||
|
|
||||||
###################################################
|
###################################################
|
||||||
###################################################
|
###################################################
|
||||||
###################################################
|
###################################################
|
||||||
|
|
|
@ -204,6 +204,9 @@ combining_dfs_plotting <- function( my_df_u
|
||||||
merged_df2$lineage_labels = merged_df2$lineage
|
merged_df2$lineage_labels = merged_df2$lineage
|
||||||
#merged_df2$lineage_labels = as.factor(merged_df2$lineage_labels)
|
#merged_df2$lineage_labels = as.factor(merged_df2$lineage_labels)
|
||||||
#merged_df2$lineage_labels = factor(merged_df2$lineage_labels)
|
#merged_df2$lineage_labels = factor(merged_df2$lineage_labels)
|
||||||
|
table(merged_df2$mutation_info_labels_orig) # original
|
||||||
|
table(merged_df2$mutation_info_labels_v1) # intermediate
|
||||||
|
table(merged_df2$mutation_info_labels) # revised, corresponding to dst_mode
|
||||||
|
|
||||||
#=================================================================
|
#=================================================================
|
||||||
# Merge 2: merged_df3
|
# Merge 2: merged_df3
|
||||||
|
@ -214,16 +217,122 @@ combining_dfs_plotting <- function( my_df_u
|
||||||
# but this should be good for the numerical corr plots
|
# but this should be good for the numerical corr plots
|
||||||
#==================================================================
|
#==================================================================
|
||||||
# remove duplicated mutations
|
# remove duplicated mutations
|
||||||
cat("\nMerging dfs without NAs: small df (removing muts with no AF|OR associated)"
|
# cat("\nMerging dfs without NAs: small df (removing muts with no AF|OR associated)"
|
||||||
,"\nCannot trust lineage info from this"
|
# ,"\nCannot trust lineage info from this"
|
||||||
,"\nlinking col: mutationinforamtion"
|
# ,"\nlinking col: mutationinforamtion"
|
||||||
,"\nfilename: merged_df3")
|
# ,"\nfilename: merged_df3")
|
||||||
|
#
|
||||||
|
# merged_df3 = merged_df2[!duplicated(merged_df2$mutationinformation),]
|
||||||
|
#
|
||||||
|
#
|
||||||
|
|
||||||
|
# head(merged_df3$position); tail(merged_df3$position) # should be sorted
|
||||||
|
#
|
||||||
|
# # sanity check
|
||||||
|
# cat("\nChecking nrows in merged_df3")
|
||||||
|
#
|
||||||
|
# if( nrow(my_df_u) == nrow(merged_df3) ){
|
||||||
|
# cat("\nPASS: No. of rows match with my_df"
|
||||||
|
# ,"\nExpected no. of rows: ", nrow(my_df_u)
|
||||||
|
# ,"\nGot no. of rows: ", nrow(merged_df3))
|
||||||
|
# } else {
|
||||||
|
# cat("\nFAIL: No. of rows mismatch"
|
||||||
|
# , "\nNo. of rows my_df: ", nrow(my_df_u)
|
||||||
|
# , "\nNo. of rows merged_df3: ", nrow(merged_df3))
|
||||||
|
# quit()
|
||||||
|
# }
|
||||||
|
#
|
||||||
|
# counting NAs in AF, OR cols in merged_df3
|
||||||
|
# this is because mcsm has no AF, OR cols,
|
||||||
|
# so you cannot count NAs
|
||||||
|
# if (identical(sum(is.na(merged_df3$or_kin))
|
||||||
|
# , sum(is.na(merged_df3$pwald_kin))
|
||||||
|
# , sum(is.na(merged_df3$af_kin)))){
|
||||||
|
# cat("\nPASS: NA count match for OR, pvalue and AF\n")
|
||||||
|
# na_count_df3 = sum(is.na(merged_df3$af_kin))
|
||||||
|
# cat("\nNo. of NAs: ", sum(is.na(merged_df3$or_kin)))
|
||||||
|
# } else{
|
||||||
|
# cat("\nFAIL: NA count mismatch"
|
||||||
|
# , "\nNA in OR: ", sum(is.na(merged_df3$or_kin))
|
||||||
|
# , "\nNA in pvalue: ", sum(is.na(merged_df3$pwald_kin))
|
||||||
|
# , "\nNA in AF:", sum(is.na(merged_df3$af_kin)))
|
||||||
|
# }
|
||||||
|
#
|
||||||
|
# ===================================
|
||||||
|
# Revised way to generate merged_df3
|
||||||
|
# ===================================
|
||||||
|
#%% Getting merged_df3: VERY important and careful subsetting merging
|
||||||
|
# dst mode column as carefully curated dst based on knowledge based approach.
|
||||||
|
# so now we want to get the
|
||||||
|
na_muts = merged_df2[is.na(merged_df2$dst),]
|
||||||
|
no_na_muts = merged_df2[!is.na(merged_df2$dst),]
|
||||||
|
|
||||||
|
muts_na_U = na_muts[!duplicated(na_muts[c('mutationinformation')]), ]
|
||||||
|
muts_no_na_U = no_na_muts[!duplicated(no_na_muts[c('mutationinformation')]), ]
|
||||||
|
|
||||||
|
# get muts from no_na that are NOT present in muts with na from dplyr
|
||||||
|
dst_muts = dplyr::anti_join(muts_no_na_U, muts_na_U, by = 'mutationinformation')
|
||||||
|
#dst_muts = anti_join(muts_no_na_U, muts_na_U, by = 'mutationinformation')
|
||||||
|
|
||||||
|
# ALL good muts are NOT in na muts unique i.e dst muts should NOT exist in na_muts
|
||||||
|
if (all(dst_muts$mutationinformation%in%muts_na_U$mutationinformation) == FALSE){
|
||||||
|
cat("\nPASS: checked length for dst tested muts"
|
||||||
|
, "\nNo. of dst testetd muts:", nrow(dst_muts))
|
||||||
|
}else{
|
||||||
|
stop("Dst muts are not correctly identified")
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( class(dst_muts) != "data.frame" ){
|
||||||
|
dst_muts = as.data.frame(dst_muts)
|
||||||
|
} else{
|
||||||
|
cat("\ndst_muts is a df")
|
||||||
|
}
|
||||||
|
|
||||||
|
# ALL bad muts are in na muts unique
|
||||||
|
bad_muts = dplyr::semi_join(muts_no_na_U, muts_na_U, by = "mutationinformation")
|
||||||
|
#bad_muts = semi_join(muts_no_na_U, muts_na_U, by = "mutationinformation")
|
||||||
|
|
||||||
|
|
||||||
|
if (all(bad_muts$mutationinformation%in%muts_na_U$mutationinformation) == TRUE){
|
||||||
|
cat("\nPASS: checked length of NOT-dst tested muts"
|
||||||
|
, "\nNo. of NOT dst-tested_muts:", nrow(bad_muts))
|
||||||
|
}else{
|
||||||
|
stop("Non-dst muts are not correctly identified")
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( class(bad_muts) != "data.frame" ){
|
||||||
|
bad_muts = as.data.frame(bad_muts)
|
||||||
|
} else{
|
||||||
|
cat("\nbad_muts is a df")
|
||||||
|
}
|
||||||
|
|
||||||
|
cat("\nNo. of muts with dst:", nrow(dst_muts)
|
||||||
|
, "\nNo. of muts without dst:", nrow(muts_na_U) - nrow(dst_muts) )
|
||||||
|
|
||||||
|
# now merge
|
||||||
|
if ( all(colnames(muts_na_U) == colnames(dst_muts)) ){
|
||||||
|
cat("\nPASS: rowbind to get merged_df3")
|
||||||
|
merged_df3 = dplyr::bind_rows(muts_na_U, dst_muts)
|
||||||
|
#merged_df3 = bind_rows(muts_na_U, dst_muts)
|
||||||
|
|
||||||
|
} else{
|
||||||
|
stop("Quitting: merged_df3 could not be generated")
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( nrow(merged_df3) == length(unique(merged_df2$mutationinformation)) ){
|
||||||
|
cat("\nPASS: merged_df3 sucessfully generated..."
|
||||||
|
, "\nnrow merged_df3:", nrow(merged_df3)
|
||||||
|
, "\nncol merged_df3:", ncol(merged_df3))
|
||||||
|
}else{
|
||||||
|
stop("Cannot generate merged_df3")
|
||||||
|
}
|
||||||
|
##################################################################
|
||||||
|
|
||||||
merged_df3 = merged_df2[!duplicated(merged_df2$mutationinformation),]
|
|
||||||
head(merged_df3$position); tail(merged_df3$position) # should be sorted
|
head(merged_df3$position); tail(merged_df3$position) # should be sorted
|
||||||
|
|
||||||
# sanity check
|
# sanity check
|
||||||
cat("\nChecking nrows in merged_df3")
|
cat("\nChecking nrows in merged_df3")
|
||||||
|
|
||||||
if( nrow(my_df_u) == nrow(merged_df3) ){
|
if( nrow(my_df_u) == nrow(merged_df3) ){
|
||||||
cat("\nPASS: No. of rows match with my_df"
|
cat("\nPASS: No. of rows match with my_df"
|
||||||
,"\nExpected no. of rows: ", nrow(my_df_u)
|
,"\nExpected no. of rows: ", nrow(my_df_u)
|
||||||
|
@ -234,166 +343,9 @@ combining_dfs_plotting <- function( my_df_u
|
||||||
, "\nNo. of rows merged_df3: ", nrow(merged_df3))
|
, "\nNo. of rows merged_df3: ", nrow(merged_df3))
|
||||||
quit()
|
quit()
|
||||||
}
|
}
|
||||||
|
|
||||||
# counting NAs in AF, OR cols in merged_df3
|
|
||||||
# this is because mcsm has no AF, OR cols,
|
|
||||||
# so you cannot count NAs
|
|
||||||
if (identical(sum(is.na(merged_df3$or_kin))
|
|
||||||
, sum(is.na(merged_df3$pwald_kin))
|
|
||||||
, sum(is.na(merged_df3$af_kin)))){
|
|
||||||
cat("\nPASS: NA count match for OR, pvalue and AF\n")
|
|
||||||
na_count_df3 = sum(is.na(merged_df3$af_kin))
|
|
||||||
cat("\nNo. of NAs: ", sum(is.na(merged_df3$or_kin)))
|
|
||||||
} else{
|
|
||||||
cat("\nFAIL: NA count mismatch"
|
|
||||||
, "\nNA in OR: ", sum(is.na(merged_df3$or_kin))
|
|
||||||
, "\nNA in pvalue: ", sum(is.na(merged_df3$pwald_kin))
|
|
||||||
, "\nNA in AF:", sum(is.na(merged_df3$af_kin)))
|
|
||||||
}
|
|
||||||
|
|
||||||
#===================================================
|
|
||||||
# Merge3: merged_df2_comp
|
|
||||||
# same as merge 1 but excluding NAs from ORs, etc.
|
|
||||||
#====================================================
|
|
||||||
cat("\nMerging dfs without any NAs: big df (1-many relationship b/w id & mut)"
|
|
||||||
,"\nfilename: merged_df2_comp")
|
|
||||||
|
|
||||||
na_count_df2 = sum(is.na(merged_df2$af))
|
|
||||||
merged_df2_comp = merged_df2[!is.na(merged_df2$af),]
|
|
||||||
|
|
||||||
# sanity check: no +-1 gymnastics
|
|
||||||
cat("\nChecking nrows in merged_df2_comp")
|
|
||||||
if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){
|
|
||||||
cat("\nPASS: No. of rows match"
|
|
||||||
,"\nDim of merged_df2_comp: "
|
|
||||||
,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
|
|
||||||
, "\nNo. of rows: ", nrow(merged_df2_comp)
|
|
||||||
, "\nNo. of cols: ", ncol(merged_df2_comp))
|
|
||||||
}else{
|
|
||||||
cat("\nFAIL: No. of rows mismatch"
|
|
||||||
,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
|
|
||||||
,"\nGot no. of rows: ", nrow(merged_df2_comp))
|
|
||||||
}
|
|
||||||
|
|
||||||
#======================================================
|
|
||||||
# Merge4: merged_df3_comp
|
|
||||||
# same as merge 2 but excluding NAs from ORs, etc or
|
|
||||||
# remove duplicate mutation information
|
|
||||||
#=======================================================
|
|
||||||
na_count_df3 = sum(is.na(merged_df3$af))
|
|
||||||
#merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way
|
|
||||||
|
|
||||||
merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way
|
|
||||||
cat("\nChecking nrows in merged_df3_comp")
|
|
||||||
|
|
||||||
if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){
|
|
||||||
cat("\nPASS: No. of rows match"
|
|
||||||
,"\nDim of merged_df3_comp: "
|
|
||||||
,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
|
|
||||||
, "\nNo. of rows: ", nrow(merged_df3_comp)
|
|
||||||
, "\nNo. of cols: ", ncol(merged_df3_comp))
|
|
||||||
}else{
|
|
||||||
cat("\nFAIL: No. of rows mismatch"
|
|
||||||
,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
|
|
||||||
,"\nGot no. of rows: ", nrow(merged_df3_comp))
|
|
||||||
}
|
|
||||||
|
|
||||||
# alternate way of deriving merged_df3_comp
|
|
||||||
foo = merged_df3[!is.na(merged_df3$af),]
|
|
||||||
bar = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),]
|
|
||||||
# compare dfs: foo and merged_df3_com
|
|
||||||
all.equal(foo, bar)
|
|
||||||
#summary(comparedf(foo, bar))
|
|
||||||
cat("\n------------------------"
|
|
||||||
, "\nSummary of created dfs:"
|
|
||||||
, "\n------------------------"
|
|
||||||
, "\n1) Dim of merged_df2: " , nrow(merged_df2), "," , ncol(merged_df2)
|
|
||||||
, "\n2) Dim of merged_df2_comp: " , nrow(merged_df2_comp), "," , ncol(merged_df2_comp)
|
|
||||||
, "\n3) Dim of merged_df3: " , nrow(merged_df3), "," , ncol(merged_df3)
|
|
||||||
, "\n4) Dim of merged_df3_comp: " , nrow(merged_df3_comp), "," , ncol(merged_df3_comp))
|
|
||||||
|
|
||||||
#####################################################################
|
|
||||||
# Combining: LIG
|
|
||||||
#####################################################################
|
|
||||||
|
|
||||||
#============
|
|
||||||
# Merges 5-8
|
|
||||||
#============
|
|
||||||
cat("\n=========================================="
|
|
||||||
, "\nStarting filtering for mcsm ligand df"
|
|
||||||
, "\n===========================================")
|
|
||||||
|
|
||||||
if (lig_dist_colname%in%names(my_df_u)){
|
|
||||||
cat("\nFiltering column: ", lig_dist_colname
|
|
||||||
, "\nCut off criteria: ", lig_dist_cutoff, "Angstroms")
|
|
||||||
df_lig = my_df_u[my_df_u[[lig_dist_colname]] < lig_dist_cutoff,]
|
|
||||||
|
|
||||||
#merged_df2_lig = merged_df2[merged_df2$ligand_distance<lig_dist_cutoff,]
|
|
||||||
merged_df2_lig = merged_df2[merged_df2[[lig_dist_colname]] < lig_dist_cutoff,]
|
|
||||||
dim(merged_df2_lig)
|
|
||||||
|
|
||||||
merged_df2_comp_lig = merged_df2_comp[merged_df2_comp[[lig_dist_colname]] < lig_dist_cutoff,]
|
|
||||||
|
|
||||||
merged_df3_lig = merged_df3[merged_df3[[lig_dist_colname]] < lig_dist_cutoff,]
|
|
||||||
merged_df3_comp_lig = merged_df3_comp[merged_df3_comp[[lig_dist_colname]] < lig_dist_cutoff,]
|
|
||||||
|
|
||||||
cat("\n------------------------"
|
|
||||||
, "\nSummary of created ligand dfs:"
|
|
||||||
, "\n------------------------"
|
|
||||||
, "\n1) Dim of merged_df2_lig: " , nrow(merged_df2_lig), "," , ncol(merged_df2_lig)
|
|
||||||
, "\n2) Dim of merged_df2_comp_lig: " , nrow(merged_df2_comp_lig), "," , ncol(merged_df2_comp_lig)
|
|
||||||
, "\n3) Dim of merged_df3_lig: " , nrow(merged_df3_lig), "," , ncol(merged_df3_lig)
|
|
||||||
, "\n4) Dim of merged_df3_comp_lig: " , nrow(merged_df3_comp_lig), "," , ncol(merged_df3_comp_lig))
|
|
||||||
} else {
|
|
||||||
cat("\nFiltering column: ", lig_dist_colname, " not found\n")
|
|
||||||
}
|
|
||||||
#quit()
|
|
||||||
|
|
||||||
# sanity check
|
|
||||||
if (nrow(merged_df3_lig) == nrow(df_lig)){
|
|
||||||
print("\nPASS: verified merged_df3_lig")
|
|
||||||
}else{
|
|
||||||
cat(paste0("\nFAIL: nrow mismatch for merged_df3_lig"
|
|
||||||
, "\nExpected:", nrow(df_lig)
|
|
||||||
, "\nGot:", nrow(merged_df3_lig)))
|
|
||||||
}
|
|
||||||
|
|
||||||
#==============================================================
|
|
||||||
|
|
||||||
############################################
|
|
||||||
# OPTIONAL: write output files in one go
|
|
||||||
############################################
|
|
||||||
#outvars = c(#"merged_df2",
|
|
||||||
#"merged_df2_comp",
|
|
||||||
#"merged_df2_lig",
|
|
||||||
#"merged_df2_comp_lig",
|
|
||||||
|
|
||||||
#"meregd_df3_comp"
|
|
||||||
#"merged_df3_comp_lig",
|
|
||||||
#"merged_df3",
|
|
||||||
#"merged_df3_lig")
|
|
||||||
|
|
||||||
#cat("Writing output files: "
|
|
||||||
#, "\nPath:", outdir)
|
|
||||||
|
|
||||||
#for (i in outvars){
|
|
||||||
#out_filename = paste0(i, ".csv")
|
|
||||||
#outfile = paste0(outdir, "/", out_filename)
|
|
||||||
#cat("Writing output file:"
|
|
||||||
# ,"\nFilename: ", out_filename,"\n")
|
|
||||||
#write.csv(get(i), outfile, row.names = FALSE)
|
|
||||||
#cat("Finished writing: ", outfile
|
|
||||||
# , "\nNo. of rows: ", nrow(get(i))
|
|
||||||
# , "\nNo. of cols: ", ncol(get(i)), "\n")
|
|
||||||
#}
|
|
||||||
|
|
||||||
return(list( merged_df2
|
return(list( merged_df2
|
||||||
, merged_df3
|
, merged_df3
|
||||||
, merged_df2_comp
|
))
|
||||||
, merged_df3_comp
|
|
||||||
, merged_df2_lig
|
|
||||||
, merged_df3_lig
|
|
||||||
, merged_df2_comp_lig
|
|
||||||
, merged_df3_comp_lig))
|
|
||||||
|
|
||||||
|
cat("\nEnd of combining_dfs_plotting.R script")
|
||||||
}
|
}
|
|
@ -239,8 +239,8 @@ def MultModelsCl(input_df, target
|
||||||
# , ('Gaussian NB' , GaussianNB() )
|
# , ('Gaussian NB' , GaussianNB() )
|
||||||
# , ('Gaussian Process' , GaussianProcessClassifier(**rs) )
|
# , ('Gaussian Process' , GaussianProcessClassifier(**rs) )
|
||||||
# , ('K-Nearest Neighbors' , KNeighborsClassifier() )
|
# , ('K-Nearest Neighbors' , KNeighborsClassifier() )
|
||||||
# , ('LDA' , LinearDiscriminantAnalysis() )
|
, ('LDA' , LinearDiscriminantAnalysis() )
|
||||||
# , ('Logistic Regression' , LogisticRegression(**rs) )
|
, ('Logistic Regression' , LogisticRegression(**rs) )
|
||||||
# , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs))
|
# , ('Logistic RegressionCV' , LogisticRegressionCV(cv = 3, **rs))
|
||||||
# , ('MLP' , MLPClassifier(max_iter = 500, **rs) )
|
# , ('MLP' , MLPClassifier(max_iter = 500, **rs) )
|
||||||
#, ('Multinomial' , MultinomialNB() )
|
#, ('Multinomial' , MultinomialNB() )
|
||||||
|
@ -259,7 +259,7 @@ def MultModelsCl(input_df, target
|
||||||
# , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) )
|
# , ('Ridge ClassifierCV' , RidgeClassifierCV(cv = 3) )
|
||||||
# , ('SVC' , SVC(**rs) )
|
# , ('SVC' , SVC(**rs) )
|
||||||
# , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
|
# , ('Stochastic GDescent' , SGDClassifier(**rs, **njobs) )
|
||||||
# , ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) )
|
, ('XGBoost' , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) )
|
||||||
#
|
#
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -14,23 +14,8 @@ sys.path
|
||||||
# import
|
# import
|
||||||
from GetMLData import *
|
from GetMLData import *
|
||||||
from SplitTTS import *
|
from SplitTTS import *
|
||||||
#from MultClfs_fi import *
|
#from MultClfs import *
|
||||||
from MultClfs import *
|
from MultClfs_SIMPLE import *
|
||||||
|
|
||||||
#%%
|
|
||||||
# X,y = load_boston(return_X_y=True)
|
|
||||||
# features = load_boston()['feature_names']
|
|
||||||
|
|
||||||
# X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)
|
|
||||||
|
|
||||||
# rf = RandomForestRegressor(random_state=0)
|
|
||||||
# rf.fit(X_train,y_train)
|
|
||||||
|
|
||||||
|
|
||||||
# f_i = list(zip(features,rf.feature_importances_))
|
|
||||||
# f_i.sort(key = lambda x : x[1])
|
|
||||||
# plt.barh([x[0] for x in f_i],[x[1] for x in f_i])
|
|
||||||
# plt.show()
|
|
||||||
|
|
||||||
#%%
|
#%%
|
||||||
|
|
||||||
|
@ -54,9 +39,8 @@ df = getmldata('katG', 'isoniazid' , **gene_model_paramD)
|
||||||
df = getmldata('rpoB', 'rifampicin' , **gene_model_paramD)
|
df = getmldata('rpoB', 'rifampicin' , **gene_model_paramD)
|
||||||
df = getmldata('gid' , 'streptomycin' , **gene_model_paramD)
|
df = getmldata('gid' , 'streptomycin' , **gene_model_paramD)
|
||||||
#df = getmldata('alr' , 'cycloserine' , **combined_model_paramD)
|
#df = getmldata('alr' , 'cycloserine' , **combined_model_paramD)
|
||||||
|
|
||||||
all(df.columns.isin(['gene_name'])) # should be False
|
all(df.columns.isin(['gene_name'])) # should be False
|
||||||
|
|
||||||
|
|
||||||
spl_type = '70_30'
|
spl_type = '70_30'
|
||||||
#spl_type = '80_20'
|
#spl_type = '80_20'
|
||||||
#spl_type = 'sl'
|
#spl_type = 'sl'
|
||||||
|
@ -73,6 +57,16 @@ df2 = split_tts(df
|
||||||
|
|
||||||
all(df2['X'].columns.isin(['gene_name'])) # should be False
|
all(df2['X'].columns.isin(['gene_name'])) # should be False
|
||||||
|
|
||||||
|
df['dst'].value_counts()
|
||||||
|
df['dst'].isna().sum()
|
||||||
|
df['dst_mode'].value_counts()
|
||||||
|
|
||||||
|
len(df)
|
||||||
|
|
||||||
|
Counter(df2['y'])
|
||||||
|
Counter(df2['y_bts'])
|
||||||
|
|
||||||
|
|
||||||
fooD = MultModelsCl(input_df = df2['X']
|
fooD = MultModelsCl(input_df = df2['X']
|
||||||
, target = df2['y']
|
, target = df2['y']
|
||||||
, sel_cv = skf_cv
|
, sel_cv = skf_cv
|
||||||
|
|
|
@ -7,11 +7,9 @@
|
||||||
getwd()
|
getwd()
|
||||||
source("~/git/LSHTM_analysis/scripts/Header_TT.R")
|
source("~/git/LSHTM_analysis/scripts/Header_TT.R")
|
||||||
|
|
||||||
#********************
|
|
||||||
# cmd args passed
|
# cmd args passed
|
||||||
# in from other scripts
|
# in from other scripts
|
||||||
# to call this
|
# to call this
|
||||||
#********************
|
|
||||||
# set drug and gene name
|
# set drug and gene name
|
||||||
|
|
||||||
#==========================================
|
#==========================================
|
||||||
|
@ -85,8 +83,8 @@ all_plot_dfs = combining_dfs_plotting(my_df_u
|
||||||
|
|
||||||
merged_df2 = all_plot_dfs[[1]]
|
merged_df2 = all_plot_dfs[[1]]
|
||||||
merged_df3 = all_plot_dfs[[2]]
|
merged_df3 = all_plot_dfs[[2]]
|
||||||
merged_df2_comp = all_plot_dfs[[3]]
|
#merged_df2_comp = all_plot_dfs[[3]]
|
||||||
merged_df3_comp = all_plot_dfs[[4]]
|
#merged_df3_comp = all_plot_dfs[[4]]
|
||||||
|
|
||||||
#======================================================================
|
#======================================================================
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue