diff --git a/scripts/count_vars_ML.R b/scripts/count_vars_ML.R index ec13f53..65a2a77 100644 --- a/scripts/count_vars_ML.R +++ b/scripts/count_vars_ML.R @@ -4,7 +4,7 @@ #source("~/git/LSHTM_analysis/config/embb.R") #source("~/git/LSHTM_analysis/config/gid.R") #source("~/git/LSHTM_analysis/config/katg.R") -source("~/git/LSHTM_analysis/config/pnca.R") +#source("~/git/LSHTM_analysis/config/pnca.R") source("~/git/LSHTM_analysis/config/rpob.R") ############################# @@ -16,19 +16,19 @@ source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R") # Output files: merged data ############################# outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv') -outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv') +#outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv') ################################################ # Add acticve site indication ############################################### merged_df2$active_site = as.integer(merged_df2$position %in% active_aa_pos) -merged_df2_comp$active_site = as.integer(merged_df2_comp$position %in% active_aa_pos) +#merged_df2_comp$active_site = as.integer(merged_df2_comp$position %in% active_aa_pos) merged_df3$active_site = as.integer(merged_df3$position %in% active_aa_pos) -merged_df3_comp$active_site = as.integer(merged_df3_comp$position %in% active_aa_pos) +#merged_df3_comp$active_site = as.integer(merged_df3_comp$position %in% active_aa_pos) # check -cols_sel = c('mutationinformation', 'dst', 'mutation_info_labels', 'dm_om_numeric', 'dst_mode') +cols_sel = c('mutationinformation', 'mutation_info_labels', 'dm_om_numeric', 'dst', 'dst_mode') check_mdf2 = merged_df2[, cols_sel] check_mdf2T = table(check_mdf2$mutationinformation, check_mdf2$dst_mode) @@ -57,6 +57,8 @@ if (check12) { stop('FAIL: Something is wrong with the dst_mode column. Quitting!') } +table(is.na(merged_df3$dst)) + #========================== # CHECK: active site labels #========================== @@ -137,7 +139,6 @@ if (all(a2 && b2)){ stop("FAIL: could not add drtype mode labels to merged_df3") ##quit() } -############################################## #=============== # CHECK: lineage #=============== @@ -179,10 +180,11 @@ if ( all( check12 && aa_check1 && aa_check2 && a1 && b1 && a2 && b2 && l1 && l2 , "\nGene:", gene) write.csv(merged_df3, outfile_merged_df3) - write.csv(merged_df2, outfile_merged_df2) + #write.csv(merged_df2, outfile_merged_df2) cat(paste("\nmerged df3 filename:", outfile_merged_df3 - , "\nmerged df2 filename:", outfile_merged_df2)) + #, "\nmerged df2 filename:", outfile_merged_df2) + )) } else{ stop("FAIL: Not able to write merged dfs. Please check numbers!") @@ -207,15 +209,6 @@ a = merged_df3[, sel] str(a) -# write file -# outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv') -# outfile_merged_df3 -# write.csv(merged_df3, outfile_merged_df3) -# -# outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv') -# outfile_merged_df2 -# write.csv(merged_df2, outfile_merged_df2) - ################################################### ################################################### ################################################### diff --git a/scripts/functions/combining_dfs_plotting.R b/scripts/functions/combining_dfs_plotting.R index 26b214f..749a7ff 100644 --- a/scripts/functions/combining_dfs_plotting.R +++ b/scripts/functions/combining_dfs_plotting.R @@ -204,6 +204,9 @@ combining_dfs_plotting <- function( my_df_u merged_df2$lineage_labels = merged_df2$lineage #merged_df2$lineage_labels = as.factor(merged_df2$lineage_labels) #merged_df2$lineage_labels = factor(merged_df2$lineage_labels) + table(merged_df2$mutation_info_labels_orig) # original + table(merged_df2$mutation_info_labels_v1) # intermediate + table(merged_df2$mutation_info_labels) # revised, corresponding to dst_mode #================================================================= # Merge 2: merged_df3 @@ -214,17 +217,123 @@ combining_dfs_plotting <- function( my_df_u # but this should be good for the numerical corr plots #================================================================== # remove duplicated mutations - cat("\nMerging dfs without NAs: small df (removing muts with no AF|OR associated)" - ,"\nCannot trust lineage info from this" - ,"\nlinking col: mutationinforamtion" - ,"\nfilename: merged_df3") + # cat("\nMerging dfs without NAs: small df (removing muts with no AF|OR associated)" + # ,"\nCannot trust lineage info from this" + # ,"\nlinking col: mutationinforamtion" + # ,"\nfilename: merged_df3") + # + # merged_df3 = merged_df2[!duplicated(merged_df2$mutationinformation),] + # + # + + # head(merged_df3$position); tail(merged_df3$position) # should be sorted + # + # # sanity check + # cat("\nChecking nrows in merged_df3") + # + # if( nrow(my_df_u) == nrow(merged_df3) ){ + # cat("\nPASS: No. of rows match with my_df" + # ,"\nExpected no. of rows: ", nrow(my_df_u) + # ,"\nGot no. of rows: ", nrow(merged_df3)) + # } else { + # cat("\nFAIL: No. of rows mismatch" + # , "\nNo. of rows my_df: ", nrow(my_df_u) + # , "\nNo. of rows merged_df3: ", nrow(merged_df3)) + # quit() + # } + # + # counting NAs in AF, OR cols in merged_df3 + # this is because mcsm has no AF, OR cols, + # so you cannot count NAs + # if (identical(sum(is.na(merged_df3$or_kin)) + # , sum(is.na(merged_df3$pwald_kin)) + # , sum(is.na(merged_df3$af_kin)))){ + # cat("\nPASS: NA count match for OR, pvalue and AF\n") + # na_count_df3 = sum(is.na(merged_df3$af_kin)) + # cat("\nNo. of NAs: ", sum(is.na(merged_df3$or_kin))) + # } else{ + # cat("\nFAIL: NA count mismatch" + # , "\nNA in OR: ", sum(is.na(merged_df3$or_kin)) + # , "\nNA in pvalue: ", sum(is.na(merged_df3$pwald_kin)) + # , "\nNA in AF:", sum(is.na(merged_df3$af_kin))) + # } + # + # =================================== + # Revised way to generate merged_df3 + # =================================== + #%% Getting merged_df3: VERY important and careful subsetting merging + # dst mode column as carefully curated dst based on knowledge based approach. + # so now we want to get the + na_muts = merged_df2[is.na(merged_df2$dst),] + no_na_muts = merged_df2[!is.na(merged_df2$dst),] + + muts_na_U = na_muts[!duplicated(na_muts[c('mutationinformation')]), ] + muts_no_na_U = no_na_muts[!duplicated(no_na_muts[c('mutationinformation')]), ] + + # get muts from no_na that are NOT present in muts with na from dplyr + dst_muts = dplyr::anti_join(muts_no_na_U, muts_na_U, by = 'mutationinformation') + #dst_muts = anti_join(muts_no_na_U, muts_na_U, by = 'mutationinformation') + + # ALL good muts are NOT in na muts unique i.e dst muts should NOT exist in na_muts + if (all(dst_muts$mutationinformation%in%muts_na_U$mutationinformation) == FALSE){ + cat("\nPASS: checked length for dst tested muts" + , "\nNo. of dst testetd muts:", nrow(dst_muts)) + }else{ + stop("Dst muts are not correctly identified") + } + + if ( class(dst_muts) != "data.frame" ){ + dst_muts = as.data.frame(dst_muts) + } else{ + cat("\ndst_muts is a df") + } + + # ALL bad muts are in na muts unique + bad_muts = dplyr::semi_join(muts_no_na_U, muts_na_U, by = "mutationinformation") + #bad_muts = semi_join(muts_no_na_U, muts_na_U, by = "mutationinformation") + + + if (all(bad_muts$mutationinformation%in%muts_na_U$mutationinformation) == TRUE){ + cat("\nPASS: checked length of NOT-dst tested muts" + , "\nNo. of NOT dst-tested_muts:", nrow(bad_muts)) + }else{ + stop("Non-dst muts are not correctly identified") + } + + if ( class(bad_muts) != "data.frame" ){ + bad_muts = as.data.frame(bad_muts) + } else{ + cat("\nbad_muts is a df") + } + + cat("\nNo. of muts with dst:", nrow(dst_muts) + , "\nNo. of muts without dst:", nrow(muts_na_U) - nrow(dst_muts) ) + + # now merge + if ( all(colnames(muts_na_U) == colnames(dst_muts)) ){ + cat("\nPASS: rowbind to get merged_df3") + merged_df3 = dplyr::bind_rows(muts_na_U, dst_muts) + #merged_df3 = bind_rows(muts_na_U, dst_muts) + + } else{ + stop("Quitting: merged_df3 could not be generated") + } + + if ( nrow(merged_df3) == length(unique(merged_df2$mutationinformation)) ){ + cat("\nPASS: merged_df3 sucessfully generated..." + , "\nnrow merged_df3:", nrow(merged_df3) + , "\nncol merged_df3:", ncol(merged_df3)) + }else{ + stop("Cannot generate merged_df3") + } +################################################################## - merged_df3 = merged_df2[!duplicated(merged_df2$mutationinformation),] head(merged_df3$position); tail(merged_df3$position) # should be sorted # sanity check cat("\nChecking nrows in merged_df3") - if(nrow(my_df_u) == nrow(merged_df3)){ + + if( nrow(my_df_u) == nrow(merged_df3) ){ cat("\nPASS: No. of rows match with my_df" ,"\nExpected no. of rows: ", nrow(my_df_u) ,"\nGot no. of rows: ", nrow(merged_df3)) @@ -234,166 +343,9 @@ combining_dfs_plotting <- function( my_df_u , "\nNo. of rows merged_df3: ", nrow(merged_df3)) quit() } - - # counting NAs in AF, OR cols in merged_df3 - # this is because mcsm has no AF, OR cols, - # so you cannot count NAs - if (identical(sum(is.na(merged_df3$or_kin)) - , sum(is.na(merged_df3$pwald_kin)) - , sum(is.na(merged_df3$af_kin)))){ - cat("\nPASS: NA count match for OR, pvalue and AF\n") - na_count_df3 = sum(is.na(merged_df3$af_kin)) - cat("\nNo. of NAs: ", sum(is.na(merged_df3$or_kin))) - } else{ - cat("\nFAIL: NA count mismatch" - , "\nNA in OR: ", sum(is.na(merged_df3$or_kin)) - , "\nNA in pvalue: ", sum(is.na(merged_df3$pwald_kin)) - , "\nNA in AF:", sum(is.na(merged_df3$af_kin))) - } - - #=================================================== - # Merge3: merged_df2_comp - # same as merge 1 but excluding NAs from ORs, etc. - #==================================================== - cat("\nMerging dfs without any NAs: big df (1-many relationship b/w id & mut)" - ,"\nfilename: merged_df2_comp") - - na_count_df2 = sum(is.na(merged_df2$af)) - merged_df2_comp = merged_df2[!is.na(merged_df2$af),] - - # sanity check: no +-1 gymnastics - cat("\nChecking nrows in merged_df2_comp") - if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){ - cat("\nPASS: No. of rows match" - ,"\nDim of merged_df2_comp: " - ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2 - , "\nNo. of rows: ", nrow(merged_df2_comp) - , "\nNo. of cols: ", ncol(merged_df2_comp)) - }else{ - cat("\nFAIL: No. of rows mismatch" - ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2 - ,"\nGot no. of rows: ", nrow(merged_df2_comp)) - } - - #====================================================== - # Merge4: merged_df3_comp - # same as merge 2 but excluding NAs from ORs, etc or - # remove duplicate mutation information - #======================================================= - na_count_df3 = sum(is.na(merged_df3$af)) - #merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way - - merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way - cat("\nChecking nrows in merged_df3_comp") - - if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){ - cat("\nPASS: No. of rows match" - ,"\nDim of merged_df3_comp: " - ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3 - , "\nNo. of rows: ", nrow(merged_df3_comp) - , "\nNo. of cols: ", ncol(merged_df3_comp)) - }else{ - cat("\nFAIL: No. of rows mismatch" - ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3 - ,"\nGot no. of rows: ", nrow(merged_df3_comp)) - } - - # alternate way of deriving merged_df3_comp - foo = merged_df3[!is.na(merged_df3$af),] - bar = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] - # compare dfs: foo and merged_df3_com - all.equal(foo, bar) - #summary(comparedf(foo, bar)) - cat("\n------------------------" - , "\nSummary of created dfs:" - , "\n------------------------" - , "\n1) Dim of merged_df2: " , nrow(merged_df2), "," , ncol(merged_df2) - , "\n2) Dim of merged_df2_comp: " , nrow(merged_df2_comp), "," , ncol(merged_df2_comp) - , "\n3) Dim of merged_df3: " , nrow(merged_df3), "," , ncol(merged_df3) - , "\n4) Dim of merged_df3_comp: " , nrow(merged_df3_comp), "," , ncol(merged_df3_comp)) - - ##################################################################### - # Combining: LIG - ##################################################################### - - #============ - # Merges 5-8 - #============ - cat("\n==========================================" - , "\nStarting filtering for mcsm ligand df" - , "\n===========================================") - - if (lig_dist_colname%in%names(my_df_u)){ - cat("\nFiltering column: ", lig_dist_colname - , "\nCut off criteria: ", lig_dist_cutoff, "Angstroms") - df_lig = my_df_u[my_df_u[[lig_dist_colname]] < lig_dist_cutoff,] - - #merged_df2_lig = merged_df2[merged_df2$ligand_distance