From 31b98fb3d37eb31b484168e353096372f3237327 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 9 Sep 2020 12:00:42 +0100 Subject: [PATCH] plotting script with resolved gene metadata --- scripts/plotting/combining_dfs_plotting.R | 186 ++++++---------------- 1 file changed, 48 insertions(+), 138 deletions(-) diff --git a/scripts/plotting/combining_dfs_plotting.R b/scripts/plotting/combining_dfs_plotting.R index 5f5d284..7549de6 100644 --- a/scripts/plotting/combining_dfs_plotting.R +++ b/scripts/plotting/combining_dfs_plotting.R @@ -36,18 +36,22 @@ source("plotting_data.R") # my_df_u_lig # dup_muts -cat(paste0("Directories imported:" - , "\ndatadir:", datadir - , "\nindir:", indir - , "\noutdir:", outdir - , "\nplotdir:", plotdir)) +cat("Directories imported:" + , "\n====================" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir) + +cat("Variables imported:" + , "\n=====================" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\ndr_muts_col:", dr_muts_col + , "\nother_muts_col:", other_muts_col + , "\ndrtype_col:", resistance_col) -cat(paste0("Variables imported:" - , "\ndrug:", drug - , "\ngene:", gene - , "\ngene_match:", gene_match - , "\nLength of upos:", length(upos) - , "\nAngstrom symbol:", angstroms_symbol)) # clear excess variable rm(my_df, upos, dup_muts) @@ -58,25 +62,6 @@ rm(my_df, upos, dup_muts) #in_file1: output of plotting_data.R # my_df_u -# quick checks -head(my_df_u[, c("mutation")]) - -cols_to_extract = c("mutationinformation", "mutation", "or_mychisq", "or_kin", "af", "af_kin") -foo = my_df_u[, colnames(my_df_u)%in% cols_to_extract] - - -table(which(is.na(my_df_u$af_kin)) == which(is.na(my_df_u$af))) - -baz = read.csv(file.choose()) - -baz = cbind(my_df_u$mutation, my_df_u$or_mychisq, bar$mutation, bar$or_mychisq) -baz = as.data.frame(baz) -colnames(baz) = c("my_df_u_muts", "my_df_u_or", "real_muts", "real_or") -sum(is.na(baz$my_df_u_or)) == sum(is.na(my_df_u$or_mychisq)) - -cat("\nNo. of with NA in or_mychisq:", sum(is.na(my_df_u$or_mychisq)) - ,"\nNo. of NA in or_kin:" , sum(is.na(my_df_u$or_kin))) - # infile 2: gene associated meta data #in_filename_gene_metadata = paste0(tolower(gene), "_meta_data_with_AFandOR.csv") in_filename_gene_metadata = paste0(tolower(gene), "_metadata.csv") @@ -113,6 +98,8 @@ gene_metadata <- read.csv(infile_gene_metadata , header = T) cat("Dim:", dim(gene_metadata)) +table(gene_metadata$mutation_info) + # counting NAs in AF, OR cols # or_mychisq @@ -145,66 +132,6 @@ if (identical(sum(is.na(my_df_u$or_kin)) str(gene_metadata) -# change category of ambiguos mutations -table(gene_metadata$mutation_info) - -cols_to_extract2 = c("mutationinformation", "mutation", "mutation_info") -foo2 = gene_metadata[, colnames(gene_metadata)%in% cols_to_extract2] - -dr_muts = foo2[foo2$mutation_info == dr_muts_col,] -other_muts = foo2[foo2$mutation_info == other_muts_col,] - -common_muts = dr_muts[dr_muts$mutation%in%other_muts$mutation,] -#write.csv(common_muts, 'common_muts.csv') -rm(common_muts) - -# FIXME read properly -# "ambiguous_mut_names.csv" -#"pnca_p.gly108arg", "pnca_p.gly132ala", "pnca_p.val180phe" -ambiguous_muts = read.csv(file.choose()) -ambiguous_muts_names = ambiguous_muts$mutation - -common_muts_all = gene_metadata[gene_metadata$mutation%in%ambiguous_muts_names,] - -if (gene_metadata$mutation_info[gene_metadata$mutation%in%ambiguous_muts_names] == other_muts_col){ - print('change me') -} - -# make a copy -gene_metadata2 = gene_metadata -table(gene_metadata$mutation_info) -count_check = as.data.frame(cbind(table(gene_metadata$mutationinformation, gene_metadata$mutation_info))) -#count_check$checks = ifelse(count_check$dr_mutations_pyrazinamide&&count_check$other_mutations_pyrazinamide>0, "ambi", "pass") -table(count_check$checks) - - -poo = c("V180F", "G132A", "D49G") -poo2 = count_check[rownames(count_check)%in%poo,] -poo2[[dr_muts_col]]&& poo2[[other_muts_col]]>0 -poo2$checks = ifelse(poo2$checkspoo2[[dr_muts_col]]&& poo2[[other_muts_col]]>0, "ambi", "pass") - -# remove common_muts_all -ids = gene_metadata$mutation%in%common_muts_all$mutation; table(ids) -gene_metadata_unambiguous = gene_metadata2[!ids,] - -# sanity checks: should be true -table(gene_metadata_unambiguous$mutation%in%common_muts_all$mutation)[[1]] == nrow(gene_metadata_unambiguous) -nrow(gene_metadata_unambiguous) + nrow(common_muts_all) == nrow(gene_metadata) - -# correct common muts -table(common_muts_all$mutation_info) -common_muts_all$mutation_info = as.factor(common_muts_all$mutation_info) - -# change the other_muts to dr_muts -common_muts_all$mutation_info[common_muts_all$mutation_info==other_muts_col] <- dr_muts_col - -table(common_muts_all$mutation_info) -common_muts_all$mutation_info = factor(common_muts_all$mutation_info) -table(common_muts_all$mutation_info) - -# add it back to -gene_meta_data - ################################################################### # combining: PS ################################################################### @@ -307,15 +234,6 @@ if (identical(sum(is.na(merged_df3$or_kin)) , "\nNA in AF:", sum(is.na(merged_df3$af_kin))) } -# check if the same or and afs are missing for -if ( identical( which(is.na(merged_df2$or_mychisq)), which(is.na(merged_df2$or_kin))) - && identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) - && identical( which(is.na(merged_df2$pval_fisher)), which(is.na(merged_df2$pwald_kin))) ){ - cat("PASS: Indices match for mychisq and kin ors missing values") -} else{ - cat("Index mismatch: mychisq and kin ors missing indices match") - quit() -} #========================= # Merge3: merged_df2_comp @@ -325,25 +243,21 @@ cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)" ,"\nlinking col: Mutationinforamtion" ,"\nfilename: merged_df2_comp") -if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){ - print("mychisq and kin ors missing indices match. Procedding with omitting NAs") - na_count_df2 = sum(is.na(merged_df2$af)) - merged_df2_comp = merged_df2[!is.na(merged_df2$af),] - # sanity check: no +-1 gymnastics - cat("Checking nrows in merged_df2_comp") - if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){ - cat("\nPASS: No. of rows match" - ,"\nDim of merged_df2_comp: " - ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2 - , "\nNo. of rows: ", nrow(merged_df2_comp) - , "\nNo. of cols: ", ncol(merged_df2_comp)) - }else{ - cat("FAIL: No. of rows mismatch" - ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2 - ,"\nGot no. of rows: ", nrow(merged_df2_comp)) - } +na_count_df2 = sum(is.na(merged_df2$af)) +merged_df2_comp = merged_df2[!is.na(merged_df2$af),] + +# sanity check: no +-1 gymnastics +cat("Checking nrows in merged_df2_comp") +if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){ + cat("\nPASS: No. of rows match" + ,"\nDim of merged_df2_comp: " + ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2 + , "\nNo. of rows: ", nrow(merged_df2_comp) + , "\nNo. of cols: ", ncol(merged_df2_comp)) }else{ - print("Index mismatch for mychisq and kin ors. Aborting NA ommission") + cat("FAIL: No. of rows mismatch" + ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2 + ,"\nGot no. of rows: ", nrow(merged_df2_comp)) } #========================= @@ -351,28 +265,24 @@ if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){ # same as merge 2 but excluding NAs from ORs, etc or # remove duplicate mutation information #========================= +na_count_df3 = sum(is.na(merged_df3$af)) +#merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way + +merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way +cat("Checking nrows in merged_df3_comp") + +if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){ + cat("\nPASS: No. of rows match" + ,"\nDim of merged_df3_comp: " + ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3 + , "\nNo. of rows: ", nrow(merged_df3_comp) + , "\nNo. of cols: ", ncol(merged_df3_comp)) +}else{ + cat("FAIL: No. of rows mismatch" + ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3 + ,"\nGot no. of rows: ", nrow(merged_df3_comp)) +} -if ( identical( which(is.na(merged_df3$af)), which(is.na(merged_df3$af_kin))) ){ - print("mychisq and kin ors missing indices match. Procedding with omitting NAs") - na_count_df3 = sum(is.na(merged_df3$af)) - #merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way - merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way - cat("Checking nrows in merged_df3_comp") - if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){ - cat("\nPASS: No. of rows match" - ,"\nDim of merged_df3_comp: " - ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3 - , "\nNo. of rows: ", nrow(merged_df3_comp) - , "\nNo. of cols: ", ncol(merged_df3_comp)) - }else{ - cat("FAIL: No. of rows mismatch" - ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3 - ,"\nGot no. of rows: ", nrow(merged_df3_comp)) - } -} else{ - print("Index mismatch for mychisq and kin ors. Aborting NA ommission") -} - # alternate way of deriving merged_df3_comp foo = merged_df3[!is.na(merged_df3$af),] bar = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),]