diff --git a/scripts/plotting/combining_dfs_plotting.R b/scripts/plotting/combining_dfs_plotting.R index 0a4c303..2b4750c 100644 --- a/scripts/plotting/combining_dfs_plotting.R +++ b/scripts/plotting/combining_dfs_plotting.R @@ -6,7 +6,6 @@ # 2) _meta_data.csv # Output: -# 1) muts with opposite effects on stability # 2) large combined df including NAs for AF, OR,etc # Dim: same no. of rows as gene associated meta_data_with_AFandOR # 3) small combined df including NAs for AF, OR, etc. @@ -36,18 +35,18 @@ source("plotting_data.R") # my_df_u_lig # dup_muts -cat(paste0("Directories imported:" - , "\ndatadir:", datadir - , "\nindir:", indir - , "\noutdir:", outdir - , "\nplotdir:", plotdir)) +cat("Directories imported:" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir) -cat(paste0("Variables imported:" - , "\ndrug:", drug - , "\ngene:", gene - , "\ngene_match:", gene_match - , "\nLength of upos:", length(upos) - , "\nAngstrom symbol:", angstroms_symbol)) +cat("Variables imported:" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nLength of upos:", length(upos) + , "\nAngstrom symbol:", angstroms_symbol) # clear excess variable rm(my_df, upos, dup_muts) @@ -58,24 +57,6 @@ rm(my_df, upos, dup_muts) #in_file1: output of plotting_data.R # my_df_u -# quick checks -head(my_df_u[, c("mutation")]) - -cols_to_extract = c("mutationinformation", "mutation", "or_mychisq", "or_kin", "af", "af_kin") -foo = my_df_u[, colnames(my_df_u)%in% cols_to_extract] - - -table(which(is.na(my_df_u$af_kin)) == which(is.na(my_df_u$af))) - -baz = read.csv(file.choose()) - -baz = cbind(my_df_u$mutation, my_df_u$or_mychisq, bar$mutation, bar$or_mychisq) -baz = as.data.frame(baz) -colnames(baz) = c("my_df_u_muts", "my_df_u_or", "real_muts", "real_or") -sum(is.na(baz$my_df_u_or)) == sum(is.na(my_df_u$or_mychisq)) - -cat("\nNo. of with NA in or_mychisq:", sum(is.na(my_df_u$or_mychisq)) - ,"\nNo. of NA in or_kin:" , sum(is.na(my_df_u$or_kin))) # infile 2: gene associated meta data #in_filename_gene_metadata = paste0(tolower(gene), "_meta_data_with_AFandOR.csv") @@ -145,67 +126,6 @@ if (identical(sum(is.na(my_df_u$or_kin)) str(gene_metadata) -# change category of ambiguos mutations -table(gene_metadata$mutation_info) - -cols_to_extract2 = c("mutationinformation", "mutation", "mutation_info") -foo2 = gene_metadata[, colnames(gene_metadata)%in% cols_to_extract2] - -dr_muts = foo2[foo2$mutation_info == dr_muts_col,] -other_muts = foo2[foo2$mutation_info == other_muts_col,] - -common_muts = dr_muts[dr_muts$mutation%in%other_muts$mutation,] -#write.csv(common_muts, 'common_muts.csv') -rm(common_muts) - -# FIXME read properly -# "ambiguous_mut_names.csv" -#"pnca_p.gly108arg", "pnca_p.gly132ala", "pnca_p.val180phe" -ambiguous_muts = read.csv(file.choose()) -ambiguous_muts_names = ambiguous_muts$mutation - -common_muts_all = gene_metadata[gene_metadata$mutation%in%ambiguous_muts_names,] - -if (gene_metadata$mutation_info[gene_metadata$mutation%in%ambiguous_muts_names] == other_muts_col){ - print('change me') -} - -# make a copy -gene_metadata2 = gene_metadata -table(gene_metadata$mutation_info) -count_check = as.data.frame(cbind(table(gene_metadata$mutationinformation, gene_metadata$mutation_info))) -#count_check$checks = ifelse(count_check$dr_mutations_pyrazinamide&&count_check$other_mutations_pyrazinamide>0, "ambi", "pass") -table(count_check$checks) - - -poo = c("V180F", "G132A", "D49G") -poo2 = count_check[rownames(count_check)%in%poo,] -poo2[[dr_muts_col]]&& poo2[[other_muts_col]]>0 -poo2$checks = ifelse(all(poo2$checkspoo2[[dr_muts_col]]&& poo2[[other_muts_col]])>0, "ambi", "pass") - -# remove common_muts_all -ids = gene_metadata$mutation%in%common_muts_all$mutation; table(ids) -gene_metadata_unambiguous = gene_metadata2[!ids,] - -# sanity checks: should be true -table(gene_metadata_unambiguous$mutation%in%common_muts_all$mutation)[[1]] == nrow(gene_metadata_unambiguous) -nrow(gene_metadata_unambiguous) + nrow(common_muts_all) == nrow(gene_metadata) - -# correct common muts -table(common_muts_all$mutation_info) -common_muts_all$mutation_info = as.factor(common_muts_all$mutation_info) - -# change the other_muts to dr_muts -common_muts_all$mutation_info[common_muts_all$mutation_info==other_muts_col] <- dr_muts_col - -table(common_muts_all$mutation_info) -common_muts_all$mutation_info = factor(common_muts_all$mutation_info) -table(common_muts_all$mutation_info) - -# add it back to -gene_metadata2 = rbind(gene_metadata_unambiguous, common_muts_all) -nrow(gene_metadata2) == nrow(gene_metadata) - ################################################################### # combining: PS ################################################################### @@ -326,53 +246,43 @@ cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)" ,"\nlinking col: Mutationinforamtion" ,"\nfilename: merged_df2_comp") -if ( identical( which(is.na(merged_df2$af)), which(is.na(merged_df2$af_kin))) ){ - print("mychisq and kin ors missing indices match. Procedding with omitting NAs") - na_count_df2 = sum(is.na(merged_df2$af)) - merged_df2_comp = merged_df2[!is.na(merged_df2$af),] - # sanity check: no +-1 gymnastics - cat("Checking nrows in merged_df2_comp") - if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){ - cat("\nPASS: No. of rows match" - ,"\nDim of merged_df2_comp: " - ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2 - , "\nNo. of rows: ", nrow(merged_df2_comp) - , "\nNo. of cols: ", ncol(merged_df2_comp)) - }else{ +na_count_df2 = sum(is.na(merged_df2$af)) +merged_df2_comp = merged_df2[!is.na(merged_df2$af),] +# sanity check: no +-1 gymnastics +cat("Checking nrows in merged_df2_comp") +if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){ + cat("\nPASS: No. of rows match" + ,"\nDim of merged_df2_comp: " + ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2 + , "\nNo. of rows: ", nrow(merged_df2_comp) + , "\nNo. of cols: ", ncol(merged_df2_comp)) +}else{ cat("FAIL: No. of rows mismatch" ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2 ,"\nGot no. of rows: ", nrow(merged_df2_comp)) - } -}else{ - print("Index mismatch for mychisq and kin ors. Aborting NA ommission") } - #========================= # Merge4: merged_df3_comp # same as merge 2 but excluding NAs from ORs, etc or # remove duplicate mutation information #========================= -if ( identical( which(is.na(merged_df3$af)), which(is.na(merged_df3$af_kin))) ){ - print("mychisq and kin ors missing indices match. Procedding with omitting NAs") - na_count_df3 = sum(is.na(merged_df3$af)) - #merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way - merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way - cat("Checking nrows in merged_df3_comp") - if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){ - cat("\nPASS: No. of rows match" - ,"\nDim of merged_df3_comp: " - ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3 - , "\nNo. of rows: ", nrow(merged_df3_comp) - , "\nNo. of cols: ", ncol(merged_df3_comp)) - }else{ - cat("FAIL: No. of rows mismatch" - ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3 - ,"\nGot no. of rows: ", nrow(merged_df3_comp)) - } -} else{ - print("Index mismatch for mychisq and kin ors. Aborting NA ommission") -} +na_count_df3 = sum(is.na(merged_df3$af)) +#merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way +merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way +cat("Checking nrows in merged_df3_comp") +if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){ + cat("\nPASS: No. of rows match" + ,"\nDim of merged_df3_comp: " + ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3 + , "\nNo. of rows: ", nrow(merged_df3_comp) + , "\nNo. of cols: ", ncol(merged_df3_comp)) +}else{ + cat("FAIL: No. of rows mismatch" + ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3 + , "\nGot no. of rows: ", nrow(merged_df3_comp)) +} + # alternate way of deriving merged_df3_comp foo = merged_df3[!is.na(merged_df3$af),] @@ -408,8 +318,7 @@ all.equal(foo, bar) # clear variables rm(foo, bar, gene_metadata , in_filename_params, infile_params, merging_cols - , in_filename_gene_metadata, infile_gene_metadata - , merged_df2v2, merged_df2v3) + , in_filename_gene_metadata, infile_gene_metadata) #************************* ##################################################################### # Combining: LIG