Merge branch 'master' of github.com:tgttunstall/LSHTM_analysis

2020-09-10 16:14:46 +01:00 · 2020-09-10 16:14:46 +01:00 · 5102bbea1b
commit 5102bbea1b
parent f415b0b239 fc47c58f91
21 changed files with 2132 additions and 243 deletions
--- a/scripts/plotting/combining_dfs_plotting.R
+++ b/scripts/plotting/combining_dfs_plotting.R
@ -6,6 +6,7 @@
 # 2) <gene>_meta_data.csv

 # Output: 
+# 1) muts with opposite effects on stability
 # 2) large combined df including NAs for AF, OR,etc
 # 		Dim: same no. of rows as gene associated meta_data_with_AFandOR
 # 3) small combined df including NAs for AF, OR, etc.
@ -36,17 +37,23 @@ source("plotting_data.R")
 # dup_muts

 cat("Directories imported:"
+    , "\n===================="
    , "\ndatadir:", datadir
    , "\nindir:", indir
    , "\noutdir:", outdir
    , "\nplotdir:", plotdir)

 cat("Variables imported:"
+    , "\n====================="
    , "\ndrug:", drug
    , "\ngene:", gene
    , "\ngene_match:", gene_match
-    , "\nLength of upos:", length(upos)
-    , "\nAngstrom symbol:", angstroms_symbol)     
+    , "\nAngstrom symbol:", angstroms_symbol
+    , "\nNo. of duplicated muts:", dup_muts_nu
+    , "\ndr_muts_col:", dr_muts_col
+    , "\nother_muts_col:", other_muts_col
+    , "\ndrtype_col:", resistance_col)
+

 # clear excess variable
 rm(my_df, upos, dup_muts)
@ -57,7 +64,6 @@ rm(my_df, upos, dup_muts)
 #in_file1: output of plotting_data.R
 # my_df_u

-
 # infile 2: gene associated meta data
 #in_filename_gene_metadata = paste0(tolower(gene),  "_meta_data_with_AFandOR.csv")
 in_filename_gene_metadata = paste0(tolower(gene),  "_metadata.csv")
@ -94,6 +100,8 @@ gene_metadata <- read.csv(infile_gene_metadata
                      , header = T)
 cat("Dim:", dim(gene_metadata))

+table(gene_metadata$mutation_info)
+

 # counting NAs in AF, OR cols
 # or_mychisq
@ -124,6 +132,7 @@ if (identical(sum(is.na(my_df_u$or_kin))
      , "\nNA in AF:", sum(is.na(my_df_u$af_kin)))
 }

+str(gene_metadata)

 ###################################################################
 #                           combining: PS
@ -145,7 +154,7 @@ merging_cols = intersect(colnames(my_df_u), colnames(gene_metadata))

 cat(paste0("Merging dfs with NAs: big df (1-many relationship b/w id & mut)"
           , "\nNo. of merging cols:", length(merging_cols)
-           , "\nMerging columns identified:\n"))
+           , "\nMerging columns identified:"))
 print(merging_cols)

 # important checks!
@ -160,7 +169,7 @@ merged_df2 = merge(x = gene_metadata
                  , by = merging_cols
                  , all.y = T)

-cat("Dim of merged_df2: ", dim(merged_df2), "\n")
+cat("Dim of merged_df2: ", dim(merged_df2))
 head(merged_df2$position)

 # sanity check 
@ -170,10 +179,10 @@ if(nrow(gene_metadata) == nrow(merged_df2)){
      ,"\nExpected no. of rows: ",nrow(gene_metadata) 
      ,"\nGot no. of rows: ", nrow(merged_df2))
 } else{
-  cat("\nFAIL: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
+  cat("FAIL: nrow(merged_df2)!= nrow(gene associated gene_metadata)"
      , "\nExpected no. of rows after merge: ", nrow(gene_metadata)
      , "\nGot no. of rows: ", nrow(merged_df2)
-      , "\nFinding discrepancy\n")
+      , "\nFinding discrepancy")
  merged_muts_u = unique(merged_df2$mutationinformation)
  meta_muts_u = unique(gene_metadata$mutationinformation)
    # find the index where it differs
@ -227,6 +236,7 @@ if (identical(sum(is.na(merged_df3$or_kin))
      , "\nNA in AF:", sum(is.na(merged_df3$af_kin)))
 }

+
 #=========================
 # Merge3: merged_df2_comp
 # same as merge 1 but excluding NAs from ORs, etc.
@ -237,6 +247,7 @@ cat("Merging dfs without any NAs: big df (1-many relationship b/w id & mut)"

 na_count_df2 = sum(is.na(merged_df2$af))
 merged_df2_comp = merged_df2[!is.na(merged_df2$af),] 
+
 # sanity check: no +-1 gymnastics
 cat("Checking nrows in merged_df2_comp")
 if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){
@ -246,20 +257,22 @@ if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){
      , "\nNo. of rows: ", nrow(merged_df2_comp)
      , "\nNo. of cols: ", ncol(merged_df2_comp))
 }else{
-    cat("FAIL: No. of rows mismatch"
-        ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
-        ,"\nGot no. of rows: ", nrow(merged_df2_comp))
+  cat("FAIL: No. of rows mismatch"
+      ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
+      ,"\nGot no. of rows: ", nrow(merged_df2_comp))
 }
+
 #=========================
 # Merge4: merged_df3_comp
 # same as merge 2 but excluding NAs from ORs, etc or 
 # remove duplicate mutation information
 #=========================
-
 na_count_df3 = sum(is.na(merged_df3$af))
 #merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way
+
 merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way
-cat("\nChecking nrows in merged_df3_comp")
+cat("Checking nrows in merged_df3_comp")
+
 if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){
  cat("\nPASS: No. of rows match"
      ,"\nDim of merged_df3_comp: "
@ -267,12 +280,11 @@ if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){
      , "\nNo. of rows: ", nrow(merged_df3_comp)
      , "\nNo. of cols: ", ncol(merged_df3_comp))
 }else{
-  cat("\nFAIL: No. of rows mismatch"
+  cat("FAIL: No. of rows mismatch"
      ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
-      , "\nGot no. of rows: ", nrow(merged_df3_comp))
+      ,"\nGot no. of rows: ", nrow(merged_df3_comp))
 }

-  
 # alternate way of deriving merged_df3_comp
 foo = merged_df3[!is.na(merged_df3$af),]
 bar = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),]
@ -307,7 +319,8 @@ all.equal(foo, bar)
 # clear variables
 rm(foo, bar, gene_metadata
   , in_filename_params, infile_params, merging_cols
-   , in_filename_gene_metadata, infile_gene_metadata)
+   , in_filename_gene_metadata, infile_gene_metadata
+   , merged_df2v2, merged_df2v3)
 #*************************
 #####################################################################
 #                       Combining: LIG
@ -334,4 +347,4 @@ if (nrow(merged_df3_lig) == nrow(my_df_u_lig)){

 #==========================================================================
 # end of script
-##=========================================================================
+##==========================================================================