reran to generate merged_df3 with correct dst for dst muts. modified combining_dfs_plotting.R

2022-07-08 21:33:57 +01:00 · 2022-07-08 21:33:57 +01:00 · 8079dd7b6c
commit 8079dd7b6c
parent 289c8913d0
6 changed files with 148 additions and 211 deletions
--- a/scripts/count_vars_ML.R
+++ b/scripts/count_vars_ML.R
@ -4,7 +4,7 @@
 #source("~/git/LSHTM_analysis/config/embb.R")
 #source("~/git/LSHTM_analysis/config/gid.R")
 #source("~/git/LSHTM_analysis/config/katg.R")
-source("~/git/LSHTM_analysis/config/pnca.R")
+#source("~/git/LSHTM_analysis/config/pnca.R")
 source("~/git/LSHTM_analysis/config/rpob.R")
 #############################
@ -16,19 +16,19 @@ source("~/git/LSHTM_analysis/scripts/plotting/get_plotting_dfs.R")
 # Output files: merged data
 #############################
 outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv')
-outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv')
+#outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv')
 ################################################
 # Add acticve site indication
 ###############################################
 merged_df2$active_site      = as.integer(merged_df2$position %in% active_aa_pos)
-merged_df2_comp$active_site = as.integer(merged_df2_comp$position %in% active_aa_pos)
+#merged_df2_comp$active_site = as.integer(merged_df2_comp$position %in% active_aa_pos)
 merged_df3$active_site      = as.integer(merged_df3$position %in% active_aa_pos)
-merged_df3_comp$active_site = as.integer(merged_df3_comp$position %in% active_aa_pos)
+#merged_df3_comp$active_site = as.integer(merged_df3_comp$position %in% active_aa_pos)
 # check 
-cols_sel = c('mutationinformation', 'dst', 'mutation_info_labels', 'dm_om_numeric', 'dst_mode')
+cols_sel = c('mutationinformation', 'mutation_info_labels', 'dm_om_numeric', 'dst', 'dst_mode')
 check_mdf2 = merged_df2[, cols_sel]
 check_mdf2T = table(check_mdf2$mutationinformation, check_mdf2$dst_mode)
@ -57,6 +57,8 @@ if (check12) {
  stop('FAIL: Something is wrong with the dst_mode column. Quitting!')
 }
 table(is.na(merged_df3$dst))
 #==========================
 # CHECK: active site labels
 #==========================
@ -137,7 +139,6 @@ if  (all(a2 && b2)){
  stop("FAIL: could not add drtype mode labels to merged_df3")
  ##quit()
 }
 ##############################################
 #===============
 # CHECK: lineage
 #===============
@ -179,10 +180,11 @@ if ( all( check12 && aa_check1 && aa_check2 && a1 && b1 && a2 && b2 && l1 && l2
    , "\nGene:", gene)
   write.csv(merged_df3, outfile_merged_df3)
-   write.csv(merged_df2, outfile_merged_df2)
+   #write.csv(merged_df2, outfile_merged_df2)
   cat(paste("\nmerged df3 filename:", outfile_merged_df3
-             , "\nmerged df2 filename:", outfile_merged_df2))
+             #, "\nmerged df2 filename:", outfile_merged_df2)
             ))
 } else{
    stop("FAIL: Not able to write merged dfs. Please check numbers!")
@ -207,15 +209,6 @@ a = merged_df3[, sel]
 str(a)
 # write file
 # outfile_merged_df3 = paste0(outdir, '/', tolower(gene), '_merged_df3.csv')
 # outfile_merged_df3
 # write.csv(merged_df3, outfile_merged_df3)
 # 
 # outfile_merged_df2 = paste0(outdir, '/', tolower(gene), '_merged_df2.csv')
 # outfile_merged_df2
 # write.csv(merged_df2, outfile_merged_df2)
 ###################################################
 ###################################################
 ###################################################
--- a/scripts/functions/combining_dfs_plotting.R
+++ b/scripts/functions/combining_dfs_plotting.R
@ -204,6 +204,9 @@ combining_dfs_plotting <- function(  my_df_u
  merged_df2$lineage_labels = merged_df2$lineage
  #merged_df2$lineage_labels = as.factor(merged_df2$lineage_labels)
  #merged_df2$lineage_labels = factor(merged_df2$lineage_labels)
  table(merged_df2$mutation_info_labels_orig) # original
  table(merged_df2$mutation_info_labels_v1) # intermediate
  table(merged_df2$mutation_info_labels) # revised, corresponding to dst_mode
  #=================================================================
  # Merge 2: merged_df3
@ -214,16 +217,122 @@ combining_dfs_plotting <- function(  my_df_u
  # but this should be good for the numerical corr plots
  #==================================================================
  # remove duplicated mutations
-  cat("\nMerging dfs without NAs: small df (removing muts with no AF|OR associated)"
+  # cat("\nMerging dfs without NAs: small df (removing muts with no AF|OR associated)"
-      ,"\nCannot trust lineage info from this"
+  #     ,"\nCannot trust lineage info from this"
-      ,"\nlinking col: mutationinforamtion"
+  #     ,"\nlinking col: mutationinforamtion"
-      ,"\nfilename: merged_df3")
+  #     ,"\nfilename: merged_df3")
  # 
  # merged_df3 = merged_df2[!duplicated(merged_df2$mutationinformation),] 
  # 
  # 
  # head(merged_df3$position); tail(merged_df3$position) # should be sorted
  # 
  # # sanity check
  # cat("\nChecking nrows in merged_df3")
  # 
  # if( nrow(my_df_u) == nrow(merged_df3) ){
  #   cat("\nPASS: No. of rows match with my_df"
  #       ,"\nExpected no. of rows: ", nrow(my_df_u)
  #       ,"\nGot no. of rows: ", nrow(merged_df3))
  # } else {
  #   cat("\nFAIL: No. of rows mismatch"
  #       , "\nNo. of rows my_df: ", nrow(my_df_u)
  #       , "\nNo. of rows merged_df3: ", nrow(merged_df3))
  #   quit()
  #   }
  # 
  # counting NAs in AF, OR cols in merged_df3
  # this is because mcsm has no AF, OR cols,
  # so you cannot count NAs
  # if (identical(sum(is.na(merged_df3$or_kin))
  #               , sum(is.na(merged_df3$pwald_kin))
  #               , sum(is.na(merged_df3$af_kin)))){
  #   cat("\nPASS: NA count match for OR, pvalue and AF\n")
  #   na_count_df3 = sum(is.na(merged_df3$af_kin))
  #   cat("\nNo. of NAs: ", sum(is.na(merged_df3$or_kin)))
  # } else{
  #   cat("\nFAIL: NA count mismatch"
  #       , "\nNA in OR: ", sum(is.na(merged_df3$or_kin))
  #       , "\nNA in pvalue: ", sum(is.na(merged_df3$pwald_kin))
  #       , "\nNA in AF:", sum(is.na(merged_df3$af_kin)))
  # }
  # 
  # ===================================
  # Revised way to generate merged_df3
  # ===================================
  #%% Getting merged_df3: VERY important and careful subsetting merging
  # dst mode column as carefully curated dst based on knowledge based approach.
  # so now we want to get the 
  na_muts = merged_df2[is.na(merged_df2$dst),]
  no_na_muts = merged_df2[!is.na(merged_df2$dst),]
  muts_na_U = na_muts[!duplicated(na_muts[c('mutationinformation')]), ]
  muts_no_na_U = no_na_muts[!duplicated(no_na_muts[c('mutationinformation')]), ]
  # get muts from no_na that are NOT present in muts with na from dplyr
  dst_muts = dplyr::anti_join(muts_no_na_U, muts_na_U, by = 'mutationinformation')
  #dst_muts = anti_join(muts_no_na_U, muts_na_U, by = 'mutationinformation')
  # ALL good muts are NOT in na muts unique i.e dst muts should NOT exist in na_muts
  if (all(dst_muts$mutationinformation%in%muts_na_U$mutationinformation) == FALSE){
    cat("\nPASS: checked length for dst tested muts"
        , "\nNo. of dst testetd muts:", nrow(dst_muts))
  }else{
    stop("Dst muts are not correctly identified")
  }
  if ( class(dst_muts) != "data.frame" ){
    dst_muts = as.data.frame(dst_muts)
  } else{
    cat("\ndst_muts is a df")
  }
  # ALL bad muts are in na muts unique
  bad_muts = dplyr::semi_join(muts_no_na_U, muts_na_U, by = "mutationinformation")
  #bad_muts = semi_join(muts_no_na_U, muts_na_U, by = "mutationinformation")
  if (all(bad_muts$mutationinformation%in%muts_na_U$mutationinformation) == TRUE){
    cat("\nPASS: checked length of NOT-dst tested muts"
        , "\nNo. of NOT dst-tested_muts:", nrow(bad_muts))
  }else{
    stop("Non-dst muts are not correctly identified")
  }
  if ( class(bad_muts) != "data.frame" ){
    bad_muts = as.data.frame(bad_muts)
  } else{
    cat("\nbad_muts is a df")
  }
  cat("\nNo. of muts with dst:", nrow(dst_muts)
      , "\nNo. of muts without dst:",  nrow(muts_na_U) - nrow(dst_muts) )
  # now merge 
  if ( all(colnames(muts_na_U) == colnames(dst_muts)) ){
    cat("\nPASS: rowbind to get merged_df3")
    merged_df3 = dplyr::bind_rows(muts_na_U, dst_muts)
    #merged_df3 = bind_rows(muts_na_U, dst_muts)
  } else{
    stop("Quitting: merged_df3 could not be generated")
  }
  if ( nrow(merged_df3) == length(unique(merged_df2$mutationinformation)) ){
    cat("\nPASS: merged_df3 sucessfully generated..."
        , "\nnrow merged_df3:", nrow(merged_df3)
        , "\nncol merged_df3:", ncol(merged_df3))
  }else{
    stop("Cannot generate merged_df3")
  }
 ##################################################################
  merged_df3 = merged_df2[!duplicated(merged_df2$mutationinformation),] 
  head(merged_df3$position); tail(merged_df3$position) # should be sorted
  # sanity check
  cat("\nChecking nrows in merged_df3")
  if( nrow(my_df_u) == nrow(merged_df3) ){
    cat("\nPASS: No. of rows match with my_df"
        ,"\nExpected no. of rows: ", nrow(my_df_u)
@ -234,166 +343,9 @@ combining_dfs_plotting <- function(  my_df_u
        , "\nNo. of rows merged_df3: ", nrow(merged_df3))
    quit()
  }
  # counting NAs in AF, OR cols in merged_df3
  # this is because mcsm has no AF, OR cols,
  # so you cannot count NAs
  if (identical(sum(is.na(merged_df3$or_kin))
                , sum(is.na(merged_df3$pwald_kin))
                , sum(is.na(merged_df3$af_kin)))){
    cat("\nPASS: NA count match for OR, pvalue and AF\n")
    na_count_df3 = sum(is.na(merged_df3$af_kin))
    cat("\nNo. of NAs: ", sum(is.na(merged_df3$or_kin)))
  } else{
    cat("\nFAIL: NA count mismatch"
        , "\nNA in OR: ", sum(is.na(merged_df3$or_kin))
        , "\nNA in pvalue: ", sum(is.na(merged_df3$pwald_kin))
        , "\nNA in AF:", sum(is.na(merged_df3$af_kin)))
  }
  #===================================================
  # Merge3: merged_df2_comp
  # same as merge 1 but excluding NAs from ORs, etc.
  #====================================================
  cat("\nMerging dfs without any NAs: big df (1-many relationship b/w id & mut)"
      ,"\nfilename: merged_df2_comp")
  na_count_df2 = sum(is.na(merged_df2$af))
  merged_df2_comp = merged_df2[!is.na(merged_df2$af),] 
  # sanity check: no +-1 gymnastics
  cat("\nChecking nrows in merged_df2_comp")
  if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count_df2)){
    cat("\nPASS: No. of rows match"
        ,"\nDim of merged_df2_comp: "
        ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
        , "\nNo. of rows: ", nrow(merged_df2_comp)
        , "\nNo. of cols: ", ncol(merged_df2_comp))
  }else{
    cat("\nFAIL: No. of rows mismatch"
        ,"\nExpected no. of rows: ", nrow(merged_df2) - na_count_df2
        ,"\nGot no. of rows: ", nrow(merged_df2_comp))
  }
  #======================================================
  # Merge4: merged_df3_comp
  # same as merge 2 but excluding NAs from ORs, etc or 
  # remove duplicate mutation information
  #=======================================================
  na_count_df3 = sum(is.na(merged_df3$af))
  #merged_df3_comp = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),] # a way
  merged_df3_comp = merged_df3[!is.na(merged_df3$af),] # another way
  cat("\nChecking nrows in merged_df3_comp")
  if(nrow(merged_df3_comp) == (nrow(merged_df3) - na_count_df3)){
    cat("\nPASS: No. of rows match"
        ,"\nDim of merged_df3_comp: "
        ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
        , "\nNo. of rows: ", nrow(merged_df3_comp)
        , "\nNo. of cols: ", ncol(merged_df3_comp))
  }else{
    cat("\nFAIL: No. of rows mismatch"
        ,"\nExpected no. of rows: ", nrow(merged_df3) - na_count_df3
        ,"\nGot no. of rows: ", nrow(merged_df3_comp))
  }
  # alternate way of deriving merged_df3_comp
  foo = merged_df3[!is.na(merged_df3$af),]
  bar = merged_df3_comp[!duplicated(merged_df3_comp$mutationinformation),]
  # compare dfs: foo and merged_df3_com
  all.equal(foo, bar)
  #summary(comparedf(foo, bar))
  cat("\n------------------------"
      , "\nSummary of created dfs:"
      , "\n------------------------"
      , "\n1) Dim of merged_df2: " , nrow(merged_df2), "," , ncol(merged_df2)
      , "\n2) Dim of merged_df2_comp: " , nrow(merged_df2_comp), "," , ncol(merged_df2_comp)
      , "\n3) Dim of merged_df3: " , nrow(merged_df3), "," , ncol(merged_df3)
      , "\n4) Dim of merged_df3_comp: " , nrow(merged_df3_comp), "," , ncol(merged_df3_comp))
  #####################################################################
  #                       Combining: LIG
  #####################################################################
  #============
  # Merges 5-8
  #============
  cat("\n=========================================="
      , "\nStarting filtering for mcsm ligand df"
      , "\n===========================================")
  if (lig_dist_colname%in%names(my_df_u)){
    cat("\nFiltering column: ", lig_dist_colname
        , "\nCut off criteria: ", lig_dist_cutoff, "Angstroms")
    df_lig = my_df_u[my_df_u[[lig_dist_colname]] < lig_dist_cutoff,]
    #merged_df2_lig = merged_df2[merged_df2$ligand_distance<lig_dist_cutoff,]
    merged_df2_lig = merged_df2[merged_df2[[lig_dist_colname]] < lig_dist_cutoff,]
    dim(merged_df2_lig)
    merged_df2_comp_lig = merged_df2_comp[merged_df2_comp[[lig_dist_colname]] < lig_dist_cutoff,]
    merged_df3_lig = merged_df3[merged_df3[[lig_dist_colname]] < lig_dist_cutoff,]
    merged_df3_comp_lig = merged_df3_comp[merged_df3_comp[[lig_dist_colname]] < lig_dist_cutoff,]
    cat("\n------------------------"
        , "\nSummary of created ligand dfs:"
        , "\n------------------------"
        , "\n1) Dim of merged_df2_lig: " , nrow(merged_df2_lig), "," , ncol(merged_df2_lig)
        , "\n2) Dim of merged_df2_comp_lig: " , nrow(merged_df2_comp_lig), "," , ncol(merged_df2_comp_lig)
        , "\n3) Dim of merged_df3_lig: " , nrow(merged_df3_lig), "," , ncol(merged_df3_lig)
        , "\n4) Dim of merged_df3_comp_lig: " , nrow(merged_df3_comp_lig), "," , ncol(merged_df3_comp_lig))
 } else {
    cat("\nFiltering column: ", lig_dist_colname, " not found\n")
  }
  #quit()
  # sanity check
  if (nrow(merged_df3_lig) == nrow(df_lig)){
    print("\nPASS: verified merged_df3_lig")
  }else{
    cat(paste0("\nFAIL: nrow mismatch for merged_df3_lig"
               , "\nExpected:", nrow(df_lig)
               , "\nGot:", nrow(merged_df3_lig)))
  }
  #==============================================================
  ############################################
  # OPTIONAL: write output files in one go
  ############################################
  #outvars = c(#"merged_df2",
  #"merged_df2_comp",
  #"merged_df2_lig",
  #"merged_df2_comp_lig",
  #"meregd_df3_comp"
  #"merged_df3_comp_lig",
  #"merged_df3",
  #"merged_df3_lig")
  #cat("Writing output files: "
  #, "\nPath:", outdir)
  #for (i in outvars){
  #out_filename = paste0(i, ".csv")
  #outfile = paste0(outdir, "/", out_filename)
  #cat("Writing output file:"
  #    ,"\nFilename: ", out_filename,"\n")
  #write.csv(get(i), outfile, row.names = FALSE)
  #cat("Finished writing: ", outfile
  #    , "\nNo. of rows: ", nrow(get(i))
  #    , "\nNo. of cols: ", ncol(get(i)), "\n")
  #}
  return(list(  merged_df2
                , merged_df3
-              , merged_df2_comp
+  ))
              , merged_df3_comp
              , merged_df2_lig
              , merged_df3_lig
              , merged_df2_comp_lig
              , merged_df3_comp_lig))
 cat("\nEnd of combining_dfs_plotting.R script")
 }
--- a/scripts/ml/ml_functions/MultClfs_SIMPLE.py
+++ b/scripts/ml/ml_functions/MultClfs_SIMPLE.py
@ -239,8 +239,8 @@ def MultModelsCl(input_df, target
               #  , ('Gaussian NB'               , GaussianNB() )
               #  , ('Gaussian Process'          , GaussianProcessClassifier(**rs) )
               #  , ('K-Nearest Neighbors'       , KNeighborsClassifier() ) 
-               #  , ('LDA'                       , LinearDiscriminantAnalysis() )
+                 , ('LDA'                       , LinearDiscriminantAnalysis() )
-               # , ('Logistic Regression'       , LogisticRegression(**rs) )
+                 , ('Logistic Regression'       , LogisticRegression(**rs) )
               # , ('Logistic RegressionCV'     , LogisticRegressionCV(cv = 3, **rs))
               # , ('MLP'                       , MLPClassifier(max_iter = 500, **rs) ) 
                #, ('Multinomial'               , MultinomialNB() )
@ -259,7 +259,7 @@ def MultModelsCl(input_df, target
             #     , ('Ridge ClassifierCV'        , RidgeClassifierCV(cv = 3) )          
             #     , ('SVC'                       , SVC(**rs) ) 
             #     , ('Stochastic GDescent'       , SGDClassifier(**rs, **njobs) )
-             #     , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) )
+                  , ('XGBoost'                   , XGBClassifier(**rs, verbosity = 0, use_label_encoder =False, **njobs) )
             # 
             ]
--- a/scripts/ml/ml_functions/test_func_singlegene.py
+++ b/scripts/ml/ml_functions/test_func_singlegene.py
@ -14,23 +14,8 @@ sys.path
 # import
 from GetMLData import *
 from SplitTTS import *
-#from MultClfs_fi import *
+#from MultClfs import *
-from MultClfs import *
+from MultClfs_SIMPLE import *
 #%%
 # X,y = load_boston(return_X_y=True) 
 # features = load_boston()['feature_names']
 # X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)
 # rf = RandomForestRegressor(random_state=0) 
 # rf.fit(X_train,y_train)
 # f_i = list(zip(features,rf.feature_importances_)) 
 # f_i.sort(key = lambda x : x[1])
 # plt.barh([x[0] for x in f_i],[x[1] for x in f_i]) 
 # plt.show()
 #%%
@ -54,9 +39,8 @@ df = getmldata('katG', 'isoniazid'    , **gene_model_paramD)
 df = getmldata('rpoB', 'rifampicin'   , **gene_model_paramD)
 df  = getmldata('gid' , 'streptomycin' , **gene_model_paramD)
 #df  = getmldata('alr' , 'cycloserine'  , **combined_model_paramD)
 all(df.columns.isin(['gene_name'])) # should be False
 spl_type = '70_30'
 #spl_type = '80_20'
 #spl_type = 'sl'
@ -73,6 +57,16 @@ df2 = split_tts(df
 all(df2['X'].columns.isin(['gene_name'])) # should be False
 df['dst'].value_counts()
 df['dst'].isna().sum()
 df['dst_mode'].value_counts()
 len(df)
 Counter(df2['y'])
 Counter(df2['y_bts'])
 fooD = MultModelsCl(input_df = df2['X']
                , target = df2['y']
                , sel_cv = skf_cv
--- a/scripts/plotting/get_plotting_dfs.R
+++ b/scripts/plotting/get_plotting_dfs.R
@ -7,11 +7,9 @@
 getwd()
 source("~/git/LSHTM_analysis/scripts/Header_TT.R")
 #********************
 # cmd args passed 
 # in from other scripts
 # to call this
 #********************
 # set drug and gene name
 #==========================================
@ -85,8 +83,8 @@ all_plot_dfs = combining_dfs_plotting(my_df_u
 merged_df2      = all_plot_dfs[[1]]
 merged_df3      = all_plot_dfs[[2]]
-merged_df2_comp = all_plot_dfs[[3]]
+#merged_df2_comp = all_plot_dfs[[3]]
-merged_df3_comp = all_plot_dfs[[4]]
+#merged_df3_comp = all_plot_dfs[[4]]
 #======================================================================