changing category of ambiguous muts

2020-09-08 18:51:03 +01:00 · 2020-09-08 18:51:03 +01:00 · f10f8f6d2a
commit f10f8f6d2a
parent e980085294
3 changed files with 55 additions and 4 deletions
--- a/scripts/combining_dfs.py
+++ b/scripts/combining_dfs.py
@ -552,6 +552,14 @@ else:
 output_cols = combined_df_all.columns

 #%% IMPORTANT result info
+if combined_df_all['or_mychisq'].isna().sum() == len(combined_df) - len(afor_df):
+    print('PASS: No. of NA in or_mychisq matches expected length'
+          , '\nNo. of with NA in or_mychisq:', combined_df_all['or_mychisq'].isna().sum() 
+          , '\nNo. of NA in or_kin:', combined_df_all['or_kin'].isna().sum())
+else:
+    print('FAIL: No. of NA in or_mychisq does not match expected length')
+
+
 if combined_df_all.shape[0] == outdf_expected_rows:
    print('\nINFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele'
          , '\n=============================================================')
--- a/scripts/plotting/combining_dfs_plotting.R
+++ b/scripts/plotting/combining_dfs_plotting.R
@ -59,18 +59,23 @@ rm(my_df, upos, dup_muts)
 # my_df_u

 # quick checks
-head(my_df_u[, c("mutation", "mutation2")])
+head(my_df_u[, c("mutation")])

 cols_to_extract  = c("mutationinformation", "mutation", "or_mychisq", "or_kin", "af", "af_kin")
 foo = my_df_u[, colnames(my_df_u)%in% cols_to_extract]


-which(is.na(my_df_u$af_kin)) == which(is.na(my_df_u$af))
+table(which(is.na(my_df_u$af_kin)) == which(is.na(my_df_u$af)))

+baz = read.csv(file.choose())

 baz = cbind(my_df_u$mutation, my_df_u$or_mychisq, bar$mutation, bar$or_mychisq)
+baz = as.data.frame(baz)
 colnames(baz) = c("my_df_u_muts", "my_df_u_or", "real_muts", "real_or")
+sum(is.na(baz$my_df_u_or)) == sum(is.na(my_df_u$or_mychisq))

+cat("\nNo. of with NA in or_mychisq:", sum(is.na(my_df_u$or_mychisq))
+    ,"\nNo. of NA in or_kin:" , sum(is.na(my_df_u$or_kin)))

 # infile 2: gene associated meta data
 #in_filename_gene_metadata = paste0(tolower(gene),  "_meta_data_with_AFandOR.csv")
@ -109,7 +114,8 @@ gene_metadata <- read.csv(infile_gene_metadata
 cat("Dim:", dim(gene_metadata))


-# counting NAs in AF, OR cols: 
+# counting NAs in AF, OR cols
+# or_mychisq
 if (identical(sum(is.na(my_df_u$or_mychisq))
              , sum(is.na(my_df_u$pval_fisher))
              , sum(is.na(my_df_u$af)))){
@ -123,7 +129,7 @@ if (identical(sum(is.na(my_df_u$or_mychisq))
      , "\nNA in AF:", sum(is.na(my_df_u$af)))
 }

-
+# or kin
 if (identical(sum(is.na(my_df_u$or_kin))
              , sum(is.na(my_df_u$pwald_kin))
              , sum(is.na(my_df_u$af_kin)))){
@ -139,6 +145,31 @@ if (identical(sum(is.na(my_df_u$or_kin))

 str(gene_metadata)

+# change category of ambiguos mutations
+table(gene_metadata$mutation_info)
+
+cols_to_extract2  = c("mutationinformation", "mutation", "mutation_info")
+foo2 = gene_metadata[, colnames(gene_metadata)%in% cols_to_extract2]
+
+dr_muts = foo2[foo2$mutation_info == dr_muts_col,]
+other_muts = foo2[foo2$mutation_info == other_muts_col,]
+
+common_muts = dr_muts[dr_muts$mutation%in%other_muts$mutation,]
+#write.csv(common_muts, 'common_muts.csv')
+
+# FIXME read properly
+# "ambiguous_mut_names.csv"
+#"pnca_p.gly108arg", "pnca_p.gly132ala", "pnca_p.val180phe"
+ambiguous_muts = read.csv(file.choose())
+ambiguous_muts_names = ambiguous_muts$mutation
+
+common_muts_all = gene_metadata[gene_metadata$mutation%in%ambiguous_muts_names,]
+
+gene_metadata2 = gene_metadata
+
+if (gene_metadata$mutation_info[gene_metadata$mutation%in%ambiguous_muts_names] == other_muts_col){
+  print('change me')
+}
 ###################################################################
 #                           combining: PS
 ###################################################################
--- a/scripts/plotting/plotting_data.R
+++ b/scripts/plotting/plotting_data.R
@ -52,6 +52,18 @@ in_filename_params = paste0(tolower(gene), "_all_params.csv")
 infile_params = paste0(outdir, "/", in_filename_params)
 cat(paste0("Input file 1:", infile_params) )

+
+dr_muts_col = paste0('dr_mutations_', drug)
+dr_muts_col  = paste0('other_mutations_', drug)
+
+cat('Extracting columns based on variables:\n'
+      , drug
+      , '\n'
+      , dr_muts_col
+      , '\n'
+      , other_muts_col
+      , '\n===============================================================')
+
 #%%===============================================================
 ###########################
 # Read file: struct params