diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index af8be26..8b8e556 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -552,6 +552,14 @@ else: output_cols = combined_df_all.columns #%% IMPORTANT result info +if combined_df_all['or_mychisq'].isna().sum() == len(combined_df) - len(afor_df): + print('PASS: No. of NA in or_mychisq matches expected length' + , '\nNo. of with NA in or_mychisq:', combined_df_all['or_mychisq'].isna().sum() + , '\nNo. of NA in or_kin:', combined_df_all['or_kin'].isna().sum()) +else: + print('FAIL: No. of NA in or_mychisq does not match expected length') + + if combined_df_all.shape[0] == outdf_expected_rows: print('\nINFORMARIONAL ONLY: combined_df_all has duplicate muts present but with unique ref and alt allele' , '\n=============================================================') diff --git a/scripts/plotting/combining_dfs_plotting.R b/scripts/plotting/combining_dfs_plotting.R index 38474d9..bba0e00 100644 --- a/scripts/plotting/combining_dfs_plotting.R +++ b/scripts/plotting/combining_dfs_plotting.R @@ -59,18 +59,23 @@ rm(my_df, upos, dup_muts) # my_df_u # quick checks -head(my_df_u[, c("mutation", "mutation2")]) +head(my_df_u[, c("mutation")]) cols_to_extract = c("mutationinformation", "mutation", "or_mychisq", "or_kin", "af", "af_kin") foo = my_df_u[, colnames(my_df_u)%in% cols_to_extract] -which(is.na(my_df_u$af_kin)) == which(is.na(my_df_u$af)) +table(which(is.na(my_df_u$af_kin)) == which(is.na(my_df_u$af))) +baz = read.csv(file.choose()) baz = cbind(my_df_u$mutation, my_df_u$or_mychisq, bar$mutation, bar$or_mychisq) +baz = as.data.frame(baz) colnames(baz) = c("my_df_u_muts", "my_df_u_or", "real_muts", "real_or") +sum(is.na(baz$my_df_u_or)) == sum(is.na(my_df_u$or_mychisq)) +cat("\nNo. of with NA in or_mychisq:", sum(is.na(my_df_u$or_mychisq)) + ,"\nNo. of NA in or_kin:" , sum(is.na(my_df_u$or_kin))) # infile 2: gene associated meta data #in_filename_gene_metadata = paste0(tolower(gene), "_meta_data_with_AFandOR.csv") @@ -109,7 +114,8 @@ gene_metadata <- read.csv(infile_gene_metadata cat("Dim:", dim(gene_metadata)) -# counting NAs in AF, OR cols: +# counting NAs in AF, OR cols +# or_mychisq if (identical(sum(is.na(my_df_u$or_mychisq)) , sum(is.na(my_df_u$pval_fisher)) , sum(is.na(my_df_u$af)))){ @@ -123,7 +129,7 @@ if (identical(sum(is.na(my_df_u$or_mychisq)) , "\nNA in AF:", sum(is.na(my_df_u$af))) } - +# or kin if (identical(sum(is.na(my_df_u$or_kin)) , sum(is.na(my_df_u$pwald_kin)) , sum(is.na(my_df_u$af_kin)))){ @@ -139,6 +145,31 @@ if (identical(sum(is.na(my_df_u$or_kin)) str(gene_metadata) +# change category of ambiguos mutations +table(gene_metadata$mutation_info) + +cols_to_extract2 = c("mutationinformation", "mutation", "mutation_info") +foo2 = gene_metadata[, colnames(gene_metadata)%in% cols_to_extract2] + +dr_muts = foo2[foo2$mutation_info == dr_muts_col,] +other_muts = foo2[foo2$mutation_info == other_muts_col,] + +common_muts = dr_muts[dr_muts$mutation%in%other_muts$mutation,] +#write.csv(common_muts, 'common_muts.csv') + +# FIXME read properly +# "ambiguous_mut_names.csv" +#"pnca_p.gly108arg", "pnca_p.gly132ala", "pnca_p.val180phe" +ambiguous_muts = read.csv(file.choose()) +ambiguous_muts_names = ambiguous_muts$mutation + +common_muts_all = gene_metadata[gene_metadata$mutation%in%ambiguous_muts_names,] + +gene_metadata2 = gene_metadata + +if (gene_metadata$mutation_info[gene_metadata$mutation%in%ambiguous_muts_names] == other_muts_col){ + print('change me') +} ################################################################### # combining: PS ################################################################### diff --git a/scripts/plotting/plotting_data.R b/scripts/plotting/plotting_data.R index 291579e..7cfe0d8 100755 --- a/scripts/plotting/plotting_data.R +++ b/scripts/plotting/plotting_data.R @@ -52,6 +52,18 @@ in_filename_params = paste0(tolower(gene), "_all_params.csv") infile_params = paste0(outdir, "/", in_filename_params) cat(paste0("Input file 1:", infile_params) ) + +dr_muts_col = paste0('dr_mutations_', drug) +dr_muts_col = paste0('other_mutations_', drug) + +cat('Extracting columns based on variables:\n' + , drug + , '\n' + , dr_muts_col + , '\n' + , other_muts_col + , '\n===============================================================') + #%%=============================================================== ########################### # Read file: struct params