From 7242b3516bd67d5f5836a2b2ca2f34b188653be6 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Fri, 4 Jun 2021 14:36:16 +0100 Subject: [PATCH] adpated combining_dfs.py and plotting.R for gid and attempting to make it generic --- scripts/combining_dfs.py | 66 +++++++++++++++++++++------- scripts/plotting/dirs.R | 74 +++++++++----------------------- scripts/plotting/plotting_data.R | 32 ++++++++------ 3 files changed, 91 insertions(+), 81 deletions(-) diff --git a/scripts/combining_dfs.py b/scripts/combining_dfs.py index 8b8e556..1e66328 100755 --- a/scripts/combining_dfs.py +++ b/scripts/combining_dfs.py @@ -57,10 +57,10 @@ from combining_FIXME import detect_common_cols from reference_dict import oneletter_aa_dict # CHECK DIR STRUC THERE! from reference_dict import low_3letter_dict # CHECK DIR STRUC THERE! #======================================================================= -#%% command line args +#%% command line args: case sensitive arg_parser = argparse.ArgumentParser() -arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide') -arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive +arg_parser.add_argument('-d', '--drug', help='drug name', default = '') +arg_parser.add_argument('-g', '--gene', help='gene name', default = '') arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data') arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') @@ -99,32 +99,37 @@ print('position regex:', pos_regex) # directories #============== if not datadir: - datadir = homedir + '/' + 'git/Data' + datadir = homedir + '/git/Data/' if not indir: - indir = datadir + '/' + drug + '/input' + indir = datadir + drug + '/input/' if not outdir: - outdir = datadir + '/' + drug + '/output' + outdir = datadir + drug + '/output/' #======= # input #======= -in_filename_mcsm = gene.lower() + '_complex_mcsm_norm.csv' +#in_filename_mcsm = gene.lower() + '_complex_mcsm_norm.csv' +in_filename_mcsm = gene.lower() + '_complex_mcsm_norm_SAM.csv' # gidb in_filename_foldx = gene.lower() + '_foldx.csv' in_filename_dssp = gene.lower() + '_dssp.csv' in_filename_kd = gene.lower() + '_kd.csv' in_filename_rd = gene.lower() + '_rd.csv' -in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info_f.csv' +#in_filename_deepddg = gene.lower() + '_complex_ddg_results.txt' # change to decent filename and put it in the correct dir + +in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info_f.csv' # gwas f info in_filename_afor = gene.lower() + '_af_or.csv' in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv' -infile_mcsm = outdir + '/' + in_filename_mcsm -infile_foldx = outdir + '/' + in_filename_foldx -infile_dssp = outdir + '/' + in_filename_dssp -infile_kd = outdir + '/' + in_filename_kd -infile_rd = outdir + '/' + in_filename_rd +infile_mcsm = outdir + in_filename_mcsm +infile_foldx = outdir + in_filename_foldx +infile_dssp = outdir + in_filename_dssp +infile_kd = outdir + in_filename_kd +infile_rd = outdir + in_filename_rd +#infile_deepddg = outdir + in_filename_deepddg + infile_snpinfo = outdir + '/' + in_filename_snpinfo infile_afor = outdir + '/' + in_filename_afor infile_afor_kin = outdir + '/' + in_filename_afor_kin @@ -135,7 +140,9 @@ print('\nInput path:', indir , '\nInput filename foldx:', infile_foldx, '\n' , '\nInput filename dssp:', infile_dssp , '\nInput filename kd:', infile_kd - , '\nInput filename rd', infile_rd , '\n' + , '\nInput filename rd', infile_rd +# , '\nInput filename rd', infile_deepddg , '\n' + , '\nInput filename snp info:', infile_snpinfo, '\n' , '\nInput filename af or:', infile_afor , '\nInput filename afor kinship:', infile_afor_kin @@ -225,10 +232,39 @@ print('\nResult of Fourth merge:', combined_df.shape , '\n===================================================================') combined_df[merging_cols_m4].apply(len) combined_df[merging_cols_m4].apply(len) == len(combined_df) + #%%============================================================================ -# OR merges: TEDIOUSSSS!!!! +#deepddg_df = pd.read_csv(infile_deepddg, sep = ' ') + + + + +#%%============================================================================ +# Output columns + +out_filename_stab_struc = gene.lower() + '_comb_stab_struc_params.csv' +outfile_stab_struc = outdir + '/' + out_filename_stab_struc +print('Output filename:', outfile_stab_struc + , '\n===================================================================') + +# write csv +print('Writing file: combined stability and structural parameters') +combined_df.to_csv(outfile_stab_struc, index = False) +print('\nFinished writing file:' + , '\nNo. of rows:', combined_df.shape[0] + , '\nNo. of cols:', combined_df.shape[1]) + + + + + + + +#%%============================================================================ +# OR merges: TEDIOUSSSS!!!! +#[ DELETE ] del(mcsm_df, foldx_df, mcsm_foldx_dfs, dssp_kd_dfs, dssp_kd_rd_dfs,rd_df, kd_df, infile_mcsm, infile_foldx, infile_dssp, infile_kd) del(merging_cols_m1, merging_cols_m2, merging_cols_m3, merging_cols_m4) del(in_filename_dssp, in_filename_foldx, in_filename_kd, in_filename_mcsm, in_filename_rd) diff --git a/scripts/plotting/dirs.R b/scripts/plotting/dirs.R index 54fb0bf..aa1b9be 100644 --- a/scripts/plotting/dirs.R +++ b/scripts/plotting/dirs.R @@ -1,56 +1,24 @@ #!/usr/bin/env Rscript ######################################################### -# TASK: formatting data that will be used for various plots - -# useful links -#https://stackoverflow.com/questions/38851592/r-append-column-in-a-dataframe-with-frequency-count-based-on-two-columns +# TASK: importing dir str +# create a function that takes 'drug' and 'gene' as args, +# This script is sourced by plotting.R to import dir str +# for various plots, etc. ######################################################### -# working dir and loading libraries -getwd() -setwd("~/git/LSHTM_analysis/scripts/plotting") -getwd() - -#source("Header_TT.R") -library(ggplot2) -library(data.table) -library(dplyr) - -require("getopt", quietly = TRUE) #cmd parse arguments -#======================================================== -# command line args -#spec = matrix(c( -# "drug" , "d", 1, "character", -# "gene" , "g", 1, "character" -#), byrow = TRUE, ncol = 4) - -#opt = getopt(spec) - -#drug = opt$druggene = opt$gene - -#if(is.null(drug)|is.null(gene)) { -# stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") -#} -#======================================================== -# FIXME: change to cmd -#%% variable assignment: input and output paths & filenames -drug = "pyrazinamide" -gene = "pncA" -gene_match = paste0(gene,"_p.") -cat(gene_match) - -#============= -# directories and variables -#============= -datadir = paste0("~/git/Data") -indir = paste0(datadir, "/", drug, "/input") -outdir = paste0("~/git/Data", "/", drug, "/output") -plotdir = paste0("~/git/Data", "/", drug, "/output/plots") - -dr_muts_col = paste0('dr_mutations_', drug) -other_muts_col = paste0('other_mutations_', drug) -resistance_col = "drtype" - -#%%=============================================================== - - - +import_dirs <- function(drug, gene) { + gene_match = paste0(gene,"_p.") + cat(gene_match) + + #============= + # directories and variables + #============= + datadir <<- paste0("~/git/Data") + indir <<- paste0(datadir, "/", drug, "/input") + outdir <<- paste0("~/git/Data", "/", drug, "/output") + plotdir <<- paste0("~/git/Data", "/", drug, "/output/plots") + + dr_muts_col <<- paste0('dr_mutations_', drug) + other_muts_col <<- paste0('other_mutations_', drug) + resistance_col <<- "drtype" + +} diff --git a/scripts/plotting/plotting_data.R b/scripts/plotting/plotting_data.R index 2df3ef4..e4ee4ff 100755 --- a/scripts/plotting/plotting_data.R +++ b/scripts/plotting/plotting_data.R @@ -14,31 +14,37 @@ getwd() library(ggplot2) library(data.table) library(dplyr) - -source("dirs.R") - require("getopt", quietly = TRUE) #cmd parse arguments +source("dirs.R") #======================================================== # command line args -#spec = matrix(c( -# "drug" , "d", 1, "character", -# "gene" , "g", 1, "character" -#), byrow = TRUE, ncol = 4) +spec = matrix(c( + "drug" , "d", 1, "character", + "gene" , "g", 1, "character" +), byrow = TRUE, ncol = 4) -#opt = getopt(spec) +opt = getopt(spec) -#drug = opt$druggene = opt$gene +#FIXME: detect if script running from cmd, then set these +#drug = opt$drug +#gene = opt$gene + +# hardcoding when not using cmd +drug = "streptomycin" +gene = "gid" + +if(is.null(drug)|is.null(gene)) { + stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") +} -#if(is.null(drug)|is.null(gene)) { -# stop("Missing arguments: --drug and --gene must both be specified (case-sensitive)") -#} #======================================================== #====== # input #====== #in_filename = "mcsm_complex1_normalised.csv" -in_filename_params = paste0(tolower(gene), "_all_params.csv") +#in_filename_params = paste0(tolower(gene), "_all_params.csv") +in_filename_params = paste0(tolower(gene), "_comb_stab_struc_params.csv") # part combined infile_params = paste0(outdir, "/", in_filename_params) cat(paste0("Input file 1:", infile_params) )