diff --git a/dynamut/katg_mcsm_formatted_snps_chain.csv b/dynamut/katg_mcsm_formatted_snps_chain.csv new file mode 100644 index 0000000..f42a434 --- /dev/null +++ b/dynamut/katg_mcsm_formatted_snps_chain.csv @@ -0,0 +1,817 @@ +A G24V +A K27I +A K27E +A Y28L +A Y28H +A P29S +A V30A +A G32S +A G33S +A G34V +A G34A +A Q36P +A Q36H +A D37G +A P40T +A L43R +A L43P +A K46N +A V47I +A L48P +A L48R +A P52S +A D56H +A P57S +A A61S +A F62L +A D63G +A Y64C +A A65T +A A66T +A V68G +A I71F +A I71S +A V73A +A V73G +A A75P +A L76P +A T77R +A R78P +A R78G +A E81V +A E82D +A V83L +A V83G +A M84I +A M84T +A M84L +A T85A +A T85P +A T86P +A T86N +A S87L +A Q88P +A Q88E +A P89D +A W90R +A W90C +A W91G +A W91R +A W91L +A W91S +A P92T +A A93G +A A93D +A A93T +A D94N +A Y95F +A Y95S +A H97N +A H97P +A H97S +A Y98C +A Y98D +A Y98N +A G99R +A G99E +A P100T +A L101F +A L101M +A F102M +A F102S +A F102I +A I103N +A I103V +A I103T +A R104Q +A R104W +A M105I +A A106S +A A106V +A A106T +A A106R +A A106G +A A109T +A A109V +A A109S +A A109D +A A110V +A A110T +A G111D +A T112I +A Y113C +A I115V +A I115S +A I115T +A H116T +A H116E +A H116L +A H116G +A H116A +A H116Q +A H116F +A H116S +A H116P +A D117E +A G120S +A G121A +A G121S +A A122G +A A122D +A A122T +A A122V +A G123R +A G123E +A G124A +A G124Q +A G124D +A G124S +A G124H +A G124E +A G124R +A G124T +A G125D +A G125S +A M126Q +A M126I +A M126A +A M126L +A M126S +A Q127P +A R128Q +A R128L +A R128G +A R128W +A F129S +A A130E +A P131Q +A P131A +A P131L +A P131S +A L132R +A N133S +A N133D +A S134R +A W135S +A P136L +A N138S +A N138H +A N138D +A A139V +A A139P +A A139G +A S140N +A S140G +A S140I +A L141S +A L141F +A L141I +A L141V +A D142G +A D142N +A K143N +A K143E +A A144T +A A144V +A R145H +A R145C +A R145S +A R146L +A L148I +A W149R +A W149L +A W149G +A W149C +A V151L +A V151I +A K152E +A K152T +A K153Q +A Y155C +A Y155S +A Y155H +A G156D +A G156S +A K157N +A K157R +A K157Q +A K158S +A K158N +A L159I +A L159F +A L159P +A W161C +A W161R +A A162V +A A162E +A A162T +A D163N +A D163A +A L164R +A I165M +A I165L +A I165Y +A I165T +A V166I +A V166T +A F167S +A F167L +A F167C +A A168V +A A168T +A A168G +A G169S +A N170K +A C171V +A C171G +A A172T +A A172V +A L173R +A M176T +A M176I +A F178I +A F178S +A K179E +A T180M +A T180K +A G182R +A G182E +A F183L +A F183S +A G184D +A G184A +A G184C +A G186A +A G186S +A G186D +A R187P +A D189N +A D189G +A D189A +A D189Y +A W191R +A W191G +A E192A +A E192D +A D194N +A E195K +A V196G +A Y197D +A W204S +A L205R +A G206R +A E208K +A R209C +A S211N +A S211T +A K213E +A K213N +A R214L +A D215H +A D215E +A N218S +A P219L +A A222T +A Q224R +A M225V +A I228L +A N231K +A P232S +A P232R +A P232T +A P232A +A E233G +A E233Q +A G234R +A N236D +A G237A +A G237D +A P241H +A M242V +A M242T +A M242I +A A243T +A A244G +A V246R +A V246G +A I248T +A R249G +A R249C +A R249H +A T251K +A T251M +A F252L +A R253G +A R253W +A R254S +A R254C +A R254H +A R254L +A A256T +A A256V +A A256G +A M257I +A M257T +A M257V +A D259G +A D259E +A D259Y +A V260I +A V260E +A T262P +A A264V +A A264T +A V267A +A G268S +A G269S +A G269D +A T271P +A T271S +A T271I +A T271A +A F272L +A F272S +A F272V +A G273R +A G273C +A T275P +A T275A +A H276Q +A G277S +A G279D +A P280S +A P280Q +A A281V +A A281G +A A281T +A D282G +A G285C +A G285S +A G285V +A G285D +A G285A +A P286L +A P288H +A P288L +A E289A +A E289K +A A290V +A A290P +A A291D +A P292A +A Q295A +A Q295P +A Q295E +A M296V +A M296T +A G297V +A G297L +A L298S +A G299S +A G299C +A G299V +A G299A +A G299D +A W300S +A W300G +A W300R +A W300C +A S302R +A S302T +A G305C +A G305A +A T306A +A T306S +A G307R +A T308P +A T308S +A T308K +A T308A +A T308V +A T308I +A D311G +A A312P +A A312E +A A312V +A T314S +A T314N +A T314A +A S315T +A S315N +A S315I +A S315G +A S315R +A I317L +A I317V +A I317T +A E318K +A V320L +A V320A +A T322A +A T322M +A N323P +A N323S +A N323H +A T324N +A T324P +A T324S +A T324L +A P325S +A P325T +A T326P +A T326M +A K327T +A W328L +A W328S +A W328R +A W328C +A D329A +A D329E +A D329H +A S331T +A S331I +A S331R +A L333F +A L333C +A E334K +A I335V +A I335T +A I335N +A L336M +A Y337C +A Y337H +A Y337F +A Y337S +A G338S +A Y339N +A Y339C +A Y339S +A E340D +A E342G +A T344L +A T344K +A T344S +A T344M +A A348V +A A348G +A G349D +A Q352Y +A Y353H +A Y353F +A T354I +A D357H +A I364N +A D366N +A P367L +A F368L +A S374A +A S374P +A L378P +A L378M +A A379V +A A379T +A T380S +A T380P +A T380I +A T380A +A T380N +A D381A +A L382I +A L382R +A S383W +A S383A +A L384R +A R385P +A V386M +A V386E +A D387N +A Y390C +A R392W +A T394P +A T394M +A T394A +A R395C +A L398R +A E399D +A E399K +A H400Y +A H400P +A E402A +A E402K +A L404W +A D406A +A D406E +A D406G +A E407A +A E407K +A F408Y +A F408S +A F408L +A F408V +A A411D +A Y413C +A Y413F +A Y413H +A Y413S +A K414R +A I416M +A I416T +A I416L +A I416V +A D419H +A D419G +A D419Y +A D419V +A P422H +A P422L +A V423I +A A424V +A A424G +A R425K +A L427P +A L427R +A L427F +A L430A +A P432L +A P432T +A K433T +A Q434P +A L437R +A W438G +A Q439K +A Q439H +A Q439R +A Q439T +A D440G +A P441L +A V442L +A V442A +A V445I +A S446N +A D448A +A D448E +A V450I +A V450A +A G451D +A E452Q +A I455L +A L458H +A K459T +A S460N +A Q461P +A Q461R +A Q461E +A I462S +A R463L +A R463W +A S465P +A T468P +A V469L +A V469I +A Q471R +A V473L +A V473F +A S474Q +A T475I +A T475A +A A476E +A A476V +A A478R +A A479P +A A479G +A A479V +A A479Q +A A480Q +A A480S +A S481A +A S481L +A S482T +A F483L +A R484H +A R484G +A K488E +A R489C +A G490D +A G490C +A G490S +A G491S +A A492V +A A492D +A N493K +A G494S +A G494A +A G495S +A G495A +A G495C +A R496L +A R496C +A R498S +A P501S +A V503A +A V503S +A W505L +A V507I +A N508D +A D509E +A D509N +A P510A +A D511N +A D513N +A L514P +A L514V +A R515H +A K516R +A R519H +A T520A +A L521P +A E522K +A E523D +A Q525P +A Q525A +A Q525K +A Q525S +A E526D +A S527L +A N529T +A A532P +A A532V +A P533L +A G534A +A G534R +A K537E +A V538A +A F540S +A A541T +A D542E +A L546F +A C549S +A A550D +A A551S +A A555P +A A556S +A K557N +A G560R +A G560A +A G560S +A H561R +A N562H +A V565G +A P566L +A F567S +A F567L +A F567V +A T568P +A P569L +A G570F +A R571L +A A574V +A T579A +A T579S +A S583P +A F584V +A V586M +A L587R +A L587P +A E588G +A A591T +A G593C +A F594I +A F594L +A N596S +A Y597H +A Y597S +A Y597D +A L598F +A L598R +A G599R +A K600Q +A N602D +A P603L +A P605S +A A606P +A A606T +A E607D +A Y608D +A M609T +A L611R +A D612G +A A614T +A A614G +A A614E +A L616S +A T618M +A S620T +A A621T +A A621D +A M624V +A M624K +A M624I +A T625A +A T625K +A L627P +A V628I +A G629D +A G629C +A G630R +A G630V +A V633A +A V633I +A L634I +A A636T +A N637D +A N637H +A N637K +A Y638C +A Y638H +A G644D +A G644S +A G644V +A E648D +A A649T +A A649G +A S650F +A S650P +A E651D +A L653Q +A T654S +A N655D +A F657S +A F657L +A N660D +A L661M +A L662V +A D663G +A D663Y +A I666V +A T667P +A T667I +A W668C +A W668L +A A673V +A D675Y +A D675G +A D675H +A T677P +A Y678C +A Q679E +A Q679Y +A G680D +A K681Q +A K681T +A S684R +A K686E +A W689G +A W689R +A T690I +A T690P +A G691D +A S692R +A R693C +A R693H +A D695A +A L696Q +A L696P +A V697A +A F698V +A G699E +A G699V +A S700P +A S700F +A E703Q +A L704W +A L704S +A R705L +A R705G +A R705W +A L707R +A L707F +A E709A +A E709G +A V710I +A V710A +A Y711D +A A713S +A D714E +A D714N +A D714G +A P718S +A F720S +A D723N +A D723A +A A726T +A A727S +A A727T +A W728R +A D729N +A D729V +A D729G +A D729T +A V731M +A V731A +A N733S +A L734R +A D735A +A R736K +A R736S +A V739M +A R740S diff --git a/scripts/functions/test_aa_prop_bp.R b/scripts/functions/test_aa_prop_bp.R new file mode 100644 index 0000000..fa1a218 --- /dev/null +++ b/scripts/functions/test_aa_prop_bp.R @@ -0,0 +1,63 @@ +#!/usr/bin/env Rscript +library(ggplot2) +library(tidyverse) +library(data.table) + +setwd("~/git/LSHTM_analysis/scripts/functions/") +getwd() +############################################################# +#=========================================== +# load functions, data, dirs, hardocded vars +# that will be used in testing the functions +#=========================================== +source("plotting_data.R") +infile = "/home/tanu/git/Data/streptomycin/output/" +pd_df = plotting_data(infile) +my_df = pd_df[[1]] +my_df_u = pd_df[[2]] +my_df_u_lig = pd_df[[3]] +dup_muts = pd_df[[4]] + +source("../plotting_globals.R") +drug = "streptomycin" +gene = "gid" + +import_dirs(drug, gene) + +#===================== +# functions to test +#===================== +source("stability_count_bp.R") +source("position_count_bp.R") +################################################################# +############################################## +# read a sample file containing muts and prop +############################################### +df<- read.csv(file.choose()) + +setDT(df)[, pos_count := .N, by = .(position)] +foo = data.frame(df$position, df$pos_count) + +#snpsBYpos_df <- df %>% +# group_by(position) %>% +# summarize(snpsBYpos = mean(pos_count)) + +# subset df without duplicates for position +df2 = df[!duplicated(df$position)] +################################################################## +# --------------------------------------- +# barplot for nssnps, coloured by aa prop +# --------------------------------------- +pos_colname = "position" +aa_prop_colname = "mut_prop_water" +aa_prop_colours = c("black", "blue") +my_legname = "aa_prop: water" + +# call function +aa_prop_bp(plotdf = df + , position_colname = pos_colname + , fill_colname = aa_prop_colname + , fill_colours = aa_prop_cols + , leg_name = my_legname) + +#=============================================================== diff --git a/scripts/functions/test_af_or_calcs.R b/scripts/functions/test_af_or_calcs.R new file mode 100644 index 0000000..0532f83 --- /dev/null +++ b/scripts/functions/test_af_or_calcs.R @@ -0,0 +1,59 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: To calculate Allele Frequency and +# Odds Ratio from master data +######################################################### +# load libraries +#source("Header_TT.R") +require("getopt", quietly = TRUE) # cmd parse arguments + +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/functions/") +getwd() + +# load functions +source("plotting_globals.R") +source("mychisq_or.R") +source("myaf_or_calcs.R") + +# cmd options + sensible defaults +drug = "streptomycin" +gene = "gid" + +# call function +import_dirs(drug, gene) + +# input file 1: master data +#in_filename_master = 'original_tanushree_data_v2.csv' #19K +in_filename_master = 'mtb_gwas_meta_v6.csv' #35k +infile_master = paste0(datadir, in_filename_master) +cat(paste0('Reading infile1: raw data', ' ', infile_master) ) + +# input file 2: gene associated meta data file to extract valid snps and add calcs to. +# This is outfile_metadata from data_extraction.py +in_filename_metadata = paste0(tolower(gene), '_metadata.csv') +infile_metadata = paste0(outdir, '/', in_filename_metadata) +cat(paste0('Reading input file 2 i.e gene associated metadata:', infile_metadata)) + +# out_filename_af_or = paste0(tolower(gene), '_meta_data_with_AF_OR.csv') +out_filename_af_or = paste0(tolower(gene), '_af_or.csv') +outfile_af_or = paste0(outdir, '/', out_filename_af_or) +cat(paste0('Output file with full path:', outfile_af_or)) + +cat("master data:", infile_master) +cat("gene data:", infile_metadata) + +dr_muts_col # comes from global (dr_mutations_) +other_muts_col # comes from global (other_mutations_) +################################################# +my_afor ( infile_master + , infile_metadata + , outfile = outfile_af_or + #, outfile = "FOO_TEST.csv" + , drug + , gene + , idcol = "id" + , dr_muts_col + , other_muts_col + ) diff --git a/scripts/functions/test_bp.R b/scripts/functions/test_bp.R new file mode 100644 index 0000000..bdb48ca --- /dev/null +++ b/scripts/functions/test_bp.R @@ -0,0 +1,113 @@ +#!/usr/bin/env Rscript +setwd("~/git/LSHTM_analysis/scripts/functions/") +getwd() +############################################################# +#=========================================== +# load functions, data, dirs, hardocded vars +# that will be used in testing the functions +#=========================================== +drug = "streptomycin" +gene = "gid" + +source("plotting_data.R") + +infile = paste0("~/git/Data/", drug, "/output/", gene, "_comb_stab_struc_params.csv") +infile_df = read.csv(infile) + +lig_dist = 5 +pd_df = plotting_data(infile_df + , lig_dist_colname = 'ligand_distance' + , lig_dist_cutoff = lig_dist) + +my_df = pd_df[[1]] +my_df_u = pd_df[[2]] +my_df_u_lig = pd_df[[3]] +dup_muts = pd_df[[4]] + +#===================== +# functions to test +#===================== +source("stability_count_bp.R") +source("position_count_bp.R") + +################################################################## +# ------------------------------ +# barplot for mscm stability +# ------------------------------ +basic_bp_duet = paste0(tolower(gene), "_basic_barplot_PS.svg") +plot_basic_bp_duet = paste0(plotdir,"/", basic_bp_duet) + +svg(plot_basic_bp_duet) +print(paste0("plot filename:", basic_bp_duet)) + +# function only +stability_count_bp(plotdf = my_df_u + , df_colname = "duet_outcome" + , leg_title = "DUET outcome" + , label_categories = c("Destabilising", "Stabilising") + , leg_position = "top") + +dev.off() + +# ------------------------------ +# barplot for ligand affinity +# ------------------------------ +basic_bp_ligand = paste0(tolower(gene), "_basic_barplot_LIG.svg") +plot_basic_bp_ligand = paste0(plotdir, "/", basic_bp_ligand) + +svg(plot_basic_bp_ligand) +print(paste0("plot filename:", basic_bp_ligand)) + +# function only +lig_dist = 10 +stability_count_bp(plotdf = my_df_u_lig + , df_colname = "ligand_outcome" + , leg_title = "Ligand outcome" + , yaxis_title = paste0("Number of nsSNPs\nLigand dist: <", lig_dist, "\u212b") + #, bp_plot_title = "Sites < 10 Ang of ligand" + ) + +dev.off() +# ------------------------------ +# barplot for foldX +# ------------------------------ +basic_bp_foldx = paste0(tolower(gene), "_basic_barplot_foldx.svg") +plot_basic_bp_foldx = paste0(plotdir,"/", basic_bp_foldx) + +svg(plot_basic_bp_foldx) +print(paste0("plot filename:", plot_basic_bp_foldx)) + +stability_count_bp(plotdf = my_df_u + , df_colname = "foldx_outcome" + , leg_title = "FoldX outcome") +dev.off() +#=============================================================== +# ------------------------------ +# barplot for nssnp site count: all +# ------------------------------ +pos_count_duet = paste0(tolower(gene), "_position_count_PS.svg") +plot_pos_count_duet = paste0(plotdir, "/", pos_count_duet) + +svg(plot_pos_count_duet) +print(paste0("plot filename:", plot_pos_count_duet)) + +# function only +site_snp_count_bp(plotdf = my_df_u + , df_colname = "position") + +dev.off() +# ------------------------------ +# barplot for nssnp site count: within 10 Ang +# ------------------------------ +pos_count_ligand = paste0(tolower(gene), "_position_count_LIG.svg") +plot_pos_count_ligand = paste0(plotdir, "/", pos_count_ligand) + +svg(plot_pos_count_ligand) +print(paste0("plot filename:", plot_pos_count_ligand)) + +# function only +site_snp_count_bp(plotdf = my_df_u_lig + , df_colname = "position") + +dev.off() +#=============================================================== diff --git a/scripts/functions/test_combining_dfs_plotting.R b/scripts/functions/test_combining_dfs_plotting.R new file mode 100644 index 0000000..87a1929 --- /dev/null +++ b/scripts/functions/test_combining_dfs_plotting.R @@ -0,0 +1,100 @@ +#!/usr/bin/env Rscript + +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/functions/") +getwd() + +# infile_params = paste0(outdir, "/" , tolower(gene), "_comb_afor.csv") +# infile_metadata = paste0(outdir, "/", tolower(gene), "_metadata") +# +# +# source("combining_dfs_plotting_func.R") +# +#################################################################### +# in_file_params = "~/git/Data/streptomycin/output/gid_comb_afor.csv" +# in_file_metadata = "~/git/Data/streptomycin/output/gid_metadata.csv" +# +# all_plot_dfs = combining_dfs_plotting(df1_mcsm_comb = infile_params +# , df2_gene_metadata = infile_metadata +# , lig_dist_colname = 'ligand_distance' +# , lig_dist_cutoff = 10) +# +# merged_df2 = all_plot_dfs[[1]] +# merged_df3 = all_plot_dfs[[2]] +# merged_df2_comp = all_plot_dfs[[3]] +# merged_df3_comp = all_plot_dfs[[4]] +# merged_df2_lig = all_plot_dfs[[5]] +# merged_df3_lig = all_plot_dfs[[6]] +# +# bar_colnames = data.frame(colnames(merged_df2)) +########################################################### +source("plotting_globals.R") +source("plotting_data.R") +source("combining_dfs_plotting.R") + +#--------------------- +# call: import_dirs() +#--------------------- +gene = 'gid' +drug = 'streptomycin' + +import_dirs(drug_name = drug, gene_name = gene) + + +#============================ +# Input 1: plotting_data() +#============================ +if (!exists("infile_params") && exists("gene")){ +#if (!is.character(infile_params) && exists("gene")){ + #in_filename_params = paste0(tolower(gene), "_all_params.csv") + in_filename_params = paste0(tolower(gene), "_comb_afor.csv") # part combined for gid + infile_params = paste0(outdir, "/", in_filename_params) + cat("\nInput file for mcsm comb data not specified, assuming filename: ", infile_params, "\n") +} + +mcsm_comb_data = read.csv(infile_params, header = T) + +#------------------------------- +# call function: plotting_data() +#------------------------------- +pd_df = plotting_data(df = mcsm_comb_data + , ligand_dist_colname = 'ligand_distance' + , lig_dist_cutoff = 10 +my_df_u = pd_df[[2]] + +#====================================== +# Input 2: read _meta data.csv +#====================================== +if (!exists("infile_metadata") && exists("gene")){ +#if (!is.character(infile_params) && exists("gene")){{ + in_filename_metadata = paste0(tolower(gene), "_metadata.csv") # part combined for gid + infile_metadata = paste0(outdir, "/", in_filename_metadata) + cat("\nInput file for gene metadata not specified, assuming filename: ", infile_metadata, "\n") +} + +cat("\nReading meta data file:", infile_metadata) + +gene_metadata <- read.csv(infile_metadata + , stringsAsFactors = F + , header = T) + +#----------------------------------------- +# test function: combining_dfs_plotting() +#----------------------------------------- +all_plot_dfs = combining_dfs_plotting(my_df_u + , gene_metadata + , lig_dist_colname = 'ligand_distance' + , lig_dist_cutoff = 10) + +merged_df2 = all_plot_dfs[[1]] +merged_df3 = all_plot_dfs[[2]] +merged_df2_comp = all_plot_dfs[[3]] +merged_df3_comp = all_plot_dfs[[4]] +merged_df2_lig = all_plot_dfs[[5]] +merged_df3_lig = all_plot_dfs[[6]] +merged_df2_comp_lig = all_plot_dfs[[7]] +merged_df3_comp_lig = all_plot_dfs[[8]] +######################################################################## +# End of script +######################################################################## diff --git a/scripts/functions/test_plotting_data.R b/scripts/functions/test_plotting_data.R new file mode 100644 index 0000000..bb5ea3c --- /dev/null +++ b/scripts/functions/test_plotting_data.R @@ -0,0 +1,35 @@ +#!/usr/bin/env Rscript +getwd() +setwd("~/git/LSHTM_analysis/scripts/functions/") +getwd() +############################################################# +#=========================================== +# load functions, data, dirs, hardocded vars +# that will be used in testing the functions +#=========================================== +source("plotting_globals.R") + +drug = "streptomycin" +gene = "gid" + +import_dirs(drug_name = drug, gene_name = gene) + +#------------------------------- +# test function: plotting_data() +#------------------------------- +source("plotting_data.R") + +infile_params = "/home/tanu/git/Data/streptomycin/output/gid_comb_stab_struc_params.csv" +mcsm_comb_data = read.csv(infile_params, header = T) + +pd_df = plotting_data(df = mcsm_comb_data + , ligand_dist_colname = 'ligand_distance' + , lig_dist_cutoff = 10) + +my_df = pd_df[[1]] +my_df_u = pd_df[[2]] +my_df_u_lig = pd_df[[3]] +dup_muts = pd_df[[4]] +######################################################################## +# End of script +######################################################################## \ No newline at end of file