From 648be02665dbc9102707393a0cdf3944ec784ac7 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Mon, 7 Sep 2020 15:27:53 +0100 Subject: [PATCH] ks test script added --- scripts/ks_test_PS.R | 211 +++++++++++++++++++++++++++++ scripts/plotting/autoviz.py | 45 ------ scripts/plotting/combined_or.R | 1 - scripts/plotting/lineage_count.txt | 71 ---------- 4 files changed, 211 insertions(+), 117 deletions(-) create mode 100644 scripts/ks_test_PS.R delete mode 100644 scripts/plotting/autoviz.py delete mode 100644 scripts/plotting/lineage_count.txt diff --git a/scripts/ks_test_PS.R b/scripts/ks_test_PS.R new file mode 100644 index 0000000..bc11957 --- /dev/null +++ b/scripts/ks_test_PS.R @@ -0,0 +1,211 @@ +#!/usr/bin/env Rscript +######################################################### +# TASK: KS test for PS/DUET lineage distributions +#======================================================================= +#======================================================================= +# working dir and loading libraries +getwd() +setwd("~/git/LSHTM_analysis/scripts/") +getwd() + +#source("/plotting/Header_TT.R") +#source("../barplot_colour_function.R") +#require(data.table) +source("plotting/combining_dfs_plotting.R") +# should return the following dfs, directories and variables + +# PS combined: +# 1) merged_df2 +# 2) merged_df2_comp +# 3) merged_df3 +# 4) merged_df3_comp + +# LIG combined: +# 5) merged_df2_lig +# 6) merged_df2_comp_lig +# 7) merged_df3_lig +# 8) merged_df3_comp_lig + +# 9) my_df_u +# 10) my_df_u_lig + +cat(paste0("Directories imported:" + , "\ndatadir:", datadir + , "\nindir:", indir + , "\noutdir:", outdir + , "\nplotdir:", plotdir)) + +cat(paste0("Variables imported:" + , "\ndrug:", drug + , "\ngene:", gene + , "\ngene_match:", gene_match + , "\nAngstrom symbol:", angstroms_symbol + , "\nNo. of duplicated muts:", dup_muts_nu + , "\nNA count for ORs:", na_count + , "\nNA count in df2:", na_count_df2 + , "\nNA count in df3:", na_count_df3)) + +########################### +# Data for stats +# you need merged_df2 or merged_df2_comp +# since this is one-many relationship +# i.e the same SNP can belong to multiple lineages +# using the _comp dataset means +# we lose some muts and at this level, we should use +# as much info as available, hence use df with NA +########################### + +# REASSIGNMENT +my_df = merged_df2 + +# delete variables not required +rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp) + +# quick checks +colnames(my_df) +str(my_df) + +# Ensure correct data type in columns to plot: need to be factor +is.factor(my_df$lineage) +my_df$lineage = as.factor(my_df$lineage) +is.factor(my_df$lineage) + +table(my_df$mutation_info); str(my_df$mutation_info) + +# subset df with dr muts only +my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide") +table(my_df_dr$mutation_info) + +# stats for all muts and dr_muts +# 1) for all muts +# 2) for dr_muts +#=========================== +table(my_df$lineage); str(my_df$lineage) +table(my_df_dr$lineage); str(my_df_dr$lineage) + +# subset only lineages1-4 +sel_lineages = c("lineage1" + , "lineage2" + , "lineage3" + , "lineage4") + +# subset and refactor: all muts +df_lin = subset(my_df, subset = lineage %in% sel_lineages) +df_lin$lineage = factor(df_lin$lineage) + +# subset and refactor: dr muts +df_lin_dr = subset(my_df_dr, subset = lineage %in% sel_lineages) +df_lin_dr$lineage = factor(df_lin_dr$lineage) + + +#{RESULT: No of samples within lineage} +table(df_lin$lineage) +table(df_lin_dr$lineage) + +#{Result: No. of unique mutations the 4 lineages contribute to} +length(unique(df_lin$mutationinformation)) +length(unique(df_lin_dr$mutationinformation)) + + +# COMPARING DISTRIBUTIONS +#================ +# ALL mutations +#================= +head(df_lin$lineage) +df_lin$lineage = as.character(df_lin$lineage) + +lin1 = df_lin[df_lin$lineage == "lineage1",]$duet_scaled +lin2 = df_lin[df_lin$lineage == "lineage2",]$duet_scaled +lin3 = df_lin[df_lin$lineage == "lineage3",]$duet_scaled +lin4 = df_lin[df_lin$lineage == "lineage4",]$duet_scaled + +# ks test +lin12 = ks.test(lin1,lin2) +lin12_df = as.data.frame(cbind(lin12$data.name, lin12$p.value)) + +lin13 = ks.test(lin1,lin3) +lin13_df = as.data.frame(cbind(lin13$data.name, lin13$p.value)) + +lin14 = ks.test(lin1,lin4) +lin14_df = as.data.frame(cbind(lin14$data.name, lin14$p.value)) + +lin23 = ks.test(lin2,lin3) +lin23_df = as.data.frame(cbind(lin23$data.name, lin23$p.value)) + +lin24 = ks.test(lin2,lin4) +lin24_df = as.data.frame(cbind(lin24$data.name, lin24$p.value)) + +lin34 = ks.test(lin3,lin4) +lin34_df = as.data.frame(cbind(lin34$data.name, lin34$p.value)) + +ks_results_all = rbind(lin12_df + , lin13_df + , lin14_df + , lin23_df + , lin24_df + , lin34_df) + +#p-value < 2.2e-16 +rm(lin12, lin12_df + , lin13, lin13_df + , lin14, lin14_df + , lin23, lin23_df + , lin24, lin24_df + , lin34, lin34_df) + +#================ +# DRUG mutations +#================= +head(df_lin_dr$lineage) +df_lin_dr$lineage = as.character(df_lin_dr$lineage) + +lin1_dr = df_lin_dr[df_lin_dr$lineage == "lineage1",]$duet_scaled +lin2_dr = df_lin_dr[df_lin_dr$lineage == "lineage2",]$duet_scaled +lin3_dr = df_lin_dr[df_lin_dr$lineage == "lineage3",]$duet_scaled +lin4_dr = df_lin_dr[df_lin_dr$lineage == "lineage4",]$duet_scaled + +# ks test: dr muts +lin12_dr = ks.test(lin1_dr,lin2_dr) +lin12_df_dr = as.data.frame(cbind(lin12_dr$data.name, lin12_dr$p.value)) + +lin13_dr = ks.test(lin1_dr,lin3_dr) +lin13_df_dr = as.data.frame(cbind(lin13_dr$data.name, lin13_dr$p.value)) + +lin14_dr = ks.test(lin1_dr,lin4_dr) +lin14_df_dr = as.data.frame(cbind(lin14_dr$data.name, lin14_dr$p.value)) + +lin23_dr = ks.test(lin2_dr,lin3_dr) +lin23_df_dr = as.data.frame(cbind(lin23_dr$data.name, lin23_dr$p.value)) + +lin24_dr = ks.test(lin2_dr,lin4_dr) +lin24_df_dr = as.data.frame(cbind(lin24_dr$data.name, lin24_dr$p.value)) + +lin34_dr = ks.test(lin3_dr,lin4_dr) +lin34_df_dr = as.data.frame(cbind(lin34_dr$data.name, lin34_dr$p.value)) + +ks_results_dr = rbind(lin12_df_dr + , lin13_df_dr + , lin14_df_dr + , lin23_df_dr + , lin24_df_dr + , lin34_df_dr) + +ks_results_combined = cbind(ks_results_all, ks_results_dr) + +my_colnames = c("Lineage_comparisons" + , paste0("All_mutations n=", nrow(df_lin)) + , paste0("Drug_associated_mutations n=", nrow(df_lin_dr))) +my_colnames + +# select the output columns +ks_results_combined_f = ks_results_combined[,c(1,2,4)] + +colnames(ks_results_combined_f) = my_colnames +ks_results_combined_f + +#============= +# write output file +#============= +ks_results = paste0(outdir,"/results/ks_results.csv") +write.csv(ks_results_combined_f, ks_results, row.names = F) + diff --git a/scripts/plotting/autoviz.py b/scripts/plotting/autoviz.py deleted file mode 100644 index 8dc6405..0000000 --- a/scripts/plotting/autoviz.py +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env python3 -#======================================================================= -#%% useful links -#https://towardsdatascience.com/autoviz-automatically-visualize-any-dataset-ba2691a8b55a -#https://pypi.org/project/autoviz/ -#======================================================================= -import os, sys -import pandas as pd -import numpy as np -import re -import argparse -from autoviz.AutoViz_Class import AutoViz_Class - -homedir = os.path.expanduser('~') -os.chdir(homedir + '/git/LSHTM_analysis/scripts') -#%%============================================================================ -# variables -gene = 'pncA' -drug = 'pyrazinamide' - -#%%============================================================================ -#============== -# directories -#============== -datadir = homedir + '/' + 'git/Data' - -indir = datadir + '/' + drug + '/input' - -outdir = datadir + '/' + drug + '/output' - -#======= -# input -#======= -in_filename_plotting = 'car_design.csv' -in_filename_plotting = gene.lower() + '_all_params.csv' -infile_plotting = outdir + '/' + in_filename_plotting -print('plotting file: ', infile_plotting - , '\n============================================================') -#======================================================================= -plotting_df = pd.read_csv(infile_plotting, sep = ',') -#Instantiate the AutoViz class -AV = AutoViz_Class() -df = AV.AutoViz(infile_plotting) -#df2 = AV.AutoViz(plotting_df) -plotting_df.columns[~plotting_df.columns.isin(df.columns)] diff --git a/scripts/plotting/combined_or.R b/scripts/plotting/combined_or.R index 4bf2b80..1c167a8 100644 --- a/scripts/plotting/combined_or.R +++ b/scripts/plotting/combined_or.R @@ -1,4 +1,3 @@ - #!/usr/bin/env Rscript ######################################################### # TASK: Basic lineage barplot showing numbers diff --git a/scripts/plotting/lineage_count.txt b/scripts/plotting/lineage_count.txt deleted file mode 100644 index fe6a87d..0000000 --- a/scripts/plotting/lineage_count.txt +++ /dev/null @@ -1,71 +0,0 @@ -#============= -# merged_df2 -#============= ----------------- -# no. of samples ----------------- - Var1 Freq -1 8 -2 lineage1 144 -3 lineage1;lineage2 3 -4 lineage1;lineage4 4 -5 lineage2 1886 -6 lineage2;lineage4 19 -7 lineage3 190 -8 lineage3;lineage4 11 -9 lineage4 2213 -10 lineage4;lineage6 1 -11 lineage4;lineage7 1 -12 lineage4;lineageBOV 1 -13 lineage5 31 -14 lineage6 9 -15 lineage7 3 -16 lineageBOV 392 - ----------------- -# no. of nsSNPs ----------------- - - sel_lineages num_snps_u total_samples -1 lineage1 74 144 -2 lineage2 277 1886 -3 lineage3 104 190 -4 lineage4 311 2213 -5 lineage5 18 31 -6 lineage6 8 9 -7 lineage7 1 3 - - -#============= -# merged_df2_comp -#============= ----------------- -# no. of samples ----------------- - - Var1 Freq -1 3 -2 lineage1 108 -3 lineage1;lineage2 2 -4 lineage1;lineage4 2 -5 lineage2 1497 -6 lineage2;lineage4 13 -7 lineage3 154 -8 lineage3;lineage4 3 -9 lineage4 1846 -10 lineage4;lineageBOV 1 -11 lineage5 12 -12 lineage6 2 -13 lineageBOV 36 - ----------------- -# no. of nsSNPs ----------------- - sel_lineages num_snps_u total_samples -1 lineage1 42 108 -2 lineage2 141 1497 -3 lineage3 75 154 -4 lineage4 148 1846 -5 lineage5 9 12 -6 lineage6 2 2 -7 lineage7 0 0