157 lines
3.9 KiB
R
157 lines
3.9 KiB
R
getwd()
|
|
setwd("~/git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/plotting")
|
|
getwd()
|
|
|
|
########################################################################
|
|
# Installing and loading required packages #
|
|
########################################################################
|
|
|
|
source("../Header_TT.R")
|
|
#source("../barplot_colour_function.R")
|
|
#require(data.table)
|
|
|
|
########################################################################
|
|
# Read file: call script for combining df for PS #
|
|
########################################################################
|
|
|
|
source("../combining_two_df.R")
|
|
|
|
#---------------------- PAY ATTENTION
|
|
# the above changes the working dir
|
|
#[1] "git/LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts"
|
|
#---------------------- PAY ATTENTION
|
|
|
|
#==========================
|
|
# This will return:
|
|
|
|
# df with NA for pyrazinamide:
|
|
# merged_df2
|
|
# merged_df3
|
|
|
|
# df without NA for pyrazinamide:
|
|
# merged_df2_comp
|
|
# merged_df3_comp
|
|
#===========================
|
|
|
|
###########################
|
|
# Data for plots
|
|
# you need merged_df2 or merged_df2_comp
|
|
# since this is one-many relationship
|
|
# i.e the same SNP can belong to multiple lineages
|
|
# using the _comp dataset means
|
|
# we lose some muts and at this level, we should use
|
|
# as much info as available, hence use df with NA
|
|
###########################
|
|
|
|
# uncomment as necessary
|
|
|
|
#%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
# REASSIGNMENT
|
|
my_df = merged_df2
|
|
#my_df = merged_df2_comp
|
|
#%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
|
|
# delete variables not required
|
|
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
|
|
|
# quick checks
|
|
colnames(my_df)
|
|
str(my_df)
|
|
|
|
# Ensure correct data type in columns to plot: need to be factor
|
|
is.factor(my_df$lineage)
|
|
my_df$lineage = as.factor(my_df$lineage)
|
|
is.factor(my_df$lineage)
|
|
|
|
table(my_df$mutation_info); str(my_df$mutation_info)
|
|
|
|
# subset df with dr muts only
|
|
my_df_dr = subset(my_df, mutation_info == "dr_mutations_pyrazinamide")
|
|
table(my_df_dr$mutation_info)
|
|
|
|
########################################################################
|
|
# end of data extraction and cleaning for plots #
|
|
########################################################################
|
|
|
|
#==========================
|
|
# Run two times:
|
|
# uncomment as necessary
|
|
# 1) for all muts
|
|
# 2) for dr_muts
|
|
#===========================
|
|
#%%%%%%%%%%%%%%%%%%%%%%%%
|
|
# REASSIGNMENT
|
|
|
|
#================
|
|
# for ALL muts
|
|
#================
|
|
#plot_df = my_df
|
|
|
|
#================
|
|
# for dr muts ONLY
|
|
#================
|
|
plot_df = my_df_dr
|
|
|
|
#%%%%%%%%%%%%%%%%%%%%%%%%
|
|
#============================
|
|
# Plot: Lineage Distribution
|
|
# x = mcsm_values, y = dist
|
|
# fill = stability
|
|
#============================
|
|
|
|
table(plot_df$lineage); str(plot_df$lineage)
|
|
|
|
# subset only lineages1-4
|
|
sel_lineages = c("lineage1"
|
|
, "lineage2"
|
|
, "lineage3"
|
|
, "lineage4")
|
|
|
|
# uncomment as necessary
|
|
df_lin = subset(plot_df, subset = lineage %in% sel_lineages )
|
|
|
|
# refactor
|
|
df_lin$lineage = factor(df_lin$lineage)
|
|
|
|
table(df_lin$lineage) #{RESULT: No of samples within lineage}
|
|
#lineage1 lineage2 lineage3 lineage4
|
|
|
|
length(unique(df_lin$Mutationinformation))
|
|
#{Result: No. of unique mutations the 4 lineages contribute to}
|
|
|
|
# sanity checks
|
|
r1 = 2:5 # when merged_df2 used: because there is missing lineages
|
|
if(sum(table(plot_df$lineage)[r1]) == nrow(df_lin)) {
|
|
print ("sanity check passed: numbers match")
|
|
} else{
|
|
print("Error!: check your numbers")
|
|
}
|
|
|
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
# REASSIGNMENT
|
|
df <- df_lin
|
|
#%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
|
|
rm(df_lin)
|
|
|
|
# COMPARING DISTRIBUTIONS
|
|
head(df$lineage)
|
|
df$lineage = as.character(df$lineage)
|
|
|
|
lin1 = df[df$lineage == "lineage1",]$ratioDUET
|
|
lin2 = df[df$lineage == "lineage2",]$ratioDUET
|
|
lin3 = df[df$lineage == "lineage3",]$ratioDUET
|
|
lin4 = df[df$lineage == "lineage4",]$ratioDUET
|
|
|
|
# ks test
|
|
ks.test(lin1,lin2)
|
|
ks.test(lin1,lin3)
|
|
ks.test(lin1,lin4)
|
|
|
|
ks.test(lin2,lin3)
|
|
ks.test(lin2,lin4)
|
|
|
|
ks.test(lin3,lin4)
|
|
|
|
|
|
|