foramtting and adding or

This commit is contained in:
Tanushree Tunstall 2020-06-18 13:55:45 +01:00
parent fdba990b80
commit 010ef133dd

View file

@ -8,6 +8,9 @@ getwd()
setwd('~/git/LSHTM_analysis/scripts') setwd('~/git/LSHTM_analysis/scripts')
getwd() getwd()
options(scipen = 999) #disabling scientific notation in R.
options(scipen = 4)
#%% variable assignment: input and output paths & filenames #%% variable assignment: input and output paths & filenames
drug = 'pyrazinamide' drug = 'pyrazinamide'
gene = 'pncA' gene = 'pncA'
@ -154,13 +157,15 @@ cat(paste0('Total no. of distinct comp snps to perform OR calcs: ', length(gene_
#====================================== #======================================
# TEST FOR ONE # TEST FOR ONE
i = "pnca_p.ala134gly" # has a NA, should NOT exist i = "pnca_p.ala134gly" # has a NA, should NOT exist
table(grepl(i,raw_data$all_muts_gene)) table(grepl(i,raw_data$all_muts_gene))
i = "pnca_p.trp68gly" i = "pnca_p.trp68gly"
table(grepl(i,raw_data$all_muts_gene)) table(grepl(i,raw_data$all_muts_gene))
i = "pnca_p.his51asp"
table(grepl(i,raw_data$all_muts_gene))
# IV # IV
mut = grepl(i,raw_data$all_muts_gene) mut = grepl(i,raw_data$all_muts_gene)
@ -174,8 +179,13 @@ table(mut, dst)
#1) chisq : noy accurate for low counts #1) chisq : noy accurate for low counts
chisq.test(table(mut,dst)) chisq.test(table(mut,dst))
attributes(chisq.test(table(mut,dst)))
chisq.test(table(mut,dst))$p.value chisq.test(table(mut,dst))$p.value
chisq.test(table(mut,dst))$statistic
t = chisq.test(table(mut,dst))$statistic; print(t)
names(t)
# remove suffix
#names(t2) = gsub(".X-squared", "", names(t))
#2) modified chisq OR: custom function #2) modified chisq OR: custom function
#x = as.numeric(mut) #x = as.numeric(mut)
@ -197,7 +207,7 @@ my_chisq_or(mut, dst)
#3) fisher #3) fisher
fisher.test(table(mut, dst)) fisher.test(table(mut, dst))
or_fisher = fisher.test(table(mut, dst))$estimate; print(or_fisher) or_fisher = fisher.test(table(mut, dst))$estimate; print(or_fisher); cat(names(or_fisher))
pval_fisher = fisher.test(table(mut, dst))$p.value; print(pval_fisher) # the same one to be used for custom chisq_or pval_fisher = fisher.test(table(mut, dst))$p.value; print(pval_fisher) # the same one to be used for custom chisq_or
ci_lb_fisher = fisher.test(table(mut, dst))$conf.int[1]; print(ci_lb_fisher) ci_lb_fisher = fisher.test(table(mut, dst))$conf.int[1]; print(ci_lb_fisher)
ci_ub_fisher = fisher.test(table(mut, dst))$conf.int[2]; print(ci_ub_fisher) ci_ub_fisher = fisher.test(table(mut, dst))$conf.int[2]; print(ci_ub_fisher)
@ -231,7 +241,8 @@ pval_logistic = summary(model)$coefficients[2,4]; print(pval_logistic)
###################### ######################
# AF and OR for all muts: sapply # AF and OR for all muts: sapply
###################### ######################
print(table(dst)); print(table(mut)) print(table(dst)); print(table(mut)) # must exist
#dst = raw_data[[drug]] #or raw_data[,drug]
# af # af
afs = sapply(gene_snps_unique,function(m){ afs = sapply(gene_snps_unique,function(m){
@ -239,17 +250,32 @@ afs = sapply(gene_snps_unique,function(m){
mean(mut) mean(mut)
}) })
afs #afs
head(afs) head(afs)
#1) chi square: original #1) chi square: original
stat_chi = sapply(gene_snps_unique,function(m){ statistic_chi = sapply(gene_snps_unique,function(m){
mut = grepl(m,raw_data$all_muts_gene) mut = grepl(m,raw_data$all_muts_gene)
chisq.test(mut,dst)$statistic chisq.test(mut,dst)$statistic
}) })
stat_chi # statistic_chi: has suffix added of '.X-squared'
head(stat_chi) head(statistic_chi)
# remove suffix
names(stat_chi) = gsub(".X-squared", "", names(statistic_chi))
if (names(stat_chi)!= names(statistic_chi) && stat_chi == statistic_chi){
cat('Sanity check passed: suffix removed correctly\n\n'
, 'names with suffix:', head(names(statistic_chi)), '\n\n'
, 'names without suffix:', head(names(stat_chi)), '\n\n'
, 'values in var with suffix:', head(statistic_chi),'\n'
, 'values in var without suffix:', head(stat_chi)
)
}else{
print('FAIL: suffix removal unsuccessful! Please Debug')
}
## pval ## pval
pvals_chi = sapply(gene_snps_unique,function(m){ pvals_chi = sapply(gene_snps_unique,function(m){
@ -257,7 +283,7 @@ pvals_chi = sapply(gene_snps_unique,function(m){
chisq.test(mut,dst)$p.value chisq.test(mut,dst)$p.value
}) })
pvals_chi #pvals_chi
head(pvals_chi) head(pvals_chi)
#2) chi square: custom #2) chi square: custom
@ -266,7 +292,7 @@ ors_chi_cus = sapply(gene_snps_unique,function(m){
my_chisq_or(mut,dst) my_chisq_or(mut,dst)
}) })
ors_chi_cus #ors_chi_cus
head(ors_chi_cus) head(ors_chi_cus)
## pval:fisher (use the same one for custom chi sqaure) ## pval:fisher (use the same one for custom chi sqaure)
@ -275,17 +301,36 @@ pvals_fisher = sapply(gene_snps_unique,function(m){
fisher.test(mut,dst)$p.value fisher.test(mut,dst)$p.value
}) })
pvals_fisher #pvals_fisher
head(pvals_fisher) head(pvals_fisher)
#3) fisher #3) fisher
ors_fisher = sapply(gene_snps_unique,function(m){ odds_ratio_fisher = sapply(gene_snps_unique,function(m){
mut = grepl(m,raw_data$all_muts_gene) mut = grepl(m,raw_data$all_muts_gene)
fisher.test(mut,dst)$estimate; fisher.test(mut,dst)$estimate;
}) })
ors_fisher #ors_fisher
head(ors_fisher) head(odds_ratio_fisher)
# statistic_chi: has suffix added of '.X-squared'
head(odds_ratio_fisher)
# remove suffix
ors_fisher = odds_ratio_fisher
names(ors_fisher) = gsub(".odds ratio", "", names(odds_ratio_fisher))
if (names(ors_fisher)!= names(odds_ratio_fisher) && ors_fisher == odds_ratio_fisher){
cat('Sanity check passed: suffix removed correctly\n\n'
, 'names with suffix:', head(names(odds_ratio_fisher)), '\n\n'
, 'names without suffix:', head(names(ors_fisher)), '\n\n'
, 'values in var with suffix:', head(odds_ratio_fisher),'\n'
, 'values in var without suffix:', head(ors_fisher)
)
}else{
print('FAIL: suffix removal unsuccessful! Please Debug')
}
## fisher ci (lower) ## fisher ci (lower)
ci_lb_fisher = sapply(gene_snps_unique, function(m){ ci_lb_fisher = sapply(gene_snps_unique, function(m){
@ -294,7 +339,7 @@ ci_lb_fisher = sapply(gene_snps_unique, function(m){
}) })
ci_lb_fisher #ci_lb_fisher
head(ci_lb_fisher) head(ci_lb_fisher)
## fisher ci (upper) ## fisher ci (upper)
@ -304,7 +349,7 @@ ci_ub_fisher = sapply(gene_snps_unique, function(m){
}) })
ci_ub_fisher #ci_ub_fisher
head(ci_ub_fisher) head(ci_ub_fisher)
#4) logistic or #4) logistic or
@ -316,7 +361,7 @@ ors_logistic = sapply(gene_snps_unique,function(m){
#pval_logistic = summary(model)$coefficients[2,4] #pval_logistic = summary(model)$coefficients[2,4]
}) })
ors_logistic #ors_logistic
head(ors_logistic) head(ors_logistic)
## logistic p-value ## logistic p-value
@ -328,60 +373,83 @@ pvals_logistic = sapply(gene_snps_unique,function(m){
pval_logistic = summary(model)$coefficients[2,4] pval_logistic = summary(model)$coefficients[2,4]
}) })
pvals_logistic #pvals_logistic
head(pvals_logistic) head(pvals_logistic)
#============================================= #=============================================
# check ..hmmm # check ..(hmmm) perhaps separate script)
afs['pnca_p.trp68gly'] afs['pnca_p.trp68gly']
afs['pnca_p.gln10pro'] afs['pnca_p.gln10pro']
afs['pnca_p.leu4ser'] afs['pnca_p.leu4ser']
plot(density(log(ors))) plot(density(log(ors_logistic)))
plot(-log10(pvals)) plot(-log10(pvals))
hist(log(ors) hist(log(ors)
, breaks = 100) , breaks = 100)
# sanity check # sanity check: if names are equal (just for 3 vars)
my_vars = c(afs
, stat_chi
, pvals_chi
, ors_chi_cus
, pvals_fisher
, ors_fisher
, ci_lb_fisher
, ci_ub_fisher
, ors_logistic
, pvals_logistic)
all(sapply(list(names(afs)
, names(pvals_chi)
, names(statistic_chi) # should return False
, names(ors_chi_cus)), function (x) x == names(ors_logistic)))
my_vars = c('afs', 'pvals_chi', 'ors_chi_cus') compare(names(afs)
, names(pvals_chi)
, names(statistic_chi) #TEST: should return False, but DOESN'T
, names(ors_chi_cus)
, names(stat_chi))$result
names(get('afs')) #=============== Now with all vars
# check if names are equal # sanity check: names for all vars
#c = compare( names(afs)
# , names(stat_chi)
# , names(statistic_chi) #TEST: should return False, but DOESN'T
# , names(pvals_chi)
# , names(ors_chi_cus)
# , names(pvals_fisher)
# , names(ors_fisher)
# , names(ci_lb_fisher)
# , names(ci_ub_fisher)
# , names(ors_logistic)
# , names(pvals_logistic))$result; c
if (all( sapply( list(names(afs)
if ( all(sapply(list(names(afs), names(pvals_chi), names(ors_chi_cus)), function (x) x == names(ors_logistic))) , names(stat_chi)
& compare(names(afs), names(pvals_chi), names(ors_chi_cus), names(stat_chi)) ){ #, names(statistic_chi) # TEST should return FALSE if included
cat('PASS: names of ors, pvals and afs match: proceed with combining into a single df') , names(pvals_chi)
} else{ , names(ors_chi_cus)
cat('FAIL: names of ors, pvals and afs mismatch') , names(pvals_fisher)
} , names(ors_fisher)
, names(ci_lb_fisher)
, names(ci_ub_fisher)
# FROM HERE , names(pvals_logistic) ), function (x) x == names(ors_logistic)))
if (table(names(ors_fisher) == names(pvals_fisher)) & table(names(ors) == names(afs)) & table(names(pvals) == names(afs)) == length(pnca_snps_unique)){ ){
cat('PASS: names of ors, pvals and afs match: proceed with combining into a single df') cat('PASS: names match: proceed with combining into a single df')
} else{ } else {
cat('FAIL: names of ors, pvals and afs mismatch') cat('FAIL: names mismatch')
} }
# combine ors, pvals and afs # combine ors, pvals and afs
cat('Combining calculated params into a df: ors, pvals and afs') cat('Combining calculated params into a df: ors, pvals and afs')
comb_AF_and_OR = data.frame(ors, pvals, afs) comb_AF_and_OR = data.frame(afs
, stat_chi
, pvals_chi
, ors_chi_cus
, pvals_fisher
, ors_fisher
, ci_lb_fisher
, ci_ub_fisher
, pvals_logistic
, ors_logistic)
cat('No. of rows in comb_AF_and_OR: ', nrow(comb_AF_and_OR) cat('No. of rows in comb_AF_and_OR: ', nrow(comb_AF_and_OR)
, '\nNo. of cols in comb_AF_and_OR: ', ncol(comb_AF_and_OR)) , '\nNo. of cols in comb_AF_and_OR: ', ncol(comb_AF_and_OR))