import commit
This commit is contained in:
commit
bccfe68192
39 changed files with 6837 additions and 0 deletions
512
meta_data_analysis/.Rhistory
Normal file
512
meta_data_analysis/.Rhistory
Normal file
|
@ -0,0 +1,512 @@
|
|||
, stringsAsFactors = F)
|
||||
x = as.numeric(grepl(i,raw_data$all_muts_pza))
|
||||
# DV: pyrazinamide 0 or 1
|
||||
y = as.numeric(raw_data$pyrazinamide)
|
||||
table(y,x)
|
||||
# run glm model
|
||||
model = glm(y ~ x, family = binomial)
|
||||
#model = glm(y ~ x, family = binomial(link = "logit"))
|
||||
summary(model)
|
||||
#**********
|
||||
# extract relevant model output
|
||||
#**********
|
||||
# extract log OR i.e the Beta estimate of the logistic model for a given snp
|
||||
my_logor = summary(model)$coefficients[2,1]
|
||||
print(paste0('Beta:', my_logor))
|
||||
# extract SE of the logistic model for a given snp
|
||||
my_se = summary(model)$coefficients[2,2]
|
||||
print(paste0('SE:', my_se))
|
||||
# extract Z of the logistic model for a given snp
|
||||
my_zval = summary(model)$coefficients[2,3]
|
||||
print(paste0('Z-value:', my_zval))
|
||||
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
|
||||
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
|
||||
my_or = exp(summary(model)$coefficients[2,1])
|
||||
print(paste0('OR:', my_or))
|
||||
# sanity check : should be True
|
||||
log(my_or) == my_logor
|
||||
# extract P-value of the logistic model for a given snp
|
||||
my_pval = summary(model)$coefficients[2,4]
|
||||
print(paste0('P-value:', my_pval))
|
||||
# extract confint interval of snp (2 steps, since the output is a named number)
|
||||
ci_mod = exp(confint(model))[2,]
|
||||
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
|
||||
print(paste0('CI:', my_ci))
|
||||
#*************
|
||||
# Assign the regression output in the original df
|
||||
# you can use ('=' or '<-/->')
|
||||
#*************
|
||||
#pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i] = my_logor
|
||||
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$Mutationinformation == i]
|
||||
my_logor
|
||||
pnca_snps_or$Mutationinformation == i
|
||||
View(pnca_snps_or)
|
||||
#===============
|
||||
# Step 4: Calculate for one snp
|
||||
# using i, when you run the loop, it is easy
|
||||
#===============
|
||||
i = "pnca_p.trp68gly"
|
||||
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
|
||||
, stringsAsFactors = F
|
||||
, header = T) #2133
|
||||
# uncomment as necessary
|
||||
pnca_snps_or = pnca_snps_or[1:5,]
|
||||
pnca_snps_or = pnca_snps_or[c(1:5),]
|
||||
#===============
|
||||
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
|
||||
, stringsAsFactors = F
|
||||
, header = T) #2133
|
||||
pnca_snps_or = pnca_snps_or[1:5,]
|
||||
pnca_snps_or = pnca_snps_or[c(1:5),]
|
||||
pnca_snps_or = pnca_snps_or[1:5]
|
||||
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
|
||||
, stringsAsFactors = F
|
||||
, header = T) #2133
|
||||
pnca_snps_or = pnca_snps_or[1:5]
|
||||
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
|
||||
, stringsAsFactors = F
|
||||
, header = T) #2133
|
||||
foo = pnca_snps_or[c(1:5,)]
|
||||
foo = pnca_snps_or[c(1:5),]
|
||||
foo = as.data.frame(pnca_snps_or[c(1:5),])
|
||||
View(foo)
|
||||
# create an empty dataframe
|
||||
pnca_snps_or = as.data.frame(pnca_snps_or[c(1:5),])
|
||||
# IV: corresponds to each unique snp (extracted using grep)
|
||||
x = as.numeric(grepl(i,raw_data$all_muts_pza))
|
||||
# DV: pyrazinamide 0 or 1
|
||||
y = as.numeric(raw_data$pyrazinamide)
|
||||
table(y,x)
|
||||
# run glm model
|
||||
model = glm(y ~ x, family = binomial)
|
||||
#model = glm(y ~ x, family = binomial(link = "logit"))
|
||||
summary(model)
|
||||
my_logor = summary(model)$coefficients[2,1]
|
||||
print(paste0('Beta:', my_logor))
|
||||
# extract SE of the logistic model for a given snp
|
||||
my_se = summary(model)$coefficients[2,2]
|
||||
print(paste0('SE:', my_se))
|
||||
# extract Z of the logistic model for a given snp
|
||||
my_zval = summary(model)$coefficients[2,3]
|
||||
print(paste0('Z-value:', my_zval))
|
||||
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
|
||||
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
|
||||
my_or = exp(summary(model)$coefficients[2,1])
|
||||
print(paste0('OR:', my_or))
|
||||
# sanity check : should be True
|
||||
log(my_or) == my_logor
|
||||
# extract P-value of the logistic model for a given snp
|
||||
my_pval = summary(model)$coefficients[2,4]
|
||||
print(paste0('P-value:', my_pval))
|
||||
# extract confint interval of snp (2 steps, since the output is a named number)
|
||||
ci_mod = exp(confint(model))[2,]
|
||||
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
|
||||
print(paste0('CI:', my_ci))
|
||||
#*************
|
||||
# Assign the regression output in the original df
|
||||
# you can use ('=' or '<-/->')
|
||||
#*************
|
||||
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
|
||||
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
|
||||
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
|
||||
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
|
||||
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
|
||||
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
|
||||
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
|
||||
#===============
|
||||
# Step 4: Iterate through this unique list
|
||||
# and calculate OR, but only for one snp
|
||||
# this is test before you apply it all others
|
||||
#===============
|
||||
pnca_snps_or$mutation == i
|
||||
View(pnca_snps_or)
|
||||
# create an empty dataframe
|
||||
pnca_snps_or = data.frame(mutation = i)
|
||||
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
|
||||
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
|
||||
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
|
||||
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
|
||||
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
|
||||
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
|
||||
View(pnca_snps_or_copy)
|
||||
#===============
|
||||
# Step 4: Iterate through this unique list
|
||||
# and calculate OR, but only for one snp
|
||||
# this is test before you apply it all others
|
||||
#===============
|
||||
#reset original df so you don't make a mistake
|
||||
pnca_snps_or = pnca_snps_or_copy
|
||||
for (i in pnca_snps_unique){
|
||||
print(i)
|
||||
}
|
||||
pnca_snps_or = pnca_snps_or_copy #2133, 1
|
||||
#........................................
|
||||
# create an empty dataframe : uncomment as necessary
|
||||
pnca_snps_or = data.frame(mutation = c(i, "blank_mut")
|
||||
#........................................
|
||||
# create an empty dataframe : uncomment as necessary
|
||||
pnca_snps_or = data.frame(mutation = c(i, "blank_mut"))
|
||||
#........................................
|
||||
# create an empty dataframe : uncomment as necessary
|
||||
pnca_snps_or = data.frame(mutation = c(i, "blank_mut"))
|
||||
View(pnca_snps_or)
|
||||
# IV: corresponds to each unique snp (extracted using grep)
|
||||
x = as.numeric(grepl(i,raw_data$all_muts_pza))
|
||||
# DV: pyrazinamide 0 or 1
|
||||
y = as.numeric(raw_data$pyrazinamide)
|
||||
table(y,x)
|
||||
# run glm model
|
||||
model = glm(y ~ x, family = binomial)
|
||||
#model = glm(y ~ x, family = binomial(link = "logit"))
|
||||
summary(model)
|
||||
#**********
|
||||
# extract relevant model output
|
||||
#**********
|
||||
# extract log OR i.e the Beta estimate of the logistic model for a given snp
|
||||
my_logor = summary(model)$coefficients[2,1]
|
||||
print(paste0('Beta:', my_logor))
|
||||
# extract SE of the logistic model for a given snp
|
||||
my_se = summary(model)$coefficients[2,2]
|
||||
print(paste0('SE:', my_se))
|
||||
# extract Z of the logistic model for a given snp
|
||||
my_zval = summary(model)$coefficients[2,3]
|
||||
print(paste0('Z-value:', my_zval))
|
||||
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
|
||||
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
|
||||
my_or = exp(summary(model)$coefficients[2,1])
|
||||
print(paste0('OR:', my_or))
|
||||
# sanity check : should be True
|
||||
log(my_or) == my_logor
|
||||
# extract P-value of the logistic model for a given snp
|
||||
my_pval = summary(model)$coefficients[2,4]
|
||||
print(paste0('P-value:', my_pval))
|
||||
# extract confint interval of snp (2 steps, since the output is a named number)
|
||||
ci_mod = exp(confint(model))[2,]
|
||||
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
|
||||
print(paste0('CI:', my_ci))
|
||||
#*************
|
||||
# Assign the regression output in the original df
|
||||
# you can use ('=' or '<-/->')
|
||||
#*************
|
||||
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
|
||||
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
|
||||
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
|
||||
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
|
||||
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
|
||||
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
|
||||
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
|
||||
View(pnca_snps_or)
|
||||
pnca_snps_or = pnca_snps_or_copy #2133, 1
|
||||
for (i in pnca_snps_unique){
|
||||
print(i)
|
||||
#*************
|
||||
# start logistic regression model building
|
||||
#*************
|
||||
# set the IV and DV for the logistic regression model
|
||||
# IV: corresponds to each unique snp (extracted using grep)
|
||||
x = as.numeric(grepl(i,raw_data$all_muts_pza))
|
||||
# DV: pyrazinamide 0 or 1
|
||||
y = as.numeric(raw_data$pyrazinamide)
|
||||
table(y,x)
|
||||
# run glm model
|
||||
model = glm(y ~ x, family = binomial)
|
||||
#model = glm(y ~ x, family = binomial(link = "logit"))
|
||||
summary(model)
|
||||
#**********
|
||||
# extract relevant model output
|
||||
#**********
|
||||
# extract log OR i.e the Beta estimate of the logistic model for a given snp
|
||||
my_logor = summary(model)$coefficients[2,1]
|
||||
print(paste0('Beta:', my_logor))
|
||||
# extract SE of the logistic model for a given snp
|
||||
my_se = summary(model)$coefficients[2,2]
|
||||
print(paste0('SE:', my_se))
|
||||
# extract Z of the logistic model for a given snp
|
||||
my_zval = summary(model)$coefficients[2,3]
|
||||
print(paste0('Z-value:', my_zval))
|
||||
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
|
||||
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
|
||||
my_or = exp(summary(model)$coefficients[2,1])
|
||||
print(paste0('OR:', my_or))
|
||||
# sanity check : should be True
|
||||
log(my_or) == my_logor
|
||||
# extract P-value of the logistic model for a given snp
|
||||
my_pval = summary(model)$coefficients[2,4]
|
||||
print(paste0('P-value:', my_pval))
|
||||
# extract confint interval of snp (2 steps, since the output is a named number)
|
||||
ci_mod = exp(confint(model))[2,]
|
||||
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
|
||||
print(paste0('CI:', my_ci))
|
||||
#*************
|
||||
# Assign the regression output in the original df
|
||||
# you can use ('=' or '<-/->')
|
||||
#*************
|
||||
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
|
||||
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
|
||||
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
|
||||
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
|
||||
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
|
||||
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
|
||||
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
|
||||
}
|
||||
warnings()
|
||||
View(pnca_snps_or)
|
||||
View(pnca_snps_or_copy)
|
||||
#sanity check
|
||||
pnca_snps_or$mutation == i1
|
||||
#sanity check
|
||||
pnca_snps_or[pnca_snps_or$mutation == i1]
|
||||
pnca_snps_or[pnca_snps_or$mutation == i2]
|
||||
pnca_snps_or[pnca_snps_or$mutation == i2,]
|
||||
pnca_snps_or1 = unique(pnca_snps_or)
|
||||
write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv")
|
||||
# you only need it for the unique mutations
|
||||
pnca_snps_or = unique(pnca_snps_or) #2133, 1
|
||||
for (i in pnca_snps_unique){
|
||||
print(i)
|
||||
#*************
|
||||
# start logistic regression model building
|
||||
#*************
|
||||
# set the IV and DV for the logistic regression model
|
||||
# IV: corresponds to each unique snp (extracted using grep)
|
||||
x = as.numeric(grepl(i,raw_data$all_muts_pza))
|
||||
# DV: pyrazinamide 0 or 1
|
||||
y = as.numeric(raw_data$pyrazinamide)
|
||||
table(y,x)
|
||||
# run glm model
|
||||
model = glm(y ~ x, family = binomial)
|
||||
#model = glm(y ~ x, family = binomial(link = "logit"))
|
||||
summary(model)
|
||||
#**********
|
||||
# extract relevant model output
|
||||
#**********
|
||||
# extract log OR i.e the Beta estimate of the logistic model for a given snp
|
||||
my_logor = summary(model)$coefficients[2,1]
|
||||
print(paste0('Beta:', my_logor))
|
||||
# extract SE of the logistic model for a given snp
|
||||
my_se = summary(model)$coefficients[2,2]
|
||||
print(paste0('SE:', my_se))
|
||||
# extract Z of the logistic model for a given snp
|
||||
my_zval = summary(model)$coefficients[2,3]
|
||||
print(paste0('Z-value:', my_zval))
|
||||
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
|
||||
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
|
||||
my_or = exp(summary(model)$coefficients[2,1])
|
||||
print(paste0('OR:', my_or))
|
||||
# sanity check : should be True
|
||||
log(my_or) == my_logor
|
||||
# extract P-value of the logistic model for a given snp
|
||||
my_pval = summary(model)$coefficients[2,4]
|
||||
print(paste0('P-value:', my_pval))
|
||||
# extract confint interval of snp (2 steps, since the output is a named number)
|
||||
ci_mod = exp(confint(model))[2,]
|
||||
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
|
||||
print(paste0('CI:', my_ci))
|
||||
#*************
|
||||
# Assign the regression output in the original df
|
||||
# you can use ('=' or '<-/->')
|
||||
#*************
|
||||
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
|
||||
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
|
||||
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
|
||||
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
|
||||
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
|
||||
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
|
||||
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
|
||||
}
|
||||
View(pnca_snps_or)
|
||||
2.290256e+01
|
||||
1.561132e+06
|
||||
3.242285e-04
|
||||
#sanity check
|
||||
pnca_snps_or[pnca_snps_or$mutation == i1]
|
||||
pnca_snps_or[pnca_snps_or$mutation == i2,]
|
||||
write.csv(pnca_snps_or1, "../Data_original/valid_pnca_snps_with_OR.csv")
|
||||
my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
|
||||
, stringsAsFactors = FALSE) #11374, 19
|
||||
View(my_data)
|
||||
# remove the first column
|
||||
my_data = my_data[-1] #11374, 18
|
||||
# check if first col is 'id': should be TRUE
|
||||
colnames(my_data)[1] == 'id'
|
||||
# sanity check
|
||||
snps_all = unique(my_data$mutation)# 337
|
||||
pnca_snps_or = snps_all
|
||||
pnca_snps_or = as.data.frame(snps_all)
|
||||
View(pnca_snps_or)
|
||||
snps_all[-"true_wt"]
|
||||
pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
|
||||
View(pnca_snps_or)
|
||||
snps_all = as.data.frame(snps_all)
|
||||
View(snps_all)
|
||||
#remove true_wt entry
|
||||
w1 = which(rownames(snps_all) == "true_wt")
|
||||
View(snps_all)
|
||||
#remove true_wt entry
|
||||
w1 = which(snps_all$snps_all == "true_wt")
|
||||
rm(pnca_snps_or)
|
||||
pnca_snps_or = snps_all[-w1]
|
||||
pnca_snps_or = snps_all[,-w1]
|
||||
pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
|
||||
#remove true_wt entry
|
||||
w1 = which(snps_all) == "true_wt"
|
||||
pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
|
||||
my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
|
||||
, stringsAsFactors = FALSE) #11374, 19
|
||||
# remove the first column
|
||||
my_data = my_data[-1] #11374, 18
|
||||
# check if first col is 'id': should be TRUE
|
||||
colnames(my_data)[1] == 'id'
|
||||
# sanity check
|
||||
snps_all = unique(my_data$mutation)# 337
|
||||
snps_all = as.data.frame(snps_all)
|
||||
snps_all[-c(1,1)]
|
||||
pnca_snps_or = as.data.frame(snps_all[-c(1,1)])
|
||||
pnca_snps_or = as.data.frame(snps_all[, -c(1,1)])
|
||||
#remove true_wt entry
|
||||
#w1 = which(snps_all) == "true_wt"
|
||||
pnca_snps_or = snps_all
|
||||
pnca_snps_or = pnca_snps_or_copy
|
||||
#remove true_wt entry
|
||||
#w1 = which(snps_all) == "true_wt"
|
||||
pnca_snps_or = snps_all
|
||||
pnca_snps_or -> pnca_snps_or_copy
|
||||
#===============
|
||||
# Step 4: Iterate through this unique list
|
||||
# and calculate OR for each snp
|
||||
# and assign to the pnca_snps_or df that has
|
||||
# each row as a unique snp
|
||||
#===============
|
||||
# reset original df so you don't make a mistake: IMPORTANT
|
||||
pnca_snps_or = pnca_snps_or_copy #2133, 1
|
||||
# you only need it for the unique mutations
|
||||
pnca_snps_or = unique(pnca_snps_or) #337, 1
|
||||
for (i in pnca_snps_unique){
|
||||
print(i)
|
||||
#*************
|
||||
# start logistic regression model building
|
||||
#*************
|
||||
# set the IV and DV for the logistic regression model
|
||||
# IV: corresponds to each unique snp (extracted using grep)
|
||||
x = as.numeric(grepl(i,raw_data$all_muts_pza))
|
||||
# DV: pyrazinamide 0 or 1
|
||||
y = as.numeric(raw_data$pyrazinamide)
|
||||
table(y,x)
|
||||
# run glm model
|
||||
model = glm(y ~ x, family = binomial)
|
||||
#model = glm(y ~ x, family = binomial(link = "logit"))
|
||||
summary(model)
|
||||
#**********
|
||||
# extract relevant model output
|
||||
#**********
|
||||
# extract log OR i.e the Beta estimate of the logistic model for a given snp
|
||||
my_logor = summary(model)$coefficients[2,1]
|
||||
print(paste0('Beta:', my_logor))
|
||||
# extract SE of the logistic model for a given snp
|
||||
my_se = summary(model)$coefficients[2,2]
|
||||
print(paste0('SE:', my_se))
|
||||
# extract Z of the logistic model for a given snp
|
||||
my_zval = summary(model)$coefficients[2,3]
|
||||
print(paste0('Z-value:', my_zval))
|
||||
# Dervive OR i.e exp(my_logor) from the logistic model for a given snp
|
||||
#my_or = round(exp(summary(model)$coefficients[2,1]), roundto)
|
||||
my_or = exp(summary(model)$coefficients[2,1])
|
||||
print(paste0('OR:', my_or))
|
||||
# sanity check : should be True
|
||||
log(my_or) == my_logor
|
||||
# extract P-value of the logistic model for a given snp
|
||||
my_pval = summary(model)$coefficients[2,4]
|
||||
print(paste0('P-value:', my_pval))
|
||||
# extract confint interval of snp (2 steps, since the output is a named number)
|
||||
ci_mod = exp(confint(model))[2,]
|
||||
my_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]])
|
||||
print(paste0('CI:', my_ci))
|
||||
#*************
|
||||
# Assign the regression output in the original df
|
||||
# you can use ('=' or '<-/->')
|
||||
#*************
|
||||
#pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i] = my_logor
|
||||
my_logor -> pnca_snps_or$logistic_logOR[pnca_snps_or$mutation == i]
|
||||
my_or -> pnca_snps_or$OR[pnca_snps_or$mutation == i]
|
||||
my_pval -> pnca_snps_or$pvalue[pnca_snps_or$mutation == i]
|
||||
my_zval -> pnca_snps_or$zvalue[pnca_snps_or$mutation == i]
|
||||
my_se -> pnca_snps_or$logistic_se[pnca_snps_or$mutation == i]
|
||||
my_ci -> pnca_snps_or$ci[pnca_snps_or$mutation == i]
|
||||
}
|
||||
getwd()
|
||||
#setwd("~/Documents/git/LSHTM_Y1_PNCA/meta_data_analysis") # work
|
||||
setwd("~/git/LSHTM_Y1_PNCA/meta_data_analysis") # thinkpad
|
||||
#setwd("/Users/tanu/git/LSHTM_Y1_PNCA/meta_data_analysis") # mac
|
||||
getwd()
|
||||
#===============
|
||||
# Step 1: read raw data
|
||||
#===============
|
||||
raw_data<-read.csv("../Data_original/original_tanushree_data_v2.csv"
|
||||
,stringsAsFactors = F)[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")]#19265, 4
|
||||
raw_data<-raw_data[!is.na(raw_data$pyrazinamide),]#12511, 4
|
||||
# combine the two mutation columns
|
||||
raw_data$all_mutations_pyrazinamide<-paste(raw_data$dr_mutations_pyrazinamide, raw_data$other_mutations_pyrazinamide)#12511, 5
|
||||
head(raw_data$all_mutations_pyrazinamide)
|
||||
# create yet another column that contains all the mutations but in lower case
|
||||
raw_data$all_muts_pza = tolower(raw_data$all_mutations_pyrazinamide) #12511, 6
|
||||
table(grepl("pnca_p",raw_data$all_muts_pza))
|
||||
#FALSE TRUE
|
||||
#10603 1908
|
||||
pnca_snps_or = read.csv("../Data_original/pnca_snps_for_or_calcs.csv"
|
||||
, stringsAsFactors = F
|
||||
, header = T) #2133
|
||||
# subset a snall section to test
|
||||
#pnca_snps_or_copy = pnca_snps_or
|
||||
#pnca_snps_or = pnca_snps_or_copy
|
||||
pnca_snps_unique = unique(pnca_snps_or$mutation) #293
|
||||
i2 = "pnca_p.trp68gly" # Should exist
|
||||
grep(i2, pnca_snps_unique)
|
||||
my_data = read.csv("../Data_original/meta_pza_with_AF.csv"
|
||||
, stringsAsFactors = FALSE) #11374, 19
|
||||
# remove the first column
|
||||
my_data = my_data[-1] #11374, 18
|
||||
# check if first col is 'id': should be TRUE
|
||||
colnames(my_data)[1] == 'id'
|
||||
# sanity check
|
||||
head(my_data$mutation)
|
||||
my_data = unique(my_data$mutation)
|
||||
my_data[!duplicated(my_data$mutation)]
|
||||
my_data_unique = my_data[!duplicated(my_data$mutation),]
|
||||
my_data[!duplicated('mutation'),]
|
||||
my_data_unique = my_data[!duplicated(my_data[,'mutation']),]
|
||||
my_data_unique = my_data[!duplicated(my_data['mutation']),]
|
||||
getwd()
|
||||
setwd("/git/LSHTM_analysis/meta_data_analysis")
|
||||
getwd()
|
||||
getwd()
|
||||
setwd("/git/github/LSHTM_analysis/meta_data_analysis")
|
||||
getwd()
|
||||
#===============
|
||||
# Step 1: read GWAS raw data stored in Data_original/
|
||||
#===============
|
||||
infile = read.csv("../Data_original", file.choose(), stringsAsFactors = F))
|
||||
c = file.choose()
|
||||
c = file.choose(../Data_original)
|
||||
c = read.csv(file.choose(), stringsAsFactors = F)
|
||||
#===============
|
||||
# Step 1: read GWAS raw data stored in Data_original/
|
||||
#===============
|
||||
infile = read.csv(file.choose(), stringsAsFactors = F))
|
||||
c = read.csv(file.choose(), stringsAsFactors = F)
|
||||
#===============
|
||||
# Step 1: read GWAS raw data stored in Data_original/
|
||||
#===============
|
||||
infile = read.csv(file.choose(), stringsAsFactors = F)
|
||||
#===============
|
||||
# Step 1: read GWAS raw data stored in Data_original/
|
||||
#===============
|
||||
infile = read.csv(file.choose(), stringsAsFactors = F)
|
||||
raw_data = infile[,c("id","pyrazinamide","dr_mutations_pyrazinamide","other_mutations_pyrazinamide")]
|
||||
outdir = paste0("../mcsm_analysis",drug,"/Data/")
|
||||
# define output variables
|
||||
drug = 'pyrazinamide'
|
||||
outdir = paste0("../mcsm_analysis",drug,"/Data/")
|
||||
outdir = paste0("../mcsm_analysis/",drug,"/Data/")
|
||||
outFile = "meta_data_with_AFandOR.csv"
|
||||
output_filename = paste0(outdir, outFile)
|
||||
output_filename
|
BIN
meta_data_analysis/__pycache__/reference_dict.cpython-37.pyc
Normal file
BIN
meta_data_analysis/__pycache__/reference_dict.cpython-37.pyc
Normal file
Binary file not shown.
7
meta_data_analysis/init_data_dirs.py
Executable file
7
meta_data_analysis/init_data_dirs.py
Executable file
|
@ -0,0 +1,7 @@
|
|||
#!/usr/bin/python3
|
||||
# Initialise a blank 'Data' directory and drug subdirs etc.
|
||||
# TODO:
|
||||
# - Read base dir from config file
|
||||
# - Create eg: '~/git/Data/{original,processed}
|
||||
# - Create eg: '~/git/Data/processed/' + drug (for each drug)
|
||||
# - Create eg: '~/git/Data/output/' + drug + '{plots, structure}'
|
241
meta_data_analysis/pnca_AF_and_OR_calcs.R
Normal file
241
meta_data_analysis/pnca_AF_and_OR_calcs.R
Normal file
|
@ -0,0 +1,241 @@
|
|||
getwd()
|
||||
setwd("/git/github/git/LSHTM_analysis/meta_data_analysis")
|
||||
getwd()
|
||||
|
||||
#===============
|
||||
# Step 1: read GWAS raw data stored in Data_original/
|
||||
#===============
|
||||
infile = read.csv(file.choose(), stringsAsFactors = F)
|
||||
|
||||
raw_data = infile[,c("id"
|
||||
, "pyrazinamide"
|
||||
, "dr_mutations_pyrazinamide"
|
||||
, "other_mutations_pyrazinamide")]
|
||||
|
||||
#####
|
||||
# 1a: exclude na
|
||||
#####
|
||||
raw_data = raw_data[!is.na(raw_data$pyrazinamide),]
|
||||
|
||||
total_samples = length(unique(raw_data$id))
|
||||
print(total_samples)
|
||||
|
||||
# sanity check: should be true
|
||||
is.numeric(total_samples)
|
||||
|
||||
#####
|
||||
# 1b: combine the two mutation columns
|
||||
#####
|
||||
raw_data$all_mutations_pyrazinamide = paste(raw_data$dr_mutations_pyrazinamide
|
||||
, raw_data$other_mutations_pyrazinamide)
|
||||
head(raw_data$all_mutations_pyrazinamide)
|
||||
|
||||
#####
|
||||
# 1c: create yet another column that contains all the mutations but in lower case
|
||||
#####
|
||||
raw_data$all_muts_pnca = tolower(raw_data$all_mutations_pyrazinamide)
|
||||
|
||||
# sanity checks
|
||||
table(grepl("pnca_p",raw_data$all_muts_pnca))
|
||||
|
||||
# sanity check: should be TRUE
|
||||
sum(table(grepl("pnca_p",raw_data$all_muts_pnca))) == total_samples
|
||||
|
||||
# set up variables: can be used for logistic regression as well
|
||||
i = "pnca_p.ala134gly" # has a NA, should NOT exist
|
||||
table(grepl(i,raw_data$all_muts_pnca))
|
||||
|
||||
i = "pnca_p.trp68gly"
|
||||
table(grepl(i,raw_data$all_muts_pnca))
|
||||
|
||||
mut = grepl(i,raw_data$all_muts_pnca)
|
||||
dst = raw_data$pyrazinamide
|
||||
table(mut, dst)
|
||||
|
||||
#chisq.test(table(mut,dst))
|
||||
#fisher.test(table(mut, dst))
|
||||
#table(mut)
|
||||
|
||||
###### read list of muts to calculate OR for (fname3 from pnca_data_extraction.py)
|
||||
pnca_snps_or = read.csv(file.choose()
|
||||
, stringsAsFactors = F
|
||||
, header = T)
|
||||
|
||||
# extract unique snps to iterate over for AF and OR calcs
|
||||
# total no of unique snps
|
||||
# AF and OR calculations
|
||||
|
||||
pnca_snps_unique = unique(pnca_snps_or$mutation)
|
||||
|
||||
# Define OR function
|
||||
x = as.numeric(mut)
|
||||
y = dst
|
||||
or = function(x,y){
|
||||
tab = as.matrix(table(x,y))
|
||||
a = tab[2,2]
|
||||
if (a==0){ a<-0.5}
|
||||
b = tab[2,1]
|
||||
if (b==0){ b<-0.5}
|
||||
c = tab[1,2]
|
||||
if (c==0){ c<-0.5}
|
||||
d = tab[1,1]
|
||||
if (d==0){ d<-0.5}
|
||||
(a/b)/(c/d)
|
||||
|
||||
}
|
||||
|
||||
dst = raw_data$pyrazinamide
|
||||
ors = sapply(pnca_snps_unique,function(m){
|
||||
mut = grepl(m,raw_data$all_muts_pnca)
|
||||
or(mut,dst)
|
||||
})
|
||||
|
||||
ors
|
||||
|
||||
pvals = sapply(pnca_snps_unique,function(m){
|
||||
mut = grepl(m,raw_data$all_muts_pnca)
|
||||
fisher.test(mut,dst)$p.value
|
||||
})
|
||||
|
||||
pvals
|
||||
|
||||
afs = sapply(pnca_snps_unique,function(m){
|
||||
mut = grepl(m,raw_data$all_muts_pnca)
|
||||
mean(mut)
|
||||
})
|
||||
|
||||
afs
|
||||
|
||||
# check ..hmmm
|
||||
afs['pnca_p.trp68gly']
|
||||
afs['pnca_p.gln10pro']
|
||||
afs['pnca_p.leu4ser']
|
||||
|
||||
#plot(density(log(ors)))
|
||||
#plot(-log10(pvals))
|
||||
#hist(log(ors)
|
||||
# ,breaks = 100
|
||||
# )
|
||||
|
||||
# subset df cols to add to the calc param df
|
||||
pnca_snps_cols = pnca_snps_or[c('mutation_info', 'mutation', 'Mutationinformation')]
|
||||
pnca_snps_cols = pnca_snps_cols[!duplicated(pnca_snps_cols$mutation),]
|
||||
|
||||
rownames(pnca_snps_cols) = pnca_snps_cols$mutation
|
||||
head(rownames(pnca_snps_cols))
|
||||
#snps_with_AF_and_OR
|
||||
|
||||
# combine
|
||||
comb_AF_and_OR = data.frame(ors, pvals, afs)
|
||||
head(rownames(comb_AF_and_OR))
|
||||
|
||||
# sanity checks: should be the same
|
||||
dim(comb_AF_and_OR); dim(pnca_snps_cols)
|
||||
table(rownames(comb_AF_and_OR)%in%rownames(pnca_snps_cols))
|
||||
|
||||
table(rownames(pnca_snps_cols)%in%rownames(comb_AF_and_OR))
|
||||
|
||||
# merge the above two df whose dim you checked
|
||||
snps_with_AF_and_OR = merge(comb_AF_and_OR, pnca_snps_cols
|
||||
, by = "row.names"
|
||||
# , all.x = T
|
||||
)
|
||||
|
||||
#rm(pnca_snps_cols, pnca_snps_or, raw_data)
|
||||
|
||||
#===============
|
||||
# Step 3: Read data file where you will add the calculated OR
|
||||
# Note: this is the big file with one-many relationship between snps and lineages
|
||||
# i.e fname4 from 'pnca_extraction.py'
|
||||
#===============
|
||||
my_data = read.csv(file.choose()
|
||||
, row.names = 1
|
||||
, stringsAsFactors = FALSE)
|
||||
|
||||
head(my_data)
|
||||
length(unique(my_data$id))
|
||||
|
||||
# check if first col is 'id': should be TRUE
|
||||
colnames(my_data)[1] == 'id'
|
||||
|
||||
# sanity check
|
||||
head(my_data$mutation)
|
||||
|
||||
# FILES TO MERGE:
|
||||
# comb_AF_and_OR: file containing OR
|
||||
# my_data = big meta data file
|
||||
# linking column: mutation
|
||||
|
||||
head(my_data)
|
||||
merged_df = merge(my_data # big file
|
||||
, snps_with_AF_and_OR # small (afor file)
|
||||
, by = "mutation"
|
||||
, all.x = T) # because you want all the entries of the meta data
|
||||
|
||||
# sanity checks: should be True
|
||||
# FIXME: I have checked this manually, but make it so it is a pass or a fail!
|
||||
comb_AF_and_OR[rownames(comb_AF_and_OR) == "pnca_p.gln10pro",]$ors
|
||||
merged_df[merged_df$Mutationinformation.x == "Q10P",]$ors
|
||||
|
||||
merged_df[merged_df$Mutationinformation.x == "Q10P",]
|
||||
|
||||
# sanity check: very important!
|
||||
colnames(merged_df)
|
||||
|
||||
table(merged_df$mutation_info.x == merged_df$mutation_info.y)
|
||||
|
||||
#FIXME: what happened to other 7 and FALSE
|
||||
table(merged_df$Mutationinformation.x == merged_df$Mutationinformation.y)
|
||||
|
||||
# problem
|
||||
identical(merged_df$Mutationinformation.x, merged_df$Mutationinformation.y)
|
||||
|
||||
#merged_df[merged_df$Mutationinformation.x != merged_df$Mutationinformation.y,]
|
||||
|
||||
#throw away the y because that is a smaller df
|
||||
d1 = which(colnames(merged_df) == "mutation_info.y") #21
|
||||
d2 = which(colnames(merged_df) == "Mutationinformation.y") #22
|
||||
|
||||
merged_df2 = merged_df[-c(d1, d2)] #3093 20
|
||||
colnames(merged_df2)
|
||||
|
||||
# rename cols
|
||||
colnames(merged_df2)[colnames(merged_df2)== "mutation_info.x"] <- "mutation_info"
|
||||
colnames(merged_df2)[colnames(merged_df2)== "Mutationinformation.x"] <- "Mutationinformation"
|
||||
|
||||
colnames(merged_df2)
|
||||
|
||||
# should be 0
|
||||
sum(is.na(merged_df2$Mutationinformation))
|
||||
|
||||
# count na in each column
|
||||
na_count = sapply(merged_df2, function(y) sum(length(which(is.na(y))))); na_count
|
||||
# only some or and Af should be NA
|
||||
#Row.names ors pvals afs
|
||||
#81 81 81 81
|
||||
|
||||
|
||||
colnames(merged_df2)[colnames(merged_df2)== "ors"] <- "OR"
|
||||
colnames(merged_df2)[colnames(merged_df2)== "afs"] <- "AF"
|
||||
colnames(merged_df2)[colnames(merged_df2)== "pvals"] <- "pvalue"
|
||||
|
||||
colnames(merged_df2)
|
||||
|
||||
# add log OR and neglog pvalue
|
||||
merged_df2$logor = log(merged_df2$OR)
|
||||
is.numeric(merged_df2$logor)
|
||||
|
||||
merged_df2$neglog10pvalue = -log10(merged_df2$pvalue)
|
||||
is.numeric(merged_df2$neglog10pvalue)
|
||||
|
||||
# write file out
|
||||
#write.csv(merged_df, "../Data/meta_data_with_AFandOR_JP_TT.csv")
|
||||
|
||||
# define output variables
|
||||
drug = 'pyrazinamide'
|
||||
out_dir = paste0("../mcsm_analysis/",drug,"/Data/")
|
||||
outFile = "meta_data_with_AFandOR.csv"
|
||||
output_filename = paste0(outdir, outFile)
|
||||
|
||||
write.csv(merged_df2, output_filename
|
||||
, row.names = F)
|
626
meta_data_analysis/pnca_data_extraction.py
Executable file
626
meta_data_analysis/pnca_data_extraction.py
Executable file
|
@ -0,0 +1,626 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Tue Aug 6 12:56:03 2019
|
||||
|
||||
@author: tanu
|
||||
"""
|
||||
|
||||
# FIXME: include error checking to enure you only
|
||||
# concentrate on positions that have structural info?
|
||||
|
||||
#%% load libraries
|
||||
###################
|
||||
# load libraries
|
||||
import os, sys
|
||||
import pandas as pd
|
||||
#import numpy as np
|
||||
|
||||
#from pandas.api.types import is_string_dtype
|
||||
#from pandas.api.types import is_numeric_dtype
|
||||
|
||||
# to create dir
|
||||
#my_dir = os.path.expanduser('~/some_dir')
|
||||
#make sure mcsm_analysis/ exists
|
||||
#or specify the output directory
|
||||
|
||||
#%%
|
||||
#%%
|
||||
#%%
|
||||
#========================================================
|
||||
# TASK: extract ALL pncA mutations from GWAS data
|
||||
#========================================================
|
||||
#%%
|
||||
####################
|
||||
# my working dir
|
||||
os.getcwd()
|
||||
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
|
||||
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
|
||||
os.getcwd()
|
||||
#%%
|
||||
from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
|
||||
#%%
|
||||
#NOTE: Out_dir MUST exis
|
||||
# User defined dir strpyrazinamide
|
||||
drug = 'pyrazinamide'
|
||||
gene = 'pnca'
|
||||
out_dir = homedir + '/git/LSHTM_analysis/mcsm_analysis/'
|
||||
# = out_dir + drug
|
||||
data_dir = homedir + '/git/Data'
|
||||
|
||||
if not os.path.exists(data_dir):
|
||||
print('Error!', data_dir, 'does not exist. Please ensure it exists and contains the appropriate raw data')
|
||||
os.makedirs(data_dir)
|
||||
die()
|
||||
|
||||
if not os.path.exists(out_dir):
|
||||
print('Error!', out_dir, 'does not exist. Please create it')
|
||||
exit()
|
||||
|
||||
#if not os.path.exists(work_dir):
|
||||
# print('creating dir that does not exist', 'dir_name:', work_dir)
|
||||
# os.makedirs(work_dir)
|
||||
else:
|
||||
print('Dir exists: Carrying on')
|
||||
|
||||
# now create sub dir structure within work_dir
|
||||
# pyrazinamide/mcsm_analysis
|
||||
|
||||
# we need three dir
|
||||
# Data
|
||||
# Scripts
|
||||
# Plotting
|
||||
# Results
|
||||
# Plots
|
||||
|
||||
# create a list of dir names
|
||||
#dir_names = ['Data', 'Scripts', 'Results']
|
||||
|
||||
|
||||
#for i in dir_names:
|
||||
# this_dir = (work_dir + '/' + i)
|
||||
# if not os.path.exists(this_dir):
|
||||
# print('creating dir that does not exist:', this_dir)
|
||||
# os.makedirs(this_dir)
|
||||
#else:
|
||||
# print('Dir exists: Carrying on')
|
||||
|
||||
# Create sub dirs
|
||||
# 1)
|
||||
# Scripts
|
||||
# Plotting
|
||||
#subdir_plotting = work_dir + '/Scripts/Plotting'
|
||||
#if not os.path.exists(subdir_plotting):
|
||||
# print('creating dir that does not exist:', subdir_plotting)
|
||||
# os.makedirs(subdir_plotting)
|
||||
#else:
|
||||
# print('Dir exists: Carrying on')
|
||||
|
||||
# 2)
|
||||
# Results
|
||||
# Plots
|
||||
#subdir_plots = work_dir + '/Results/Plots'
|
||||
#if not os.path.exists(subdir_plots):
|
||||
# print('creating dir that does not exist:', subdir_plots)
|
||||
# os.makedirs(subdir_plots)
|
||||
#else:
|
||||
# print('Dir exists: Carrying on')
|
||||
|
||||
# clear varaibles
|
||||
#del(dir_names, drug, i, subdir_plots, subdir_plotting)
|
||||
|
||||
#exit()
|
||||
#%%
|
||||
#==============================================================================
|
||||
############
|
||||
# STEP 1: Read file original_tanushree_data_v2.csv
|
||||
############
|
||||
data_file = data_dir + '/input/original/original_tanushree_data_v2.csv'
|
||||
meta_data = pd.read_csv(data_file, sep = ',')
|
||||
|
||||
# column names
|
||||
list(meta_data.columns)
|
||||
|
||||
# extract elevant columns to extract from meta data related to the pyrazinamide
|
||||
meta_data = meta_data[['id'
|
||||
,'country'
|
||||
,'lineage'
|
||||
,'sublineage'
|
||||
,'drtype'
|
||||
, 'pyrazinamide'
|
||||
, 'dr_mutations_pyrazinamide'
|
||||
, 'other_mutations_pyrazinamide'
|
||||
]]
|
||||
|
||||
# checks
|
||||
total_samples = meta_data['id'].nunique() # 19265
|
||||
|
||||
# counts NA per column
|
||||
meta_data.isna().sum()
|
||||
|
||||
# glance
|
||||
meta_data.head()
|
||||
|
||||
# equivalent of table in R
|
||||
# pyrazinamide counts
|
||||
meta_data.pyrazinamide.value_counts()
|
||||
|
||||
#%%
|
||||
############
|
||||
# STEP 2: extract entries containing selected genes:
|
||||
# pyrazinamide: pnca_p.
|
||||
# in the dr_mutations and other mutations"
|
||||
# as we are interested in the mutations in the protein coding region only
|
||||
# (corresponding to a structure)
|
||||
# and drop the entries with NA
|
||||
#############
|
||||
meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
|
||||
meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
|
||||
|
||||
del(meta_pza)
|
||||
|
||||
##########################
|
||||
# pyrazinamide: pnca_p.
|
||||
##########################
|
||||
meta_data_pnca = meta_data[['id'
|
||||
,'country'
|
||||
,'lineage'
|
||||
,'sublineage'
|
||||
,'drtype'
|
||||
, 'pyrazinamide'
|
||||
, 'dr_mutations_pyrazinamide'
|
||||
, 'other_mutations_pyrazinamide'
|
||||
]]
|
||||
|
||||
del(meta_data)
|
||||
|
||||
# sanity checks
|
||||
|
||||
# dr_mutations only
|
||||
meta_pnca_dr = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
|
||||
meta_pnca_dr['id'].nunique()
|
||||
del(meta_pnca_dr)
|
||||
|
||||
# other mutations
|
||||
meta_pnca_other = meta_data_pnca.loc[meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
|
||||
meta_pnca_other['id'].nunique()
|
||||
del(meta_pnca_other)
|
||||
|
||||
# Now extract "all" mutations
|
||||
meta_pnca_all = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*') | meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*') ]
|
||||
|
||||
meta_pnca_all['id'].nunique()
|
||||
pnca_samples = len(meta_pnca_all)
|
||||
pnca_na = meta_pnca_all['pyrazinamide'].isna().sum()
|
||||
comp_pnca_samples = pnca_samples - pnca_na
|
||||
|
||||
#=#=#=#=#=#=#
|
||||
# COMMENT: use it later to check number of complete samples from LF data
|
||||
#=#=#=#=#=#=#
|
||||
|
||||
# sanity checks
|
||||
meta_pnca_all.dr_mutations_pyrazinamide.value_counts()
|
||||
meta_pnca_all.other_mutations_pyrazinamide.value_counts()
|
||||
|
||||
# more sanity checks
|
||||
# !CAUTION!: muts will change depending on your gene
|
||||
|
||||
# dr muts : insert
|
||||
meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro')] #
|
||||
meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Phe106Leu')] # empty
|
||||
meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Val139Leu')]
|
||||
|
||||
meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists # rows
|
||||
m = meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists # rows
|
||||
|
||||
# other_muts
|
||||
meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro*')] # empty
|
||||
meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Phe106Leu')]
|
||||
|
||||
#=#=#=#=#=#=#=#=#=#
|
||||
# FIXME
|
||||
# COMMENTS: both mutations columns are separated by ;
|
||||
# CHECK if there are mutations that exist both in dr and other_muts!
|
||||
# doesn't make any sense for the same mut to exist in both, I would have thought!
|
||||
#=#=#=#=#=#=#=#=#=#
|
||||
|
||||
# remove not required variables
|
||||
del(meta_data_pnca)
|
||||
|
||||
############
|
||||
# STEP 3: split the columns of
|
||||
# a) dr_mutation_... (;) as
|
||||
# the column has snps related to multiple genes.
|
||||
# useful links
|
||||
# https://stackoverflow.com/questions/17116814/pandas-how-do-i-split-text-in-a-column-into-multiple-rows
|
||||
# this one works beautifully
|
||||
# https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows
|
||||
############
|
||||
|
||||
# sanity check: counts NA per column afer subsetted df: i.e in meta_pza(with pncA_p. extracted mutations)
|
||||
meta_pnca_all.isna().sum()
|
||||
|
||||
#=#=#=#=#=#=#=#=#=#
|
||||
# COMMENT: no NA's in dr_mutations/other_mutations_columns
|
||||
#=#=#=#=#=#=#=#=#=#
|
||||
# define the split function
|
||||
def tidy_split(df, column, sep='|', keep=False):
|
||||
"""
|
||||
Split the values of a column and expand so the new DataFrame has one split
|
||||
value per row. Filters rows where the column is missing.
|
||||
|
||||
Params
|
||||
------
|
||||
df : pandas.DataFrame
|
||||
dataframe with the column to split and expand
|
||||
column : str
|
||||
the column to split and expand
|
||||
sep : str
|
||||
the string used to split the column's values
|
||||
keep : bool
|
||||
whether to retain the presplit value as it's own row
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.DataFrame
|
||||
Returns a dataframe with the same columns as `df`.
|
||||
"""
|
||||
indexes = list()
|
||||
new_values = list()
|
||||
#df = df.dropna(subset=[column])#<<<<<<-----see this incase you need to uncomment based on use case
|
||||
for i, presplit in enumerate(df[column].astype(str)):
|
||||
values = presplit.split(sep)
|
||||
if keep and len(values) > 1:
|
||||
indexes.append(i)
|
||||
new_values.append(presplit)
|
||||
for value in values:
|
||||
indexes.append(i)
|
||||
new_values.append(value)
|
||||
new_df = df.iloc[indexes, :].copy()
|
||||
new_df[column] = new_values
|
||||
return new_df
|
||||
|
||||
########
|
||||
# 3a: call tidy_split() on 'dr_mutations_pyrazinamide' column and remove leading white spaces
|
||||
#https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas
|
||||
########
|
||||
meta_pnca_WF0 = tidy_split(meta_pnca_all, 'dr_mutations_pyrazinamide', sep = ';')
|
||||
|
||||
# remove leading white space else these are counted as distinct mutations as well
|
||||
meta_pnca_WF0['dr_mutations_pyrazinamide'] = meta_pnca_WF0['dr_mutations_pyrazinamide'].str.lstrip()
|
||||
|
||||
########
|
||||
# 3b: call function on 'other_mutations_pyrazinamide' column and remove leading white spaces
|
||||
########
|
||||
meta_pnca_WF1 = tidy_split(meta_pnca_WF0, 'other_mutations_pyrazinamide', sep = ';')
|
||||
|
||||
# remove the leading white spaces in the column
|
||||
meta_pnca_WF1['other_mutations_pyrazinamide'] = meta_pnca_WF1['other_mutations_pyrazinamide'].str.strip()
|
||||
|
||||
##########
|
||||
# Step 4: Reshape data so that all mutations are in one column and the
|
||||
# annotations for the mutation reflect the column name
|
||||
# LINK: http://www.datasciencemadesimple.com/reshape-wide-long-pandas-python-melt-function/
|
||||
|
||||
# data frame “df” is passed to melt() function
|
||||
# id_vars is the variable which need to be left unaltered
|
||||
# var_name are the column names so we named it as 'mutation_info'
|
||||
# value_name are its values so we named it as 'mutation'
|
||||
##########
|
||||
meta_pnca_WF1.columns
|
||||
|
||||
meta_pnca_LF0 = pd.melt(meta_pnca_WF1
|
||||
, id_vars = ['id', 'country', 'lineage', 'sublineage', 'drtype', 'pyrazinamide']
|
||||
, var_name = 'mutation_info'
|
||||
, value_name = 'mutation')
|
||||
|
||||
# sanity check: should be true
|
||||
if len(meta_pnca_LF0) == len(meta_pnca_WF1)*2:
|
||||
print('sanity check passed: Long format df has the expected length')
|
||||
else:
|
||||
print("Sanity check failed: Debug please!")
|
||||
|
||||
###########
|
||||
# Step 5: This is still dirty data. Filter LF data so that you only have
|
||||
# mutations corresponding to pnca_p.
|
||||
# this will be your list you run OR calcs
|
||||
###########
|
||||
meta_pnca_LF1 = meta_pnca_LF0[meta_pnca_LF0['mutation'].str.contains('pncA_p.*')]
|
||||
|
||||
# sanity checks
|
||||
# unique samples
|
||||
meta_pnca_LF1['id'].nunique()
|
||||
if len(meta_pnca_all) == meta_pnca_LF1['id'].nunique():
|
||||
print("Sanity check passed: No of samples with pncA mutations match")
|
||||
else:
|
||||
print("Sanity check failed: Debug please!")
|
||||
|
||||
# count if all the mutations are indeed in the protein coding region
|
||||
# i.e begin with pnca_p
|
||||
meta_pnca_LF1['mutation'].str.count('pncA_p.').sum() # 3093
|
||||
|
||||
# should be true.
|
||||
# and check against the length of the df, which should match
|
||||
if len(meta_pnca_LF1) == meta_pnca_LF1['mutation'].str.count('pncA_p.').sum():
|
||||
print("Sanity check passed: Long format data containing pnca mutations indeed correspond to pncA_p region")
|
||||
else:
|
||||
print("Sanity check failed: Debug please!")
|
||||
|
||||
###########
|
||||
# Step 6: Filter dataframe with "na" in the drug column
|
||||
# This is because for OR, you can't use the snps that have the
|
||||
# NA in the specified drug column
|
||||
# it creates problems when performing calcs in R inside the loop
|
||||
# so best to filter it here
|
||||
###########
|
||||
# NOT NEEDED FOR all snps, only for extracting valid OR snps
|
||||
del (meta_pnca_WF0, meta_pnca_WF1, meta_pnca_LF0, meta_pnca_all)
|
||||
|
||||
###########
|
||||
# Step 7: count unique pncA_p mutations (all and comp cases)
|
||||
###########
|
||||
meta_pnca_LF1['mutation'].nunique()
|
||||
meta_pnca_LF1.groupby('mutation_info').nunique()
|
||||
|
||||
meta_pnca_LF1['id'].nunique()
|
||||
meta_pnca_LF1['mutation'].nunique()
|
||||
meta_pnca_LF1.groupby('id').nunique()
|
||||
|
||||
###########
|
||||
# Step 8: convert all snps only (IN LOWERCASE)
|
||||
# because my mcsm file intergated has lowercase
|
||||
###########
|
||||
# convert mutation to lower case as it needs to exactly match the dict key
|
||||
#meta_pnca_LF1['mutation'] = meta_pnca_LF1.mutation.str.lower() # WARNINGS: suggested to use .loc
|
||||
meta_pnca_LF1['mutation'] = meta_pnca_LF1.loc[:, 'mutation'].str.lower()
|
||||
|
||||
###########
|
||||
# Step 9 : Split 'mutation' column into three: wild_type, position and
|
||||
# mutant_type separately. Then map three letter code to one from the
|
||||
# referece_dict imported pncaeady. First convert to mutation to lowercase
|
||||
# to allow to match entries from dict
|
||||
###########
|
||||
#=======
|
||||
# Step 9a: iterate through the dict, create a lookup dict i.e
|
||||
# lookup_dict = {three_letter_code: one_letter_code}.
|
||||
# lookup dict should be the key and the value (you want to create a column for)
|
||||
# Then use this to perform the mapping separetly for wild type and mutant cols.
|
||||
# The three letter code is extracted using a regex match from the dataframe and then converted
|
||||
# to 'pandas series'since map only works in pandas series
|
||||
#=======
|
||||
# initialise a sub dict that is a lookup dict for three letter code to one
|
||||
lookup_dict = dict()
|
||||
for k, v in my_aa_dict.items():
|
||||
lookup_dict[k] = v['one_letter_code']
|
||||
wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
|
||||
meta_pnca_LF1['wild_type'] = wt.map(lookup_dict)
|
||||
mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
|
||||
meta_pnca_LF1['mutant_type'] = mut.map(lookup_dict)
|
||||
|
||||
# extract position info from mutation column separetly using regex
|
||||
meta_pnca_LF1['position'] = meta_pnca_LF1['mutation'].str.extract(r'(\d+)')
|
||||
|
||||
# clear variables
|
||||
del(k, v, wt, mut, lookup_dict)
|
||||
|
||||
#=========
|
||||
# Step 9b: iterate through the dict, create a lookup dict that i.e
|
||||
# lookup_dict = {three_letter_code: aa_prop_water}
|
||||
# Do this for both wild_type and mutant as above.
|
||||
#=========
|
||||
# initialise a sub dict that is lookup dict for three letter code to aa prop
|
||||
lookup_dict = dict()
|
||||
|
||||
for k, v in my_aa_dict.items():
|
||||
lookup_dict[k] = v['aa_prop_water']
|
||||
#print(lookup_dict)
|
||||
wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
|
||||
meta_pnca_LF1['wt_prop_water'] = wt.map(lookup_dict)
|
||||
mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
|
||||
meta_pnca_LF1['mut_prop_water'] = mut.map(lookup_dict)
|
||||
|
||||
# added two more cols
|
||||
|
||||
# clear variables
|
||||
del(k, v, wt, mut, lookup_dict)
|
||||
|
||||
#========
|
||||
# Step 9c: iterate through the dict, create a lookup dict that i.e
|
||||
# lookup_dict = {three_letter_code: aa_prop_polarity}
|
||||
# Do this for both wild_type and mutant as above.
|
||||
#=========
|
||||
# initialise a sub dict that is lookup dict for three letter code to aa prop
|
||||
lookup_dict = dict()
|
||||
|
||||
for k, v in my_aa_dict.items():
|
||||
lookup_dict[k] = v['aa_prop_polarity']
|
||||
#print(lookup_dict)
|
||||
wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
|
||||
meta_pnca_LF1['wt_prop_polarity'] = wt.map(lookup_dict)
|
||||
mut = meta_pnca_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
|
||||
meta_pnca_LF1['mut_prop_polarity'] = mut.map(lookup_dict)
|
||||
|
||||
# added two more cols
|
||||
|
||||
# clear variables
|
||||
del(k, v, wt, mut, lookup_dict)
|
||||
|
||||
########
|
||||
# Step 10: combine the wild_type+poistion+mutant_type columns to generate
|
||||
# Mutationinformation (matches mCSM output field)
|
||||
# Remember to use .map(str) for int col types to allow string concatenation
|
||||
#########
|
||||
meta_pnca_LF1['Mutationinformation'] = meta_pnca_LF1['wild_type'] + meta_pnca_LF1.position.map(str) + meta_pnca_LF1['mutant_type']
|
||||
|
||||
#=#=#=#=#=#=#
|
||||
# Step 11:
|
||||
# COMMENT: there is more processing in the older version of this script
|
||||
# consult if necessary
|
||||
# possibly due to the presence of true_wt
|
||||
# since this file doesn't contain any true_wt, we won't need it(hopefully!)
|
||||
#=#=#=#=#=#=#
|
||||
|
||||
#%%
|
||||
###########
|
||||
# Step 12: Output files for only SNPs to run mCSM
|
||||
###########
|
||||
|
||||
#=========
|
||||
# Step 12a: all SNPs to run mCSM
|
||||
#=========
|
||||
snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique())
|
||||
pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique())
|
||||
|
||||
# assign meaningful colnames
|
||||
#snps_only.rename({0 : 'all_pnca_snps'}, axis = 1, inplace = True)
|
||||
#list(snps_only.columns)
|
||||
snps_only.isna().sum() # should be 0
|
||||
|
||||
# output csv: all SNPS for mCSM analysis
|
||||
# specify variable name for output file
|
||||
gene = 'pnca'
|
||||
#drug = 'pyrazinamide'
|
||||
my_fname1 = '_snps_'
|
||||
nrows = len(snps_only)
|
||||
|
||||
#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
|
||||
#output_file_path = work_dir + '/Data/'
|
||||
output_file_path = data_dir + '/input/processed/' + drug + '/'
|
||||
|
||||
if not os.path.exists(output_file_path):
|
||||
print( output_file_path, 'does not exist. Creating')
|
||||
os.makedirs(output_file_path)
|
||||
exit()
|
||||
|
||||
output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv'
|
||||
print(output_filename) #<<<- check
|
||||
|
||||
# write to csv: without column or row names
|
||||
# Bad practice: numbers at the start of a filename
|
||||
snps_only.to_csv(output_filename, header = False, index = False)
|
||||
|
||||
#=========
|
||||
# Step 12b: all snps with annotation
|
||||
#=========
|
||||
# all snps, selected cols
|
||||
#pnca_snps_ALL = meta_pnca_LF1[['id','country','lineage', 'sublineage'
|
||||
# , 'drtype', 'pyrazinamide'
|
||||
# , 'mutation_info', 'mutation', 'Mutationinformation']]
|
||||
|
||||
#len(pnca_snps_ALL)
|
||||
|
||||
# sanity check
|
||||
#meta_pnca_LF1['mutation'].nunique()
|
||||
|
||||
# output csv: WITH column but WITHOUT row names(all snps with meta data)
|
||||
# specify variable name for output file
|
||||
#gene = 'pnca'
|
||||
#drug = 'pyrazinamide'
|
||||
#my_fname2 = '_snps_with_metadata_'
|
||||
#nrows = len(pnca_snps_ALL)
|
||||
|
||||
#output_file_path = work_dir + '/Data/'
|
||||
#output_filename = output_file_path + gene + my_fname2 + str(nrows) + '.csv'
|
||||
#print(output_filename) #<<<- check
|
||||
|
||||
# write out file
|
||||
#pnca_snps_ALL.to_csv(output_filename, header = True, index = False)
|
||||
|
||||
#=========
|
||||
# Step 12c: comp snps for OR calcs with annotation
|
||||
#=========
|
||||
# remove all NA's from pyrazinamide column from LF1
|
||||
|
||||
# counts NA per column
|
||||
meta_pnca_LF1.isna().sum()
|
||||
|
||||
# remove NA
|
||||
meta_pnca_LF2 = meta_pnca_LF1.dropna(subset=['pyrazinamide'])
|
||||
|
||||
# sanity checks
|
||||
# should be True
|
||||
len(meta_pnca_LF2) == len(meta_pnca_LF1) - meta_pnca_LF1['pyrazinamide'].isna().sum()
|
||||
|
||||
# unique counts
|
||||
meta_pnca_LF2['mutation'].nunique()
|
||||
|
||||
meta_pnca_LF2.groupby('mutation_info').nunique()
|
||||
|
||||
# sanity check
|
||||
meta_pnca_LF2['id'].nunique()
|
||||
|
||||
# should be True
|
||||
if meta_pnca_LF2['id'].nunique() == comp_pnca_samples:
|
||||
print ('sanity check passed: complete numbers match')
|
||||
else:
|
||||
print('Error: Please Debug!')
|
||||
|
||||
# value counts
|
||||
meta_pnca_LF2.mutation.value_counts()
|
||||
#meta_pnca_LF2.groupby(['mutation_info', 'mutation']).size()
|
||||
|
||||
# valid/comp snps
|
||||
# uncomment as necessary
|
||||
pnca_snps_COMP = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())
|
||||
len(pnca_snps_COMP)
|
||||
|
||||
# output csv: WITH column but WITHOUT row names (COMP snps with meta data)
|
||||
# specify variable name for output file
|
||||
|
||||
gene = 'pnca'
|
||||
#drug = 'pyrazinamide'
|
||||
my_fname3 = '_comp_snps_with_metadata_'
|
||||
nrows = len(pnca_snps_COMP)
|
||||
|
||||
#output_filename = output_file_path + gene + my_fname3 + str(nrows) + '.csv'
|
||||
#print(output_filename) #<<<-check
|
||||
|
||||
# write out file
|
||||
#pnca_snps_COMP.to_csv(output_filename, header = True, index = False)
|
||||
|
||||
|
||||
#=========
|
||||
# Step 12d: comp snps only
|
||||
#=========
|
||||
# output csv: comp SNPS for info (i.e snps for which OR exist)
|
||||
# specify variable name for output file
|
||||
|
||||
snps_only = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())
|
||||
|
||||
gene = 'pnca'
|
||||
#drug = 'pyrazinamide'
|
||||
my_fname1 = '_comp_snps_'
|
||||
nrows = len(snps_only)
|
||||
|
||||
output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv'
|
||||
print(output_filename) #<<<- check
|
||||
|
||||
# write to csv: without column or row names
|
||||
snps_only.to_csv(output_filename, header = False, index = False)
|
||||
|
||||
|
||||
#=#=#=#=#=#=#=#
|
||||
# COMMENT: LF1 is the file to extract all unique snps for mcsm
|
||||
# but you have that already in file called pnca_snps...
|
||||
# LF2: is the file for extracting snps tested for DS and hence OR calcs
|
||||
#=#=#=#=#=#=#=#
|
||||
|
||||
###########
|
||||
# Step 13 : Output the whole df i.e
|
||||
# file for meta_data which is now formatted with
|
||||
# each row as a unique snp rather than the original version where
|
||||
# each row is a unique id
|
||||
# ***** This is the file you will ADD the AF and OR calculations to *****
|
||||
###########
|
||||
|
||||
# output csv: the entire DF
|
||||
# specify variable name for output file
|
||||
gene = 'pnca'
|
||||
#drug = 'pyrazinamide'
|
||||
my_fname4 = '_metadata'
|
||||
#nrows = len(meta_pnca_LF1)
|
||||
output_filename = output_file_path + gene + my_fname4 + '.csv'
|
||||
print(output_filename) #<<<-check
|
||||
|
||||
# write out file
|
||||
meta_pnca_LF1.to_csv(output_filename)
|
121
meta_data_analysis/reference_dict.py
Executable file
121
meta_data_analysis/reference_dict.py
Executable file
|
@ -0,0 +1,121 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Tue Jun 18 11:32:28 2019
|
||||
|
||||
@author: tanushree
|
||||
"""
|
||||
############################################
|
||||
#load libraries
|
||||
import pandas as pd
|
||||
import os
|
||||
#############################################
|
||||
|
||||
#!#########################!
|
||||
# REQUIREMNETS:
|
||||
# Data_original/ must exist
|
||||
# containing GWAS data
|
||||
#!#########################!
|
||||
|
||||
print(os.getcwd())
|
||||
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
|
||||
os.chdir(homedir + '/git/Data/input/original')
|
||||
print(os.getcwd())
|
||||
#==========
|
||||
#read file
|
||||
#==========
|
||||
my_aa = pd.read_csv('aa_codes.csv') #20, 6
|
||||
#assign the one_letter code as the row names so that it is easier to create a dict of dicts using index
|
||||
#my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6 #a way to it since it is the first column
|
||||
my_aa = my_aa.set_index('three_letter_code_lower') #20, 5
|
||||
|
||||
#=========================================================
|
||||
#convert file to dict of dicts
|
||||
#=========================================================
|
||||
#convert each row into a dict of dicts so that there are 20 aa and 5 keys within
|
||||
#with your choice of column name that you have assigned to index as the "primary key".
|
||||
#using 'index' creates a dict of dicts
|
||||
#using 'records' creates a list of dicts
|
||||
my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys
|
||||
|
||||
#================================================
|
||||
#dict of aa with their corresponding properties
|
||||
#This is defined twice
|
||||
#================================================
|
||||
#7 categories: no overlap
|
||||
qualities1 = { ('R', 'H', 'K'): 'Basic'
|
||||
, ('D', 'E'): 'Acidic'
|
||||
, ('N', 'Q'): 'Amidic'
|
||||
, ('G', 'A', 'V', 'L', 'I', 'P'): 'Hydrophobic'
|
||||
, ('S', 'T'): 'Hydroxylic'
|
||||
, ('F', 'W', 'Y'): 'Aromatic'
|
||||
, ('C', 'M'): 'Sulphur'
|
||||
}
|
||||
|
||||
#9 categories: allowing for overlap
|
||||
qualities2 = { ('R', 'H', 'K'): 'Basic'
|
||||
, ('D', 'E'): 'Acidc'
|
||||
, ('S', 'T', 'N', 'Q'): 'Polar'
|
||||
, ('V', 'I', 'L', 'M', 'F', 'Y', 'W'): 'Hydrophobic'
|
||||
, ('S', 'T', 'H', 'N', 'Q', 'E', 'D', 'K', 'R'): 'Hydrophilic'
|
||||
, ('S', 'G', 'A', 'P'): 'Small'
|
||||
, ('F', 'W', 'Y', 'H'): 'Aromatic'
|
||||
, ('V', 'I', 'L', 'M'): 'Aliphatic'
|
||||
, ('C', 'G', 'P'): 'Special'
|
||||
}
|
||||
|
||||
qualities_taylor = { ('R', 'H', 'K'): 'Basic'
|
||||
, ('D', 'E'): 'Acidc'
|
||||
, ('S', 'T', 'N', 'Q', 'C', 'Y', 'W', 'H', 'K', 'R', 'D', 'E'): 'Polar'
|
||||
, ('V', 'I', 'L', 'M', 'F', 'Y', 'W', 'C', 'A', 'G', 'T', 'H'): 'Hydrophobic'
|
||||
#, ('S', 'T', 'H', 'N', 'Q', 'E', 'D', 'K', 'R'): 'Hydrophilic', #C, W, y MISSING FROM POLAR!
|
||||
, ('S', 'G', 'A', 'P', 'C', 'T', 'N', 'D', 'V'): 'Small'
|
||||
, ('F', 'W', 'Y', 'H'): 'Aromatic'
|
||||
, ('V', 'I', 'L', 'M'): 'Aliphatic' #although M is not strictly in the circle!
|
||||
, ('C', 'G', 'P'): 'Special'
|
||||
}
|
||||
|
||||
qualities_water = { ('D', 'E', 'N', 'P', 'Q', 'R', 'S'): 'hydrophilic'
|
||||
, ('A', 'C', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'T', 'V', 'W', 'X', 'Y'): 'hydrophobic'
|
||||
}
|
||||
|
||||
qualities_polarity = { ('D', 'E'): 'acidic'
|
||||
, ('H', 'K', 'R'): 'basic'
|
||||
, ('C', 'G', 'N', 'Q', 'S', 'T', 'Y'): 'neutral'
|
||||
, ('A', 'F', 'I', 'L', 'M', 'P', 'V', 'W'): 'non-polar'
|
||||
}
|
||||
|
||||
#==============================================================================
|
||||
#adding amino acid properties to my dict of dicts
|
||||
for k, v in my_aa_dict.items():
|
||||
#print (k,v)
|
||||
v['aa_prop1'] = str() #initialise keys
|
||||
v['aa_prop2'] = list() #initialise keys (allows for overalpping properties)
|
||||
v['aa_taylor'] = list() #initialise keys (allows for overalpping properties)
|
||||
v['aa_prop_water'] = str() #initialise keys
|
||||
v['aa_prop_polarity'] = str() #initialise keys
|
||||
|
||||
for group in qualities1:
|
||||
if v['one_letter_code'] in group:
|
||||
v['aa_prop1']+= qualities1[group] # += for str concat
|
||||
|
||||
for group in qualities2:
|
||||
if v['one_letter_code'] in group:
|
||||
v['aa_prop2'].append(qualities2[group]) # append to list
|
||||
|
||||
for group in qualities_taylor:
|
||||
if v['one_letter_code'] in group:
|
||||
v['aa_taylor'].append(qualities_taylor[group]) # append to list
|
||||
|
||||
for group in qualities_water:
|
||||
if v['one_letter_code'] in group:
|
||||
v['aa_prop_water']+= qualities_water[group] # += for str concat
|
||||
|
||||
for group in qualities_polarity:
|
||||
if v['one_letter_code'] in group:
|
||||
v['aa_prop_polarity']+= qualities_polarity[group] # += for str concat
|
||||
|
||||
#COMMENT:VOILA!!! my_aa_dict is now a dict of dicts containing all associated properties for each aa
|
||||
#==============================================================================
|
||||
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue