updated gitignore to tidyup
This commit is contained in:
parent
1cf1f4e70e
commit
2c013124ad
14 changed files with 1 additions and 2677 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -6,6 +6,7 @@
|
||||||
__pycache__
|
__pycache__
|
||||||
*/__pycache__
|
*/__pycache__
|
||||||
mcsm_analysis_fixme
|
mcsm_analysis_fixme
|
||||||
|
meta_data_analysis
|
||||||
del
|
del
|
||||||
examples
|
examples
|
||||||
example
|
example
|
||||||
|
|
|
@ -1,398 +0,0 @@
|
||||||
#########################################################
|
|
||||||
# TASK: To calculate Allele Frequency and
|
|
||||||
# Odds Ratio from master data
|
|
||||||
# and add the calculated params to meta_data extracted from
|
|
||||||
# data_extraction.py
|
|
||||||
#########################################################
|
|
||||||
getwd()
|
|
||||||
setwd('~/git/LSHTM_analysis/meta_data_analysis')
|
|
||||||
getwd()
|
|
||||||
|
|
||||||
#%% variable assignment: input and output paths & filenames
|
|
||||||
drug = 'pyrazinamide'
|
|
||||||
gene = 'pncA'
|
|
||||||
gene_match = paste0(gene,'_p.')
|
|
||||||
cat(gene_match)
|
|
||||||
|
|
||||||
#===========
|
|
||||||
# input
|
|
||||||
#===========
|
|
||||||
# infile1: Raw data
|
|
||||||
#indir = 'git/Data/pyrazinamide/input/original'
|
|
||||||
indir = paste0('~/git/Data')
|
|
||||||
in_filename = 'original_tanushree_data_v2.csv'
|
|
||||||
infile = paste0(indir, '/', in_filename)
|
|
||||||
cat(paste0('Reading infile1: raw data', ' ', infile) )
|
|
||||||
|
|
||||||
# infile2: gene associated meta data file to extract valid snps and add calcs to.
|
|
||||||
# This is outfile3 from data_extraction.py
|
|
||||||
indir_metadata = paste0('~/git/Data', '/', drug, '/', 'output')
|
|
||||||
in_filename_metadata = 'pnca_metadata.csv'
|
|
||||||
infile_metadata = paste0(indir_metadata, '/', in_filename_metadata)
|
|
||||||
cat(paste0('Reading infile2: gene associated metadata:', infile_metadata))
|
|
||||||
|
|
||||||
#===========
|
|
||||||
# output
|
|
||||||
#===========
|
|
||||||
# outdir = 'git/Data/pyrazinamide/output'
|
|
||||||
outdir = paste0('~/git/Data', '/', drug, '/', 'output')
|
|
||||||
out_filename = paste0(tolower(gene),'_', 'meta_data_with_AFandOR.csv')
|
|
||||||
outfile = paste0(outdir, '/', out_filename)
|
|
||||||
cat(paste0('Output file with full path:', outfile))
|
|
||||||
#%% end of variable assignment for input and output files
|
|
||||||
|
|
||||||
#########################################################
|
|
||||||
# 1: Read master/raw data stored in Data/
|
|
||||||
#########################################################
|
|
||||||
raw_data_all = read.csv(infile, stringsAsFactors = F)
|
|
||||||
|
|
||||||
raw_data = raw_data_all[,c("id"
|
|
||||||
, "pyrazinamide"
|
|
||||||
, "dr_mutations_pyrazinamide"
|
|
||||||
, "other_mutations_pyrazinamide")]
|
|
||||||
rm(raw_data_all)
|
|
||||||
|
|
||||||
rm(indir, in_filename, infile)
|
|
||||||
|
|
||||||
#===========
|
|
||||||
# 1a: exclude na
|
|
||||||
#===========
|
|
||||||
raw_data = raw_data[!is.na(raw_data$pyrazinamide),]
|
|
||||||
|
|
||||||
total_samples = length(unique(raw_data$id))
|
|
||||||
cat(paste0('Total samples without NA in', ' ', drug, 'is:', total_samples))
|
|
||||||
|
|
||||||
# sanity check: should be true
|
|
||||||
is.numeric(total_samples)
|
|
||||||
|
|
||||||
#===========
|
|
||||||
# 1b: combine the two mutation columns
|
|
||||||
#===========
|
|
||||||
raw_data$all_mutations_pyrazinamide = paste(raw_data$dr_mutations_pyrazinamide
|
|
||||||
, raw_data$other_mutations_pyrazinamide)
|
|
||||||
head(raw_data$all_mutations_pyrazinamide)
|
|
||||||
|
|
||||||
#===========
|
|
||||||
# 1c: create yet another column that contains all the mutations but in lower case
|
|
||||||
#===========
|
|
||||||
raw_data$all_muts_pnca = tolower(raw_data$all_mutations_pyrazinamide)
|
|
||||||
|
|
||||||
# sanity checks
|
|
||||||
#table(grepl("pnca_p",raw_data$all_muts_pnca))
|
|
||||||
cat(paste0('converting gene match:', gene_match, ' ', 'to lowercase'))
|
|
||||||
gene_match = tolower(gene_match)
|
|
||||||
|
|
||||||
table(grepl(gene_match,raw_data$all_muts_pnca))
|
|
||||||
|
|
||||||
# sanity check: should be TRUE
|
|
||||||
#sum(table(grepl("pnca_p",raw_data$all_muts_pnca))) == total_samples
|
|
||||||
sum(table(grepl(gene_match,raw_data$all_muts_pnca))) == total_samples
|
|
||||||
|
|
||||||
# set up variables: can be used for logistic regression as well
|
|
||||||
i = "pnca_p.ala134gly" # has a NA, should NOT exist
|
|
||||||
table(grepl(i,raw_data$all_muts_pnca))
|
|
||||||
|
|
||||||
i = "pnca_p.trp68gly"
|
|
||||||
table(grepl(i,raw_data$all_muts_pnca))
|
|
||||||
|
|
||||||
mut = grepl(i,raw_data$all_muts_pnca)
|
|
||||||
dst = raw_data$pyrazinamide
|
|
||||||
table(mut, dst)
|
|
||||||
|
|
||||||
#chisq.test(table(mut,dst))
|
|
||||||
#fisher.test(table(mut, dst))
|
|
||||||
#table(mut)
|
|
||||||
|
|
||||||
#########################################################
|
|
||||||
# 2: Read valid snps for which OR
|
|
||||||
# can be calculated (infile_comp_snps.csv)
|
|
||||||
#########################################################
|
|
||||||
cat(paste0('Reading metadata infile:', infile_metadata))
|
|
||||||
|
|
||||||
pnca_metadata = read.csv(infile_metadata
|
|
||||||
# , file.choose()
|
|
||||||
, stringsAsFactors = F
|
|
||||||
, header = T)
|
|
||||||
|
|
||||||
|
|
||||||
# clear variables
|
|
||||||
rm(indir, in_filename, infile)
|
|
||||||
rm(indir_metadata, in_filename_metadata, infile_metadata)
|
|
||||||
|
|
||||||
# count na in pyrazinamide column
|
|
||||||
tot_pza_na = sum(is.na(pnca_metadata$pyrazinamide))
|
|
||||||
expected_rows = nrow(pnca_metadata) - tot_pza_na
|
|
||||||
|
|
||||||
# drop na from the pyrazinamide colum
|
|
||||||
pnca_snps_or = pnca_metadata[!is.na(pnca_metadata$pyrazinamide),]
|
|
||||||
|
|
||||||
# sanity check
|
|
||||||
if(nrow(pnca_snps_or) == expected_rows){
|
|
||||||
cat('PASS: no. of rows match with expected_rows')
|
|
||||||
} else{
|
|
||||||
cat('FAIL: nrows mismatch.')
|
|
||||||
}
|
|
||||||
|
|
||||||
# extract unique snps to iterate over for AF and OR calcs
|
|
||||||
pnca_snps_unique = unique(pnca_snps_or$mutation)
|
|
||||||
|
|
||||||
cat(paste0('Total no. of distinct comp snps to perform OR calcs: ', length(pnca_snps_unique)))
|
|
||||||
|
|
||||||
# Define OR function
|
|
||||||
x = as.numeric(mut)
|
|
||||||
y = dst
|
|
||||||
or = function(x,y){
|
|
||||||
tab = as.matrix(table(x,y))
|
|
||||||
a = tab[2,2]
|
|
||||||
if (a==0){ a<-0.5}
|
|
||||||
b = tab[2,1]
|
|
||||||
if (b==0){ b<-0.5}
|
|
||||||
c = tab[1,2]
|
|
||||||
if (c==0){ c<-0.5}
|
|
||||||
d = tab[1,1]
|
|
||||||
if (d==0){ d<-0.5}
|
|
||||||
(a/b)/(c/d)
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
dst = raw_data$pyrazinamide
|
|
||||||
ors = sapply(pnca_snps_unique,function(m){
|
|
||||||
mut = grepl(m,raw_data$all_muts_pnca)
|
|
||||||
or(mut,dst)
|
|
||||||
})
|
|
||||||
|
|
||||||
ors
|
|
||||||
|
|
||||||
pvals = sapply(pnca_snps_unique,function(m){
|
|
||||||
mut = grepl(m,raw_data$all_muts_pnca)
|
|
||||||
fisher.test(mut,dst)$p.value
|
|
||||||
})
|
|
||||||
|
|
||||||
pvals
|
|
||||||
|
|
||||||
afs = sapply(pnca_snps_unique,function(m){
|
|
||||||
mut = grepl(m,raw_data$all_muts_pnca)
|
|
||||||
mean(mut)
|
|
||||||
})
|
|
||||||
|
|
||||||
afs
|
|
||||||
|
|
||||||
# check ..hmmm
|
|
||||||
afs['pnca_p.trp68gly']
|
|
||||||
afs['pnca_p.gln10pro']
|
|
||||||
afs['pnca_p.leu4ser']
|
|
||||||
|
|
||||||
plot(density(log(ors)))
|
|
||||||
plot(-log10(pvals))
|
|
||||||
hist(log(ors)
|
|
||||||
, breaks = 100
|
|
||||||
)
|
|
||||||
|
|
||||||
# sanity check
|
|
||||||
if (table(names(ors) == names(pvals)) & table(names(ors) == names(afs)) & table(names(pvals) == names(afs)) == length(pnca_snps_unique)){
|
|
||||||
cat('PASS: names of ors, pvals and afs match: proceed with combining into a single df')
|
|
||||||
} else{
|
|
||||||
cat('FAIL: names of ors, pvals and afs mismatch')
|
|
||||||
}
|
|
||||||
|
|
||||||
# combine ors, pvals and afs
|
|
||||||
cat('Combining calculated params into a df: ors, pvals and afs')
|
|
||||||
|
|
||||||
comb_AF_and_OR = data.frame(ors, pvals, afs)
|
|
||||||
cat('No. of rows in comb_AF_and_OR: ', nrow(comb_AF_and_OR)
|
|
||||||
, '\nNo. of cols in comb_AF_and_OR: ', ncol(comb_AF_and_OR))
|
|
||||||
|
|
||||||
cat('Rownames == mutation: ', head(rownames(comb_AF_and_OR)))
|
|
||||||
|
|
||||||
# add rownames of comb_AF_and_OR as an extra column 'mutation' to allow merging based on this column
|
|
||||||
comb_AF_and_OR$mutation = rownames(comb_AF_and_OR)
|
|
||||||
|
|
||||||
# sanity check
|
|
||||||
if (table(rownames(comb_AF_and_OR) == comb_AF_and_OR$mutation)){
|
|
||||||
cat('PASS: rownames and mutaion col values match')
|
|
||||||
}else{
|
|
||||||
cat('FAIL: rownames and mutation col values mismatch')
|
|
||||||
}
|
|
||||||
|
|
||||||
#########################################################
|
|
||||||
# 3: Merge meta data file + calculated num params
|
|
||||||
#########################################################
|
|
||||||
df1 = pnca_metadata
|
|
||||||
df2 = comb_AF_and_OR
|
|
||||||
|
|
||||||
cat('checking commom col of the two dfs before merging:'
|
|
||||||
,'\ndf1:', head(df1$mutation)
|
|
||||||
, '\ndf2:', head(df2$mutation))
|
|
||||||
|
|
||||||
cat(paste0('merging two dfs: '
|
|
||||||
,'\ndf1 (big df i.e. meta data) nrows: ', nrow(df1)
|
|
||||||
,'\ndf2 (small df i.e af, or, pval) nrows: ', nrow(df2)
|
|
||||||
,'\nexpected rows in merged df: ', nrow(df1)
|
|
||||||
,'\nexpected cols in merged_df: ', (ncol(df1) + ncol(df2) - 1)))
|
|
||||||
|
|
||||||
merged_df = merge(df1 # big file
|
|
||||||
, df2 # small (afor file)
|
|
||||||
, by = "mutation"
|
|
||||||
, all.x = T) # because you want all the entries of the meta data
|
|
||||||
|
|
||||||
# sanity check
|
|
||||||
if(ncol(merged_df) == (ncol(df1) + ncol(df2) - 1)){
|
|
||||||
cat(paste0('PASS: no. of cols is as expected: ', ncol(merged_df)))
|
|
||||||
} else{
|
|
||||||
cat('FAIL: no.of cols mistmatch')
|
|
||||||
}
|
|
||||||
|
|
||||||
# quick check
|
|
||||||
i = "pnca_p.ala134gly" # has all NAs in pyrazinamide, should be NA in ors, etc.
|
|
||||||
merged_df[merged_df$mutation == i,]
|
|
||||||
|
|
||||||
# count na in each column
|
|
||||||
na_count = sapply(merged_df, function(y) sum(length(which(is.na(y))))); na_count
|
|
||||||
|
|
||||||
# check last three cols: should be NA
|
|
||||||
if ( identical(na_count[[length(na_count)]], na_count[[length(na_count)-1]], na_count[[length(na_count)-2]])){
|
|
||||||
cat('PASS: No. of NAs for OR, AF and Pvals are equal as expected',
|
|
||||||
'\nNo. of NA: ', na_count[[length(na_count)]])
|
|
||||||
} else {
|
|
||||||
cat('FAIL: No. of NAs for OR, AF and Pvals mismatch')
|
|
||||||
}
|
|
||||||
|
|
||||||
# reassign custom colnames
|
|
||||||
cat('Assigning custom colnames for the calculated params...')
|
|
||||||
colnames(merged_df)[colnames(merged_df)== "ors"] <- "OR"
|
|
||||||
colnames(merged_df)[colnames(merged_df)== "pvals"] <- "pvalue"
|
|
||||||
colnames(merged_df)[colnames(merged_df)== "afs"] <- "AF"
|
|
||||||
|
|
||||||
colnames(merged_df)
|
|
||||||
|
|
||||||
# add 3 more cols: log OR, neglog pvalue and AF_percent cols
|
|
||||||
merged_df$logor = log(merged_df$OR)
|
|
||||||
is.numeric(merged_df$logor)
|
|
||||||
|
|
||||||
merged_df$neglog10pvalue = -log10(merged_df$pvalue)
|
|
||||||
is.numeric(merged_df$neglog10pvalue)
|
|
||||||
|
|
||||||
merged_df$AF_percent = merged_df$AF*100
|
|
||||||
is.numeric(merged_df$AF_percent)
|
|
||||||
|
|
||||||
# check AFs
|
|
||||||
#i = 'pnca_p.trp68gly'
|
|
||||||
i = 'pnca_p.gln10pro'
|
|
||||||
#i = 'pnca_p.leu4ser'
|
|
||||||
merged_df[merged_df$mutation == i,]
|
|
||||||
|
|
||||||
# FIXME: harcoding (beware!), NOT FATAL though!
|
|
||||||
ncol_added = 3
|
|
||||||
|
|
||||||
cat(paste0('Added', ' ', ncol_added, ' more cols to merged_df:'
|
|
||||||
, '\ncols added: logor, neglog10pvalue and AF_percent:'
|
|
||||||
, '\nno. of cols in merged_df now: ', ncol(merged_df)))
|
|
||||||
|
|
||||||
#%% write file out: pnca_meta_data_with_AFandOR
|
|
||||||
#*********************************************
|
|
||||||
cat(paste0('writing output file: '
|
|
||||||
, '\nFilename: ', out_filename
|
|
||||||
, '\nPath:', outdir))
|
|
||||||
|
|
||||||
write.csv(merged_df, outfile
|
|
||||||
, row.names = F)
|
|
||||||
|
|
||||||
cat(paste0('Finished writing:'
|
|
||||||
, out_filename
|
|
||||||
, '\nNo. of rows: ', nrow(merged_df)
|
|
||||||
, '\nNo. of cols: ', ncol(merged_df)))
|
|
||||||
#************************************************
|
|
||||||
cat('======================================================================')
|
|
||||||
rm(out_filename)
|
|
||||||
cat('End of script: calculated AF, OR, pvalues and saved file')
|
|
||||||
# End of script
|
|
||||||
#%%
|
|
||||||
# sanity check: Count NA in these four cols.
|
|
||||||
# However these need to be numeric else these
|
|
||||||
# will be misleading when counting NAs (i.e retrun 0)
|
|
||||||
#is.numeric(meta_with_afor$OR)
|
|
||||||
na_var = c('AF', 'OR', 'pvalue', 'logor', 'neglog10pvalue')
|
|
||||||
|
|
||||||
# loop through these vars and check if these are numeric.
|
|
||||||
# if not, then convert to numeric
|
|
||||||
check_all = NULL
|
|
||||||
|
|
||||||
for (i in na_var){
|
|
||||||
# cat(i)
|
|
||||||
check0 = is.numeric(meta_with_afor[,i])
|
|
||||||
if (check0) {
|
|
||||||
check_all = c(check0, check_all)
|
|
||||||
cat('These are all numeric cols')
|
|
||||||
} else{
|
|
||||||
cat('First converting to numeric')
|
|
||||||
check0 = as.numeric(meta_with_afor[,i])
|
|
||||||
check_all = c(check0, check_all)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# count na now that the respective cols are numeric
|
|
||||||
na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count
|
|
||||||
str(na_count)
|
|
||||||
|
|
||||||
# extract how many NAs:
|
|
||||||
# should be all TRUE
|
|
||||||
# should be a single number since
|
|
||||||
# all the cols should have 'equal' and 'same' no. of NAs
|
|
||||||
# compare if the No of 'NA' are the same for all these cols
|
|
||||||
na_len = NULL
|
|
||||||
for (i in na_var){
|
|
||||||
temp = na_count[[i]]
|
|
||||||
na_len = c(na_len, temp)
|
|
||||||
}
|
|
||||||
|
|
||||||
cat('Checking how many NAs and if these are identical for the selected cols:')
|
|
||||||
my_nrows = NULL
|
|
||||||
for ( i in 1: (length(na_len)-1) ){
|
|
||||||
# cat(compare(na_len[i]), na_len[i+1])
|
|
||||||
c = compare(na_len[i], na_len[i+1])
|
|
||||||
if ( c$result ) {
|
|
||||||
cat('PASS: No. of NAa in selected cols are identical')
|
|
||||||
my_nrows = na_len[i] }
|
|
||||||
else {
|
|
||||||
cat('FAIL: No. of NAa in selected cols mismatch')
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
cat('No. of NAs in each of the selected cols: ', my_nrows)
|
|
||||||
|
|
||||||
# yet more sanity checks:
|
|
||||||
cat('Check whether the ', my_nrows, 'indices are indeed the same')
|
|
||||||
|
|
||||||
#which(is.na(meta_with_afor$OR))
|
|
||||||
|
|
||||||
# initialise an empty df with nrows as extracted above
|
|
||||||
na_count_df = data.frame(matrix(vector(mode = 'numeric'
|
|
||||||
# , length = length(na_var)
|
|
||||||
)
|
|
||||||
, nrow = my_nrows
|
|
||||||
# , ncol = length(na_var)
|
|
||||||
))
|
|
||||||
|
|
||||||
# populate the df with the indices of the cols that are NA
|
|
||||||
for (i in na_var){
|
|
||||||
cat(i)
|
|
||||||
na_i = which(is.na(meta_with_afor[i]))
|
|
||||||
na_count_df = cbind(na_count_df, na_i)
|
|
||||||
colnames(na_count_df)[which(na_var == i)] <- i
|
|
||||||
}
|
|
||||||
|
|
||||||
# Now compare these indices to ensure these are the same
|
|
||||||
check2 = NULL
|
|
||||||
for ( i in 1: ( length(na_count_df)-1 ) ) {
|
|
||||||
# cat(na_count_df[i] == na_count_df[i+1])
|
|
||||||
check_all = identical(na_count_df[[i]], na_count_df[[i+1]])
|
|
||||||
check2 = c(check_all, check2)
|
|
||||||
if ( all(check2) ) {
|
|
||||||
cat('PASS: The indices for AF, OR, etc are all the same\n')
|
|
||||||
} else {
|
|
||||||
cat ('FAIL: Please check indices which are NA')
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,248 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
'''
|
|
||||||
Created on Tue Aug 6 12:56:03 2019
|
|
||||||
|
|
||||||
@author: tanu
|
|
||||||
'''
|
|
||||||
# FIXME: change filename 4 (mcsm normalised data)
|
|
||||||
# to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline
|
|
||||||
#=============================================================================
|
|
||||||
# Task: combine 4 dfs with aa position as linking column
|
|
||||||
# This is done in 2 steps:
|
|
||||||
# merge 1: of 3 dfs (filenames in lowercase)
|
|
||||||
# <gene.lower()>_dssp.csv
|
|
||||||
# <gene.lower()>_kd.csv
|
|
||||||
# <gene.lower()>_pnca_rd.csv
|
|
||||||
|
|
||||||
# merge 2: of 2 dfs
|
|
||||||
# pnca_complex_mcsm_norm.csv (!fix name in mcsm script)
|
|
||||||
# output df from merge1
|
|
||||||
|
|
||||||
# Input: 3 dfs
|
|
||||||
# <gene.lower()>_dssp.csv
|
|
||||||
# <gene.lower()>_kd.csv
|
|
||||||
# <gene.lower()>_pnca_rd.csv
|
|
||||||
# pnca_complex_mcsm_norm.csv (!fix name in mcsm script)
|
|
||||||
|
|
||||||
# Output: .csv of all 4 dfs combined
|
|
||||||
|
|
||||||
# useful link
|
|
||||||
# https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns
|
|
||||||
#=============================================================================
|
|
||||||
#%% load packages
|
|
||||||
import sys, os
|
|
||||||
import pandas as pd
|
|
||||||
#import numpy as np
|
|
||||||
import argparse
|
|
||||||
#=======================================================================
|
|
||||||
#%% specify input and curr dir
|
|
||||||
homedir = os.path.expanduser('~')
|
|
||||||
|
|
||||||
# set working dir
|
|
||||||
os.getcwd()
|
|
||||||
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
|
|
||||||
os.getcwd()
|
|
||||||
#=======================================================================
|
|
||||||
#%% command line args
|
|
||||||
arg_parser = argparse.ArgumentParser()
|
|
||||||
#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
|
|
||||||
#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
|
|
||||||
arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazin')
|
|
||||||
arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pn') # case sensitive
|
|
||||||
args = arg_parser.parse_args()
|
|
||||||
#=======================================================================
|
|
||||||
#%% variable assignment: input and output
|
|
||||||
#drug = 'pyrazinamide'
|
|
||||||
#gene = 'pncA'
|
|
||||||
#gene_match = gene + '_p.'
|
|
||||||
|
|
||||||
drug = args.drug
|
|
||||||
gene = args.gene
|
|
||||||
#==========
|
|
||||||
# data dir
|
|
||||||
#==========
|
|
||||||
datadir = homedir + '/' + 'git/Data'
|
|
||||||
|
|
||||||
#=======
|
|
||||||
# input
|
|
||||||
#=======
|
|
||||||
indir = datadir + '/' + drug + '/' + 'output'
|
|
||||||
in_filename1 = 'pnca_dssp.csv'
|
|
||||||
in_filename2 = 'pnca_kd.csv'
|
|
||||||
in_filename3 = 'pnca_rd.csv'
|
|
||||||
#in_filename4 = 'mcsm_complex1_normalised.csv' # Fix name in mcsm script
|
|
||||||
in_filename4 = 'pnca_complex_mcsm_norm.csv' # manually changed temporarily
|
|
||||||
|
|
||||||
infile1 = indir + '/' + in_filename1
|
|
||||||
infile2 = indir + '/' + in_filename2
|
|
||||||
infile3 = indir + '/' + in_filename3
|
|
||||||
infile4 = indir + '/' + in_filename4
|
|
||||||
|
|
||||||
print('\nInput path:', indir
|
|
||||||
, '\nInput filename1:', in_filename1
|
|
||||||
, '\nInput filename2:', in_filename2
|
|
||||||
, '\nInput filename3:', in_filename3
|
|
||||||
, '\nInput filename4:', in_filename4
|
|
||||||
, '\n===================================================================')
|
|
||||||
|
|
||||||
#=======
|
|
||||||
# output
|
|
||||||
#=======
|
|
||||||
outdir = datadir + '/' + drug + '/' + 'output'
|
|
||||||
out_filename = gene.lower() + '_mcsm_struct_params.csv'
|
|
||||||
outfile = outdir + '/' + out_filename
|
|
||||||
print('Output filename:', out_filename
|
|
||||||
, '\nOutput path:', outdir
|
|
||||||
, '\n===================================================================')
|
|
||||||
|
|
||||||
#%% end of variable assignment for input and output files
|
|
||||||
#=======================================================================
|
|
||||||
#%% Read input file
|
|
||||||
dssp_df = pd.read_csv(infile1, sep = ',')
|
|
||||||
kd_df = pd.read_csv(infile2, sep = ',')
|
|
||||||
rd_df = pd.read_csv(infile3, sep = ',')
|
|
||||||
mcsm_df = pd.read_csv(infile4, sep = ',')
|
|
||||||
|
|
||||||
print('Reading input files:'
|
|
||||||
, '\ndssp file:', infile1
|
|
||||||
, '\nNo. of rows:', len(dssp_df)
|
|
||||||
, '\nNo. of cols:', len(dssp_df.columns)
|
|
||||||
, '\nColumn names:', dssp_df.columns
|
|
||||||
, '\n==================================================================='
|
|
||||||
, '\nkd file:', infile2
|
|
||||||
, '\nNo. of rows:', len(kd_df)
|
|
||||||
, '\nNo. of cols:', len(kd_df.columns)
|
|
||||||
, '\nColumn names:', kd_df.columns
|
|
||||||
, '\n==================================================================='
|
|
||||||
, '\nrd file:', infile3
|
|
||||||
, '\nNo. of rows:', len(rd_df)
|
|
||||||
, '\nNo. of cols:', len(rd_df.columns)
|
|
||||||
, '\nColumn names:', rd_df.columns
|
|
||||||
, '\n==================================================================='
|
|
||||||
, '\nrd file:', infile4
|
|
||||||
, '\nNo. of rows:', len(mcsm_df)
|
|
||||||
, '\nNo. of cols:', len(mcsm_df.columns)
|
|
||||||
, '\nColumn names:', mcsm_df.columns
|
|
||||||
, '\n===================================================================')
|
|
||||||
#%% Begin combining dfs
|
|
||||||
#===================
|
|
||||||
# concatenating df1 (3dfs): dssp_df + kd_df+ rd_df
|
|
||||||
#===================
|
|
||||||
print('starting first merge...\n')
|
|
||||||
|
|
||||||
# checking no. of rows
|
|
||||||
print('Checking if no. of rows of the 3 dfs are equal:\n'
|
|
||||||
, len(dssp_df) == len(kd_df) == len(rd_df)
|
|
||||||
, '\nReason: fasta files and pdb files vary since not all pos are part of the structure'
|
|
||||||
, '\n===================================================================')
|
|
||||||
|
|
||||||
# variables for sanity checks
|
|
||||||
expected_rows_df1 = max(len(dssp_df), len(kd_df), len(rd_df))
|
|
||||||
# beware of harcoding! used for sanity check
|
|
||||||
ndfs = 3
|
|
||||||
ncol_merge = 1
|
|
||||||
offset = ndfs- ncol_merge
|
|
||||||
expected_cols_df1 = len(dssp_df.columns) + len(kd_df.columns) + len(rd_df.columns) - offset
|
|
||||||
|
|
||||||
print('Merge 1:'
|
|
||||||
, '\ncombining 3dfs by commom col: position'
|
|
||||||
, '\nExpected nrows in combined_df:', expected_rows_df1
|
|
||||||
, '\nExpected ncols in combined_df:', expected_cols_df1
|
|
||||||
, '\nResetting the common col as the index'
|
|
||||||
, '\n===================================================================')
|
|
||||||
|
|
||||||
#dssp_df.set_index('position', inplace = True)
|
|
||||||
#kd_df.set_index('position', inplace = True)
|
|
||||||
#rd_df.set_index('position', inplace =True)
|
|
||||||
|
|
||||||
#combined_df = pd.concat([dssp_df, kd_df, rd_df], axis = 1, sort = False).reset_index()
|
|
||||||
#combined_df.rename(columns = {'index':'position'})
|
|
||||||
|
|
||||||
combined_df1 = pd.concat(
|
|
||||||
(my_index.set_index('position') for my_index in [dssp_df, kd_df, rd_df])
|
|
||||||
, axis = 1, join = 'outer').reset_index()
|
|
||||||
|
|
||||||
# sanity check
|
|
||||||
print('Checking dimensions of concatenated df1...')
|
|
||||||
if len(combined_df1) == expected_rows_df1 and len(combined_df1.columns) == expected_cols_df1:
|
|
||||||
print('PASS: combined df has expected dimensions'
|
|
||||||
, '\nNo. of rows in combined df:', len(combined_df1)
|
|
||||||
, '\nNo. of cols in combined df:', len(combined_df1.columns)
|
|
||||||
, '\n===============================================================')
|
|
||||||
else:
|
|
||||||
print('FAIL: combined df does not have expected dimensions'
|
|
||||||
, '\nNo. of rows in combined df:', len(combined_df1)
|
|
||||||
, '\nNo. of cols in combined df:', len(combined_df1.columns)
|
|
||||||
, '\n===============================================================')
|
|
||||||
|
|
||||||
#===================
|
|
||||||
# concatenating df2 (2dfs): mcsm_df + combined_df1
|
|
||||||
# sort sorts the cols
|
|
||||||
#===================
|
|
||||||
print('starting second merge...\n')
|
|
||||||
|
|
||||||
# rename col 'Position' in mcsm_df to lowercase 'position'
|
|
||||||
# as it matches the combined_df1 colname to perfom merge
|
|
||||||
#mcsm_df.columns
|
|
||||||
#mcsm_df.rename(columns = {'Position':'position'}) # not working!
|
|
||||||
# copy 'Position' column with the correct colname
|
|
||||||
print('Firstly, copying \'Position\' col and renaming \'position\' to allow merging'
|
|
||||||
, '\nNo. of cols before copying: ', len(mcsm_df.columns))
|
|
||||||
|
|
||||||
mcsm_df['position'] = mcsm_df['Position']
|
|
||||||
print('No. of cols after copying: ', len(mcsm_df.columns))
|
|
||||||
|
|
||||||
# sanity check
|
|
||||||
if mcsm_df['position'].equals(mcsm_df['Position']):
|
|
||||||
print('PASS: Copying worked correctly'
|
|
||||||
, '\ncopied col matches original column'
|
|
||||||
, '\n===============================================================')
|
|
||||||
else:
|
|
||||||
print('FAIL: copied col does not match original column'
|
|
||||||
, '\n================================================================')
|
|
||||||
|
|
||||||
# variables for sanity checks
|
|
||||||
expected_rows_df2 = len(mcsm_df)
|
|
||||||
# beware of harcoding! used for sanity check
|
|
||||||
ndfs = 2
|
|
||||||
ncol_merge = 1
|
|
||||||
offset = ndfs - ncol_merge
|
|
||||||
expected_cols_df2 = len(mcsm_df.columns) + len(combined_df1.columns) - offset
|
|
||||||
|
|
||||||
print('Merge 2:'
|
|
||||||
, '\ncombining 2dfs by commom col: position'
|
|
||||||
, '\nExpected nrows in combined_df:', expected_rows_df2
|
|
||||||
, '\nExpected ncols in combined_df:', expected_cols_df2
|
|
||||||
, '\n===================================================================')
|
|
||||||
|
|
||||||
combined_df2 = mcsm_df.merge(combined_df1, on = 'position')
|
|
||||||
|
|
||||||
# sanity check
|
|
||||||
print('Checking dimensions of concatenated df2...')
|
|
||||||
if len(combined_df2) == expected_rows_df2 and len(combined_df2.columns) == expected_cols_df2:
|
|
||||||
print('PASS: combined df2 has expected dimensions'
|
|
||||||
, '\nNo. of rows in combined df:', len(combined_df2)
|
|
||||||
, '\nNo. of cols in combined df:', len(combined_df2.columns)
|
|
||||||
, '\n===============================================================')
|
|
||||||
else:
|
|
||||||
print('FAIL: combined df2 does not have expected dimensions'
|
|
||||||
, '\nNo. of rows in combined df:', len(combined_df2)
|
|
||||||
, '\nNo. of cols in combined df:', len(combined_df2.columns)
|
|
||||||
, '\n===============================================================')
|
|
||||||
|
|
||||||
#%% write file
|
|
||||||
print('Writing file:'
|
|
||||||
, '\nFilename:', out_filename
|
|
||||||
, '\nPath:', outdir
|
|
||||||
, '\n===================================================================')
|
|
||||||
|
|
||||||
combined_df2.to_csv(outfile, header = True, index = False)
|
|
||||||
|
|
||||||
print('Finished writing:', out_filename
|
|
||||||
, '\nNo. of rows:', len(combined_df2)
|
|
||||||
, '\nNo. of cols:', len(combined_df2.columns)
|
|
||||||
, '\n===================================================================')
|
|
||||||
|
|
||||||
#%% end of script
|
|
||||||
#==============================================================================
|
|
|
@ -1,397 +0,0 @@
|
||||||
#=======================================================================
|
|
||||||
# TASK: To combine mcsm and meta data with af and or
|
|
||||||
# by filtering for distance to ligand (<10Ang).
|
|
||||||
# This script doesn't output anything.
|
|
||||||
# This script is sourced from other .R scripts for plotting ligand plots
|
|
||||||
|
|
||||||
# Input csv files:
|
|
||||||
# 1) mcsm normalised and struct params
|
|
||||||
# 2) gene associated meta_data_with_AFandOR
|
|
||||||
#=======================================================================
|
|
||||||
#%% specify curr dir
|
|
||||||
getwd()
|
|
||||||
setwd('~/git/LSHTM_analysis/meta_data_analysis/')
|
|
||||||
getwd()
|
|
||||||
#=======================================================================
|
|
||||||
#%% load packages
|
|
||||||
#require(data.table)
|
|
||||||
#require(arsenal)
|
|
||||||
#require(compare)
|
|
||||||
#library(tidyverse)
|
|
||||||
|
|
||||||
# header file
|
|
||||||
header_dir = '~/git/LSHTM_analysis/'
|
|
||||||
source(paste0(header_dir, '/', 'my_header.R'))
|
|
||||||
#%% variable assignment: input and output paths & filenames
|
|
||||||
#=======================================================================
|
|
||||||
drug = 'pyrazinamide'
|
|
||||||
gene = 'pncA'
|
|
||||||
gene_match = paste0(gene,'_p.')
|
|
||||||
cat(gene_match)
|
|
||||||
|
|
||||||
#===========
|
|
||||||
# data dir
|
|
||||||
#===========
|
|
||||||
datadir = '~/git/Data'
|
|
||||||
|
|
||||||
#===========
|
|
||||||
# input
|
|
||||||
#===========
|
|
||||||
# infile1: mCSM data
|
|
||||||
#indir = '~/git/Data/pyrazinamide/input/processed/'
|
|
||||||
indir = paste0(datadir, '/', drug, '/', 'output') # revised {TODO: change in mcsm pipeline}
|
|
||||||
#in_filename = 'mcsm_complex1_normalised.csv'
|
|
||||||
in_filename = 'pnca_mcsm_struct_params.csv'
|
|
||||||
infile = paste0(indir, '/', in_filename)
|
|
||||||
cat(paste0('Reading infile1: mCSM output file', ' ', infile) )
|
|
||||||
|
|
||||||
# infile2: gene associated meta data combined with AF and OR
|
|
||||||
#indir: same as above
|
|
||||||
in_filename_comb = paste0(tolower(gene), '_meta_data_with_AFandOR.csv')
|
|
||||||
infile_comb = paste0(indir, '/', in_filename_comb)
|
|
||||||
cat(paste0('Reading infile2: gene associated combined metadata:', infile_comb))
|
|
||||||
|
|
||||||
#===========
|
|
||||||
# output
|
|
||||||
#===========
|
|
||||||
# Uncomment if and when required to output
|
|
||||||
outdir = paste0(datadir, '/', drug, '/', 'output') #same as indir
|
|
||||||
cat('Output dir: ', outdir, '\n')
|
|
||||||
#out_filename = paste0(tolower(gene), 'XXX')
|
|
||||||
#outfile = paste0(outdir, '/', out_filename)
|
|
||||||
#cat(paste0('Output file with full path:', outfile))
|
|
||||||
|
|
||||||
#%% end of variable assignment for input and output files
|
|
||||||
#=======================================================================
|
|
||||||
#%% Read input files
|
|
||||||
|
|
||||||
#####################################
|
|
||||||
# input file 1: mcsm normalised data
|
|
||||||
# output of step 4 mcsm_pipeline
|
|
||||||
#####################################
|
|
||||||
|
|
||||||
cat('Reading mcsm_data:'
|
|
||||||
, '\nindir: ', indir
|
|
||||||
, '\ninfile_comb: ', in_filename)
|
|
||||||
|
|
||||||
mcsm_data = read.csv(infile
|
|
||||||
# , row.names = 1
|
|
||||||
, stringsAsFactors = F
|
|
||||||
, header = T)
|
|
||||||
|
|
||||||
cat('Read mcsm_data file:'
|
|
||||||
, '\nNo.of rows: ', nrow(mcsm_data)
|
|
||||||
, '\nNo. of cols:', ncol(mcsm_data))
|
|
||||||
|
|
||||||
# clear variables
|
|
||||||
rm(in_filename, infile)
|
|
||||||
|
|
||||||
str(mcsm_data)
|
|
||||||
|
|
||||||
table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
|
|
||||||
|
|
||||||
# spelling Correction 1: DUET
|
|
||||||
mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
|
|
||||||
mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
|
|
||||||
|
|
||||||
# checks: should be the same as above
|
|
||||||
table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
|
|
||||||
head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
|
|
||||||
|
|
||||||
# spelling Correction 2: Ligand
|
|
||||||
table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
|
|
||||||
|
|
||||||
mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
|
|
||||||
mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
|
|
||||||
|
|
||||||
# checks: should be the same as above
|
|
||||||
table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
|
|
||||||
head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
|
|
||||||
|
|
||||||
# muts with opposing effects on protomer and ligand stability
|
|
||||||
# excluded from here as it is redundant.
|
|
||||||
# check 'combining_two_df.R' to refer if required.
|
|
||||||
#=======================================================================
|
|
||||||
#%% !!!Filter data only for mcsm lig!!!
|
|
||||||
|
|
||||||
###########################
|
|
||||||
# Filter/subset data
|
|
||||||
# Lig plots < 10Ang
|
|
||||||
# Filter the lig plots for Dis_to_lig < 10Ang
|
|
||||||
###########################
|
|
||||||
|
|
||||||
# check range of distances
|
|
||||||
max(mcsm_data$Dis_lig_Ang)
|
|
||||||
min(mcsm_data$Dis_lig_Ang)
|
|
||||||
|
|
||||||
# count
|
|
||||||
table(mcsm_data$Dis_lig_Ang<10)
|
|
||||||
|
|
||||||
# subset data to have only values less than 10 Ang
|
|
||||||
mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
|
|
||||||
|
|
||||||
# sanity checks
|
|
||||||
max(mcsm_data2$Dis_lig_Ang)
|
|
||||||
min(mcsm_data2$Dis_lig_Ang)
|
|
||||||
|
|
||||||
# count no of unique positions
|
|
||||||
length(unique(mcsm_data2$Position))
|
|
||||||
|
|
||||||
# count no of unique mutations
|
|
||||||
length(unique(mcsm_data2$Mutationinformation))
|
|
||||||
|
|
||||||
# count Destabilisinga and stabilising
|
|
||||||
table(mcsm_data2$Lig_outcome) #{RESULT: no of mutations within 10Ang}
|
|
||||||
|
|
||||||
#############################
|
|
||||||
# Extra sanity check:
|
|
||||||
# for mcsm_lig ONLY
|
|
||||||
# Dis_lig_Ang should be <10
|
|
||||||
#############################
|
|
||||||
|
|
||||||
if (max(mcsm_data2$Dis_lig_Ang) < 10){
|
|
||||||
print ("Sanity check passed: lig data is <10Ang")
|
|
||||||
}else{
|
|
||||||
print ("Error: data should be filtered to be within 10Ang")
|
|
||||||
}
|
|
||||||
|
|
||||||
#!!!!!!!!!!!!!!!!!!!!!
|
|
||||||
# REASSIGNMENT: so as not to alter the script
|
|
||||||
mcsm_data = mcsm_data2
|
|
||||||
#!!!!!!!!!!!!!!!!!!!!!
|
|
||||||
#=======================================================================
|
|
||||||
# clear variables
|
|
||||||
rm(mcsm_data2)
|
|
||||||
|
|
||||||
# count na in each column
|
|
||||||
na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
|
|
||||||
|
|
||||||
# sort by Mutationinformation
|
|
||||||
mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
|
|
||||||
head(mcsm_data$Mutationinformation)
|
|
||||||
|
|
||||||
orig_col = ncol(mcsm_data)
|
|
||||||
# get freq count of positions and add to the df
|
|
||||||
setDT(mcsm_data)[, mut_pos_occurrence := .N, by = .(position)]
|
|
||||||
|
|
||||||
cat('Added col: position frequency to see which posn has how many muts'
|
|
||||||
, '\nNo. of cols now', ncol(mcsm_data)
|
|
||||||
, '\nNo. of cols before: ', orig_col)
|
|
||||||
|
|
||||||
mut_pos_occurrence = data.frame(mcsm_data$id, mcsm_data$Position, mcsm_data$mut_pos_occurrence)
|
|
||||||
|
|
||||||
######################################
|
|
||||||
# input file2 meta data with AFandOR
|
|
||||||
######################################
|
|
||||||
cat('Reading combined meta data and AFandOR file:'
|
|
||||||
, '\nindir: ', indir
|
|
||||||
, '\ninfile_comb: ', in_filename_comb)
|
|
||||||
|
|
||||||
meta_with_afor <- read.csv(infile_comb
|
|
||||||
, stringsAsFactors = F
|
|
||||||
, header = T)
|
|
||||||
|
|
||||||
cat('Read mcsm_data file:'
|
|
||||||
, '\nNo.of rows: ', nrow(meta_with_afor)
|
|
||||||
, '\nNo. of cols:', ncol(meta_with_afor))
|
|
||||||
|
|
||||||
# clear variables
|
|
||||||
rm(in_filename_comb, infile_comb)
|
|
||||||
|
|
||||||
str(meta_with_afor)
|
|
||||||
|
|
||||||
# sort by Mutationinformation
|
|
||||||
head(meta_with_afor$Mutationinformation)
|
|
||||||
meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
|
|
||||||
head(meta_with_afor$Mutationinformation)
|
|
||||||
|
|
||||||
orig_col2 = ncol(meta_with_afor)
|
|
||||||
|
|
||||||
# get freq count of positions and add to the df
|
|
||||||
setDT(meta_with_afor)[, sample_pos_occurrence := .N, by = .(Position)]
|
|
||||||
|
|
||||||
cat('Added col: position frequency of samples to check'
|
|
||||||
,'how many samples correspond to a partiulcar posn associated with muts'
|
|
||||||
, '\nNo. of cols now', ncol(meta_with_afor)
|
|
||||||
, '\nNo. of cols before: ', orig_col2)
|
|
||||||
|
|
||||||
sample_pos_occurrence = data.frame(meta_with_afor$id, meta_with_afor$position, meta_with_afor$sample_pos_occurrence)
|
|
||||||
|
|
||||||
#=======================================================================
|
|
||||||
cat('Begin merging dfs with NAs'
|
|
||||||
, '\n===============================================================')
|
|
||||||
|
|
||||||
###########################
|
|
||||||
# merging two dfs: with NA
|
|
||||||
###########################
|
|
||||||
# link col name = 'Mutationinforamtion'
|
|
||||||
cat('Merging dfs with NAs: big df (1-many relationship b/w id & mut)'
|
|
||||||
,'\nlinking col: Mutationinforamtion'
|
|
||||||
,'\nfilename: merged_df2')
|
|
||||||
|
|
||||||
head(mcsm_data$Mutationinformation)
|
|
||||||
head(meta_with_afor$Mutationinformation)
|
|
||||||
|
|
||||||
#########
|
|
||||||
# a) merged_df2: meta data with mcsm
|
|
||||||
#########
|
|
||||||
merged_df2 = merge(x = meta_with_afor
|
|
||||||
, y = mcsm_data
|
|
||||||
, by = 'Mutationinformation'
|
|
||||||
, all.y = T)
|
|
||||||
|
|
||||||
cat('Dim of merged_df2: '
|
|
||||||
, '\nNo. of rows: ', nrow(merged_df2)
|
|
||||||
, '\nNo. of cols: ', ncol(merged_df2))
|
|
||||||
head(merged_df2$Position)
|
|
||||||
|
|
||||||
if(nrow(meta_with_afor) == nrow(merged_df2)){
|
|
||||||
cat('nrow(merged_df2) = nrow (gene associated metadata)'
|
|
||||||
,'\nExpected no. of rows: ',nrow(meta_with_afor)
|
|
||||||
,'\nGot no. of rows: ', nrow(merged_df2))
|
|
||||||
} else{
|
|
||||||
cat('nrow(merged_df2)!= nrow(gene associated metadata)'
|
|
||||||
, '\nExpected no. of rows after merge: ', nrow(meta_with_afor)
|
|
||||||
, '\nGot no. of rows: ', nrow(merged_df2)
|
|
||||||
, '\nFinding discrepancy')
|
|
||||||
merged_muts_u = unique(merged_df2$Mutationinformation)
|
|
||||||
meta_muts_u = unique(meta_with_afor$Mutationinformation)
|
|
||||||
# find the index where it differs
|
|
||||||
unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
|
|
||||||
}
|
|
||||||
|
|
||||||
# sort by Position
|
|
||||||
head(merged_df2$Position)
|
|
||||||
merged_df2 = merged_df2[order(merged_df2$Position),]
|
|
||||||
head(merged_df2$Position)
|
|
||||||
|
|
||||||
merged_df2v2 = merge(x = meta_with_afor
|
|
||||||
, y = mcsm_data
|
|
||||||
, by = 'Mutationinformation'
|
|
||||||
, all.x = T)
|
|
||||||
#!=!=!=!=!=!=!=!
|
|
||||||
# COMMENT: used all.y since position 186 is not part of the struc,
|
|
||||||
# hence doesn't have a mcsm value
|
|
||||||
# but 186 is associated with mutation
|
|
||||||
#!=!=!=!=!=!=!=!
|
|
||||||
|
|
||||||
# should be False
|
|
||||||
identical(merged_df2, merged_df2v2)
|
|
||||||
table(merged_df2$Position%in%merged_df2v2$Position)
|
|
||||||
|
|
||||||
rm(merged_df2v2)
|
|
||||||
|
|
||||||
#########
|
|
||||||
# b) merged_df3: remove duplicate mutation information
|
|
||||||
#########
|
|
||||||
cat('Merging dfs with NAs: small df (removing duplicate muts)'
|
|
||||||
,'\nCannot trust lineage info from this'
|
|
||||||
,'\nlinking col: Mutationinforamtion'
|
|
||||||
,'\nfilename: merged_df3')
|
|
||||||
|
|
||||||
#==#=#=#=#=#=#
|
|
||||||
# Cannot trust lineage, country from this df as the same mutation
|
|
||||||
# can have many different lineages
|
|
||||||
# but this should be good for the numerical corr plots
|
|
||||||
#=#=#=#=#=#=#=
|
|
||||||
merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),]
|
|
||||||
head(merged_df3$Position); tail(merged_df3$Position) # should be sorted
|
|
||||||
|
|
||||||
# sanity checks
|
|
||||||
# nrows of merged_df3 should be the same as the nrows of mcsm_data
|
|
||||||
if(nrow(mcsm_data) == nrow(merged_df3)){
|
|
||||||
cat('PASS: No. of rows match with mcsm_data'
|
|
||||||
,'\nExpected no. of rows: ', nrow(mcsm_data)
|
|
||||||
,'\nGot no. of rows: ', nrow(merged_df3))
|
|
||||||
} else {
|
|
||||||
cat('FAIL: No. of rows mismatch'
|
|
||||||
, '\nNo. of rows mcsm_data: ', nrow(mcsm_data)
|
|
||||||
, '\nNo. of rows merged_df3: ', nrow(merged_df3))
|
|
||||||
}
|
|
||||||
|
|
||||||
# counting NAs in AF, OR cols in merged_df3
|
|
||||||
# this is becuase mcsm has no AF, OR cols,
|
|
||||||
# so you cannot count NAs
|
|
||||||
if (identical(sum(is.na(merged_df3$OR))
|
|
||||||
, sum(is.na(merged_df3$pvalue))
|
|
||||||
, sum(is.na(merged_df3$AF)))){
|
|
||||||
cat('PASS: NA count match for OR, pvalue and AF\n')
|
|
||||||
na_count_df3 = sum(is.na(merged_df3$AF))
|
|
||||||
cat('No. of NAs: ', sum(is.na(merged_df3$OR)))
|
|
||||||
} else{
|
|
||||||
cat('FAIL: NA count mismatch'
|
|
||||||
, '\nNA in OR: ', sum(is.na(merged_df3$OR))
|
|
||||||
, '\nNA in pvalue: ', sum(is.na(merged_df3$pvalue))
|
|
||||||
, '\nNA in AF:', sum(is.na(merged_df3$AF)))
|
|
||||||
}
|
|
||||||
#=======================================================================
|
|
||||||
#%% merging without NAs
|
|
||||||
|
|
||||||
cat('Begin merging dfs without NAs'
|
|
||||||
, '\n===============================================================')
|
|
||||||
|
|
||||||
cat('Merging dfs without any NAs: big df (1-many relationship b/w id & mut)'
|
|
||||||
,'\nlinking col: Mutationinforamtion'
|
|
||||||
,'\nfilename: merged_df2_comp')
|
|
||||||
|
|
||||||
#########
|
|
||||||
# c) merged_df2_comp: merging two dfs without NA
|
|
||||||
#########
|
|
||||||
merged_df2_comp = merged_df2[!is.na(merged_df2$AF),]
|
|
||||||
#merged_df2_comp = merged_df2[!duplicated(merged_df2$Mutationinformation),]
|
|
||||||
|
|
||||||
# sanity check
|
|
||||||
cat('Checking nrows in merged_df2_comp')
|
|
||||||
if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count + 1)){
|
|
||||||
cat('PASS: No. of rows match'
|
|
||||||
,'\nDim of merged_df2_comp: '
|
|
||||||
,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1
|
|
||||||
, '\nNo. of rows: ', nrow(merged_df2_comp)
|
|
||||||
, '\nNo. of cols: ', ncol(merged_df2_comp))
|
|
||||||
}else{
|
|
||||||
cat('FAIL: No. of rows mismatch'
|
|
||||||
,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1
|
|
||||||
,'\nGot no. of rows: ', nrow(merged_df2_comp))
|
|
||||||
}
|
|
||||||
|
|
||||||
#########
|
|
||||||
# d) merged_df3_comp: remove duplicate mutation information
|
|
||||||
#########
|
|
||||||
merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),]
|
|
||||||
|
|
||||||
cat('Dim of merged_df3_comp: '
|
|
||||||
, '\nNo. of rows: ', nrow(merged_df3_comp)
|
|
||||||
, '\nNo. of cols: ', ncol(merged_df3_comp))
|
|
||||||
|
|
||||||
# alternate way of deriving merged_df3_comp
|
|
||||||
foo = merged_df3[!is.na(merged_df3$AF),]
|
|
||||||
# compare dfs: foo and merged_df3_com
|
|
||||||
all.equal(foo, merged_df3)
|
|
||||||
|
|
||||||
summary(comparedf(foo, merged_df3))
|
|
||||||
|
|
||||||
# sanity check
|
|
||||||
cat('Checking nrows in merged_df3_comp')
|
|
||||||
if(nrow(merged_df3_comp) == nrow(merged_df3)){
|
|
||||||
cat('NO NAs detected in merged_df3 in AF|OR cols'
|
|
||||||
,'\nNo. of rows are identical: ', nrow(merged_df3))
|
|
||||||
} else{
|
|
||||||
if(nrow(merged_df3_comp) == nrow(merged_df3) - na_count_df3) {
|
|
||||||
cat('PASS: NAs detected in merged_df3 in AF|OR cols'
|
|
||||||
, '\nNo. of NAs: ', na_count_df3
|
|
||||||
, '\nExpected no. of rows in merged_df3_comp: ', nrow(merged_df3) - na_count_df3
|
|
||||||
, '\nGot no. of rows: ', nrow(merged_df3_comp))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#=======================================================================
|
|
||||||
# write_output files
|
|
||||||
# Not required as this is a subset of the combining_two_df.R
|
|
||||||
#*************************
|
|
||||||
# clear variables
|
|
||||||
rm(mcsm_data, meta_with_afor, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, orig_col, outdir)
|
|
||||||
rm(mut_pos_occurrence)
|
|
||||||
#%% end of script
|
|
||||||
#=======================================================================
|
|
||||||
|
|
||||||
|
|
|
@ -1,461 +0,0 @@
|
||||||
#=======================================================================
|
|
||||||
# TASK: To combine mcsm and meta data with af and or files
|
|
||||||
# Input csv files:
|
|
||||||
# 1) mcsm normalised and struct params
|
|
||||||
# 2) gene associated meta_data_with_AFandOR
|
|
||||||
|
|
||||||
# Output:
|
|
||||||
# 1) muts with opposite effects on stability
|
|
||||||
# 2) large combined df including NAs for AF, OR,etc
|
|
||||||
# Dim: same no. of rows as gene associated meta_data_with_AFandOR
|
|
||||||
# 3) small combined df including NAs for AF, OR, etc.
|
|
||||||
# Dim: same as mcsm data
|
|
||||||
# 4) large combined df excluding NAs
|
|
||||||
# Dim: dim(#1) - no. of NAs(AF|OR) + 1
|
|
||||||
# 5) small combined df excluding NAs
|
|
||||||
# Dim: dim(#2) - no. of unique NAs - 1
|
|
||||||
# This script is sourced from other .R scripts for plotting
|
|
||||||
#=======================================================================
|
|
||||||
#%% specify curr dir
|
|
||||||
getwd()
|
|
||||||
setwd('~/git/LSHTM_analysis/meta_data_analysis/')
|
|
||||||
getwd()
|
|
||||||
#=======================================================================
|
|
||||||
#%% load packages
|
|
||||||
#require(data.table)
|
|
||||||
#require(arsenal)
|
|
||||||
#require(compare)
|
|
||||||
#library(tidyverse)
|
|
||||||
|
|
||||||
# header file
|
|
||||||
header_dir = '~/git/LSHTM_analysis/'
|
|
||||||
source(paste0(header_dir, '/', 'my_header.R'))
|
|
||||||
#=======================================================================
|
|
||||||
#%% variable assignment: input and output paths & filenames
|
|
||||||
drug = 'pyrazinamide'
|
|
||||||
gene = 'pncA'
|
|
||||||
gene_match = paste0(gene,'_p.')
|
|
||||||
cat(gene_match)
|
|
||||||
|
|
||||||
#===========
|
|
||||||
# data dir
|
|
||||||
#===========
|
|
||||||
datadir = '~/git/Data'
|
|
||||||
|
|
||||||
#===========
|
|
||||||
# input
|
|
||||||
#===========
|
|
||||||
# infile1: mCSM data
|
|
||||||
#indir = '~/git/Data/pyrazinamide/input/processed/'
|
|
||||||
indir = paste0(datadir, '/', drug, '/', 'output') # revised {TODO: change in mcsm pipeline}
|
|
||||||
#in_filename = 'mcsm_complex1_normalised.csv'
|
|
||||||
in_filename = 'pnca_mcsm_struct_params.csv'
|
|
||||||
infile = paste0(indir, '/', in_filename)
|
|
||||||
cat(paste0('Reading infile1: mCSM output file', ' ', infile, '\n') )
|
|
||||||
|
|
||||||
# infile2: gene associated meta data combined with AF and OR
|
|
||||||
#indir: same as above
|
|
||||||
in_filename_comb = paste0(tolower(gene), '_meta_data_with_AFandOR.csv')
|
|
||||||
infile_comb = paste0(indir, '/', in_filename_comb)
|
|
||||||
cat(paste0('Reading infile2: gene associated combined metadata:', infile_comb, '\n'))
|
|
||||||
|
|
||||||
#===========
|
|
||||||
# output
|
|
||||||
#===========
|
|
||||||
# Uncomment if and when required to output
|
|
||||||
outdir = paste0(datadir, '/', drug, '/', 'output') #same as indir
|
|
||||||
cat('Output dir: ', outdir, '\n')
|
|
||||||
#out_filename = paste0(tolower(gene), 'XXX')
|
|
||||||
#outfile = paste0(outdir, '/', out_filename)
|
|
||||||
#cat(paste0('Output file with full path:', outfile))
|
|
||||||
|
|
||||||
#%% end of variable assignment for input and output files
|
|
||||||
#=======================================================================
|
|
||||||
#%% Read input files
|
|
||||||
|
|
||||||
#####################################
|
|
||||||
# input file 1: mcsm normalised data
|
|
||||||
# output of step 4 mcsm_pipeline
|
|
||||||
#####################################
|
|
||||||
cat('Reading mcsm_data:'
|
|
||||||
, '\nindir: ', indir
|
|
||||||
, '\ninfile_comb: ', in_filename)
|
|
||||||
|
|
||||||
mcsm_data = read.csv(infile
|
|
||||||
# , row.names = 1
|
|
||||||
, stringsAsFactors = F
|
|
||||||
, header = T)
|
|
||||||
|
|
||||||
cat('Read mcsm_data file:'
|
|
||||||
, '\nNo.of rows: ', nrow(mcsm_data)
|
|
||||||
, '\nNo. of cols:', ncol(mcsm_data))
|
|
||||||
|
|
||||||
# clear variables
|
|
||||||
rm(in_filename, infile)
|
|
||||||
|
|
||||||
str(mcsm_data)
|
|
||||||
|
|
||||||
table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
|
|
||||||
|
|
||||||
# spelling Correction 1: DUET
|
|
||||||
mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
|
|
||||||
mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
|
|
||||||
|
|
||||||
# checks: should be the same as above
|
|
||||||
table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
|
|
||||||
head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
|
|
||||||
|
|
||||||
# spelling Correction 2: Ligand
|
|
||||||
table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
|
|
||||||
|
|
||||||
mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
|
|
||||||
mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
|
|
||||||
|
|
||||||
# checks: should be the same as above
|
|
||||||
table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
|
|
||||||
head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
|
|
||||||
|
|
||||||
# muts with opposing effects on protomer and ligand stability
|
|
||||||
table(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome)
|
|
||||||
changes = mcsm_data[which(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome),]
|
|
||||||
|
|
||||||
# sanity check: redundant, but uber cautious!
|
|
||||||
dl_i = which(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome)
|
|
||||||
ld_i = which(mcsm_data$Lig_outcome != mcsm_data$DUET_outcome)
|
|
||||||
|
|
||||||
cat('Identifying muts with opposite stability effects')
|
|
||||||
if(nrow(changes) == (table(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome)[[2]]) & identical(dl_i,ld_i)) {
|
|
||||||
cat('PASS: muts with opposite effects on stability and affinity correctly identified'
|
|
||||||
, '\nNo. of such muts: ', nrow(changes))
|
|
||||||
}else {
|
|
||||||
cat('FAIL: unsuccessful in extracting muts with changed stability effects')
|
|
||||||
}
|
|
||||||
|
|
||||||
#***************************
|
|
||||||
# write file: changed muts
|
|
||||||
out_filename = 'muts_opp_effects.csv'
|
|
||||||
outfile = paste0(outdir, '/', out_filename)
|
|
||||||
cat('Writing file for muts with opp effects:'
|
|
||||||
, '\nFilename: ', outfile
|
|
||||||
, '\nPath: ', outdir)
|
|
||||||
|
|
||||||
write.csv(changes, outfile)
|
|
||||||
#****************************
|
|
||||||
# clear variables
|
|
||||||
rm(out_filename, outfile)
|
|
||||||
rm(changes, dl_i, ld_i)
|
|
||||||
|
|
||||||
# count na in each column
|
|
||||||
na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
|
|
||||||
|
|
||||||
# sort by Mutationinformation
|
|
||||||
mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
|
|
||||||
head(mcsm_data$Mutationinformation)
|
|
||||||
|
|
||||||
orig_col = ncol(mcsm_data)
|
|
||||||
|
|
||||||
# get freq count of positions and add to the df
|
|
||||||
setDT(mcsm_data)[, mut_pos_occurrence := .N, by = .(Position)]
|
|
||||||
|
|
||||||
cat('Added col: position frequency of muts to see which posn has how many muts'
|
|
||||||
, '\nNo. of cols now', ncol(mcsm_data)
|
|
||||||
, '\nNo. of cols before: ', orig_col)
|
|
||||||
|
|
||||||
mut_pos_occurrence = data.frame(mcsm_data$Mutationinformation
|
|
||||||
, mcsm_data$Position
|
|
||||||
, mcsm_data$mut_pos_occurrence)
|
|
||||||
|
|
||||||
colnames(mut_pos_occurrence) = c('Mutationinformation', 'position', 'mut_pos_occurrence')
|
|
||||||
#######################################
|
|
||||||
# input file 2: meta data with AFandOR
|
|
||||||
#######################################
|
|
||||||
cat('Reading combined meta data and AFandOR file:'
|
|
||||||
, '\nindir: ', indir
|
|
||||||
, '\ninfile_comb: ', in_filename_comb)
|
|
||||||
|
|
||||||
meta_with_afor <- read.csv(infile_comb
|
|
||||||
, stringsAsFactors = F
|
|
||||||
, header = T)
|
|
||||||
|
|
||||||
cat('Read mcsm_data file:'
|
|
||||||
, '\nNo.of rows: ', nrow(meta_with_afor)
|
|
||||||
, '\nNo. of cols:', ncol(meta_with_afor))
|
|
||||||
|
|
||||||
# counting NAs in AF, OR cols
|
|
||||||
if (identical(sum(is.na(meta_with_afor$OR))
|
|
||||||
, sum(is.na(meta_with_afor$pvalue))
|
|
||||||
, sum(is.na(meta_with_afor$AF)))){
|
|
||||||
cat('PASS: NA count match for OR, pvalue and AF\n')
|
|
||||||
na_count = sum(is.na(meta_with_afor$AF))
|
|
||||||
cat('No. of NAs: ', sum(is.na(meta_with_afor$OR)))
|
|
||||||
} else{
|
|
||||||
cat('FAIL: NA count mismatch'
|
|
||||||
, '\nNA in OR: ', sum(is.na(meta_with_afor$OR))
|
|
||||||
, '\nNA in pvalue: ', sum(is.na(meta_with_afor$pvalue))
|
|
||||||
, '\nNA in AF:', sum(is.na(meta_with_afor$AF)))
|
|
||||||
}
|
|
||||||
|
|
||||||
# clear variables
|
|
||||||
rm(in_filename_comb, infile_comb)
|
|
||||||
|
|
||||||
str(meta_with_afor)
|
|
||||||
|
|
||||||
# sort by Mutationinformation
|
|
||||||
head(meta_with_afor$Mutationinformation)
|
|
||||||
meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
|
|
||||||
head(meta_with_afor$Mutationinformation)
|
|
||||||
|
|
||||||
orig_col2 = ncol(meta_with_afor)
|
|
||||||
|
|
||||||
# get freq count of positions and add to the df
|
|
||||||
setDT(meta_with_afor)[, sample_pos_occurrence := .N, by = .(position)]
|
|
||||||
|
|
||||||
cat('Added col: position frequency of samples to check'
|
|
||||||
,'how many samples correspond to a partiulcar posn associated with muts'
|
|
||||||
, '\nNo. of cols now', ncol(meta_with_afor)
|
|
||||||
, '\nNo. of cols before: ', orig_col2)
|
|
||||||
|
|
||||||
sample_pos_occurrence = data.frame(meta_with_afor$id
|
|
||||||
, meta_with_afor$mutation
|
|
||||||
, meta_with_afor$Mutationinformation
|
|
||||||
, meta_with_afor$position
|
|
||||||
, meta_with_afor$sample_pos_occurrence)
|
|
||||||
colnames(sample_pos_occurrence) = c('id', 'mutation', 'Mutationinformation', 'position', 'sample_pos_occurrence')
|
|
||||||
#=======================================================================
|
|
||||||
cat('Begin merging dfs with NAs'
|
|
||||||
, '\n===============================================================')
|
|
||||||
|
|
||||||
###########################
|
|
||||||
# merging two dfs: with NA
|
|
||||||
###########################
|
|
||||||
# link col name = 'Mutationinforamtion'
|
|
||||||
head(mcsm_data$Mutationinformation)
|
|
||||||
head(meta_with_afor$Mutationinformation)
|
|
||||||
|
|
||||||
cat('Merging dfs with NAs: big df (1-many relationship b/w id & mut)'
|
|
||||||
,'\nlinking col: Mutationinforamtion'
|
|
||||||
,'\nfilename: merged_df2')
|
|
||||||
|
|
||||||
#########
|
|
||||||
# a) merged_df2: meta data with mcsm
|
|
||||||
#########
|
|
||||||
merged_df2 = merge(x = meta_with_afor
|
|
||||||
,y = mcsm_data
|
|
||||||
, by = 'Mutationinformation'
|
|
||||||
, all.y = T)
|
|
||||||
|
|
||||||
cat('Dim of merged_df2: '
|
|
||||||
, '\nNo. of rows: ', nrow(merged_df2)
|
|
||||||
, '\nNo. of cols: ', ncol(merged_df2))
|
|
||||||
head(merged_df2$Position)
|
|
||||||
|
|
||||||
# sanity check
|
|
||||||
cat('Checking nrows in merged_df2')
|
|
||||||
if(nrow(meta_with_afor) == nrow(merged_df2)){
|
|
||||||
cat('nrow(merged_df2) = nrow (gene associated metadata)'
|
|
||||||
,'\nExpected no. of rows: ',nrow(meta_with_afor)
|
|
||||||
,'\nGot no. of rows: ', nrow(merged_df2))
|
|
||||||
} else{
|
|
||||||
cat('nrow(merged_df2)!= nrow(gene associated metadata)'
|
|
||||||
, '\nExpected no. of rows after merge: ', nrow(meta_with_afor)
|
|
||||||
, '\nGot no. of rows: ', nrow(merged_df2)
|
|
||||||
, '\nFinding discrepancy')
|
|
||||||
merged_muts_u = unique(merged_df2$Mutationinformation)
|
|
||||||
meta_muts_u = unique(meta_with_afor$Mutationinformation)
|
|
||||||
# find the index where it differs
|
|
||||||
unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
|
|
||||||
}
|
|
||||||
|
|
||||||
# sort by Position
|
|
||||||
head(merged_df2$Position)
|
|
||||||
merged_df2 = merged_df2[order(merged_df2$Position),]
|
|
||||||
head(merged_df2$Position)
|
|
||||||
|
|
||||||
merged_df2v2 = merge(x = meta_with_afor
|
|
||||||
,y = mcsm_data
|
|
||||||
, by = 'Mutationinformation'
|
|
||||||
, all.x = T)
|
|
||||||
#!=!=!=!=!=!=!=!
|
|
||||||
# COMMENT: used all.y since position 186 is not part of the struc,
|
|
||||||
# hence doesn't have a mcsm value
|
|
||||||
# but 186 is associated with mutation
|
|
||||||
#!=!=!=!=!=!=!=!
|
|
||||||
|
|
||||||
# should be False
|
|
||||||
identical(merged_df2, merged_df2v2)
|
|
||||||
table(merged_df2$Position%in%merged_df2v2$Position)
|
|
||||||
|
|
||||||
rm(merged_df2v2)
|
|
||||||
|
|
||||||
#########
|
|
||||||
# b) merged_df3:remove duplicate mutation information
|
|
||||||
#########
|
|
||||||
cat('Merging dfs without NAs: small df (removing muts with no AF|OR associated)'
|
|
||||||
,'\nCannot trust lineage info from this'
|
|
||||||
,'\nlinking col: Mutationinforamtion'
|
|
||||||
,'\nfilename: merged_df3')
|
|
||||||
|
|
||||||
#==#=#=#=#=#=#
|
|
||||||
# Cannot trust lineage, country from this df as the same mutation
|
|
||||||
# can have many different lineages
|
|
||||||
# but this should be good for the numerical corr plots
|
|
||||||
#=#=#=#=#=#=#=
|
|
||||||
merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),]
|
|
||||||
head(merged_df3$Position); tail(merged_df3$Position) # should be sorted
|
|
||||||
|
|
||||||
# sanity check
|
|
||||||
cat('Checking nrows in merged_df3')
|
|
||||||
if(nrow(mcsm_data) == nrow(merged_df3)){
|
|
||||||
cat('PASS: No. of rows match with mcsm_data'
|
|
||||||
,'\nExpected no. of rows: ', nrow(mcsm_data)
|
|
||||||
,'\nGot no. of rows: ', nrow(merged_df3))
|
|
||||||
} else {
|
|
||||||
cat('FAIL: No. of rows mismatch'
|
|
||||||
, '\nNo. of rows mcsm_data: ', nrow(mcsm_data)
|
|
||||||
, '\nNo. of rows merged_df3: ', nrow(merged_df3))
|
|
||||||
}
|
|
||||||
|
|
||||||
# counting NAs in AF, OR cols in merged_df3
|
|
||||||
# this is becuase mcsm has no AF, OR cols,
|
|
||||||
# so you cannot count NAs
|
|
||||||
if (identical(sum(is.na(merged_df3$OR))
|
|
||||||
, sum(is.na(merged_df3$pvalue))
|
|
||||||
, sum(is.na(merged_df3$AF)))){
|
|
||||||
cat('PASS: NA count match for OR, pvalue and AF\n')
|
|
||||||
na_count_df3 = sum(is.na(merged_df3$AF))
|
|
||||||
cat('No. of NAs: ', sum(is.na(merged_df3$OR)))
|
|
||||||
} else{
|
|
||||||
cat('FAIL: NA count mismatch'
|
|
||||||
, '\nNA in OR: ', sum(is.na(merged_df3$OR))
|
|
||||||
, '\nNA in pvalue: ', sum(is.na(merged_df3$pvalue))
|
|
||||||
, '\nNA in AF:', sum(is.na(merged_df3$AF)))
|
|
||||||
}
|
|
||||||
#=======================================================================
|
|
||||||
#%% merging without NAs
|
|
||||||
|
|
||||||
cat('Begin merging dfs without NAs'
|
|
||||||
, '\n===============================================================')
|
|
||||||
|
|
||||||
cat('Merging dfs without any NAs: big df (1-many relationship b/w id & mut)'
|
|
||||||
,'\nlinking col: Mutationinforamtion'
|
|
||||||
,'\nfilename: merged_df2_comp')
|
|
||||||
|
|
||||||
#########
|
|
||||||
# c) merged_df2_comp: same as merge 1 but excluding NA
|
|
||||||
#########
|
|
||||||
merged_df2_comp = merged_df2[!is.na(merged_df2$AF),]
|
|
||||||
#merged_df2_comp = merged_df2[!duplicated(merged_df2$Mutationinformation),]
|
|
||||||
|
|
||||||
# sanity check
|
|
||||||
cat('Checking nrows in merged_df2_comp')
|
|
||||||
if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count + 1)){
|
|
||||||
cat('PASS: No. of rows match'
|
|
||||||
,'\nDim of merged_df2_comp: '
|
|
||||||
,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1
|
|
||||||
, '\nNo. of rows: ', nrow(merged_df2_comp)
|
|
||||||
, '\nNo. of cols: ', ncol(merged_df2_comp))
|
|
||||||
}else{
|
|
||||||
cat('FAIL: No. of rows mismatch'
|
|
||||||
,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1
|
|
||||||
,'\nGot no. of rows: ', nrow(merged_df2_comp))
|
|
||||||
}
|
|
||||||
|
|
||||||
#########
|
|
||||||
# d) merged_df3_comp: remove duplicate mutation information
|
|
||||||
#########
|
|
||||||
merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),]
|
|
||||||
|
|
||||||
cat('Dim of merged_df3_comp: '
|
|
||||||
, '\nNo. of rows: ', nrow(merged_df3_comp)
|
|
||||||
, '\nNo. of cols: ', ncol(merged_df3_comp))
|
|
||||||
|
|
||||||
# alternate way of deriving merged_df3_comp
|
|
||||||
foo = merged_df3[!is.na(merged_df3$AF),]
|
|
||||||
# compare dfs: foo and merged_df3_com
|
|
||||||
all.equal(foo, merged_df3)
|
|
||||||
|
|
||||||
summary(comparedf(foo, merged_df3))
|
|
||||||
|
|
||||||
# sanity check
|
|
||||||
cat('Checking nrows in merged_df3_comp')
|
|
||||||
if(nrow(merged_df3_comp) == nrow(merged_df3)){
|
|
||||||
cat('NO NAs detected in merged_df3 in AF|OR cols'
|
|
||||||
,'\nNo. of rows are identical: ', nrow(merged_df3))
|
|
||||||
} else{
|
|
||||||
if(nrow(merged_df3_comp) == nrow(merged_df3) - na_count_df3) {
|
|
||||||
cat('PASS: NAs detected in merged_df3 in AF|OR cols'
|
|
||||||
, '\nNo. of NAs: ', na_count_df3
|
|
||||||
, '\nExpected no. of rows in merged_df3_comp: ', nrow(merged_df3) - na_count_df3
|
|
||||||
, '\nGot no. of rows: ', nrow(merged_df3_comp))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#=======================================================================
|
|
||||||
#*********************
|
|
||||||
# writing 1 file in the style of a loop: merged_df3
|
|
||||||
# print(output dir)
|
|
||||||
#i = 'merged_df3'
|
|
||||||
#out_filename = paste0(i, '.csv')
|
|
||||||
#outfile = paste0(outdir, '/', out_filename)
|
|
||||||
|
|
||||||
#cat('Writing output file: '
|
|
||||||
# ,'\nFilename: ', out_filename
|
|
||||||
# ,'\nPath: ', outdir)
|
|
||||||
|
|
||||||
#template: write.csv(merged_df3, 'merged_df3.csv')
|
|
||||||
#write.csv(get(i), outfile, row.names = FALSE)
|
|
||||||
#cat('Finished writing: ', outfile
|
|
||||||
# , '\nNo. of rows: ', nrow(get(i))
|
|
||||||
# , '\nNo. of cols: ', ncol(get(i)))
|
|
||||||
|
|
||||||
#%% write_output files; all 4 files:
|
|
||||||
outvars = c('merged_df2'
|
|
||||||
, 'merged_df3'
|
|
||||||
, 'merged_df2_comp'
|
|
||||||
, 'merged_df3_comp')
|
|
||||||
|
|
||||||
cat('Writing output files: '
|
|
||||||
, '\nPath:', outdir)
|
|
||||||
|
|
||||||
for (i in outvars){
|
|
||||||
# cat(i, '\n')
|
|
||||||
out_filename = paste0(i, '.csv')
|
|
||||||
# cat(out_filename, '\n')
|
|
||||||
# cat('getting value of variable: ', get(i))
|
|
||||||
outfile = paste0(outdir, '/', out_filename)
|
|
||||||
# cat('Full output path: ', outfile, '\n')
|
|
||||||
cat('Writing output file:'
|
|
||||||
,'\nFilename: ', out_filename,'\n')
|
|
||||||
write.csv(get(i), outfile, row.names = FALSE)
|
|
||||||
cat('Finished writing: ', outfile
|
|
||||||
, '\nNo. of rows: ', nrow(get(i))
|
|
||||||
, '\nNo. of cols: ', ncol(get(i)), '\n')
|
|
||||||
}
|
|
||||||
|
|
||||||
# alternate way to replace with implicit loop
|
|
||||||
# FIXME
|
|
||||||
#sapply(outvars, function(x, y) write.csv(get(outvars), paste0(outdir, '/', outvars, '.csv')))
|
|
||||||
|
|
||||||
#=======================================================================
|
|
||||||
#%% merging mut_pos_occurrence and sample_pos_occurence
|
|
||||||
# FIXME
|
|
||||||
#cat('Merging dfs with positional frequency from mcsm and meta_data'
|
|
||||||
# , '\nNcol in mut_pos_occurrence:', ncol(mut_pos_occurrence)
|
|
||||||
# , '\nncol in sample_pos_occurence:', ncol(sample_pos_occurrence)
|
|
||||||
# ,'\nlinking col:', intersect(colnames(sample_pos_occurrence), colnames(mut_pos_occurrence))
|
|
||||||
# ,'\nfilename: merged_df4')
|
|
||||||
|
|
||||||
#merged_df4 = merge(sample_pos_occurrence, mut_pos_occurrence
|
|
||||||
# , by = 'position'
|
|
||||||
# , all = T)
|
|
||||||
|
|
||||||
#out_filename4 = 'mut_and_sample_freq.csv'
|
|
||||||
#outfile4 = paste0(outdir, '/', out_filename4)
|
|
||||||
|
|
||||||
#*************************
|
|
||||||
# clear variables
|
|
||||||
rm(mcsm_data, meta_with_afor, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, orig_col, outdir)
|
|
||||||
rm(mut_pos_occurrence, sample_pos_occurrence)
|
|
||||||
#rm(merged_df4)
|
|
||||||
#%% end of script
|
|
||||||
#=======================================================================
|
|
||||||
|
|
|
@ -1,170 +0,0 @@
|
||||||
#!/home/tanu/anaconda3/envs/ContactMap/bin/python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
"""
|
|
||||||
Created on Tue Feb 18 10:10:12 2020
|
|
||||||
|
|
||||||
@author: tanu
|
|
||||||
"""
|
|
||||||
#=======================================================================
|
|
||||||
# Task: Read a DSSP file into a data frame and output to a csv file
|
|
||||||
|
|
||||||
# Input: '.dssp' i.e gene associated.dssp file (output from run_dssp.sh)
|
|
||||||
|
|
||||||
# Output: '.csv' file containing DSSP output as a df with ASA, RSA, etc.
|
|
||||||
# based on Tien at al 2013 (theor.) values
|
|
||||||
|
|
||||||
# useful links:
|
|
||||||
# https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
|
|
||||||
# https://en.wikipedia.org/wiki/Relative_accessible_surface_area
|
|
||||||
#=======================================================================
|
|
||||||
#%% load packages
|
|
||||||
import sys, os
|
|
||||||
import re
|
|
||||||
import pandas as pd
|
|
||||||
from Bio.PDB import PDBParser
|
|
||||||
from Bio.PDB.DSSP import DSSP
|
|
||||||
import pprint as pp
|
|
||||||
import dms_tools2
|
|
||||||
import dms_tools2.dssp
|
|
||||||
|
|
||||||
#=======================================================================#
|
|
||||||
#%% specify homedir and curr dir
|
|
||||||
homedir = os.path.expanduser('~')
|
|
||||||
|
|
||||||
# set working dir
|
|
||||||
os.getcwd()
|
|
||||||
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
|
|
||||||
os.getcwd()
|
|
||||||
#=======================================================================
|
|
||||||
#%% variable assignment: input and output
|
|
||||||
#drug = 'pyrazinamide'
|
|
||||||
#gene = 'pncA'
|
|
||||||
#gene_match = gene + '_p.'
|
|
||||||
|
|
||||||
drug = 'cycloserine'
|
|
||||||
gene = 'alr'
|
|
||||||
#==========
|
|
||||||
# data dir
|
|
||||||
#==========
|
|
||||||
#indir = 'git/Data/pyrazinamide/input/original'
|
|
||||||
datadir = homedir + '/' + 'git/Data'
|
|
||||||
|
|
||||||
#=======
|
|
||||||
# input from outdir
|
|
||||||
#=======
|
|
||||||
indir = datadir + '/' + drug + '/' + 'input'
|
|
||||||
#1) pdb file
|
|
||||||
in_filename_pdb = gene.lower() + '_complex' + '.pdb'
|
|
||||||
infile_pdb = indir + '/' + in_filename_pdb
|
|
||||||
print('Input pdb filename:', in_filename_pdb
|
|
||||||
, '\npath:', indir
|
|
||||||
, '\n============================================================')
|
|
||||||
|
|
||||||
#2) dssp file
|
|
||||||
outdir = datadir + '/' + drug + '/' + 'output'
|
|
||||||
in_filename = gene.lower() +'.dssp'
|
|
||||||
infile = outdir + '/' + in_filename
|
|
||||||
print('Input dssp filename:', in_filename
|
|
||||||
, '\npath:', outdir
|
|
||||||
, '\n============================================================')
|
|
||||||
|
|
||||||
#=======
|
|
||||||
# output
|
|
||||||
#=======
|
|
||||||
outdir = datadir + '/' + drug + '/' + 'output'
|
|
||||||
out_filename = gene.lower() + '_dssp.csv'
|
|
||||||
outfile = outdir + '/' + out_filename
|
|
||||||
print('Output filename:', out_filename
|
|
||||||
, '\npath:', outdir
|
|
||||||
, '\nOutfile: ', outfile
|
|
||||||
, '\n=============================================================')
|
|
||||||
#%% end of variable assignment for input and output files
|
|
||||||
#=======================================================================
|
|
||||||
#%% specify pdb chain as a list. Handy when more than 1 pdb chain
|
|
||||||
my_chains = ['A']
|
|
||||||
# my_chains = ['A', 'B'] # for cycloserine
|
|
||||||
|
|
||||||
# generate my_chains from dssp
|
|
||||||
p = PDBParser()
|
|
||||||
structure = p.get_structure(in_filename_pdb, infile_pdb)
|
|
||||||
model = structure[0]
|
|
||||||
dssp = DSSP(model, infile_pdb)
|
|
||||||
|
|
||||||
#print(dssp.keys())
|
|
||||||
#print(dssp.keys()[0][0])
|
|
||||||
#print(len(dssp))
|
|
||||||
#print(dssp.keys()[0][0])
|
|
||||||
#print(dssp.keys()[len(dssp)-1][0])
|
|
||||||
|
|
||||||
dssp_chains = []
|
|
||||||
for num_aa in range(0, len(dssp)):
|
|
||||||
# print(num_aa)
|
|
||||||
# extract the chain id only and append to a list
|
|
||||||
dssp_chains.append(dssp.keys()[num_aa][0])
|
|
||||||
chainsL = list(set(dssp_chains))
|
|
||||||
|
|
||||||
print(chainsL)
|
|
||||||
# sort the list to make for sanity (since sets are not ordered)
|
|
||||||
# this will be required for dssp_df
|
|
||||||
my_chains = sorted(chainsL)
|
|
||||||
|
|
||||||
print('dssp output for'
|
|
||||||
, in_filename, 'contains:', len(my_chains)
|
|
||||||
, 'chains:\n', my_chains)
|
|
||||||
#%% ====================================================================
|
|
||||||
# Process dssp output and extract into df (single chain)
|
|
||||||
|
|
||||||
#dssp_df = dms_tools2.dssp.processDSSP(infile, chain = my_chains)
|
|
||||||
#dssp_df['chain_dssp'] = my_chains
|
|
||||||
#pp.pprint(dssp_df)
|
|
||||||
#=======================================================================
|
|
||||||
# incase pdb has > 1 chain and you need to run it for all chains
|
|
||||||
|
|
||||||
# initialise an empty df
|
|
||||||
dssp_df = pd.DataFrame()
|
|
||||||
print('Total no. of chains: ', len(my_chains))
|
|
||||||
for chain_id in my_chains:
|
|
||||||
print('Chain id:', chain_id)
|
|
||||||
dssp_cur = pd.DataFrame()
|
|
||||||
dssp_cur = dms_tools2.dssp.processDSSP(infile, chain = chain_id)
|
|
||||||
#!!!Important!!!
|
|
||||||
dssp_cur['chain_id'] = chain_id
|
|
||||||
dssp_df = dssp_df.append(dssp_cur)
|
|
||||||
pp.pprint(dssp_df)
|
|
||||||
|
|
||||||
#=====================
|
|
||||||
# Renaming amino-acid
|
|
||||||
# and site columns
|
|
||||||
#=====================
|
|
||||||
# Rename column (amino acid) as 'wild_type' and (site} as 'position'
|
|
||||||
# to be the same names as used in the file required for merging later.
|
|
||||||
dssp_df.columns
|
|
||||||
dssp_df.rename(columns = {'site':'position', 'amino_acid':'wild_type_dssp'}, inplace = True)
|
|
||||||
dssp_df.columns
|
|
||||||
|
|
||||||
# sanity check
|
|
||||||
if len(dssp_df) == len(dssp):
|
|
||||||
print('PASS: length of dssp_df has correct length')
|
|
||||||
else:
|
|
||||||
print('FAIL: length mismatch for dssp_df'
|
|
||||||
, '\nexpected length:', len(dssp)
|
|
||||||
, '\nGot length:', len(dssp_df)
|
|
||||||
, 'Debug please!')
|
|
||||||
|
|
||||||
#%% Write ouput csv file
|
|
||||||
print('Writing file:', outfile
|
|
||||||
, '\nFilename:', out_filename
|
|
||||||
, '\nPath:', outdir
|
|
||||||
, '\n=============================================================')
|
|
||||||
|
|
||||||
# write to csv
|
|
||||||
dssp_df.to_csv(outfile, header=True, index = False)
|
|
||||||
|
|
||||||
print('Finished writing:', out_filename
|
|
||||||
, '\nNo. of rows:', len(dssp_df)
|
|
||||||
, '\nNo. of cols:', len(dssp_df.columns)
|
|
||||||
, '\n==============================================================')
|
|
||||||
#%% end of script
|
|
||||||
#=======================================================================
|
|
||||||
#%% run dssp to extract chain ids to later process the dssp output into a df
|
|
||||||
|
|
|
@ -1,36 +0,0 @@
|
||||||
#!/usr/bin/python3
|
|
||||||
# Initialise a blank 'Data' directory and drug subdirs etc.
|
|
||||||
# TODO:
|
|
||||||
# - Read base dir from config file
|
|
||||||
# - Create eg: '~/git/Data/{original,processed}
|
|
||||||
# - Create eg: '~/git/Data/processed/' + drug (for each drug)
|
|
||||||
# - Create eg: '~/git/Data/output/' + drug + '{plots, structure}'
|
|
||||||
|
|
||||||
|
|
||||||
#%% specify homedir as python doesn't recognise tilde
|
|
||||||
homedir = os.path.expanduser('~')
|
|
||||||
|
|
||||||
#%% variable assignment: input and output paths & filenames
|
|
||||||
drug = 'pyrazinamide'
|
|
||||||
gene = 'pncA'
|
|
||||||
gene_match = gene + '_p.'
|
|
||||||
|
|
||||||
#==========
|
|
||||||
# data dir
|
|
||||||
#==========
|
|
||||||
#indir = 'git/Data/pyrazinamide/input/original'
|
|
||||||
datadir = homedir + '/' + 'git/Data'
|
|
||||||
|
|
||||||
#==========
|
|
||||||
# input dir
|
|
||||||
#==========
|
|
||||||
indir = datadir + '/' + drug + '/' + 'input'
|
|
||||||
|
|
||||||
#============
|
|
||||||
# output dir
|
|
||||||
#============
|
|
||||||
# several output files
|
|
||||||
outdir = datadir + '/' + drug + '/' + 'output'
|
|
||||||
|
|
||||||
#%%end of variable assignment for input and output files
|
|
||||||
#==============================================================================
|
|
|
@ -1,193 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
'''
|
|
||||||
Created on Tue Aug 6 12:56:03 2019
|
|
||||||
|
|
||||||
@author: tanu
|
|
||||||
'''
|
|
||||||
#=======================================================================
|
|
||||||
# Task: Hydrophobicity (Kd) values for amino acid sequence using the
|
|
||||||
# Kyt&-Doolittle.
|
|
||||||
# Same output as using the expasy server https://web.expasy.org/protscale/
|
|
||||||
# Input: fasta file
|
|
||||||
|
|
||||||
# Output: csv file with
|
|
||||||
|
|
||||||
|
|
||||||
# useful links
|
|
||||||
# https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html
|
|
||||||
# https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
|
|
||||||
#=======================================================================
|
|
||||||
#%% load packages
|
|
||||||
from pylab import *
|
|
||||||
from Bio.SeqUtils import ProtParamData
|
|
||||||
from Bio.SeqUtils.ProtParam import ProteinAnalysis
|
|
||||||
from Bio import SeqIO
|
|
||||||
#from Bio.Alphabet.IUPAC import IUPACProtein
|
|
||||||
#import pprint as pp
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
import sys, os
|
|
||||||
#=======================================================================
|
|
||||||
#%% specify input and curr dir
|
|
||||||
homedir = os.path.expanduser('~')
|
|
||||||
|
|
||||||
# set working dir
|
|
||||||
os.getcwd()
|
|
||||||
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
|
|
||||||
os.getcwd()
|
|
||||||
#=======================================================================
|
|
||||||
#%% variable assignment: input and output
|
|
||||||
drug = 'pyrazinamide'
|
|
||||||
gene = 'pncA'
|
|
||||||
gene_match = gene + '_p.'
|
|
||||||
|
|
||||||
#==========
|
|
||||||
# data dir
|
|
||||||
#==========
|
|
||||||
#indir = 'git/Data/pyrazinamide/input/original'
|
|
||||||
datadir = homedir + '/' + 'git/Data'
|
|
||||||
|
|
||||||
#=======
|
|
||||||
# input
|
|
||||||
#=======
|
|
||||||
#indir = 'git/Data/pyrazinamide/input/original'
|
|
||||||
indir = datadir + '/' + drug + '/' + 'input'
|
|
||||||
in_filename = '3pl1.fasta.txt'
|
|
||||||
infile = indir + '/' + in_filename
|
|
||||||
print('Input filename:', in_filename
|
|
||||||
, '\nInput path:', indir
|
|
||||||
, '\n============================================================')
|
|
||||||
|
|
||||||
#=======
|
|
||||||
# output
|
|
||||||
#=======
|
|
||||||
outdir = datadir + '/' + drug + '/' + 'output'
|
|
||||||
out_filename = gene.lower() + '_kd.csv'
|
|
||||||
outfile = outdir + '/' + out_filename
|
|
||||||
print('Output filename:', out_filename
|
|
||||||
, '\nOutput path:', outdir
|
|
||||||
, '\n=============================================================')
|
|
||||||
#%% end of variable assignment for input and output files
|
|
||||||
#=======================================================================
|
|
||||||
#===================
|
|
||||||
#calculate KD values: same as the expasy server
|
|
||||||
#===================
|
|
||||||
#%%specify window size for hydropathy profile computation
|
|
||||||
# https://web.expasy.org/protscale/pscale/protscale_help.html
|
|
||||||
my_window = 3
|
|
||||||
offset = round((my_window/2)-0.5)
|
|
||||||
|
|
||||||
fh = open(infile)
|
|
||||||
|
|
||||||
for record in SeqIO.parse(fh, 'fasta'):
|
|
||||||
id = record.id
|
|
||||||
seq = record.seq
|
|
||||||
num_residues = len(seq)
|
|
||||||
fh.close()
|
|
||||||
|
|
||||||
sequence = str(seq)
|
|
||||||
|
|
||||||
X = ProteinAnalysis(sequence)
|
|
||||||
|
|
||||||
kd_values = (X.protein_scale(ProtParamData.kd , window = my_window)) # edge weight is set to default (100%)
|
|
||||||
|
|
||||||
# sanity checks
|
|
||||||
print('Sequence Length:', num_residues)
|
|
||||||
print('kd_values Length:',len(kd_values))
|
|
||||||
print('Window Length:', my_window)
|
|
||||||
print('Window Offset:', offset)
|
|
||||||
print('=================================================================')
|
|
||||||
print('Checking:len(kd values) is as expected for the given window size & offset...')
|
|
||||||
expected_length = num_residues - (my_window - offset)
|
|
||||||
if len(kd_values) == expected_length:
|
|
||||||
print('PASS: expected and actual length of kd values match')
|
|
||||||
else:
|
|
||||||
print('FAIL: length mismatch'
|
|
||||||
,'\nExpected length:', expected_length
|
|
||||||
,'\nActual length:', len(kd_values)
|
|
||||||
, '\n=========================================================')
|
|
||||||
#===================
|
|
||||||
# creating two dfs
|
|
||||||
#===================
|
|
||||||
#%% make 2 dfs; 1) aa sequence and 2) kd_values. Then reset index for each df
|
|
||||||
# which will allow easy merging of the two dfs.
|
|
||||||
|
|
||||||
# df1: df of aa seq with index reset to start from 1 (reflective of the actual aa position in a sequence)
|
|
||||||
# Name column of wt as 'wild_type' to be the same name used in the file required for merging later.
|
|
||||||
dfSeq = pd.DataFrame({'wild_type_kd':list(sequence)})
|
|
||||||
dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive
|
|
||||||
|
|
||||||
# df2: df of kd_values with index reset to start from offset + 1 and subsequent matched length of the kd_values
|
|
||||||
dfVals = pd.DataFrame({'kd_values':kd_values})
|
|
||||||
dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset)
|
|
||||||
|
|
||||||
# sanity checks
|
|
||||||
max(dfVals['kd_values'])
|
|
||||||
min(dfVals['kd_values'])
|
|
||||||
|
|
||||||
#===================
|
|
||||||
# concatenating dfs
|
|
||||||
#===================
|
|
||||||
# Merge the two on index
|
|
||||||
# (as these are now reflective of the aa position numbers): df1 and df2
|
|
||||||
# This will introduce NaN where there is missing values. In our case this
|
|
||||||
# will be 2 (first and last ones based on window size and offset)
|
|
||||||
# In our case this will be 2 (first and last ones)
|
|
||||||
# For pnca: the last position is not part of the struc, so not info loss
|
|
||||||
# Needless to say that this will be variable for other targets.
|
|
||||||
|
|
||||||
kd_df = pd.concat([dfSeq, dfVals], axis = 1)
|
|
||||||
|
|
||||||
#============================
|
|
||||||
# renaming index to position
|
|
||||||
#============================
|
|
||||||
kd_df = kd_df.rename_axis('position')
|
|
||||||
kd_df.head
|
|
||||||
print('=================================================================')
|
|
||||||
|
|
||||||
print('position col i.e. index should be numeric
|
|
||||||
, '\n===============================================================')
|
|
||||||
|
|
||||||
if kd_df.index.dtype == 'int64':
|
|
||||||
print('PASS: position col is numeric'
|
|
||||||
, '\ndtype is:', kd_df.index.dtype)
|
|
||||||
else:
|
|
||||||
print('FAIL: position col is not numeric'
|
|
||||||
, '\nConverting to numeric')
|
|
||||||
kd_df.index.astype('int64')
|
|
||||||
print('Checking dtype for after conversion:\n'
|
|
||||||
, '\ndtype is:', kd_df.index.dtype
|
|
||||||
, '\n=========================================================')
|
|
||||||
#===============
|
|
||||||
# writing file
|
|
||||||
#===============
|
|
||||||
print('Writing file:', out_filename
|
|
||||||
, '\nFilename:', out_filename
|
|
||||||
, '\nPath:', outdir
|
|
||||||
, '\n=============================================================')
|
|
||||||
|
|
||||||
kd_df.to_csv(outfile, header = True, index = True)
|
|
||||||
|
|
||||||
print('Finished writing:', out_filename
|
|
||||||
, '\nNo. of rows:', len(kd_df)
|
|
||||||
, '\nNo. of cols:', len(kd_df.columns)
|
|
||||||
, '\n=============================================================')
|
|
||||||
|
|
||||||
#===============
|
|
||||||
# plot: optional!
|
|
||||||
#===============#%% plot
|
|
||||||
# http://www.dalkescientific.com/writings/NBN/plotting.html
|
|
||||||
|
|
||||||
# FIXME: save fig
|
|
||||||
# extract just pdb if from 'id' to pass to title of plot
|
|
||||||
# foo = re.match(r'(^[0-9]{1}\w{3})', id).groups(1)
|
|
||||||
plot(kd_values, linewidth = 1.0)
|
|
||||||
#axis(xmin = 1, xmax = num_residues)
|
|
||||||
xlabel('Residue Number')
|
|
||||||
ylabel('Hydrophobicity')
|
|
||||||
title('K&D Hydrophobicity for ' + id)
|
|
||||||
show()
|
|
||||||
print('======================================================================')
|
|
||||||
#%% end of script
|
|
||||||
#=======================================================================
|
|
|
@ -1,167 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
'''
|
|
||||||
Created on Tue Aug 6 12:56:03 2019
|
|
||||||
|
|
||||||
@author: tanu
|
|
||||||
'''
|
|
||||||
|
|
||||||
# FIXME: import dirs.py to get the basic dir paths available
|
|
||||||
#=======================================================================
|
|
||||||
# TASK: calculate how many mutations result in
|
|
||||||
# electrostatic changes wrt wt
|
|
||||||
|
|
||||||
# Input: mcsm and AF_OR file
|
|
||||||
|
|
||||||
# Output: mut_elec_changes_results.txt
|
|
||||||
#=======================================================================
|
|
||||||
#%% load libraries
|
|
||||||
import os, sys
|
|
||||||
import pandas as pd
|
|
||||||
#import numpy as np
|
|
||||||
#=======================================================================
|
|
||||||
#%% specify homedir and curr dir
|
|
||||||
homedir = os.path.expanduser('~')
|
|
||||||
|
|
||||||
# set working dir
|
|
||||||
os.getcwd()
|
|
||||||
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
|
|
||||||
os.getcwd()
|
|
||||||
#=======================================================================
|
|
||||||
#%% variable assignment: input and output
|
|
||||||
drug = 'pyrazinamide'
|
|
||||||
gene = 'pncA'
|
|
||||||
gene_match = gene + '_p.'
|
|
||||||
|
|
||||||
#==========
|
|
||||||
# data dir
|
|
||||||
#==========
|
|
||||||
#indir = 'git/Data/pyrazinamide/input/original'
|
|
||||||
datadir = homedir + '/' + 'git/Data'
|
|
||||||
|
|
||||||
#=======
|
|
||||||
# input
|
|
||||||
#=======
|
|
||||||
indir = datadir + '/' + drug + '/' + 'input'
|
|
||||||
in_filename = 'merged_df3.csv'
|
|
||||||
infile = outdir + '/' + in_filename
|
|
||||||
print('Input filename: ', in_filename
|
|
||||||
, '\nInput path: ', indir
|
|
||||||
, '\n============================================================')
|
|
||||||
|
|
||||||
#=======
|
|
||||||
# output
|
|
||||||
#=======
|
|
||||||
outdir = datadir + '/' + drug + '/' + 'output'
|
|
||||||
# specify output file
|
|
||||||
out_filename = 'mut_elec_changes.txt'
|
|
||||||
outfile = outdir + '/' + out_filename
|
|
||||||
print('Output filename: ', out_filename
|
|
||||||
, '\nOutput path: ', outdir
|
|
||||||
, '\n============================================================')
|
|
||||||
|
|
||||||
#%% end of variable assignment for input and output files
|
|
||||||
#=======================================================================
|
|
||||||
#%% Read input files
|
|
||||||
print('Reading input file (merged file):', infile)
|
|
||||||
|
|
||||||
comb_df = pd.read_csv(infile, sep = ',')
|
|
||||||
|
|
||||||
print('Input filename: ', in_filename
|
|
||||||
, '\nPath :', outdir
|
|
||||||
, '\nNo. of rows: ', len(comb_df)
|
|
||||||
, '\nNo. of cols: ', infile
|
|
||||||
, '\n============================================================')
|
|
||||||
|
|
||||||
# column names
|
|
||||||
list(comb_df.columns)
|
|
||||||
|
|
||||||
# clear variables
|
|
||||||
del(in_filename, infile)
|
|
||||||
|
|
||||||
#%% subset unique mutations
|
|
||||||
df = comb_df.drop_duplicates(['Mutationinformation'], keep = 'first')
|
|
||||||
|
|
||||||
total_muts = df.Mutationinformation.nunique()
|
|
||||||
#df.Mutationinformation.count()
|
|
||||||
print('Total mutations associated with structure: ', total_muts
|
|
||||||
, '\n===============================================================')
|
|
||||||
|
|
||||||
#%% combine aa_calcprop cols so that you can count the changes as value_counts
|
|
||||||
# check if all muts have been categorised
|
|
||||||
print('Checking if all muts have been categorised: ')
|
|
||||||
if df['wt_calcprop'].isna().sum() == 0 & df['mut_calcprop'].isna().sum():
|
|
||||||
print('PASS: No. NA detected i.e all muts have aa prop associated'
|
|
||||||
, '\n===============================================================')
|
|
||||||
else:
|
|
||||||
print('FAIL: NAs detected i.e some muts remain unclassified'
|
|
||||||
, '\n===============================================================')
|
|
||||||
|
|
||||||
df['wt_calcprop'].head()
|
|
||||||
df['mut_calcprop'].head()
|
|
||||||
|
|
||||||
print('Combining wt_calcprop and mut_calcprop...')
|
|
||||||
#df['aa_calcprop_combined'] = df['wt_calcprop']+ '->' + df['mut_calcprop']
|
|
||||||
df['aa_calcprop_combined'] = df.wt_calcprop.str.cat(df.mut_calcprop, sep = '->')
|
|
||||||
df['aa_calcprop_combined'].head()
|
|
||||||
|
|
||||||
mut_categ = df["aa_calcprop_combined"].unique()
|
|
||||||
print('Total no. of aa_calc properties: ', len(mut_categ))
|
|
||||||
print('Categories are: ', mut_categ)
|
|
||||||
|
|
||||||
# counting no. of muts in each mut categ
|
|
||||||
|
|
||||||
# way1: count values within each combinaton
|
|
||||||
df.groupby('aa_calcprop_combined').size()
|
|
||||||
#df.groupby('aa_calcprop_combined').count()
|
|
||||||
|
|
||||||
# way2: count values within each combinaton
|
|
||||||
df['aa_calcprop_combined'].value_counts()
|
|
||||||
|
|
||||||
# comment: the two ways should be identical
|
|
||||||
# groupby result order is similar to pivot table order,
|
|
||||||
# I prefer the value_counts look
|
|
||||||
|
|
||||||
# assign to variable: count values within each combinaton
|
|
||||||
all_prop = df['aa_calcprop_combined'].value_counts()
|
|
||||||
|
|
||||||
# convert to a df from Series
|
|
||||||
ap_df = pd.DataFrame({'aa_calcprop': all_prop.index, 'mut_count': all_prop.values})
|
|
||||||
|
|
||||||
# subset df to contain only the changes in prop
|
|
||||||
all_prop_change = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-polar','polar->polar', 'pos->pos']) == False]
|
|
||||||
|
|
||||||
elec_count = all_prop_change.mut_count.sum()
|
|
||||||
print('Total no.of muts with elec changes: ', elec_count)
|
|
||||||
|
|
||||||
# calculate percentage of electrostatic changes
|
|
||||||
elec_changes = (elec_count/total_muts) * 100
|
|
||||||
|
|
||||||
print('Total number of electrostatic changes resulting from Mutation is (%):', elec_changes)
|
|
||||||
|
|
||||||
# check no change muts
|
|
||||||
no_change_muts = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-polar','polar->polar', 'pos->pos']) == True]
|
|
||||||
|
|
||||||
no_change_muts.mut_count.sum()
|
|
||||||
|
|
||||||
#%% output from console
|
|
||||||
#sys.stdout = open(file, 'w')
|
|
||||||
sys.stdout = open(outfile, 'w')
|
|
||||||
|
|
||||||
print('======================\n'
|
|
||||||
,'Unchanged muts'
|
|
||||||
,'\n=====================\n'
|
|
||||||
, no_change_muts
|
|
||||||
,'\n=============================\n'
|
|
||||||
, 'Muts with changed prop:'
|
|
||||||
, '\n============================\n'
|
|
||||||
, all_prop_change)
|
|
||||||
|
|
||||||
print('================================================================='
|
|
||||||
, '\nTotal number of electrostatic changes resulting from Mtation is (%):', elec_changes
|
|
||||||
, '\nTotal no. of muts: ', total_muts
|
|
||||||
, '\nTotal no. of changed muts: ', all_prop_change.mut_count.sum()
|
|
||||||
, '\nTotal no. of unchanged muts: ', no_change_muts.mut_count.sum()
|
|
||||||
, '\n===================================================================')
|
|
||||||
#%% end of script
|
|
||||||
#=======================================================================
|
|
|
@ -1,179 +0,0 @@
|
||||||
#!/usr/bin/python
|
|
||||||
from __future__ import print_function
|
|
||||||
from Bio import SeqIO
|
|
||||||
from Bio.Seq import Seq
|
|
||||||
from collections import OrderedDict
|
|
||||||
import sys
|
|
||||||
import argparse
|
|
||||||
# https://github.com/jrjhealey/bioinfo-tools/blob/master/Mutate.py
|
|
||||||
# https://www.biostars.org/p/336891/
|
|
||||||
|
|
||||||
# TODO:
|
|
||||||
# - create some logic to 'group' mutations that will be applied to the same sequence, to
|
|
||||||
# make all switches at once
|
|
||||||
# - This will also probably break the verbose transversion output so the maths will need replacing
|
|
||||||
# - Create the ability to support INDELS (will also require pairwise alignment so that
|
|
||||||
# hamming distances remain meaningful.
|
|
||||||
|
|
||||||
|
|
||||||
def get_args():
|
|
||||||
"""Parse command line arguments"""
|
|
||||||
desc = "Mutate fasta sequences based on a file of sequence mappings."
|
|
||||||
epi = (
|
|
||||||
"This script takes a mapfile of the form:\n"
|
|
||||||
" SequenceID,A123B\n"
|
|
||||||
" SequenceID,X456Y\n"
|
|
||||||
"And performs substitutions/mutations. At preset it only does one SNP per sequence.\n"
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description=desc, epilog=epi, formatter_class=argparse.RawTextHelpFormatter
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"mutation_file",
|
|
||||||
action="store",
|
|
||||||
help='File of mutation mappings like so: "SeqID,X123Y"',
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"sequences",
|
|
||||||
action="store",
|
|
||||||
help="File of sequences to be mutated (fasta only).",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"-v",
|
|
||||||
"--verbose",
|
|
||||||
action="store_true",
|
|
||||||
help="Verbose behaviour, printing parameters of the script.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"-o",
|
|
||||||
"--outfile",
|
|
||||||
action="store",
|
|
||||||
help="Output file for mutated sequences (default STDOUT).",
|
|
||||||
)
|
|
||||||
if len(sys.argv) == 1:
|
|
||||||
parser.print_help(sys.stderr)
|
|
||||||
exit(1)
|
|
||||||
except:
|
|
||||||
sys.stderr.write(
|
|
||||||
"An exception occurred with argument parsing. Check your provided options.\n"
|
|
||||||
)
|
|
||||||
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
|
|
||||||
class Mutation(object):
|
|
||||||
"""A class wrapper for sequence IDs so that duplicate IDs can be used in a dictionary"""
|
|
||||||
|
|
||||||
def __init__(self, name):
|
|
||||||
self.name = name
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return "'" + self.name + "'"
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return self.name
|
|
||||||
|
|
||||||
|
|
||||||
def parse_mapfile(mapfile):
|
|
||||||
"""Return a dict of mapped mutations.
|
|
||||||
File should resemble:
|
|
||||||
SequenceID,A123B
|
|
||||||
SequenceID2,X234Y
|
|
||||||
Sequence IDs should exactly match the fasta headers, as parsed by BioPython.
|
|
||||||
(">" symbols are optional)
|
|
||||||
"""
|
|
||||||
|
|
||||||
with open(mapfile, "r") as handle:
|
|
||||||
mut_dict = OrderedDict()
|
|
||||||
for line in handle:
|
|
||||||
id, change = line.lstrip(">").rstrip("\n").split(",")
|
|
||||||
mut_dict[Mutation(id)] = change
|
|
||||||
|
|
||||||
for k, v in mut_dict.items():
|
|
||||||
assert v[0].isalpha(), (
|
|
||||||
"First character of mutation map is not a valid letter. Got: %s" % v[0]
|
|
||||||
)
|
|
||||||
assert v[-1].isalpha(), (
|
|
||||||
"Last character of mutation map is not a valid letter. Got: %s" % v[-1]
|
|
||||||
)
|
|
||||||
assert v[1:-1].isdigit(), (
|
|
||||||
"Location string of mutation map is not a valid number. Got: %s" % v[1:-1]
|
|
||||||
)
|
|
||||||
|
|
||||||
return mut_dict
|
|
||||||
|
|
||||||
|
|
||||||
def morph(orig, loc, new, mutableseq, verbose):
|
|
||||||
"""Perform actual sequence change (polymorphism only at present)"""
|
|
||||||
# Shift location to offset 0-based index
|
|
||||||
loc = loc - 1
|
|
||||||
assert mutableseq[loc] == orig, (
|
|
||||||
"Sequence does not match the mutation file for pre-exising residue. Expected %s , got %s "
|
|
||||||
% (orig, mutableseq[loc])
|
|
||||||
)
|
|
||||||
if verbose is True:
|
|
||||||
print(
|
|
||||||
"Performing change: {} -> {}, at location: {} (0 based)".format(
|
|
||||||
orig, new, loc
|
|
||||||
)
|
|
||||||
)
|
|
||||||
mutableseq[loc] = new
|
|
||||||
return mutableseq
|
|
||||||
|
|
||||||
|
|
||||||
def hamming_distance(s1, s2):
|
|
||||||
"""Return the Hamming distance between equal-length sequences"""
|
|
||||||
if len(s1) != len(s2):
|
|
||||||
raise ValueError("Undefined for sequences of unequal length")
|
|
||||||
return sum(ch1 != ch2 for ch1, ch2 in zip(s1.upper(), s2.upper()))
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
args = get_args()
|
|
||||||
if args.outfile is not None:
|
|
||||||
ofh = open(args.outfile, "w")
|
|
||||||
|
|
||||||
# Parse the mutation file (get mutations by sequence)
|
|
||||||
mutations = parse_mapfile(args.mutation_file)
|
|
||||||
if args.verbose is True:
|
|
||||||
print("Got mutations:")
|
|
||||||
print(mutations)
|
|
||||||
# Iterate all sequences and make any substitutions necessary
|
|
||||||
for record in SeqIO.parse(args.sequences, "fasta"):
|
|
||||||
for k, v in mutations.items():
|
|
||||||
mutable = record.seq.upper().tomutable()
|
|
||||||
# mutable = record.seq.tomutable()
|
|
||||||
# print("MO:", mutable)
|
|
||||||
if k.name == record.id[0:4]: # BEWARE HARDCODING
|
|
||||||
# print("k.name:", k.name, "record.id:", record.id)
|
|
||||||
orig = v[0]
|
|
||||||
print("orig:", orig)
|
|
||||||
new = v[-1]
|
|
||||||
loc = int(v[1:-1])
|
|
||||||
if args.verbose:
|
|
||||||
print(record.id)
|
|
||||||
newseq = morph(orig, loc, new, mutable, args.verbose)
|
|
||||||
# print("NS is:", newseq)
|
|
||||||
if args.verbose is True:
|
|
||||||
print("Original: " + record.seq.upper())
|
|
||||||
print(
|
|
||||||
str((" " * int(loc - 2 + 11))) + "V"
|
|
||||||
) # Padded string to show where switch happened (not sure how it'll deal with line wrapping
|
|
||||||
print("New: " + newseq)
|
|
||||||
print(
|
|
||||||
"Distance: "
|
|
||||||
+ str(hamming_distance(str(record.seq), str(newseq)))
|
|
||||||
)
|
|
||||||
if args.outfile is not None:
|
|
||||||
ofh.write(">%s_%s\n%s\n" % (record.id, v, newseq))
|
|
||||||
if args.verbose is False:
|
|
||||||
print(">%s_%s\n%s\n" % (record.id, v, newseq))
|
|
||||||
|
|
||||||
if args.outfile is not None:
|
|
||||||
ofh.close()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -1,135 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
'''
|
|
||||||
Created on Tue Aug 6 12:56:03 2019
|
|
||||||
|
|
||||||
@author: tanu
|
|
||||||
'''
|
|
||||||
#=============================================================================
|
|
||||||
# Task: Residue depth (rd) processing to generate a df with residue_depth(rd)
|
|
||||||
# values
|
|
||||||
|
|
||||||
# FIXME
|
|
||||||
# Input: '.tsv' i.e residue depth txt file (output from .zip file manually
|
|
||||||
# downloaded from the website).
|
|
||||||
# This should be integrated into the pipeline
|
|
||||||
|
|
||||||
# Output: .csv with 3 cols i.e position, rd_values & 3-letter wt aa code(caps)
|
|
||||||
#=============================================================================
|
|
||||||
#%% load packages
|
|
||||||
import sys, os
|
|
||||||
import pandas as pd
|
|
||||||
#import numpy as np
|
|
||||||
#=============================================================================
|
|
||||||
#%% specify input and curr dir
|
|
||||||
homedir = os.path.expanduser('~')
|
|
||||||
|
|
||||||
# set working dir
|
|
||||||
os.getcwd()
|
|
||||||
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
|
|
||||||
os.getcwd()
|
|
||||||
#=============================================================================
|
|
||||||
#%% variable assignment: input and output
|
|
||||||
drug = 'pyrazinamide'
|
|
||||||
gene = 'pncA'
|
|
||||||
gene_match = gene + '_p.'
|
|
||||||
|
|
||||||
#==========
|
|
||||||
# data dir
|
|
||||||
#==========
|
|
||||||
#indir = 'git/Data/pyrazinamide/input/original'
|
|
||||||
datadir = homedir + '/' + 'git/Data'
|
|
||||||
|
|
||||||
#=======
|
|
||||||
# input
|
|
||||||
#=======
|
|
||||||
#indir = 'git/Data/pyrazinamide/input/original'
|
|
||||||
indir = datadir + '/' + drug + '/' + 'output'
|
|
||||||
in_filename = '3pl1_rd.tsv'
|
|
||||||
infile = indir + '/' + in_filename
|
|
||||||
print('Input filename:', in_filename
|
|
||||||
, '\nInput path:', indir
|
|
||||||
, '\n=============================================================')
|
|
||||||
#=======
|
|
||||||
# output
|
|
||||||
#=======
|
|
||||||
outdir = datadir + '/' + drug + '/' + 'output'
|
|
||||||
out_filename = gene.lower() + '_rd.csv'
|
|
||||||
outfile = outdir + '/' + out_filename
|
|
||||||
print('Output filename:', out_filename
|
|
||||||
, '\nOutput path:', outdir
|
|
||||||
, '\n=============================================================')
|
|
||||||
|
|
||||||
#%% end of variable assignment for input and output files
|
|
||||||
#=======================================================================
|
|
||||||
#%% Read input file
|
|
||||||
rd_data = pd.read_csv(infile, sep = '\t')
|
|
||||||
print('Reading input file:', infile
|
|
||||||
, '\nNo. of rows:', len(rd_data)
|
|
||||||
, '\nNo. of cols:', len(rd_data.columns))
|
|
||||||
|
|
||||||
print('Column names:', rd_data.columns
|
|
||||||
, '\n===============================================================')
|
|
||||||
#========================
|
|
||||||
# creating position col
|
|
||||||
#========================
|
|
||||||
# Extracting residue number from index and assigning
|
|
||||||
# the values to a column [position]. Then convert the position col to numeric.
|
|
||||||
rd_data['position'] = rd_data.index.str.extract('([0-9]+)').values
|
|
||||||
|
|
||||||
# converting position to numeric
|
|
||||||
rd_data['position'] = pd.to_numeric(rd_data['position'])
|
|
||||||
rd_data['position'].dtype
|
|
||||||
|
|
||||||
print('Extracted residue num from index and assigned as a column:'
|
|
||||||
, '\ncolumn name: position'
|
|
||||||
, '\ntotal no. of cols now:', len(rd_data.columns)
|
|
||||||
, '\n=============================================================')
|
|
||||||
|
|
||||||
#========================
|
|
||||||
# Renaming amino-acid
|
|
||||||
# and all-atom cols
|
|
||||||
#========================
|
|
||||||
print('Renaming columns:'
|
|
||||||
, '\ncolname==> # chain:residue: wt_3letter_caps'
|
|
||||||
, '\nYES... the column name *actually* contains a # ..!'
|
|
||||||
, '\ncolname==> all-atom: rd_values'
|
|
||||||
, '\n=============================================================')
|
|
||||||
|
|
||||||
rd_data.rename(columns = {'# chain:residue':'wt_3letter_caps', 'all-atom':'rd_values'}, inplace = True)
|
|
||||||
print('Column names:', rd_data.columns)
|
|
||||||
|
|
||||||
#========================
|
|
||||||
# extracting df with the
|
|
||||||
# desired columns
|
|
||||||
#========================
|
|
||||||
print('Extracting relevant columns for writing df as csv')
|
|
||||||
|
|
||||||
rd_df = rd_data[['position','rd_values','wt_3letter_caps']]
|
|
||||||
|
|
||||||
if len(rd_df) == len(rd_data):
|
|
||||||
print('PASS: extracted df has expected no. of rows'
|
|
||||||
,'\nExtracted df dim:'
|
|
||||||
,'\nNo. of rows:', len(rd_df)
|
|
||||||
,'\nNo. of cols:', len(rd_df.columns))
|
|
||||||
else:
|
|
||||||
print('FAIL: no. of rows mimatch'
|
|
||||||
, '\nExpected no. of rows:', len(rd_data)
|
|
||||||
, '\nGot no. of rows:', len(rd_df)
|
|
||||||
, '\n=========================================================')
|
|
||||||
|
|
||||||
#%% write file
|
|
||||||
print('Writing file:'
|
|
||||||
, '\nFilename:', out_filename
|
|
||||||
, '\nPath:', outdir
|
|
||||||
, '\n=============================================================')
|
|
||||||
|
|
||||||
rd_df.to_csv(outfile, header = True, index = False)
|
|
||||||
|
|
||||||
print('Finished writing:', out_filename
|
|
||||||
, '\nNo. of rows:', len(rd_df)
|
|
||||||
, '\nNo. of cols:', len(rd_df.columns)
|
|
||||||
, '\n=============================================================')
|
|
||||||
|
|
||||||
#%% end of script
|
|
||||||
#=======================================================================
|
|
|
@ -1,18 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
|
|
||||||
#https://www.biostars.org/p/336891/
|
|
||||||
#python Mutate.py -v -o /path/to/output.fasta mutation_map_file.csv input.fasta
|
|
||||||
|
|
||||||
# pnca_all_muts_msa_FIXME: This should be formatted like this from python script
|
|
||||||
# change to a cmd script that takes this "prefix" as an input
|
|
||||||
for i in $(cat pnca_all_muts_msa_FIXME.csv); do echo "3PL1,${i}"; done > pnca_copy.txt
|
|
||||||
|
|
||||||
# make sure there is no new line at the end of the mutation file (snps.csv)
|
|
||||||
#python3 Mutate.py -v -o /home/tanu/git/Data/pyrazinamide/input/output.fasta mut_map.csv 3pl1.fasta.txt
|
|
||||||
python3 mutate.py -v -o /home/tanu/git/Data/pyrazinamide/output/pnca_msa.txt /home/tanu/git/Data/pyrazinamide/output/pnca_all_muts_msa.csv /home/tanu/git/Data/pyrazinamide/input/pnca_fasta.txt
|
|
||||||
|
|
||||||
# remove fasta style header lines in the output i.e
|
|
||||||
# lines beginning with '>' so the file is just the mutated seqs
|
|
||||||
sed -i '/^>.*$/d' /home/tanu/git/Data/pyrazinamide/output/pnca_msa.txt
|
|
||||||
printf 'No. of lines after cleaning: '
|
|
||||||
cat /home/tanu/git/Data/pyrazinamide/output/pnca_msa.txt | wc -l
|
|
|
@ -1,54 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
#=======================================================================
|
|
||||||
# Task: read a pdb file and generate a dssp output file
|
|
||||||
# Input:
|
|
||||||
# pdb file
|
|
||||||
|
|
||||||
# Output:
|
|
||||||
# pdb_code.dssp
|
|
||||||
# needs dssp exe on linux
|
|
||||||
# more efficient to run dssp exe locally
|
|
||||||
|
|
||||||
# note: double quotes for variable interpolation
|
|
||||||
#=======================================================================
|
|
||||||
# #%%specify variables for input and output paths and filenames
|
|
||||||
drug='pyrazinamide'
|
|
||||||
gene='pncA'
|
|
||||||
#convert to lowercase for consistency in filenames
|
|
||||||
gene_l=$(printf $gene | awk '{print tolower($0)}')
|
|
||||||
gene_match="${gene}_p."
|
|
||||||
#printf "${gene_match}\n"
|
|
||||||
|
|
||||||
#==========
|
|
||||||
# data dir
|
|
||||||
#==========
|
|
||||||
#indir = 'git/Data/pyrazinamide/input/original'
|
|
||||||
#datadir="~git/Data"
|
|
||||||
datadir="${HOME}/git/Data"
|
|
||||||
|
|
||||||
#=======
|
|
||||||
# input
|
|
||||||
#=======
|
|
||||||
indir=${datadir}'/'${drug}'/input'
|
|
||||||
printf "Input dir: ${indir}\n"
|
|
||||||
in_filename='3pl1.pdb'
|
|
||||||
#infile=${basedir}${inpath}${in_filename}
|
|
||||||
infile=${indir}'/'${in_filename}
|
|
||||||
printf "Input file: ${infile}\n"
|
|
||||||
|
|
||||||
#=======
|
|
||||||
# output
|
|
||||||
#=======
|
|
||||||
outdir=${datadir}'/'${drug}$'/output'
|
|
||||||
printf "Output dir: ${outdir}"
|
|
||||||
#out_filename='3pl1.dssp'
|
|
||||||
out_filename="${gene_l}.dssp"
|
|
||||||
outfile=${outdir}'/'${out_filename}
|
|
||||||
printf "Output file: ${outfile}\n"
|
|
||||||
|
|
||||||
#%%end of variable assignment for input and output files
|
|
||||||
#================================================================
|
|
||||||
# command line arg to run dssp and create output file
|
|
||||||
dssp -i ${infile} -o ${outfile}
|
|
||||||
printf "Finished writing: ${outfile}\n"
|
|
||||||
#printf "Filename: ${out_filename}\nlocated in: ${outdir}\n"
|
|
|
@ -1,221 +0,0 @@
|
||||||
#=======================================================================
|
|
||||||
# Task: To generate a logo plot or bar plot but coloured
|
|
||||||
# aa properties.
|
|
||||||
# step1: read mcsm file and OR file
|
|
||||||
# step2: plot wild type positions
|
|
||||||
# step3: plot mutants per position coloured by aa properties
|
|
||||||
# step4: make the size of the letters/bars prop to OR if you can!
|
|
||||||
|
|
||||||
# useful links
|
|
||||||
# https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2
|
|
||||||
# https://omarwagih.github.io/ggseqlogo/
|
|
||||||
# https://kkdey.github.io/Logolas-pages/workflow.html
|
|
||||||
# A new sequence logo plot to highlight enrichment and depletion.
|
|
||||||
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6288878/
|
|
||||||
|
|
||||||
#very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/
|
|
||||||
#=======================================================================
|
|
||||||
#%% specify curr dir
|
|
||||||
getwd()
|
|
||||||
setwd('~/git/LSHTM_analysis/plotting_test/')
|
|
||||||
getwd()
|
|
||||||
#=======================================================================
|
|
||||||
#%% load packages
|
|
||||||
|
|
||||||
# header file
|
|
||||||
header_dir = '~/git/LSHTM_analysis/'
|
|
||||||
source(paste0(header_dir, '/', 'my_header.R'))
|
|
||||||
#=======================================================================
|
|
||||||
#%% variable assignment: input and output paths & filenames
|
|
||||||
drug = 'pyrazinamide'
|
|
||||||
gene = 'pncA'
|
|
||||||
gene_match = paste0(gene,'_p.')
|
|
||||||
cat(gene_match)
|
|
||||||
|
|
||||||
#===========
|
|
||||||
# data dir
|
|
||||||
#===========
|
|
||||||
datadir = paste0('~/git/Data')
|
|
||||||
|
|
||||||
#===========
|
|
||||||
# input
|
|
||||||
#===========
|
|
||||||
# source R script 'combining_two_df.R'
|
|
||||||
#indir = paste0(datadir, '/', drug, '/', 'output') # reading files
|
|
||||||
indir = '../meta_data_analysis' # sourcing R script
|
|
||||||
in_filename = 'combining_df_ps.R'
|
|
||||||
infile = paste0(indir, '/', in_filename)
|
|
||||||
cat(paste0('Input is a R script: ', '\'', infile, '\'')
|
|
||||||
, '\n========================================================')
|
|
||||||
|
|
||||||
#===========
|
|
||||||
# output
|
|
||||||
#===========
|
|
||||||
# 1) lineage dist of all muts
|
|
||||||
outdir = paste0('~/git/Data', '/', drug, '/', 'output/plots') #same as indir
|
|
||||||
#cat('Output dir: ', outdir, '\n')
|
|
||||||
#file_type = '.svg'
|
|
||||||
#out_filename1 = paste0(tolower(gene), '_lineage_dist_ps', file_type)
|
|
||||||
#outfile1 = paste0(outdir, '/', out_filename1)
|
|
||||||
#cat(paste0('Output plot1 :', outfile1)
|
|
||||||
# , '\n========================================================')
|
|
||||||
|
|
||||||
#%% end of variable assignment for input and output files
|
|
||||||
#=======================================================================
|
|
||||||
##%% read input file
|
|
||||||
cat('Reading input file(sourcing R script):', in_filename)
|
|
||||||
|
|
||||||
source(infile)
|
|
||||||
|
|
||||||
#==========================
|
|
||||||
# This will return:
|
|
||||||
|
|
||||||
# df with NA for pyrazinamide:
|
|
||||||
# merged_df2
|
|
||||||
# merged_df3
|
|
||||||
|
|
||||||
# df without NA for pyrazinamide:
|
|
||||||
# merged_df2_comp
|
|
||||||
# merged_df3_comp
|
|
||||||
#===========================
|
|
||||||
|
|
||||||
###########################
|
|
||||||
# Data for plots
|
|
||||||
# you need merged_df2 or merged_df2_comp
|
|
||||||
# since this is one-many relationship
|
|
||||||
# i.e the same SNP can belong to multiple lineages
|
|
||||||
# using the _comp dataset means
|
|
||||||
# we lose some muts and at this level, we should use
|
|
||||||
# as much info as available, hence use df with NA
|
|
||||||
|
|
||||||
# This will the first plotting df
|
|
||||||
# Then subset this to extract dr muts only (second plottig df)
|
|
||||||
###########################
|
|
||||||
|
|
||||||
#%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
||||||
# uncomment as necessary
|
|
||||||
# REASSIGNMENT
|
|
||||||
#my_data = merged_df2
|
|
||||||
#my_data = merged_df2_comp
|
|
||||||
#my_data = merged_df3
|
|
||||||
my_data = merged_df3_comp
|
|
||||||
#%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
||||||
|
|
||||||
# delete variables not required
|
|
||||||
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
|
|
||||||
|
|
||||||
# quick checks
|
|
||||||
colnames(my_data)
|
|
||||||
str(my_data)
|
|
||||||
|
|
||||||
c1 = unique(my_data$Position)
|
|
||||||
nrow(my_data)
|
|
||||||
cat('No. of rows in my_data:', nrow(my_data)
|
|
||||||
, '\nDistinct positions corresponding to snps:', length(c1)
|
|
||||||
, '\n===========================================================')
|
|
||||||
|
|
||||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
||||||
# FIXME: Think and decide what you want to remove
|
|
||||||
# mut_pos_occurence < 1 or sample_pos_occurrence <1
|
|
||||||
|
|
||||||
# get freq count of positions so you can subset freq<1
|
|
||||||
require(data.table)
|
|
||||||
#setDT(my_data)[, mut_pos_occurrence := .N, by = .(Position)] #265, 14
|
|
||||||
|
|
||||||
#extract freq_pos>1
|
|
||||||
#my_data_snp = my_data[my_data$occurrence!=1,]
|
|
||||||
|
|
||||||
#u = unique(my_data_snp$Position) #73
|
|
||||||
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
|
||||||
|
|
||||||
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
||||||
# REASSIGNMENT to prevent changing code
|
|
||||||
my_data_snp = my_data
|
|
||||||
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
||||||
#=======================================================================
|
|
||||||
#%% logo plots from dataframe
|
|
||||||
|
|
||||||
#############
|
|
||||||
# PLOTS: ggseqlogo with custom height
|
|
||||||
# https://omarwagih.github.io/ggseqlogo/
|
|
||||||
#############
|
|
||||||
#require(ggplot2)
|
|
||||||
#require(tidyverse)
|
|
||||||
library(ggseqlogo)
|
|
||||||
|
|
||||||
foo = my_data_snp[, c("Position", "Mutant_type","ratioDUET", "OR"
|
|
||||||
, "mut_prop_polarity", "mut_prop_water") ]
|
|
||||||
|
|
||||||
# log10OR
|
|
||||||
# FIXME: at the source script (when calculating AFandOR)
|
|
||||||
my_data_snp$log10or = log10(my_data_snp$OR)
|
|
||||||
bar = my_data_snp[, c('Position', 'Mutant_type', 'OR', 'logor', 'log10or')]
|
|
||||||
|
|
||||||
|
|
||||||
bar_or = my_data_snp[, c('Position', 'Mutant_type', 'OR')]
|
|
||||||
wide_df_or <- bar_or %>% spread(Position, OR, fill = 0)
|
|
||||||
wide_df_or = as.matrix(wide_df_or)
|
|
||||||
rownames(wide_df_or) = wide_df_or[,1]
|
|
||||||
wide_df_or = wide_df_or[,-1]
|
|
||||||
|
|
||||||
# custom height (OR) logo plot: yayy works
|
|
||||||
ggseqlogo(wide_df_or, method='custom', seq_type='aa') + ylab('my custom height') +
|
|
||||||
theme(legend.position = "bottom"
|
|
||||||
, axis.text.x = element_text(size = 11
|
|
||||||
, angle = 90
|
|
||||||
, hjust = 1
|
|
||||||
, vjust = 0.4)
|
|
||||||
, axis.text.y = element_text(size = 15
|
|
||||||
, angle = 0
|
|
||||||
, hjust = 1
|
|
||||||
, vjust = 0))+
|
|
||||||
labs(title = "AA logo plot"
|
|
||||||
, x = "Wild-type Position"
|
|
||||||
, y = "OR")
|
|
||||||
#%% end of logo plot with OR as height
|
|
||||||
#=======================================================================
|
|
||||||
# extracting data with log10OR
|
|
||||||
bar_logor = my_data_snp[, c('Position', 'Mutant_type', 'log10or')]
|
|
||||||
wide_df_logor <- bar_logor %>% spread(Position, log10or, fill = 0)
|
|
||||||
|
|
||||||
wide_df_logor = as.matrix(wide_df_logor)
|
|
||||||
rownames(wide_df_logor) = wide_df_logor[,1]
|
|
||||||
wide_df_logor = wide_df_logor[,-1]
|
|
||||||
|
|
||||||
# custom height (log10OR) logo plot: yayy works
|
|
||||||
ggseqlogo(wide_df_logor, method='custom', seq_type='aa') + ylab('my custom height') +
|
|
||||||
theme(legend.position = "bottom"
|
|
||||||
, axis.text.x = element_text(size = 11
|
|
||||||
, angle = 90
|
|
||||||
, hjust = 1
|
|
||||||
, vjust = 0.4)
|
|
||||||
, axis.text.y = element_text(size = 15
|
|
||||||
, angle = 0
|
|
||||||
, hjust = 1
|
|
||||||
, vjust = 0))+
|
|
||||||
labs(title = "AA logo plot"
|
|
||||||
, x = "Wild-type Position"
|
|
||||||
, y = "Log10(OR)")
|
|
||||||
|
|
||||||
#=======================================================================
|
|
||||||
#%% logo plot from sequence
|
|
||||||
|
|
||||||
#################
|
|
||||||
# Plot: LOGOLAS (ED plots)
|
|
||||||
# link: https://github.com/kkdey/Logolas
|
|
||||||
# on all pncA samples: output of mutate.py
|
|
||||||
################
|
|
||||||
library(Logolas)
|
|
||||||
|
|
||||||
seqs = read.csv('~/git//Data/pyrazinamide/output/pnca_msa.txt'
|
|
||||||
, header = FALSE
|
|
||||||
, stringsAsFactors = FALSE)$V1
|
|
||||||
|
|
||||||
|
|
||||||
# my_data: useful!
|
|
||||||
logomaker(seqs, type = "EDLogo", color_type = 'per_symbol'
|
|
||||||
, return_heights = TRUE)
|
|
||||||
logomaker(seqs, type = "Logo", color_type = 'per_symbol')
|
|
||||||
|
|
||||||
#%% end of script
|
|
||||||
#=======================================================================
|
|
Loading…
Add table
Add a link
Reference in a new issue