updated gitignore to tidyup

This commit is contained in:
Tanushree Tunstall 2020-09-21 17:54:54 +01:00
parent 1cf1f4e70e
commit 2c013124ad
14 changed files with 1 additions and 2677 deletions

1
.gitignore vendored
View file

@ -6,6 +6,7 @@
__pycache__
*/__pycache__
mcsm_analysis_fixme
meta_data_analysis
del
examples
example

View file

@ -1,398 +0,0 @@
#########################################################
# TASK: To calculate Allele Frequency and
# Odds Ratio from master data
# and add the calculated params to meta_data extracted from
# data_extraction.py
#########################################################
getwd()
setwd('~/git/LSHTM_analysis/meta_data_analysis')
getwd()
#%% variable assignment: input and output paths & filenames
drug = 'pyrazinamide'
gene = 'pncA'
gene_match = paste0(gene,'_p.')
cat(gene_match)
#===========
# input
#===========
# infile1: Raw data
#indir = 'git/Data/pyrazinamide/input/original'
indir = paste0('~/git/Data')
in_filename = 'original_tanushree_data_v2.csv'
infile = paste0(indir, '/', in_filename)
cat(paste0('Reading infile1: raw data', ' ', infile) )
# infile2: gene associated meta data file to extract valid snps and add calcs to.
# This is outfile3 from data_extraction.py
indir_metadata = paste0('~/git/Data', '/', drug, '/', 'output')
in_filename_metadata = 'pnca_metadata.csv'
infile_metadata = paste0(indir_metadata, '/', in_filename_metadata)
cat(paste0('Reading infile2: gene associated metadata:', infile_metadata))
#===========
# output
#===========
# outdir = 'git/Data/pyrazinamide/output'
outdir = paste0('~/git/Data', '/', drug, '/', 'output')
out_filename = paste0(tolower(gene),'_', 'meta_data_with_AFandOR.csv')
outfile = paste0(outdir, '/', out_filename)
cat(paste0('Output file with full path:', outfile))
#%% end of variable assignment for input and output files
#########################################################
# 1: Read master/raw data stored in Data/
#########################################################
raw_data_all = read.csv(infile, stringsAsFactors = F)
raw_data = raw_data_all[,c("id"
, "pyrazinamide"
, "dr_mutations_pyrazinamide"
, "other_mutations_pyrazinamide")]
rm(raw_data_all)
rm(indir, in_filename, infile)
#===========
# 1a: exclude na
#===========
raw_data = raw_data[!is.na(raw_data$pyrazinamide),]
total_samples = length(unique(raw_data$id))
cat(paste0('Total samples without NA in', ' ', drug, 'is:', total_samples))
# sanity check: should be true
is.numeric(total_samples)
#===========
# 1b: combine the two mutation columns
#===========
raw_data$all_mutations_pyrazinamide = paste(raw_data$dr_mutations_pyrazinamide
, raw_data$other_mutations_pyrazinamide)
head(raw_data$all_mutations_pyrazinamide)
#===========
# 1c: create yet another column that contains all the mutations but in lower case
#===========
raw_data$all_muts_pnca = tolower(raw_data$all_mutations_pyrazinamide)
# sanity checks
#table(grepl("pnca_p",raw_data$all_muts_pnca))
cat(paste0('converting gene match:', gene_match, ' ', 'to lowercase'))
gene_match = tolower(gene_match)
table(grepl(gene_match,raw_data$all_muts_pnca))
# sanity check: should be TRUE
#sum(table(grepl("pnca_p",raw_data$all_muts_pnca))) == total_samples
sum(table(grepl(gene_match,raw_data$all_muts_pnca))) == total_samples
# set up variables: can be used for logistic regression as well
i = "pnca_p.ala134gly" # has a NA, should NOT exist
table(grepl(i,raw_data$all_muts_pnca))
i = "pnca_p.trp68gly"
table(grepl(i,raw_data$all_muts_pnca))
mut = grepl(i,raw_data$all_muts_pnca)
dst = raw_data$pyrazinamide
table(mut, dst)
#chisq.test(table(mut,dst))
#fisher.test(table(mut, dst))
#table(mut)
#########################################################
# 2: Read valid snps for which OR
# can be calculated (infile_comp_snps.csv)
#########################################################
cat(paste0('Reading metadata infile:', infile_metadata))
pnca_metadata = read.csv(infile_metadata
# , file.choose()
, stringsAsFactors = F
, header = T)
# clear variables
rm(indir, in_filename, infile)
rm(indir_metadata, in_filename_metadata, infile_metadata)
# count na in pyrazinamide column
tot_pza_na = sum(is.na(pnca_metadata$pyrazinamide))
expected_rows = nrow(pnca_metadata) - tot_pza_na
# drop na from the pyrazinamide colum
pnca_snps_or = pnca_metadata[!is.na(pnca_metadata$pyrazinamide),]
# sanity check
if(nrow(pnca_snps_or) == expected_rows){
cat('PASS: no. of rows match with expected_rows')
} else{
cat('FAIL: nrows mismatch.')
}
# extract unique snps to iterate over for AF and OR calcs
pnca_snps_unique = unique(pnca_snps_or$mutation)
cat(paste0('Total no. of distinct comp snps to perform OR calcs: ', length(pnca_snps_unique)))
# Define OR function
x = as.numeric(mut)
y = dst
or = function(x,y){
tab = as.matrix(table(x,y))
a = tab[2,2]
if (a==0){ a<-0.5}
b = tab[2,1]
if (b==0){ b<-0.5}
c = tab[1,2]
if (c==0){ c<-0.5}
d = tab[1,1]
if (d==0){ d<-0.5}
(a/b)/(c/d)
}
dst = raw_data$pyrazinamide
ors = sapply(pnca_snps_unique,function(m){
mut = grepl(m,raw_data$all_muts_pnca)
or(mut,dst)
})
ors
pvals = sapply(pnca_snps_unique,function(m){
mut = grepl(m,raw_data$all_muts_pnca)
fisher.test(mut,dst)$p.value
})
pvals
afs = sapply(pnca_snps_unique,function(m){
mut = grepl(m,raw_data$all_muts_pnca)
mean(mut)
})
afs
# check ..hmmm
afs['pnca_p.trp68gly']
afs['pnca_p.gln10pro']
afs['pnca_p.leu4ser']
plot(density(log(ors)))
plot(-log10(pvals))
hist(log(ors)
, breaks = 100
)
# sanity check
if (table(names(ors) == names(pvals)) & table(names(ors) == names(afs)) & table(names(pvals) == names(afs)) == length(pnca_snps_unique)){
cat('PASS: names of ors, pvals and afs match: proceed with combining into a single df')
} else{
cat('FAIL: names of ors, pvals and afs mismatch')
}
# combine ors, pvals and afs
cat('Combining calculated params into a df: ors, pvals and afs')
comb_AF_and_OR = data.frame(ors, pvals, afs)
cat('No. of rows in comb_AF_and_OR: ', nrow(comb_AF_and_OR)
, '\nNo. of cols in comb_AF_and_OR: ', ncol(comb_AF_and_OR))
cat('Rownames == mutation: ', head(rownames(comb_AF_and_OR)))
# add rownames of comb_AF_and_OR as an extra column 'mutation' to allow merging based on this column
comb_AF_and_OR$mutation = rownames(comb_AF_and_OR)
# sanity check
if (table(rownames(comb_AF_and_OR) == comb_AF_and_OR$mutation)){
cat('PASS: rownames and mutaion col values match')
}else{
cat('FAIL: rownames and mutation col values mismatch')
}
#########################################################
# 3: Merge meta data file + calculated num params
#########################################################
df1 = pnca_metadata
df2 = comb_AF_and_OR
cat('checking commom col of the two dfs before merging:'
,'\ndf1:', head(df1$mutation)
, '\ndf2:', head(df2$mutation))
cat(paste0('merging two dfs: '
,'\ndf1 (big df i.e. meta data) nrows: ', nrow(df1)
,'\ndf2 (small df i.e af, or, pval) nrows: ', nrow(df2)
,'\nexpected rows in merged df: ', nrow(df1)
,'\nexpected cols in merged_df: ', (ncol(df1) + ncol(df2) - 1)))
merged_df = merge(df1 # big file
, df2 # small (afor file)
, by = "mutation"
, all.x = T) # because you want all the entries of the meta data
# sanity check
if(ncol(merged_df) == (ncol(df1) + ncol(df2) - 1)){
cat(paste0('PASS: no. of cols is as expected: ', ncol(merged_df)))
} else{
cat('FAIL: no.of cols mistmatch')
}
# quick check
i = "pnca_p.ala134gly" # has all NAs in pyrazinamide, should be NA in ors, etc.
merged_df[merged_df$mutation == i,]
# count na in each column
na_count = sapply(merged_df, function(y) sum(length(which(is.na(y))))); na_count
# check last three cols: should be NA
if ( identical(na_count[[length(na_count)]], na_count[[length(na_count)-1]], na_count[[length(na_count)-2]])){
cat('PASS: No. of NAs for OR, AF and Pvals are equal as expected',
'\nNo. of NA: ', na_count[[length(na_count)]])
} else {
cat('FAIL: No. of NAs for OR, AF and Pvals mismatch')
}
# reassign custom colnames
cat('Assigning custom colnames for the calculated params...')
colnames(merged_df)[colnames(merged_df)== "ors"] <- "OR"
colnames(merged_df)[colnames(merged_df)== "pvals"] <- "pvalue"
colnames(merged_df)[colnames(merged_df)== "afs"] <- "AF"
colnames(merged_df)
# add 3 more cols: log OR, neglog pvalue and AF_percent cols
merged_df$logor = log(merged_df$OR)
is.numeric(merged_df$logor)
merged_df$neglog10pvalue = -log10(merged_df$pvalue)
is.numeric(merged_df$neglog10pvalue)
merged_df$AF_percent = merged_df$AF*100
is.numeric(merged_df$AF_percent)
# check AFs
#i = 'pnca_p.trp68gly'
i = 'pnca_p.gln10pro'
#i = 'pnca_p.leu4ser'
merged_df[merged_df$mutation == i,]
# FIXME: harcoding (beware!), NOT FATAL though!
ncol_added = 3
cat(paste0('Added', ' ', ncol_added, ' more cols to merged_df:'
, '\ncols added: logor, neglog10pvalue and AF_percent:'
, '\nno. of cols in merged_df now: ', ncol(merged_df)))
#%% write file out: pnca_meta_data_with_AFandOR
#*********************************************
cat(paste0('writing output file: '
, '\nFilename: ', out_filename
, '\nPath:', outdir))
write.csv(merged_df, outfile
, row.names = F)
cat(paste0('Finished writing:'
, out_filename
, '\nNo. of rows: ', nrow(merged_df)
, '\nNo. of cols: ', ncol(merged_df)))
#************************************************
cat('======================================================================')
rm(out_filename)
cat('End of script: calculated AF, OR, pvalues and saved file')
# End of script
#%%
# sanity check: Count NA in these four cols.
# However these need to be numeric else these
# will be misleading when counting NAs (i.e retrun 0)
#is.numeric(meta_with_afor$OR)
na_var = c('AF', 'OR', 'pvalue', 'logor', 'neglog10pvalue')
# loop through these vars and check if these are numeric.
# if not, then convert to numeric
check_all = NULL
for (i in na_var){
# cat(i)
check0 = is.numeric(meta_with_afor[,i])
if (check0) {
check_all = c(check0, check_all)
cat('These are all numeric cols')
} else{
cat('First converting to numeric')
check0 = as.numeric(meta_with_afor[,i])
check_all = c(check0, check_all)
}
}
# count na now that the respective cols are numeric
na_count = sapply(meta_with_afor, function(y) sum(length(which(is.na(y))))); na_count
str(na_count)
# extract how many NAs:
# should be all TRUE
# should be a single number since
# all the cols should have 'equal' and 'same' no. of NAs
# compare if the No of 'NA' are the same for all these cols
na_len = NULL
for (i in na_var){
temp = na_count[[i]]
na_len = c(na_len, temp)
}
cat('Checking how many NAs and if these are identical for the selected cols:')
my_nrows = NULL
for ( i in 1: (length(na_len)-1) ){
# cat(compare(na_len[i]), na_len[i+1])
c = compare(na_len[i], na_len[i+1])
if ( c$result ) {
cat('PASS: No. of NAa in selected cols are identical')
my_nrows = na_len[i] }
else {
cat('FAIL: No. of NAa in selected cols mismatch')
}
}
cat('No. of NAs in each of the selected cols: ', my_nrows)
# yet more sanity checks:
cat('Check whether the ', my_nrows, 'indices are indeed the same')
#which(is.na(meta_with_afor$OR))
# initialise an empty df with nrows as extracted above
na_count_df = data.frame(matrix(vector(mode = 'numeric'
# , length = length(na_var)
)
, nrow = my_nrows
# , ncol = length(na_var)
))
# populate the df with the indices of the cols that are NA
for (i in na_var){
cat(i)
na_i = which(is.na(meta_with_afor[i]))
na_count_df = cbind(na_count_df, na_i)
colnames(na_count_df)[which(na_var == i)] <- i
}
# Now compare these indices to ensure these are the same
check2 = NULL
for ( i in 1: ( length(na_count_df)-1 ) ) {
# cat(na_count_df[i] == na_count_df[i+1])
check_all = identical(na_count_df[[i]], na_count_df[[i+1]])
check2 = c(check_all, check2)
if ( all(check2) ) {
cat('PASS: The indices for AF, OR, etc are all the same\n')
} else {
cat ('FAIL: Please check indices which are NA')
}
}

View file

@ -1,248 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
Created on Tue Aug 6 12:56:03 2019
@author: tanu
'''
# FIXME: change filename 4 (mcsm normalised data)
# to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline
#=============================================================================
# Task: combine 4 dfs with aa position as linking column
# This is done in 2 steps:
# merge 1: of 3 dfs (filenames in lowercase)
# <gene.lower()>_dssp.csv
# <gene.lower()>_kd.csv
# <gene.lower()>_pnca_rd.csv
# merge 2: of 2 dfs
# pnca_complex_mcsm_norm.csv (!fix name in mcsm script)
# output df from merge1
# Input: 3 dfs
# <gene.lower()>_dssp.csv
# <gene.lower()>_kd.csv
# <gene.lower()>_pnca_rd.csv
# pnca_complex_mcsm_norm.csv (!fix name in mcsm script)
# Output: .csv of all 4 dfs combined
# useful link
# https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns
#=============================================================================
#%% load packages
import sys, os
import pandas as pd
#import numpy as np
import argparse
#=======================================================================
#%% specify input and curr dir
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
os.getcwd()
#=======================================================================
#%% command line args
arg_parser = argparse.ArgumentParser()
#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazin')
arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pn') # case sensitive
args = arg_parser.parse_args()
#=======================================================================
#%% variable assignment: input and output
#drug = 'pyrazinamide'
#gene = 'pncA'
#gene_match = gene + '_p.'
drug = args.drug
gene = args.gene
#==========
# data dir
#==========
datadir = homedir + '/' + 'git/Data'
#=======
# input
#=======
indir = datadir + '/' + drug + '/' + 'output'
in_filename1 = 'pnca_dssp.csv'
in_filename2 = 'pnca_kd.csv'
in_filename3 = 'pnca_rd.csv'
#in_filename4 = 'mcsm_complex1_normalised.csv' # Fix name in mcsm script
in_filename4 = 'pnca_complex_mcsm_norm.csv' # manually changed temporarily
infile1 = indir + '/' + in_filename1
infile2 = indir + '/' + in_filename2
infile3 = indir + '/' + in_filename3
infile4 = indir + '/' + in_filename4
print('\nInput path:', indir
, '\nInput filename1:', in_filename1
, '\nInput filename2:', in_filename2
, '\nInput filename3:', in_filename3
, '\nInput filename4:', in_filename4
, '\n===================================================================')
#=======
# output
#=======
outdir = datadir + '/' + drug + '/' + 'output'
out_filename = gene.lower() + '_mcsm_struct_params.csv'
outfile = outdir + '/' + out_filename
print('Output filename:', out_filename
, '\nOutput path:', outdir
, '\n===================================================================')
#%% end of variable assignment for input and output files
#=======================================================================
#%% Read input file
dssp_df = pd.read_csv(infile1, sep = ',')
kd_df = pd.read_csv(infile2, sep = ',')
rd_df = pd.read_csv(infile3, sep = ',')
mcsm_df = pd.read_csv(infile4, sep = ',')
print('Reading input files:'
, '\ndssp file:', infile1
, '\nNo. of rows:', len(dssp_df)
, '\nNo. of cols:', len(dssp_df.columns)
, '\nColumn names:', dssp_df.columns
, '\n==================================================================='
, '\nkd file:', infile2
, '\nNo. of rows:', len(kd_df)
, '\nNo. of cols:', len(kd_df.columns)
, '\nColumn names:', kd_df.columns
, '\n==================================================================='
, '\nrd file:', infile3
, '\nNo. of rows:', len(rd_df)
, '\nNo. of cols:', len(rd_df.columns)
, '\nColumn names:', rd_df.columns
, '\n==================================================================='
, '\nrd file:', infile4
, '\nNo. of rows:', len(mcsm_df)
, '\nNo. of cols:', len(mcsm_df.columns)
, '\nColumn names:', mcsm_df.columns
, '\n===================================================================')
#%% Begin combining dfs
#===================
# concatenating df1 (3dfs): dssp_df + kd_df+ rd_df
#===================
print('starting first merge...\n')
# checking no. of rows
print('Checking if no. of rows of the 3 dfs are equal:\n'
, len(dssp_df) == len(kd_df) == len(rd_df)
, '\nReason: fasta files and pdb files vary since not all pos are part of the structure'
, '\n===================================================================')
# variables for sanity checks
expected_rows_df1 = max(len(dssp_df), len(kd_df), len(rd_df))
# beware of harcoding! used for sanity check
ndfs = 3
ncol_merge = 1
offset = ndfs- ncol_merge
expected_cols_df1 = len(dssp_df.columns) + len(kd_df.columns) + len(rd_df.columns) - offset
print('Merge 1:'
, '\ncombining 3dfs by commom col: position'
, '\nExpected nrows in combined_df:', expected_rows_df1
, '\nExpected ncols in combined_df:', expected_cols_df1
, '\nResetting the common col as the index'
, '\n===================================================================')
#dssp_df.set_index('position', inplace = True)
#kd_df.set_index('position', inplace = True)
#rd_df.set_index('position', inplace =True)
#combined_df = pd.concat([dssp_df, kd_df, rd_df], axis = 1, sort = False).reset_index()
#combined_df.rename(columns = {'index':'position'})
combined_df1 = pd.concat(
(my_index.set_index('position') for my_index in [dssp_df, kd_df, rd_df])
, axis = 1, join = 'outer').reset_index()
# sanity check
print('Checking dimensions of concatenated df1...')
if len(combined_df1) == expected_rows_df1 and len(combined_df1.columns) == expected_cols_df1:
print('PASS: combined df has expected dimensions'
, '\nNo. of rows in combined df:', len(combined_df1)
, '\nNo. of cols in combined df:', len(combined_df1.columns)
, '\n===============================================================')
else:
print('FAIL: combined df does not have expected dimensions'
, '\nNo. of rows in combined df:', len(combined_df1)
, '\nNo. of cols in combined df:', len(combined_df1.columns)
, '\n===============================================================')
#===================
# concatenating df2 (2dfs): mcsm_df + combined_df1
# sort sorts the cols
#===================
print('starting second merge...\n')
# rename col 'Position' in mcsm_df to lowercase 'position'
# as it matches the combined_df1 colname to perfom merge
#mcsm_df.columns
#mcsm_df.rename(columns = {'Position':'position'}) # not working!
# copy 'Position' column with the correct colname
print('Firstly, copying \'Position\' col and renaming \'position\' to allow merging'
, '\nNo. of cols before copying: ', len(mcsm_df.columns))
mcsm_df['position'] = mcsm_df['Position']
print('No. of cols after copying: ', len(mcsm_df.columns))
# sanity check
if mcsm_df['position'].equals(mcsm_df['Position']):
print('PASS: Copying worked correctly'
, '\ncopied col matches original column'
, '\n===============================================================')
else:
print('FAIL: copied col does not match original column'
, '\n================================================================')
# variables for sanity checks
expected_rows_df2 = len(mcsm_df)
# beware of harcoding! used for sanity check
ndfs = 2
ncol_merge = 1
offset = ndfs - ncol_merge
expected_cols_df2 = len(mcsm_df.columns) + len(combined_df1.columns) - offset
print('Merge 2:'
, '\ncombining 2dfs by commom col: position'
, '\nExpected nrows in combined_df:', expected_rows_df2
, '\nExpected ncols in combined_df:', expected_cols_df2
, '\n===================================================================')
combined_df2 = mcsm_df.merge(combined_df1, on = 'position')
# sanity check
print('Checking dimensions of concatenated df2...')
if len(combined_df2) == expected_rows_df2 and len(combined_df2.columns) == expected_cols_df2:
print('PASS: combined df2 has expected dimensions'
, '\nNo. of rows in combined df:', len(combined_df2)
, '\nNo. of cols in combined df:', len(combined_df2.columns)
, '\n===============================================================')
else:
print('FAIL: combined df2 does not have expected dimensions'
, '\nNo. of rows in combined df:', len(combined_df2)
, '\nNo. of cols in combined df:', len(combined_df2.columns)
, '\n===============================================================')
#%% write file
print('Writing file:'
, '\nFilename:', out_filename
, '\nPath:', outdir
, '\n===================================================================')
combined_df2.to_csv(outfile, header = True, index = False)
print('Finished writing:', out_filename
, '\nNo. of rows:', len(combined_df2)
, '\nNo. of cols:', len(combined_df2.columns)
, '\n===================================================================')
#%% end of script
#==============================================================================

View file

@ -1,397 +0,0 @@
#=======================================================================
# TASK: To combine mcsm and meta data with af and or
# by filtering for distance to ligand (<10Ang).
# This script doesn't output anything.
# This script is sourced from other .R scripts for plotting ligand plots
# Input csv files:
# 1) mcsm normalised and struct params
# 2) gene associated meta_data_with_AFandOR
#=======================================================================
#%% specify curr dir
getwd()
setwd('~/git/LSHTM_analysis/meta_data_analysis/')
getwd()
#=======================================================================
#%% load packages
#require(data.table)
#require(arsenal)
#require(compare)
#library(tidyverse)
# header file
header_dir = '~/git/LSHTM_analysis/'
source(paste0(header_dir, '/', 'my_header.R'))
#%% variable assignment: input and output paths & filenames
#=======================================================================
drug = 'pyrazinamide'
gene = 'pncA'
gene_match = paste0(gene,'_p.')
cat(gene_match)
#===========
# data dir
#===========
datadir = '~/git/Data'
#===========
# input
#===========
# infile1: mCSM data
#indir = '~/git/Data/pyrazinamide/input/processed/'
indir = paste0(datadir, '/', drug, '/', 'output') # revised {TODO: change in mcsm pipeline}
#in_filename = 'mcsm_complex1_normalised.csv'
in_filename = 'pnca_mcsm_struct_params.csv'
infile = paste0(indir, '/', in_filename)
cat(paste0('Reading infile1: mCSM output file', ' ', infile) )
# infile2: gene associated meta data combined with AF and OR
#indir: same as above
in_filename_comb = paste0(tolower(gene), '_meta_data_with_AFandOR.csv')
infile_comb = paste0(indir, '/', in_filename_comb)
cat(paste0('Reading infile2: gene associated combined metadata:', infile_comb))
#===========
# output
#===========
# Uncomment if and when required to output
outdir = paste0(datadir, '/', drug, '/', 'output') #same as indir
cat('Output dir: ', outdir, '\n')
#out_filename = paste0(tolower(gene), 'XXX')
#outfile = paste0(outdir, '/', out_filename)
#cat(paste0('Output file with full path:', outfile))
#%% end of variable assignment for input and output files
#=======================================================================
#%% Read input files
#####################################
# input file 1: mcsm normalised data
# output of step 4 mcsm_pipeline
#####################################
cat('Reading mcsm_data:'
, '\nindir: ', indir
, '\ninfile_comb: ', in_filename)
mcsm_data = read.csv(infile
# , row.names = 1
, stringsAsFactors = F
, header = T)
cat('Read mcsm_data file:'
, '\nNo.of rows: ', nrow(mcsm_data)
, '\nNo. of cols:', ncol(mcsm_data))
# clear variables
rm(in_filename, infile)
str(mcsm_data)
table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
# spelling Correction 1: DUET
mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
# checks: should be the same as above
table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
# spelling Correction 2: Ligand
table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
# checks: should be the same as above
table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
# muts with opposing effects on protomer and ligand stability
# excluded from here as it is redundant.
# check 'combining_two_df.R' to refer if required.
#=======================================================================
#%% !!!Filter data only for mcsm lig!!!
###########################
# Filter/subset data
# Lig plots < 10Ang
# Filter the lig plots for Dis_to_lig < 10Ang
###########################
# check range of distances
max(mcsm_data$Dis_lig_Ang)
min(mcsm_data$Dis_lig_Ang)
# count
table(mcsm_data$Dis_lig_Ang<10)
# subset data to have only values less than 10 Ang
mcsm_data2 = subset(mcsm_data, mcsm_data$Dis_lig_Ang < 10)
# sanity checks
max(mcsm_data2$Dis_lig_Ang)
min(mcsm_data2$Dis_lig_Ang)
# count no of unique positions
length(unique(mcsm_data2$Position))
# count no of unique mutations
length(unique(mcsm_data2$Mutationinformation))
# count Destabilisinga and stabilising
table(mcsm_data2$Lig_outcome) #{RESULT: no of mutations within 10Ang}
#############################
# Extra sanity check:
# for mcsm_lig ONLY
# Dis_lig_Ang should be <10
#############################
if (max(mcsm_data2$Dis_lig_Ang) < 10){
print ("Sanity check passed: lig data is <10Ang")
}else{
print ("Error: data should be filtered to be within 10Ang")
}
#!!!!!!!!!!!!!!!!!!!!!
# REASSIGNMENT: so as not to alter the script
mcsm_data = mcsm_data2
#!!!!!!!!!!!!!!!!!!!!!
#=======================================================================
# clear variables
rm(mcsm_data2)
# count na in each column
na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
# sort by Mutationinformation
mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
head(mcsm_data$Mutationinformation)
orig_col = ncol(mcsm_data)
# get freq count of positions and add to the df
setDT(mcsm_data)[, mut_pos_occurrence := .N, by = .(position)]
cat('Added col: position frequency to see which posn has how many muts'
, '\nNo. of cols now', ncol(mcsm_data)
, '\nNo. of cols before: ', orig_col)
mut_pos_occurrence = data.frame(mcsm_data$id, mcsm_data$Position, mcsm_data$mut_pos_occurrence)
######################################
# input file2 meta data with AFandOR
######################################
cat('Reading combined meta data and AFandOR file:'
, '\nindir: ', indir
, '\ninfile_comb: ', in_filename_comb)
meta_with_afor <- read.csv(infile_comb
, stringsAsFactors = F
, header = T)
cat('Read mcsm_data file:'
, '\nNo.of rows: ', nrow(meta_with_afor)
, '\nNo. of cols:', ncol(meta_with_afor))
# clear variables
rm(in_filename_comb, infile_comb)
str(meta_with_afor)
# sort by Mutationinformation
head(meta_with_afor$Mutationinformation)
meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
head(meta_with_afor$Mutationinformation)
orig_col2 = ncol(meta_with_afor)
# get freq count of positions and add to the df
setDT(meta_with_afor)[, sample_pos_occurrence := .N, by = .(Position)]
cat('Added col: position frequency of samples to check'
,'how many samples correspond to a partiulcar posn associated with muts'
, '\nNo. of cols now', ncol(meta_with_afor)
, '\nNo. of cols before: ', orig_col2)
sample_pos_occurrence = data.frame(meta_with_afor$id, meta_with_afor$position, meta_with_afor$sample_pos_occurrence)
#=======================================================================
cat('Begin merging dfs with NAs'
, '\n===============================================================')
###########################
# merging two dfs: with NA
###########################
# link col name = 'Mutationinforamtion'
cat('Merging dfs with NAs: big df (1-many relationship b/w id & mut)'
,'\nlinking col: Mutationinforamtion'
,'\nfilename: merged_df2')
head(mcsm_data$Mutationinformation)
head(meta_with_afor$Mutationinformation)
#########
# a) merged_df2: meta data with mcsm
#########
merged_df2 = merge(x = meta_with_afor
, y = mcsm_data
, by = 'Mutationinformation'
, all.y = T)
cat('Dim of merged_df2: '
, '\nNo. of rows: ', nrow(merged_df2)
, '\nNo. of cols: ', ncol(merged_df2))
head(merged_df2$Position)
if(nrow(meta_with_afor) == nrow(merged_df2)){
cat('nrow(merged_df2) = nrow (gene associated metadata)'
,'\nExpected no. of rows: ',nrow(meta_with_afor)
,'\nGot no. of rows: ', nrow(merged_df2))
} else{
cat('nrow(merged_df2)!= nrow(gene associated metadata)'
, '\nExpected no. of rows after merge: ', nrow(meta_with_afor)
, '\nGot no. of rows: ', nrow(merged_df2)
, '\nFinding discrepancy')
merged_muts_u = unique(merged_df2$Mutationinformation)
meta_muts_u = unique(meta_with_afor$Mutationinformation)
# find the index where it differs
unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
}
# sort by Position
head(merged_df2$Position)
merged_df2 = merged_df2[order(merged_df2$Position),]
head(merged_df2$Position)
merged_df2v2 = merge(x = meta_with_afor
, y = mcsm_data
, by = 'Mutationinformation'
, all.x = T)
#!=!=!=!=!=!=!=!
# COMMENT: used all.y since position 186 is not part of the struc,
# hence doesn't have a mcsm value
# but 186 is associated with mutation
#!=!=!=!=!=!=!=!
# should be False
identical(merged_df2, merged_df2v2)
table(merged_df2$Position%in%merged_df2v2$Position)
rm(merged_df2v2)
#########
# b) merged_df3: remove duplicate mutation information
#########
cat('Merging dfs with NAs: small df (removing duplicate muts)'
,'\nCannot trust lineage info from this'
,'\nlinking col: Mutationinforamtion'
,'\nfilename: merged_df3')
#==#=#=#=#=#=#
# Cannot trust lineage, country from this df as the same mutation
# can have many different lineages
# but this should be good for the numerical corr plots
#=#=#=#=#=#=#=
merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),]
head(merged_df3$Position); tail(merged_df3$Position) # should be sorted
# sanity checks
# nrows of merged_df3 should be the same as the nrows of mcsm_data
if(nrow(mcsm_data) == nrow(merged_df3)){
cat('PASS: No. of rows match with mcsm_data'
,'\nExpected no. of rows: ', nrow(mcsm_data)
,'\nGot no. of rows: ', nrow(merged_df3))
} else {
cat('FAIL: No. of rows mismatch'
, '\nNo. of rows mcsm_data: ', nrow(mcsm_data)
, '\nNo. of rows merged_df3: ', nrow(merged_df3))
}
# counting NAs in AF, OR cols in merged_df3
# this is becuase mcsm has no AF, OR cols,
# so you cannot count NAs
if (identical(sum(is.na(merged_df3$OR))
, sum(is.na(merged_df3$pvalue))
, sum(is.na(merged_df3$AF)))){
cat('PASS: NA count match for OR, pvalue and AF\n')
na_count_df3 = sum(is.na(merged_df3$AF))
cat('No. of NAs: ', sum(is.na(merged_df3$OR)))
} else{
cat('FAIL: NA count mismatch'
, '\nNA in OR: ', sum(is.na(merged_df3$OR))
, '\nNA in pvalue: ', sum(is.na(merged_df3$pvalue))
, '\nNA in AF:', sum(is.na(merged_df3$AF)))
}
#=======================================================================
#%% merging without NAs
cat('Begin merging dfs without NAs'
, '\n===============================================================')
cat('Merging dfs without any NAs: big df (1-many relationship b/w id & mut)'
,'\nlinking col: Mutationinforamtion'
,'\nfilename: merged_df2_comp')
#########
# c) merged_df2_comp: merging two dfs without NA
#########
merged_df2_comp = merged_df2[!is.na(merged_df2$AF),]
#merged_df2_comp = merged_df2[!duplicated(merged_df2$Mutationinformation),]
# sanity check
cat('Checking nrows in merged_df2_comp')
if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count + 1)){
cat('PASS: No. of rows match'
,'\nDim of merged_df2_comp: '
,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1
, '\nNo. of rows: ', nrow(merged_df2_comp)
, '\nNo. of cols: ', ncol(merged_df2_comp))
}else{
cat('FAIL: No. of rows mismatch'
,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1
,'\nGot no. of rows: ', nrow(merged_df2_comp))
}
#########
# d) merged_df3_comp: remove duplicate mutation information
#########
merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),]
cat('Dim of merged_df3_comp: '
, '\nNo. of rows: ', nrow(merged_df3_comp)
, '\nNo. of cols: ', ncol(merged_df3_comp))
# alternate way of deriving merged_df3_comp
foo = merged_df3[!is.na(merged_df3$AF),]
# compare dfs: foo and merged_df3_com
all.equal(foo, merged_df3)
summary(comparedf(foo, merged_df3))
# sanity check
cat('Checking nrows in merged_df3_comp')
if(nrow(merged_df3_comp) == nrow(merged_df3)){
cat('NO NAs detected in merged_df3 in AF|OR cols'
,'\nNo. of rows are identical: ', nrow(merged_df3))
} else{
if(nrow(merged_df3_comp) == nrow(merged_df3) - na_count_df3) {
cat('PASS: NAs detected in merged_df3 in AF|OR cols'
, '\nNo. of NAs: ', na_count_df3
, '\nExpected no. of rows in merged_df3_comp: ', nrow(merged_df3) - na_count_df3
, '\nGot no. of rows: ', nrow(merged_df3_comp))
}
}
#=======================================================================
# write_output files
# Not required as this is a subset of the combining_two_df.R
#*************************
# clear variables
rm(mcsm_data, meta_with_afor, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, orig_col, outdir)
rm(mut_pos_occurrence)
#%% end of script
#=======================================================================

View file

@ -1,461 +0,0 @@
#=======================================================================
# TASK: To combine mcsm and meta data with af and or files
# Input csv files:
# 1) mcsm normalised and struct params
# 2) gene associated meta_data_with_AFandOR
# Output:
# 1) muts with opposite effects on stability
# 2) large combined df including NAs for AF, OR,etc
# Dim: same no. of rows as gene associated meta_data_with_AFandOR
# 3) small combined df including NAs for AF, OR, etc.
# Dim: same as mcsm data
# 4) large combined df excluding NAs
# Dim: dim(#1) - no. of NAs(AF|OR) + 1
# 5) small combined df excluding NAs
# Dim: dim(#2) - no. of unique NAs - 1
# This script is sourced from other .R scripts for plotting
#=======================================================================
#%% specify curr dir
getwd()
setwd('~/git/LSHTM_analysis/meta_data_analysis/')
getwd()
#=======================================================================
#%% load packages
#require(data.table)
#require(arsenal)
#require(compare)
#library(tidyverse)
# header file
header_dir = '~/git/LSHTM_analysis/'
source(paste0(header_dir, '/', 'my_header.R'))
#=======================================================================
#%% variable assignment: input and output paths & filenames
drug = 'pyrazinamide'
gene = 'pncA'
gene_match = paste0(gene,'_p.')
cat(gene_match)
#===========
# data dir
#===========
datadir = '~/git/Data'
#===========
# input
#===========
# infile1: mCSM data
#indir = '~/git/Data/pyrazinamide/input/processed/'
indir = paste0(datadir, '/', drug, '/', 'output') # revised {TODO: change in mcsm pipeline}
#in_filename = 'mcsm_complex1_normalised.csv'
in_filename = 'pnca_mcsm_struct_params.csv'
infile = paste0(indir, '/', in_filename)
cat(paste0('Reading infile1: mCSM output file', ' ', infile, '\n') )
# infile2: gene associated meta data combined with AF and OR
#indir: same as above
in_filename_comb = paste0(tolower(gene), '_meta_data_with_AFandOR.csv')
infile_comb = paste0(indir, '/', in_filename_comb)
cat(paste0('Reading infile2: gene associated combined metadata:', infile_comb, '\n'))
#===========
# output
#===========
# Uncomment if and when required to output
outdir = paste0(datadir, '/', drug, '/', 'output') #same as indir
cat('Output dir: ', outdir, '\n')
#out_filename = paste0(tolower(gene), 'XXX')
#outfile = paste0(outdir, '/', out_filename)
#cat(paste0('Output file with full path:', outfile))
#%% end of variable assignment for input and output files
#=======================================================================
#%% Read input files
#####################################
# input file 1: mcsm normalised data
# output of step 4 mcsm_pipeline
#####################################
cat('Reading mcsm_data:'
, '\nindir: ', indir
, '\ninfile_comb: ', in_filename)
mcsm_data = read.csv(infile
# , row.names = 1
, stringsAsFactors = F
, header = T)
cat('Read mcsm_data file:'
, '\nNo.of rows: ', nrow(mcsm_data)
, '\nNo. of cols:', ncol(mcsm_data))
# clear variables
rm(in_filename, infile)
str(mcsm_data)
table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
# spelling Correction 1: DUET
mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Stabilizing'] <- 'Stabilising'
mcsm_data$DUET_outcome[mcsm_data$DUET_outcome=='Destabilizing'] <- 'Destabilising'
# checks: should be the same as above
table(mcsm_data$DUET_outcome); sum(table(mcsm_data$DUET_outcome) )
head(mcsm_data$DUET_outcome); tail(mcsm_data$DUET_outcome)
# spelling Correction 2: Ligand
table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Stabilizing'] <- 'Stabilising'
mcsm_data$Lig_outcome[mcsm_data$Lig_outcome=='Destabilizing'] <- 'Destabilising'
# checks: should be the same as above
table(mcsm_data$Lig_outcome); sum(table(mcsm_data$Lig_outcome) )
head(mcsm_data$Lig_outcome); tail(mcsm_data$Lig_outcome)
# muts with opposing effects on protomer and ligand stability
table(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome)
changes = mcsm_data[which(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome),]
# sanity check: redundant, but uber cautious!
dl_i = which(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome)
ld_i = which(mcsm_data$Lig_outcome != mcsm_data$DUET_outcome)
cat('Identifying muts with opposite stability effects')
if(nrow(changes) == (table(mcsm_data$DUET_outcome != mcsm_data$Lig_outcome)[[2]]) & identical(dl_i,ld_i)) {
cat('PASS: muts with opposite effects on stability and affinity correctly identified'
, '\nNo. of such muts: ', nrow(changes))
}else {
cat('FAIL: unsuccessful in extracting muts with changed stability effects')
}
#***************************
# write file: changed muts
out_filename = 'muts_opp_effects.csv'
outfile = paste0(outdir, '/', out_filename)
cat('Writing file for muts with opp effects:'
, '\nFilename: ', outfile
, '\nPath: ', outdir)
write.csv(changes, outfile)
#****************************
# clear variables
rm(out_filename, outfile)
rm(changes, dl_i, ld_i)
# count na in each column
na_count = sapply(mcsm_data, function(y) sum(length(which(is.na(y))))); na_count
# sort by Mutationinformation
mcsm_data = mcsm_data[order(mcsm_data$Mutationinformation),]
head(mcsm_data$Mutationinformation)
orig_col = ncol(mcsm_data)
# get freq count of positions and add to the df
setDT(mcsm_data)[, mut_pos_occurrence := .N, by = .(Position)]
cat('Added col: position frequency of muts to see which posn has how many muts'
, '\nNo. of cols now', ncol(mcsm_data)
, '\nNo. of cols before: ', orig_col)
mut_pos_occurrence = data.frame(mcsm_data$Mutationinformation
, mcsm_data$Position
, mcsm_data$mut_pos_occurrence)
colnames(mut_pos_occurrence) = c('Mutationinformation', 'position', 'mut_pos_occurrence')
#######################################
# input file 2: meta data with AFandOR
#######################################
cat('Reading combined meta data and AFandOR file:'
, '\nindir: ', indir
, '\ninfile_comb: ', in_filename_comb)
meta_with_afor <- read.csv(infile_comb
, stringsAsFactors = F
, header = T)
cat('Read mcsm_data file:'
, '\nNo.of rows: ', nrow(meta_with_afor)
, '\nNo. of cols:', ncol(meta_with_afor))
# counting NAs in AF, OR cols
if (identical(sum(is.na(meta_with_afor$OR))
, sum(is.na(meta_with_afor$pvalue))
, sum(is.na(meta_with_afor$AF)))){
cat('PASS: NA count match for OR, pvalue and AF\n')
na_count = sum(is.na(meta_with_afor$AF))
cat('No. of NAs: ', sum(is.na(meta_with_afor$OR)))
} else{
cat('FAIL: NA count mismatch'
, '\nNA in OR: ', sum(is.na(meta_with_afor$OR))
, '\nNA in pvalue: ', sum(is.na(meta_with_afor$pvalue))
, '\nNA in AF:', sum(is.na(meta_with_afor$AF)))
}
# clear variables
rm(in_filename_comb, infile_comb)
str(meta_with_afor)
# sort by Mutationinformation
head(meta_with_afor$Mutationinformation)
meta_with_afor = meta_with_afor[order(meta_with_afor$Mutationinformation),]
head(meta_with_afor$Mutationinformation)
orig_col2 = ncol(meta_with_afor)
# get freq count of positions and add to the df
setDT(meta_with_afor)[, sample_pos_occurrence := .N, by = .(position)]
cat('Added col: position frequency of samples to check'
,'how many samples correspond to a partiulcar posn associated with muts'
, '\nNo. of cols now', ncol(meta_with_afor)
, '\nNo. of cols before: ', orig_col2)
sample_pos_occurrence = data.frame(meta_with_afor$id
, meta_with_afor$mutation
, meta_with_afor$Mutationinformation
, meta_with_afor$position
, meta_with_afor$sample_pos_occurrence)
colnames(sample_pos_occurrence) = c('id', 'mutation', 'Mutationinformation', 'position', 'sample_pos_occurrence')
#=======================================================================
cat('Begin merging dfs with NAs'
, '\n===============================================================')
###########################
# merging two dfs: with NA
###########################
# link col name = 'Mutationinforamtion'
head(mcsm_data$Mutationinformation)
head(meta_with_afor$Mutationinformation)
cat('Merging dfs with NAs: big df (1-many relationship b/w id & mut)'
,'\nlinking col: Mutationinforamtion'
,'\nfilename: merged_df2')
#########
# a) merged_df2: meta data with mcsm
#########
merged_df2 = merge(x = meta_with_afor
,y = mcsm_data
, by = 'Mutationinformation'
, all.y = T)
cat('Dim of merged_df2: '
, '\nNo. of rows: ', nrow(merged_df2)
, '\nNo. of cols: ', ncol(merged_df2))
head(merged_df2$Position)
# sanity check
cat('Checking nrows in merged_df2')
if(nrow(meta_with_afor) == nrow(merged_df2)){
cat('nrow(merged_df2) = nrow (gene associated metadata)'
,'\nExpected no. of rows: ',nrow(meta_with_afor)
,'\nGot no. of rows: ', nrow(merged_df2))
} else{
cat('nrow(merged_df2)!= nrow(gene associated metadata)'
, '\nExpected no. of rows after merge: ', nrow(meta_with_afor)
, '\nGot no. of rows: ', nrow(merged_df2)
, '\nFinding discrepancy')
merged_muts_u = unique(merged_df2$Mutationinformation)
meta_muts_u = unique(meta_with_afor$Mutationinformation)
# find the index where it differs
unique(meta_muts_u[! meta_muts_u %in% merged_muts_u])
}
# sort by Position
head(merged_df2$Position)
merged_df2 = merged_df2[order(merged_df2$Position),]
head(merged_df2$Position)
merged_df2v2 = merge(x = meta_with_afor
,y = mcsm_data
, by = 'Mutationinformation'
, all.x = T)
#!=!=!=!=!=!=!=!
# COMMENT: used all.y since position 186 is not part of the struc,
# hence doesn't have a mcsm value
# but 186 is associated with mutation
#!=!=!=!=!=!=!=!
# should be False
identical(merged_df2, merged_df2v2)
table(merged_df2$Position%in%merged_df2v2$Position)
rm(merged_df2v2)
#########
# b) merged_df3:remove duplicate mutation information
#########
cat('Merging dfs without NAs: small df (removing muts with no AF|OR associated)'
,'\nCannot trust lineage info from this'
,'\nlinking col: Mutationinforamtion'
,'\nfilename: merged_df3')
#==#=#=#=#=#=#
# Cannot trust lineage, country from this df as the same mutation
# can have many different lineages
# but this should be good for the numerical corr plots
#=#=#=#=#=#=#=
merged_df3 = merged_df2[!duplicated(merged_df2$Mutationinformation),]
head(merged_df3$Position); tail(merged_df3$Position) # should be sorted
# sanity check
cat('Checking nrows in merged_df3')
if(nrow(mcsm_data) == nrow(merged_df3)){
cat('PASS: No. of rows match with mcsm_data'
,'\nExpected no. of rows: ', nrow(mcsm_data)
,'\nGot no. of rows: ', nrow(merged_df3))
} else {
cat('FAIL: No. of rows mismatch'
, '\nNo. of rows mcsm_data: ', nrow(mcsm_data)
, '\nNo. of rows merged_df3: ', nrow(merged_df3))
}
# counting NAs in AF, OR cols in merged_df3
# this is becuase mcsm has no AF, OR cols,
# so you cannot count NAs
if (identical(sum(is.na(merged_df3$OR))
, sum(is.na(merged_df3$pvalue))
, sum(is.na(merged_df3$AF)))){
cat('PASS: NA count match for OR, pvalue and AF\n')
na_count_df3 = sum(is.na(merged_df3$AF))
cat('No. of NAs: ', sum(is.na(merged_df3$OR)))
} else{
cat('FAIL: NA count mismatch'
, '\nNA in OR: ', sum(is.na(merged_df3$OR))
, '\nNA in pvalue: ', sum(is.na(merged_df3$pvalue))
, '\nNA in AF:', sum(is.na(merged_df3$AF)))
}
#=======================================================================
#%% merging without NAs
cat('Begin merging dfs without NAs'
, '\n===============================================================')
cat('Merging dfs without any NAs: big df (1-many relationship b/w id & mut)'
,'\nlinking col: Mutationinforamtion'
,'\nfilename: merged_df2_comp')
#########
# c) merged_df2_comp: same as merge 1 but excluding NA
#########
merged_df2_comp = merged_df2[!is.na(merged_df2$AF),]
#merged_df2_comp = merged_df2[!duplicated(merged_df2$Mutationinformation),]
# sanity check
cat('Checking nrows in merged_df2_comp')
if(nrow(merged_df2_comp) == (nrow(merged_df2) - na_count + 1)){
cat('PASS: No. of rows match'
,'\nDim of merged_df2_comp: '
,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1
, '\nNo. of rows: ', nrow(merged_df2_comp)
, '\nNo. of cols: ', ncol(merged_df2_comp))
}else{
cat('FAIL: No. of rows mismatch'
,'\nExpected no. of rows: ', nrow(merged_df2) - na_count + 1
,'\nGot no. of rows: ', nrow(merged_df2_comp))
}
#########
# d) merged_df3_comp: remove duplicate mutation information
#########
merged_df3_comp = merged_df2_comp[!duplicated(merged_df2_comp$Mutationinformation),]
cat('Dim of merged_df3_comp: '
, '\nNo. of rows: ', nrow(merged_df3_comp)
, '\nNo. of cols: ', ncol(merged_df3_comp))
# alternate way of deriving merged_df3_comp
foo = merged_df3[!is.na(merged_df3$AF),]
# compare dfs: foo and merged_df3_com
all.equal(foo, merged_df3)
summary(comparedf(foo, merged_df3))
# sanity check
cat('Checking nrows in merged_df3_comp')
if(nrow(merged_df3_comp) == nrow(merged_df3)){
cat('NO NAs detected in merged_df3 in AF|OR cols'
,'\nNo. of rows are identical: ', nrow(merged_df3))
} else{
if(nrow(merged_df3_comp) == nrow(merged_df3) - na_count_df3) {
cat('PASS: NAs detected in merged_df3 in AF|OR cols'
, '\nNo. of NAs: ', na_count_df3
, '\nExpected no. of rows in merged_df3_comp: ', nrow(merged_df3) - na_count_df3
, '\nGot no. of rows: ', nrow(merged_df3_comp))
}
}
#=======================================================================
#*********************
# writing 1 file in the style of a loop: merged_df3
# print(output dir)
#i = 'merged_df3'
#out_filename = paste0(i, '.csv')
#outfile = paste0(outdir, '/', out_filename)
#cat('Writing output file: '
# ,'\nFilename: ', out_filename
# ,'\nPath: ', outdir)
#template: write.csv(merged_df3, 'merged_df3.csv')
#write.csv(get(i), outfile, row.names = FALSE)
#cat('Finished writing: ', outfile
# , '\nNo. of rows: ', nrow(get(i))
# , '\nNo. of cols: ', ncol(get(i)))
#%% write_output files; all 4 files:
outvars = c('merged_df2'
, 'merged_df3'
, 'merged_df2_comp'
, 'merged_df3_comp')
cat('Writing output files: '
, '\nPath:', outdir)
for (i in outvars){
# cat(i, '\n')
out_filename = paste0(i, '.csv')
# cat(out_filename, '\n')
# cat('getting value of variable: ', get(i))
outfile = paste0(outdir, '/', out_filename)
# cat('Full output path: ', outfile, '\n')
cat('Writing output file:'
,'\nFilename: ', out_filename,'\n')
write.csv(get(i), outfile, row.names = FALSE)
cat('Finished writing: ', outfile
, '\nNo. of rows: ', nrow(get(i))
, '\nNo. of cols: ', ncol(get(i)), '\n')
}
# alternate way to replace with implicit loop
# FIXME
#sapply(outvars, function(x, y) write.csv(get(outvars), paste0(outdir, '/', outvars, '.csv')))
#=======================================================================
#%% merging mut_pos_occurrence and sample_pos_occurence
# FIXME
#cat('Merging dfs with positional frequency from mcsm and meta_data'
# , '\nNcol in mut_pos_occurrence:', ncol(mut_pos_occurrence)
# , '\nncol in sample_pos_occurence:', ncol(sample_pos_occurrence)
# ,'\nlinking col:', intersect(colnames(sample_pos_occurrence), colnames(mut_pos_occurrence))
# ,'\nfilename: merged_df4')
#merged_df4 = merge(sample_pos_occurrence, mut_pos_occurrence
# , by = 'position'
# , all = T)
#out_filename4 = 'mut_and_sample_freq.csv'
#outfile4 = paste0(outdir, '/', out_filename4)
#*************************
# clear variables
rm(mcsm_data, meta_with_afor, foo, drug, gene, gene_match, indir, merged_muts_u, meta_muts_u, na_count, orig_col, outdir)
rm(mut_pos_occurrence, sample_pos_occurrence)
#rm(merged_df4)
#%% end of script
#=======================================================================

View file

@ -1,170 +0,0 @@
#!/home/tanu/anaconda3/envs/ContactMap/bin/python3
# -*- coding: utf-8 -*-
"""
Created on Tue Feb 18 10:10:12 2020
@author: tanu
"""
#=======================================================================
# Task: Read a DSSP file into a data frame and output to a csv file
# Input: '.dssp' i.e gene associated.dssp file (output from run_dssp.sh)
# Output: '.csv' file containing DSSP output as a df with ASA, RSA, etc.
# based on Tien at al 2013 (theor.) values
# useful links:
# https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
# https://en.wikipedia.org/wiki/Relative_accessible_surface_area
#=======================================================================
#%% load packages
import sys, os
import re
import pandas as pd
from Bio.PDB import PDBParser
from Bio.PDB.DSSP import DSSP
import pprint as pp
import dms_tools2
import dms_tools2.dssp
#=======================================================================#
#%% specify homedir and curr dir
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
os.getcwd()
#=======================================================================
#%% variable assignment: input and output
#drug = 'pyrazinamide'
#gene = 'pncA'
#gene_match = gene + '_p.'
drug = 'cycloserine'
gene = 'alr'
#==========
# data dir
#==========
#indir = 'git/Data/pyrazinamide/input/original'
datadir = homedir + '/' + 'git/Data'
#=======
# input from outdir
#=======
indir = datadir + '/' + drug + '/' + 'input'
#1) pdb file
in_filename_pdb = gene.lower() + '_complex' + '.pdb'
infile_pdb = indir + '/' + in_filename_pdb
print('Input pdb filename:', in_filename_pdb
, '\npath:', indir
, '\n============================================================')
#2) dssp file
outdir = datadir + '/' + drug + '/' + 'output'
in_filename = gene.lower() +'.dssp'
infile = outdir + '/' + in_filename
print('Input dssp filename:', in_filename
, '\npath:', outdir
, '\n============================================================')
#=======
# output
#=======
outdir = datadir + '/' + drug + '/' + 'output'
out_filename = gene.lower() + '_dssp.csv'
outfile = outdir + '/' + out_filename
print('Output filename:', out_filename
, '\npath:', outdir
, '\nOutfile: ', outfile
, '\n=============================================================')
#%% end of variable assignment for input and output files
#=======================================================================
#%% specify pdb chain as a list. Handy when more than 1 pdb chain
my_chains = ['A']
# my_chains = ['A', 'B'] # for cycloserine
# generate my_chains from dssp
p = PDBParser()
structure = p.get_structure(in_filename_pdb, infile_pdb)
model = structure[0]
dssp = DSSP(model, infile_pdb)
#print(dssp.keys())
#print(dssp.keys()[0][0])
#print(len(dssp))
#print(dssp.keys()[0][0])
#print(dssp.keys()[len(dssp)-1][0])
dssp_chains = []
for num_aa in range(0, len(dssp)):
# print(num_aa)
# extract the chain id only and append to a list
dssp_chains.append(dssp.keys()[num_aa][0])
chainsL = list(set(dssp_chains))
print(chainsL)
# sort the list to make for sanity (since sets are not ordered)
# this will be required for dssp_df
my_chains = sorted(chainsL)
print('dssp output for'
, in_filename, 'contains:', len(my_chains)
, 'chains:\n', my_chains)
#%% ====================================================================
# Process dssp output and extract into df (single chain)
#dssp_df = dms_tools2.dssp.processDSSP(infile, chain = my_chains)
#dssp_df['chain_dssp'] = my_chains
#pp.pprint(dssp_df)
#=======================================================================
# incase pdb has > 1 chain and you need to run it for all chains
# initialise an empty df
dssp_df = pd.DataFrame()
print('Total no. of chains: ', len(my_chains))
for chain_id in my_chains:
print('Chain id:', chain_id)
dssp_cur = pd.DataFrame()
dssp_cur = dms_tools2.dssp.processDSSP(infile, chain = chain_id)
#!!!Important!!!
dssp_cur['chain_id'] = chain_id
dssp_df = dssp_df.append(dssp_cur)
pp.pprint(dssp_df)
#=====================
# Renaming amino-acid
# and site columns
#=====================
# Rename column (amino acid) as 'wild_type' and (site} as 'position'
# to be the same names as used in the file required for merging later.
dssp_df.columns
dssp_df.rename(columns = {'site':'position', 'amino_acid':'wild_type_dssp'}, inplace = True)
dssp_df.columns
# sanity check
if len(dssp_df) == len(dssp):
print('PASS: length of dssp_df has correct length')
else:
print('FAIL: length mismatch for dssp_df'
, '\nexpected length:', len(dssp)
, '\nGot length:', len(dssp_df)
, 'Debug please!')
#%% Write ouput csv file
print('Writing file:', outfile
, '\nFilename:', out_filename
, '\nPath:', outdir
, '\n=============================================================')
# write to csv
dssp_df.to_csv(outfile, header=True, index = False)
print('Finished writing:', out_filename
, '\nNo. of rows:', len(dssp_df)
, '\nNo. of cols:', len(dssp_df.columns)
, '\n==============================================================')
#%% end of script
#=======================================================================
#%% run dssp to extract chain ids to later process the dssp output into a df

View file

@ -1,36 +0,0 @@
#!/usr/bin/python3
# Initialise a blank 'Data' directory and drug subdirs etc.
# TODO:
# - Read base dir from config file
# - Create eg: '~/git/Data/{original,processed}
# - Create eg: '~/git/Data/processed/' + drug (for each drug)
# - Create eg: '~/git/Data/output/' + drug + '{plots, structure}'
#%% specify homedir as python doesn't recognise tilde
homedir = os.path.expanduser('~')
#%% variable assignment: input and output paths & filenames
drug = 'pyrazinamide'
gene = 'pncA'
gene_match = gene + '_p.'
#==========
# data dir
#==========
#indir = 'git/Data/pyrazinamide/input/original'
datadir = homedir + '/' + 'git/Data'
#==========
# input dir
#==========
indir = datadir + '/' + drug + '/' + 'input'
#============
# output dir
#============
# several output files
outdir = datadir + '/' + drug + '/' + 'output'
#%%end of variable assignment for input and output files
#==============================================================================

View file

@ -1,193 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
Created on Tue Aug 6 12:56:03 2019
@author: tanu
'''
#=======================================================================
# Task: Hydrophobicity (Kd) values for amino acid sequence using the
# Kyt&-Doolittle.
# Same output as using the expasy server https://web.expasy.org/protscale/
# Input: fasta file
# Output: csv file with
# useful links
# https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html
# https://jbloomlab.github.io/dms_tools2/dms_tools2.dssp.html
#=======================================================================
#%% load packages
from pylab import *
from Bio.SeqUtils import ProtParamData
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio import SeqIO
#from Bio.Alphabet.IUPAC import IUPACProtein
#import pprint as pp
import pandas as pd
import numpy as np
import sys, os
#=======================================================================
#%% specify input and curr dir
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
os.getcwd()
#=======================================================================
#%% variable assignment: input and output
drug = 'pyrazinamide'
gene = 'pncA'
gene_match = gene + '_p.'
#==========
# data dir
#==========
#indir = 'git/Data/pyrazinamide/input/original'
datadir = homedir + '/' + 'git/Data'
#=======
# input
#=======
#indir = 'git/Data/pyrazinamide/input/original'
indir = datadir + '/' + drug + '/' + 'input'
in_filename = '3pl1.fasta.txt'
infile = indir + '/' + in_filename
print('Input filename:', in_filename
, '\nInput path:', indir
, '\n============================================================')
#=======
# output
#=======
outdir = datadir + '/' + drug + '/' + 'output'
out_filename = gene.lower() + '_kd.csv'
outfile = outdir + '/' + out_filename
print('Output filename:', out_filename
, '\nOutput path:', outdir
, '\n=============================================================')
#%% end of variable assignment for input and output files
#=======================================================================
#===================
#calculate KD values: same as the expasy server
#===================
#%%specify window size for hydropathy profile computation
# https://web.expasy.org/protscale/pscale/protscale_help.html
my_window = 3
offset = round((my_window/2)-0.5)
fh = open(infile)
for record in SeqIO.parse(fh, 'fasta'):
id = record.id
seq = record.seq
num_residues = len(seq)
fh.close()
sequence = str(seq)
X = ProteinAnalysis(sequence)
kd_values = (X.protein_scale(ProtParamData.kd , window = my_window)) # edge weight is set to default (100%)
# sanity checks
print('Sequence Length:', num_residues)
print('kd_values Length:',len(kd_values))
print('Window Length:', my_window)
print('Window Offset:', offset)
print('=================================================================')
print('Checking:len(kd values) is as expected for the given window size & offset...')
expected_length = num_residues - (my_window - offset)
if len(kd_values) == expected_length:
print('PASS: expected and actual length of kd values match')
else:
print('FAIL: length mismatch'
,'\nExpected length:', expected_length
,'\nActual length:', len(kd_values)
, '\n=========================================================')
#===================
# creating two dfs
#===================
#%% make 2 dfs; 1) aa sequence and 2) kd_values. Then reset index for each df
# which will allow easy merging of the two dfs.
# df1: df of aa seq with index reset to start from 1 (reflective of the actual aa position in a sequence)
# Name column of wt as 'wild_type' to be the same name used in the file required for merging later.
dfSeq = pd.DataFrame({'wild_type_kd':list(sequence)})
dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive
# df2: df of kd_values with index reset to start from offset + 1 and subsequent matched length of the kd_values
dfVals = pd.DataFrame({'kd_values':kd_values})
dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset)
# sanity checks
max(dfVals['kd_values'])
min(dfVals['kd_values'])
#===================
# concatenating dfs
#===================
# Merge the two on index
# (as these are now reflective of the aa position numbers): df1 and df2
# This will introduce NaN where there is missing values. In our case this
# will be 2 (first and last ones based on window size and offset)
# In our case this will be 2 (first and last ones)
# For pnca: the last position is not part of the struc, so not info loss
# Needless to say that this will be variable for other targets.
kd_df = pd.concat([dfSeq, dfVals], axis = 1)
#============================
# renaming index to position
#============================
kd_df = kd_df.rename_axis('position')
kd_df.head
print('=================================================================')
print('position col i.e. index should be numeric
, '\n===============================================================')
if kd_df.index.dtype == 'int64':
print('PASS: position col is numeric'
, '\ndtype is:', kd_df.index.dtype)
else:
print('FAIL: position col is not numeric'
, '\nConverting to numeric')
kd_df.index.astype('int64')
print('Checking dtype for after conversion:\n'
, '\ndtype is:', kd_df.index.dtype
, '\n=========================================================')
#===============
# writing file
#===============
print('Writing file:', out_filename
, '\nFilename:', out_filename
, '\nPath:', outdir
, '\n=============================================================')
kd_df.to_csv(outfile, header = True, index = True)
print('Finished writing:', out_filename
, '\nNo. of rows:', len(kd_df)
, '\nNo. of cols:', len(kd_df.columns)
, '\n=============================================================')
#===============
# plot: optional!
#===============#%% plot
# http://www.dalkescientific.com/writings/NBN/plotting.html
# FIXME: save fig
# extract just pdb if from 'id' to pass to title of plot
# foo = re.match(r'(^[0-9]{1}\w{3})', id).groups(1)
plot(kd_values, linewidth = 1.0)
#axis(xmin = 1, xmax = num_residues)
xlabel('Residue Number')
ylabel('Hydrophobicity')
title('K&D Hydrophobicity for ' + id)
show()
print('======================================================================')
#%% end of script
#=======================================================================

View file

@ -1,167 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
Created on Tue Aug 6 12:56:03 2019
@author: tanu
'''
# FIXME: import dirs.py to get the basic dir paths available
#=======================================================================
# TASK: calculate how many mutations result in
# electrostatic changes wrt wt
# Input: mcsm and AF_OR file
# Output: mut_elec_changes_results.txt
#=======================================================================
#%% load libraries
import os, sys
import pandas as pd
#import numpy as np
#=======================================================================
#%% specify homedir and curr dir
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
os.getcwd()
#=======================================================================
#%% variable assignment: input and output
drug = 'pyrazinamide'
gene = 'pncA'
gene_match = gene + '_p.'
#==========
# data dir
#==========
#indir = 'git/Data/pyrazinamide/input/original'
datadir = homedir + '/' + 'git/Data'
#=======
# input
#=======
indir = datadir + '/' + drug + '/' + 'input'
in_filename = 'merged_df3.csv'
infile = outdir + '/' + in_filename
print('Input filename: ', in_filename
, '\nInput path: ', indir
, '\n============================================================')
#=======
# output
#=======
outdir = datadir + '/' + drug + '/' + 'output'
# specify output file
out_filename = 'mut_elec_changes.txt'
outfile = outdir + '/' + out_filename
print('Output filename: ', out_filename
, '\nOutput path: ', outdir
, '\n============================================================')
#%% end of variable assignment for input and output files
#=======================================================================
#%% Read input files
print('Reading input file (merged file):', infile)
comb_df = pd.read_csv(infile, sep = ',')
print('Input filename: ', in_filename
, '\nPath :', outdir
, '\nNo. of rows: ', len(comb_df)
, '\nNo. of cols: ', infile
, '\n============================================================')
# column names
list(comb_df.columns)
# clear variables
del(in_filename, infile)
#%% subset unique mutations
df = comb_df.drop_duplicates(['Mutationinformation'], keep = 'first')
total_muts = df.Mutationinformation.nunique()
#df.Mutationinformation.count()
print('Total mutations associated with structure: ', total_muts
, '\n===============================================================')
#%% combine aa_calcprop cols so that you can count the changes as value_counts
# check if all muts have been categorised
print('Checking if all muts have been categorised: ')
if df['wt_calcprop'].isna().sum() == 0 & df['mut_calcprop'].isna().sum():
print('PASS: No. NA detected i.e all muts have aa prop associated'
, '\n===============================================================')
else:
print('FAIL: NAs detected i.e some muts remain unclassified'
, '\n===============================================================')
df['wt_calcprop'].head()
df['mut_calcprop'].head()
print('Combining wt_calcprop and mut_calcprop...')
#df['aa_calcprop_combined'] = df['wt_calcprop']+ '->' + df['mut_calcprop']
df['aa_calcprop_combined'] = df.wt_calcprop.str.cat(df.mut_calcprop, sep = '->')
df['aa_calcprop_combined'].head()
mut_categ = df["aa_calcprop_combined"].unique()
print('Total no. of aa_calc properties: ', len(mut_categ))
print('Categories are: ', mut_categ)
# counting no. of muts in each mut categ
# way1: count values within each combinaton
df.groupby('aa_calcprop_combined').size()
#df.groupby('aa_calcprop_combined').count()
# way2: count values within each combinaton
df['aa_calcprop_combined'].value_counts()
# comment: the two ways should be identical
# groupby result order is similar to pivot table order,
# I prefer the value_counts look
# assign to variable: count values within each combinaton
all_prop = df['aa_calcprop_combined'].value_counts()
# convert to a df from Series
ap_df = pd.DataFrame({'aa_calcprop': all_prop.index, 'mut_count': all_prop.values})
# subset df to contain only the changes in prop
all_prop_change = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-polar','polar->polar', 'pos->pos']) == False]
elec_count = all_prop_change.mut_count.sum()
print('Total no.of muts with elec changes: ', elec_count)
# calculate percentage of electrostatic changes
elec_changes = (elec_count/total_muts) * 100
print('Total number of electrostatic changes resulting from Mutation is (%):', elec_changes)
# check no change muts
no_change_muts = ap_df[ap_df['aa_calcprop'].isin(['neg->neg','non-polar->non-polar','polar->polar', 'pos->pos']) == True]
no_change_muts.mut_count.sum()
#%% output from console
#sys.stdout = open(file, 'w')
sys.stdout = open(outfile, 'w')
print('======================\n'
,'Unchanged muts'
,'\n=====================\n'
, no_change_muts
,'\n=============================\n'
, 'Muts with changed prop:'
, '\n============================\n'
, all_prop_change)
print('================================================================='
, '\nTotal number of electrostatic changes resulting from Mtation is (%):', elec_changes
, '\nTotal no. of muts: ', total_muts
, '\nTotal no. of changed muts: ', all_prop_change.mut_count.sum()
, '\nTotal no. of unchanged muts: ', no_change_muts.mut_count.sum()
, '\n===================================================================')
#%% end of script
#=======================================================================

View file

@ -1,179 +0,0 @@
#!/usr/bin/python
from __future__ import print_function
from Bio import SeqIO
from Bio.Seq import Seq
from collections import OrderedDict
import sys
import argparse
# https://github.com/jrjhealey/bioinfo-tools/blob/master/Mutate.py
# https://www.biostars.org/p/336891/
# TODO:
# - create some logic to 'group' mutations that will be applied to the same sequence, to
# make all switches at once
# - This will also probably break the verbose transversion output so the maths will need replacing
# - Create the ability to support INDELS (will also require pairwise alignment so that
# hamming distances remain meaningful.
def get_args():
"""Parse command line arguments"""
desc = "Mutate fasta sequences based on a file of sequence mappings."
epi = (
"This script takes a mapfile of the form:\n"
" SequenceID,A123B\n"
" SequenceID,X456Y\n"
"And performs substitutions/mutations. At preset it only does one SNP per sequence.\n"
)
try:
parser = argparse.ArgumentParser(
description=desc, epilog=epi, formatter_class=argparse.RawTextHelpFormatter
)
parser.add_argument(
"mutation_file",
action="store",
help='File of mutation mappings like so: "SeqID,X123Y"',
)
parser.add_argument(
"sequences",
action="store",
help="File of sequences to be mutated (fasta only).",
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
help="Verbose behaviour, printing parameters of the script.",
)
parser.add_argument(
"-o",
"--outfile",
action="store",
help="Output file for mutated sequences (default STDOUT).",
)
if len(sys.argv) == 1:
parser.print_help(sys.stderr)
exit(1)
except:
sys.stderr.write(
"An exception occurred with argument parsing. Check your provided options.\n"
)
return parser.parse_args()
class Mutation(object):
"""A class wrapper for sequence IDs so that duplicate IDs can be used in a dictionary"""
def __init__(self, name):
self.name = name
def __repr__(self):
return "'" + self.name + "'"
def __str__(self):
return self.name
def parse_mapfile(mapfile):
"""Return a dict of mapped mutations.
File should resemble:
SequenceID,A123B
SequenceID2,X234Y
Sequence IDs should exactly match the fasta headers, as parsed by BioPython.
(">" symbols are optional)
"""
with open(mapfile, "r") as handle:
mut_dict = OrderedDict()
for line in handle:
id, change = line.lstrip(">").rstrip("\n").split(",")
mut_dict[Mutation(id)] = change
for k, v in mut_dict.items():
assert v[0].isalpha(), (
"First character of mutation map is not a valid letter. Got: %s" % v[0]
)
assert v[-1].isalpha(), (
"Last character of mutation map is not a valid letter. Got: %s" % v[-1]
)
assert v[1:-1].isdigit(), (
"Location string of mutation map is not a valid number. Got: %s" % v[1:-1]
)
return mut_dict
def morph(orig, loc, new, mutableseq, verbose):
"""Perform actual sequence change (polymorphism only at present)"""
# Shift location to offset 0-based index
loc = loc - 1
assert mutableseq[loc] == orig, (
"Sequence does not match the mutation file for pre-exising residue. Expected %s , got %s "
% (orig, mutableseq[loc])
)
if verbose is True:
print(
"Performing change: {} -> {}, at location: {} (0 based)".format(
orig, new, loc
)
)
mutableseq[loc] = new
return mutableseq
def hamming_distance(s1, s2):
"""Return the Hamming distance between equal-length sequences"""
if len(s1) != len(s2):
raise ValueError("Undefined for sequences of unequal length")
return sum(ch1 != ch2 for ch1, ch2 in zip(s1.upper(), s2.upper()))
def main():
args = get_args()
if args.outfile is not None:
ofh = open(args.outfile, "w")
# Parse the mutation file (get mutations by sequence)
mutations = parse_mapfile(args.mutation_file)
if args.verbose is True:
print("Got mutations:")
print(mutations)
# Iterate all sequences and make any substitutions necessary
for record in SeqIO.parse(args.sequences, "fasta"):
for k, v in mutations.items():
mutable = record.seq.upper().tomutable()
# mutable = record.seq.tomutable()
# print("MO:", mutable)
if k.name == record.id[0:4]: # BEWARE HARDCODING
# print("k.name:", k.name, "record.id:", record.id)
orig = v[0]
print("orig:", orig)
new = v[-1]
loc = int(v[1:-1])
if args.verbose:
print(record.id)
newseq = morph(orig, loc, new, mutable, args.verbose)
# print("NS is:", newseq)
if args.verbose is True:
print("Original: " + record.seq.upper())
print(
str((" " * int(loc - 2 + 11))) + "V"
) # Padded string to show where switch happened (not sure how it'll deal with line wrapping
print("New: " + newseq)
print(
"Distance: "
+ str(hamming_distance(str(record.seq), str(newseq)))
)
if args.outfile is not None:
ofh.write(">%s_%s\n%s\n" % (record.id, v, newseq))
if args.verbose is False:
print(">%s_%s\n%s\n" % (record.id, v, newseq))
if args.outfile is not None:
ofh.close()
if __name__ == "__main__":
main()

View file

@ -1,135 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
Created on Tue Aug 6 12:56:03 2019
@author: tanu
'''
#=============================================================================
# Task: Residue depth (rd) processing to generate a df with residue_depth(rd)
# values
# FIXME
# Input: '.tsv' i.e residue depth txt file (output from .zip file manually
# downloaded from the website).
# This should be integrated into the pipeline
# Output: .csv with 3 cols i.e position, rd_values & 3-letter wt aa code(caps)
#=============================================================================
#%% load packages
import sys, os
import pandas as pd
#import numpy as np
#=============================================================================
#%% specify input and curr dir
homedir = os.path.expanduser('~')
# set working dir
os.getcwd()
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
os.getcwd()
#=============================================================================
#%% variable assignment: input and output
drug = 'pyrazinamide'
gene = 'pncA'
gene_match = gene + '_p.'
#==========
# data dir
#==========
#indir = 'git/Data/pyrazinamide/input/original'
datadir = homedir + '/' + 'git/Data'
#=======
# input
#=======
#indir = 'git/Data/pyrazinamide/input/original'
indir = datadir + '/' + drug + '/' + 'output'
in_filename = '3pl1_rd.tsv'
infile = indir + '/' + in_filename
print('Input filename:', in_filename
, '\nInput path:', indir
, '\n=============================================================')
#=======
# output
#=======
outdir = datadir + '/' + drug + '/' + 'output'
out_filename = gene.lower() + '_rd.csv'
outfile = outdir + '/' + out_filename
print('Output filename:', out_filename
, '\nOutput path:', outdir
, '\n=============================================================')
#%% end of variable assignment for input and output files
#=======================================================================
#%% Read input file
rd_data = pd.read_csv(infile, sep = '\t')
print('Reading input file:', infile
, '\nNo. of rows:', len(rd_data)
, '\nNo. of cols:', len(rd_data.columns))
print('Column names:', rd_data.columns
, '\n===============================================================')
#========================
# creating position col
#========================
# Extracting residue number from index and assigning
# the values to a column [position]. Then convert the position col to numeric.
rd_data['position'] = rd_data.index.str.extract('([0-9]+)').values
# converting position to numeric
rd_data['position'] = pd.to_numeric(rd_data['position'])
rd_data['position'].dtype
print('Extracted residue num from index and assigned as a column:'
, '\ncolumn name: position'
, '\ntotal no. of cols now:', len(rd_data.columns)
, '\n=============================================================')
#========================
# Renaming amino-acid
# and all-atom cols
#========================
print('Renaming columns:'
, '\ncolname==> # chain:residue: wt_3letter_caps'
, '\nYES... the column name *actually* contains a # ..!'
, '\ncolname==> all-atom: rd_values'
, '\n=============================================================')
rd_data.rename(columns = {'# chain:residue':'wt_3letter_caps', 'all-atom':'rd_values'}, inplace = True)
print('Column names:', rd_data.columns)
#========================
# extracting df with the
# desired columns
#========================
print('Extracting relevant columns for writing df as csv')
rd_df = rd_data[['position','rd_values','wt_3letter_caps']]
if len(rd_df) == len(rd_data):
print('PASS: extracted df has expected no. of rows'
,'\nExtracted df dim:'
,'\nNo. of rows:', len(rd_df)
,'\nNo. of cols:', len(rd_df.columns))
else:
print('FAIL: no. of rows mimatch'
, '\nExpected no. of rows:', len(rd_data)
, '\nGot no. of rows:', len(rd_df)
, '\n=========================================================')
#%% write file
print('Writing file:'
, '\nFilename:', out_filename
, '\nPath:', outdir
, '\n=============================================================')
rd_df.to_csv(outfile, header = True, index = False)
print('Finished writing:', out_filename
, '\nNo. of rows:', len(rd_df)
, '\nNo. of cols:', len(rd_df.columns)
, '\n=============================================================')
#%% end of script
#=======================================================================

View file

@ -1,18 +0,0 @@
#!/bin/bash
#https://www.biostars.org/p/336891/
#python Mutate.py -v -o /path/to/output.fasta mutation_map_file.csv input.fasta
# pnca_all_muts_msa_FIXME: This should be formatted like this from python script
# change to a cmd script that takes this "prefix" as an input
for i in $(cat pnca_all_muts_msa_FIXME.csv); do echo "3PL1,${i}"; done > pnca_copy.txt
# make sure there is no new line at the end of the mutation file (snps.csv)
#python3 Mutate.py -v -o /home/tanu/git/Data/pyrazinamide/input/output.fasta mut_map.csv 3pl1.fasta.txt
python3 mutate.py -v -o /home/tanu/git/Data/pyrazinamide/output/pnca_msa.txt /home/tanu/git/Data/pyrazinamide/output/pnca_all_muts_msa.csv /home/tanu/git/Data/pyrazinamide/input/pnca_fasta.txt
# remove fasta style header lines in the output i.e
# lines beginning with '>' so the file is just the mutated seqs
sed -i '/^>.*$/d' /home/tanu/git/Data/pyrazinamide/output/pnca_msa.txt
printf 'No. of lines after cleaning: '
cat /home/tanu/git/Data/pyrazinamide/output/pnca_msa.txt | wc -l

View file

@ -1,54 +0,0 @@
#!/bin/bash
#=======================================================================
# Task: read a pdb file and generate a dssp output file
# Input:
# pdb file
# Output:
# pdb_code.dssp
# needs dssp exe on linux
# more efficient to run dssp exe locally
# note: double quotes for variable interpolation
#=======================================================================
# #%%specify variables for input and output paths and filenames
drug='pyrazinamide'
gene='pncA'
#convert to lowercase for consistency in filenames
gene_l=$(printf $gene | awk '{print tolower($0)}')
gene_match="${gene}_p."
#printf "${gene_match}\n"
#==========
# data dir
#==========
#indir = 'git/Data/pyrazinamide/input/original'
#datadir="~git/Data"
datadir="${HOME}/git/Data"
#=======
# input
#=======
indir=${datadir}'/'${drug}'/input'
printf "Input dir: ${indir}\n"
in_filename='3pl1.pdb'
#infile=${basedir}${inpath}${in_filename}
infile=${indir}'/'${in_filename}
printf "Input file: ${infile}\n"
#=======
# output
#=======
outdir=${datadir}'/'${drug}$'/output'
printf "Output dir: ${outdir}"
#out_filename='3pl1.dssp'
out_filename="${gene_l}.dssp"
outfile=${outdir}'/'${out_filename}
printf "Output file: ${outfile}\n"
#%%end of variable assignment for input and output files
#================================================================
# command line arg to run dssp and create output file
dssp -i ${infile} -o ${outfile}
printf "Finished writing: ${outfile}\n"
#printf "Filename: ${out_filename}\nlocated in: ${outdir}\n"

View file

@ -1,221 +0,0 @@
#=======================================================================
# Task: To generate a logo plot or bar plot but coloured
# aa properties.
# step1: read mcsm file and OR file
# step2: plot wild type positions
# step3: plot mutants per position coloured by aa properties
# step4: make the size of the letters/bars prop to OR if you can!
# useful links
# https://stackoverflow.com/questions/5438474/plotting-a-sequence-logo-using-ggplot2
# https://omarwagih.github.io/ggseqlogo/
# https://kkdey.github.io/Logolas-pages/workflow.html
# A new sequence logo plot to highlight enrichment and depletion.
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6288878/
#very good: http://www.cbs.dtu.dk/biotools/Seq2Logo-2.0/
#=======================================================================
#%% specify curr dir
getwd()
setwd('~/git/LSHTM_analysis/plotting_test/')
getwd()
#=======================================================================
#%% load packages
# header file
header_dir = '~/git/LSHTM_analysis/'
source(paste0(header_dir, '/', 'my_header.R'))
#=======================================================================
#%% variable assignment: input and output paths & filenames
drug = 'pyrazinamide'
gene = 'pncA'
gene_match = paste0(gene,'_p.')
cat(gene_match)
#===========
# data dir
#===========
datadir = paste0('~/git/Data')
#===========
# input
#===========
# source R script 'combining_two_df.R'
#indir = paste0(datadir, '/', drug, '/', 'output') # reading files
indir = '../meta_data_analysis' # sourcing R script
in_filename = 'combining_df_ps.R'
infile = paste0(indir, '/', in_filename)
cat(paste0('Input is a R script: ', '\'', infile, '\'')
, '\n========================================================')
#===========
# output
#===========
# 1) lineage dist of all muts
outdir = paste0('~/git/Data', '/', drug, '/', 'output/plots') #same as indir
#cat('Output dir: ', outdir, '\n')
#file_type = '.svg'
#out_filename1 = paste0(tolower(gene), '_lineage_dist_ps', file_type)
#outfile1 = paste0(outdir, '/', out_filename1)
#cat(paste0('Output plot1 :', outfile1)
# , '\n========================================================')
#%% end of variable assignment for input and output files
#=======================================================================
##%% read input file
cat('Reading input file(sourcing R script):', in_filename)
source(infile)
#==========================
# This will return:
# df with NA for pyrazinamide:
# merged_df2
# merged_df3
# df without NA for pyrazinamide:
# merged_df2_comp
# merged_df3_comp
#===========================
###########################
# Data for plots
# you need merged_df2 or merged_df2_comp
# since this is one-many relationship
# i.e the same SNP can belong to multiple lineages
# using the _comp dataset means
# we lose some muts and at this level, we should use
# as much info as available, hence use df with NA
# This will the first plotting df
# Then subset this to extract dr muts only (second plottig df)
###########################
#%%%%%%%%%%%%%%%%%%%%%%%%%
# uncomment as necessary
# REASSIGNMENT
#my_data = merged_df2
#my_data = merged_df2_comp
#my_data = merged_df3
my_data = merged_df3_comp
#%%%%%%%%%%%%%%%%%%%%%%%%%%
# delete variables not required
rm(merged_df2, merged_df2_comp, merged_df3, merged_df3_comp)
# quick checks
colnames(my_data)
str(my_data)
c1 = unique(my_data$Position)
nrow(my_data)
cat('No. of rows in my_data:', nrow(my_data)
, '\nDistinct positions corresponding to snps:', length(c1)
, '\n===========================================================')
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
# FIXME: Think and decide what you want to remove
# mut_pos_occurence < 1 or sample_pos_occurrence <1
# get freq count of positions so you can subset freq<1
require(data.table)
#setDT(my_data)[, mut_pos_occurrence := .N, by = .(Position)] #265, 14
#extract freq_pos>1
#my_data_snp = my_data[my_data$occurrence!=1,]
#u = unique(my_data_snp$Position) #73
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# REASSIGNMENT to prevent changing code
my_data_snp = my_data
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#=======================================================================
#%% logo plots from dataframe
#############
# PLOTS: ggseqlogo with custom height
# https://omarwagih.github.io/ggseqlogo/
#############
#require(ggplot2)
#require(tidyverse)
library(ggseqlogo)
foo = my_data_snp[, c("Position", "Mutant_type","ratioDUET", "OR"
, "mut_prop_polarity", "mut_prop_water") ]
# log10OR
# FIXME: at the source script (when calculating AFandOR)
my_data_snp$log10or = log10(my_data_snp$OR)
bar = my_data_snp[, c('Position', 'Mutant_type', 'OR', 'logor', 'log10or')]
bar_or = my_data_snp[, c('Position', 'Mutant_type', 'OR')]
wide_df_or <- bar_or %>% spread(Position, OR, fill = 0)
wide_df_or = as.matrix(wide_df_or)
rownames(wide_df_or) = wide_df_or[,1]
wide_df_or = wide_df_or[,-1]
# custom height (OR) logo plot: yayy works
ggseqlogo(wide_df_or, method='custom', seq_type='aa') + ylab('my custom height') +
theme(legend.position = "bottom"
, axis.text.x = element_text(size = 11
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = 15
, angle = 0
, hjust = 1
, vjust = 0))+
labs(title = "AA logo plot"
, x = "Wild-type Position"
, y = "OR")
#%% end of logo plot with OR as height
#=======================================================================
# extracting data with log10OR
bar_logor = my_data_snp[, c('Position', 'Mutant_type', 'log10or')]
wide_df_logor <- bar_logor %>% spread(Position, log10or, fill = 0)
wide_df_logor = as.matrix(wide_df_logor)
rownames(wide_df_logor) = wide_df_logor[,1]
wide_df_logor = wide_df_logor[,-1]
# custom height (log10OR) logo plot: yayy works
ggseqlogo(wide_df_logor, method='custom', seq_type='aa') + ylab('my custom height') +
theme(legend.position = "bottom"
, axis.text.x = element_text(size = 11
, angle = 90
, hjust = 1
, vjust = 0.4)
, axis.text.y = element_text(size = 15
, angle = 0
, hjust = 1
, vjust = 0))+
labs(title = "AA logo plot"
, x = "Wild-type Position"
, y = "Log10(OR)")
#=======================================================================
#%% logo plot from sequence
#################
# Plot: LOGOLAS (ED plots)
# link: https://github.com/kkdey/Logolas
# on all pncA samples: output of mutate.py
################
library(Logolas)
seqs = read.csv('~/git//Data/pyrazinamide/output/pnca_msa.txt'
, header = FALSE
, stringsAsFactors = FALSE)$V1
# my_data: useful!
logomaker(seqs, type = "EDLogo", color_type = 'per_symbol'
, return_heights = TRUE)
logomaker(seqs, type = "Logo", color_type = 'per_symbol')
#%% end of script
#=======================================================================