adding clean files for rerrun 35k dataset
This commit is contained in:
parent
943513a338
commit
a7f21cfb14
32 changed files with 157 additions and 44550 deletions
|
@ -1,319 +0,0 @@
|
||||||
#!/usr/bin/env Rscript
|
|
||||||
#require('compare')
|
|
||||||
require('getopt', quietly=TRUE) # We need to be able to parse arguments
|
|
||||||
#########################################################
|
|
||||||
# TASK: To calculate Allele Frequency and
|
|
||||||
# Odds Ratio from master data
|
|
||||||
# and add the calculated params to meta_data extracted from
|
|
||||||
# data_extraction.py
|
|
||||||
#########################################################
|
|
||||||
#getwd()
|
|
||||||
setwd('~/git/LSHTM_analysis/scripts')
|
|
||||||
cat(c(getwd(),'\n'))
|
|
||||||
|
|
||||||
# Command line args
|
|
||||||
spec = matrix(c(
|
|
||||||
"drug" , "d", 1, "character",
|
|
||||||
"gene" , "g", 1, "character"
|
|
||||||
), byrow = TRUE, ncol = 4)
|
|
||||||
|
|
||||||
opt = getopt(spec);
|
|
||||||
|
|
||||||
drug = opt$drug
|
|
||||||
gene = opt$gene
|
|
||||||
|
|
||||||
if(is.null(drug)|is.null(gene)) {
|
|
||||||
stop('Missing arguments: --drug and --gene must both be specified (case-sensitive)')
|
|
||||||
}
|
|
||||||
|
|
||||||
#options(scipen = 999) #disabling scientific notation in R.
|
|
||||||
#options(scipen = 4)
|
|
||||||
|
|
||||||
#%% variable assignment: input and output paths & filenames
|
|
||||||
drug = 'pyrazinamide'
|
|
||||||
gene = 'pncA'
|
|
||||||
gene_match = paste0(gene,'_p.')
|
|
||||||
cat(gene_match)
|
|
||||||
|
|
||||||
#===========
|
|
||||||
# input
|
|
||||||
#===========
|
|
||||||
# infile1: Raw data
|
|
||||||
#indir = 'git/Data/pyrazinamide/input/original'
|
|
||||||
indir = paste0('~/git/Data')
|
|
||||||
in_filename = 'original_tanushree_data_v2.csv'
|
|
||||||
#in_filename = 'mtb_gwas_v3.csv'
|
|
||||||
infile = paste0(indir, '/', in_filename)
|
|
||||||
cat(paste0('Reading infile1: raw data', ' ', infile) )
|
|
||||||
|
|
||||||
# infile2: gene associated meta data file to extract valid snps and add calcs to.
|
|
||||||
# This is outfile3 from data_extraction.py
|
|
||||||
indir_metadata = paste0('~/git/Data', '/', drug, '/', 'output')
|
|
||||||
in_filename_metadata = 'pnca_metadata.csv'
|
|
||||||
infile_metadata = paste0(indir_metadata, '/', in_filename_metadata)
|
|
||||||
cat(paste0('Reading infile2: gene associated metadata:', infile_metadata))
|
|
||||||
|
|
||||||
#===========
|
|
||||||
# output
|
|
||||||
#===========
|
|
||||||
# outdir = 'git/Data/pyrazinamide/output'
|
|
||||||
outdir = paste0('~/git/Data', '/', drug, '/', 'output')
|
|
||||||
#out_filename = paste0(tolower(gene), '_meta_data_with_AF_OR.csv')
|
|
||||||
out_filename = paste0(tolower(gene), '_af_or.csv')
|
|
||||||
outfile = paste0(outdir, '/', out_filename)
|
|
||||||
cat(paste0('Output file with full path:', outfile))
|
|
||||||
#%% end of variable assignment for input and output files
|
|
||||||
|
|
||||||
#########################################################
|
|
||||||
# 1: Read master/raw data stored in Data/
|
|
||||||
#########################################################
|
|
||||||
raw_data_all = read.csv(infile, stringsAsFactors = F)
|
|
||||||
|
|
||||||
# building cols to extract
|
|
||||||
dr_muts_col = paste0('dr_mutations_', drug)
|
|
||||||
other_muts_col = paste0('other_mutations_', drug)
|
|
||||||
|
|
||||||
cat('Extracting columns based on variables:\n'
|
|
||||||
, drug
|
|
||||||
, '\n'
|
|
||||||
, dr_muts_col
|
|
||||||
, '\n'
|
|
||||||
, other_muts_col
|
|
||||||
, '\n===============================================================')
|
|
||||||
|
|
||||||
raw_data = raw_data_all[,c("id"
|
|
||||||
, drug
|
|
||||||
, dr_muts_col
|
|
||||||
, other_muts_col)]
|
|
||||||
rm(raw_data_all)
|
|
||||||
|
|
||||||
rm(indir, in_filename, infile)
|
|
||||||
|
|
||||||
#===========
|
|
||||||
# 1a: exclude na
|
|
||||||
#===========
|
|
||||||
raw_data = raw_data[!is.na(raw_data[[drug]]),]
|
|
||||||
|
|
||||||
total_samples = length(unique(raw_data$id))
|
|
||||||
cat(paste0('Total samples without NA in', ' ', drug, 'is:', total_samples))
|
|
||||||
|
|
||||||
# sanity check: should be true
|
|
||||||
is.numeric(total_samples)
|
|
||||||
|
|
||||||
#===========
|
|
||||||
# 1b: combine the two mutation columns
|
|
||||||
#===========
|
|
||||||
all_muts_colname = paste0('all_mutations_', drug)
|
|
||||||
#raw_data$all_mutations_pyrazinamide = paste(raw_data$dr_mutations_pyrazinamide, raw_data$other_mutations_pyrazinamide)
|
|
||||||
raw_data[[all_muts_colname]] = paste(raw_data[[dr_muts_col]], raw_data[[other_muts_col]])
|
|
||||||
head(raw_data[[all_muts_colname]])
|
|
||||||
|
|
||||||
#===========
|
|
||||||
# 1c: create yet another column that contains all the mutations but in lower case
|
|
||||||
#===========
|
|
||||||
head(raw_data[[all_muts_colname]])
|
|
||||||
raw_data$all_muts_gene = tolower(raw_data[[all_muts_colname]])
|
|
||||||
head(raw_data$all_muts_gene)
|
|
||||||
|
|
||||||
# sanity checks
|
|
||||||
#table(grepl("gene_p",raw_data$all_muts_gene))
|
|
||||||
cat(paste0('converting gene match:', gene_match, ' ', 'to lowercase'))
|
|
||||||
gene_match = tolower(gene_match)
|
|
||||||
|
|
||||||
table(grepl(gene_match,raw_data$all_muts_gene))
|
|
||||||
|
|
||||||
# sanity check: should be TRUE
|
|
||||||
#sum(table(grepl("gene_p",raw_data$all_muts_gene))) == total_samples
|
|
||||||
# sanity check
|
|
||||||
if(sum(table(grepl(gene_match, raw_data$all_muts_gene))) == total_samples){
|
|
||||||
cat('PASS: Total no. of samples match')
|
|
||||||
} else{
|
|
||||||
cat('FAIL: No. of samples mismatch')
|
|
||||||
}
|
|
||||||
|
|
||||||
#########################################################
|
|
||||||
# 2: Read valid snps for which OR
|
|
||||||
# can be calculated
|
|
||||||
#########################################################
|
|
||||||
cat(paste0('Reading metadata infile:', infile_metadata))
|
|
||||||
|
|
||||||
gene_metadata = read.csv(infile_metadata
|
|
||||||
#, file.choose()
|
|
||||||
, stringsAsFactors = F
|
|
||||||
, header = T)
|
|
||||||
|
|
||||||
|
|
||||||
# clear variables
|
|
||||||
rm(in_filename_metadata, infile_metadata)
|
|
||||||
|
|
||||||
# count na in pyrazinamide column
|
|
||||||
tot_pza_na = sum(is.na(gene_metadata$pyrazinamide))
|
|
||||||
expected_rows = nrow(gene_metadata) - tot_pza_na
|
|
||||||
|
|
||||||
# drop na from the pyrazinamide colum
|
|
||||||
gene_snps_or = gene_metadata[!is.na(gene_metadata[[drug]]),]
|
|
||||||
|
|
||||||
# sanity check
|
|
||||||
if(nrow(gene_snps_or) == expected_rows){
|
|
||||||
cat('PASS: no. of rows match with expected_rows')
|
|
||||||
} else{
|
|
||||||
cat('FAIL: nrows mismatch.')
|
|
||||||
}
|
|
||||||
|
|
||||||
# extract unique snps to iterate over for AF and OR calcs
|
|
||||||
gene_snps_unique = unique(gene_snps_or$mutation)
|
|
||||||
|
|
||||||
cat(paste0('Total no. of distinct comp snps to perform OR calcs: ', length(gene_snps_unique)))
|
|
||||||
|
|
||||||
#===========================================================================================
|
|
||||||
#########################
|
|
||||||
# custom chisq function:
|
|
||||||
# To calculate OR
|
|
||||||
#########################
|
|
||||||
i = "pnca_p.trp68gly"
|
|
||||||
mut = grepl(i,raw_data$all_muts_gene)
|
|
||||||
mut = as.numeric(mut)
|
|
||||||
|
|
||||||
dst = raw_data[[drug]]
|
|
||||||
|
|
||||||
#x = as.numeric(mut)
|
|
||||||
#y = dst
|
|
||||||
|
|
||||||
mychisq_or = function(x,y){
|
|
||||||
tab = as.matrix(table(x,y))
|
|
||||||
a = tab[2,2]
|
|
||||||
if (a==0){ a<-0.5}
|
|
||||||
b = tab[2,1]
|
|
||||||
if (b==0){ b<-0.5}
|
|
||||||
c = tab[1,2]
|
|
||||||
if (c==0){ c<-0.5}
|
|
||||||
d = tab[1,1]
|
|
||||||
if (d==0){ d<-0.5}
|
|
||||||
(a/b)/(c/d)
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
or_mychisq = mychisq_or(dst, mut)
|
|
||||||
print(paste0('mychisq OR:', or_mychisq ))
|
|
||||||
|
|
||||||
#=====================================
|
|
||||||
#OR calcs using the following 4
|
|
||||||
#1) chisq.test
|
|
||||||
#2) fisher
|
|
||||||
#3) modified chisq.test
|
|
||||||
#4) logistic
|
|
||||||
#5) adjusted logistic?
|
|
||||||
#6) kinship (separate script)
|
|
||||||
|
|
||||||
#======================================
|
|
||||||
# TEST FOR a few muts: sapply and df
|
|
||||||
#===============================================
|
|
||||||
snps <- gene_snps_unique # reassign so you test with subset of muts
|
|
||||||
#snps <- gene_snps_unique[1:2]
|
|
||||||
cat(paste0('Running calculations for:', length(snps), ' nssnps\n'
|
|
||||||
, 'gene: ', gene
|
|
||||||
, '\ndrug: ', drug ))
|
|
||||||
|
|
||||||
# DV: pyrazinamide 0 or 1
|
|
||||||
dst = raw_data[[drug]]
|
|
||||||
|
|
||||||
# initialise an empty df
|
|
||||||
ors_df = data.frame()
|
|
||||||
|
|
||||||
x = sapply(snps,function(m){
|
|
||||||
|
|
||||||
mut = grepl(m,raw_data$all_muts_gene)
|
|
||||||
mut = as.numeric(mut)
|
|
||||||
cat(paste0('Running mutation:', m, '\n'))
|
|
||||||
|
|
||||||
model<-glm(dst ~ mut, family = binomial)
|
|
||||||
|
|
||||||
#-------------------
|
|
||||||
# allele frequency
|
|
||||||
#-------------------
|
|
||||||
afs = mean(mut)
|
|
||||||
|
|
||||||
#-------------------
|
|
||||||
# logistic model
|
|
||||||
#-------------------
|
|
||||||
beta_logistic = summary(model)$coefficients[2,1]
|
|
||||||
|
|
||||||
or_logistic = exp(summary(model)$coefficients[2,1])
|
|
||||||
#print(paste0('logistic OR:', or_logistic))
|
|
||||||
|
|
||||||
pval_logistic = summary(model)$coefficients[2,4]
|
|
||||||
#print(paste0('logistic pval:', pval_logistic))
|
|
||||||
|
|
||||||
se_logistic = summary(model)$coefficients[2,2]
|
|
||||||
#print(paste0('logistic SE:', se_logistic))
|
|
||||||
|
|
||||||
zval_logistic = summary(model)$coefficients[2,3]
|
|
||||||
#print(paste0('logistic zval:', zval_logistic))
|
|
||||||
|
|
||||||
ci_mod = exp(confint(model))[2,]
|
|
||||||
#print(paste0('logistic CI:', ci_mod))
|
|
||||||
|
|
||||||
ci_lower_logistic = ci_mod[["2.5 %"]]
|
|
||||||
ci_upper_logistic = ci_mod[["97.5 %"]]
|
|
||||||
|
|
||||||
#-------------------
|
|
||||||
# custom_chisq and fisher: OR p-value and CI
|
|
||||||
#-------------------
|
|
||||||
or_mychisq = mychisq_or(dst, mut)
|
|
||||||
#print(paste0('mychisq OR:', or_mychisq))
|
|
||||||
|
|
||||||
odds_fisher = fisher.test(table(dst, mut))$estimate
|
|
||||||
or_fisher = odds_fisher[[1]]
|
|
||||||
|
|
||||||
pval_fisher = fisher.test(table(dst, mut))$p.value
|
|
||||||
|
|
||||||
ci_lower_fisher = fisher.test(table(dst, mut))$conf.int[1]
|
|
||||||
ci_upper_fisher = fisher.test(table(dst, mut))$conf.int[2]
|
|
||||||
|
|
||||||
#-------------------
|
|
||||||
# chi sq estimates
|
|
||||||
#-------------------
|
|
||||||
estimate_chisq = chisq.test(table(dst, mut))$statistic; estimate_chisq
|
|
||||||
est_chisq = estimate_chisq[[1]]; print(est_chisq)
|
|
||||||
|
|
||||||
pval_chisq = chisq.test(table(dst, mut))$p.value
|
|
||||||
|
|
||||||
# build a row to append to df
|
|
||||||
row = data.frame(mutation = m
|
|
||||||
, af = afs
|
|
||||||
, beta_logistic = beta_logistic
|
|
||||||
, or_logistic = or_logistic
|
|
||||||
, pval_logistic = pval_logistic
|
|
||||||
, se_logistic = se_logistic
|
|
||||||
, zval_logistic = zval_logistic
|
|
||||||
, ci_low_logistic = ci_lower_logistic
|
|
||||||
, ci_hi_logistic = ci_upper_logistic
|
|
||||||
, or_mychisq = or_mychisq
|
|
||||||
, or_fisher = or_fisher
|
|
||||||
, pval_fisher = pval_fisher
|
|
||||||
, ci_low_fisher= ci_lower_fisher
|
|
||||||
, ci_hi_fisher = ci_upper_fisher
|
|
||||||
, est_chisq = est_chisq
|
|
||||||
, pval_chisq = pval_chisq
|
|
||||||
)
|
|
||||||
#print(row)
|
|
||||||
|
|
||||||
ors_df <<- rbind(ors_df, row)
|
|
||||||
|
|
||||||
})
|
|
||||||
|
|
||||||
#%%======================================================
|
|
||||||
# Writing file with calculated ORs and AFs
|
|
||||||
cat(paste0('writing output file: '
|
|
||||||
, '\nFilename: ', out_filename))
|
|
||||||
|
|
||||||
write.csv(ors_df, outfile
|
|
||||||
, row.names = F)
|
|
||||||
|
|
||||||
cat(paste0('Finished writing:'
|
|
||||||
, outfile
|
|
||||||
, '\nNo. of rows: ', nrow(ors_df)
|
|
||||||
, '\nNo. of cols: ', ncol(ors_df)))
|
|
||||||
#************************************************
|
|
||||||
cat('\n======================================================================\n')
|
|
||||||
cat('End of script: calculated AF, OR, pvalues and saved file')
|
|
|
@ -1,51 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
from Bio import SeqIO
|
|
||||||
from Bio import pairwise2
|
|
||||||
from Bio.pairwise2 import format_alignment
|
|
||||||
import re
|
|
||||||
import os
|
|
||||||
#%%
|
|
||||||
def myalign(ref_seq, pdb_seq):
|
|
||||||
|
|
||||||
myalign_dict = {}
|
|
||||||
alignments = pairwise2.align.globalxx(ref_seq, pdb_seq)
|
|
||||||
#alignments = pairwise2.align.localxx(ref, struct)
|
|
||||||
|
|
||||||
match = []
|
|
||||||
|
|
||||||
for a, b in zip(alignments[0][0], alignments[0][1]):
|
|
||||||
if a == b:
|
|
||||||
match.append('|')
|
|
||||||
else:
|
|
||||||
match.append(' ')
|
|
||||||
|
|
||||||
|
|
||||||
#print(match)
|
|
||||||
print(alignments[0][0])
|
|
||||||
print("".join(match))
|
|
||||||
print(alignments[0][1])
|
|
||||||
|
|
||||||
result_align = alignments[0][1]
|
|
||||||
#print(result_align)
|
|
||||||
print('===============================================================\n')
|
|
||||||
|
|
||||||
# update dict
|
|
||||||
myalign_dict.update({'aligned_fasta': result_align})
|
|
||||||
|
|
||||||
# find start and end of match
|
|
||||||
aa_regex = '\w'
|
|
||||||
m = re.search(aa_regex, result_align)
|
|
||||||
#m = my_match.span()
|
|
||||||
offset = m.start()
|
|
||||||
offset_end = m.end()
|
|
||||||
|
|
||||||
print('start of match:', offset
|
|
||||||
, '\nend of match:', offset_end)
|
|
||||||
print('===============================================================\n')
|
|
||||||
|
|
||||||
# update dict
|
|
||||||
myalign_dict.update({'start_match' : offset})
|
|
||||||
myalign_dict.update({'end_match' : offset_end})
|
|
||||||
|
|
||||||
return myalign_dict
|
|
||||||
|
|
|
@ -1,24 +0,0 @@
|
||||||
#!/usr/bin/python3
|
|
||||||
|
|
||||||
#=======================================================================
|
|
||||||
# TASK: select specified chains from the pdb & save a cropped PDB with
|
|
||||||
# the selected chains. Useful for dimer, etc modelling.
|
|
||||||
|
|
||||||
# link for saving each chain as a separate file
|
|
||||||
# https://stackoverflow.com/questions/11685716/how-to-extract-chains-from-a-pdb-file
|
|
||||||
#=======================================================================
|
|
||||||
|
|
||||||
from Bio.PDB import PDBParser, PDBIO, Select
|
|
||||||
|
|
||||||
|
|
||||||
# Select() Method to return True for every chain in 'chains'
|
|
||||||
class ChainExtract(Select):
|
|
||||||
def __init__(self, chain):
|
|
||||||
self.chain = chain
|
|
||||||
|
|
||||||
def accept_chain(self, chain):
|
|
||||||
#print(dir(chain))
|
|
||||||
if chain.id in self.chain:
|
|
||||||
return 1
|
|
||||||
else:
|
|
||||||
return 0
|
|
|
@ -1,28 +0,0 @@
|
||||||
#!/usr/bin/python3
|
|
||||||
|
|
||||||
#=======================================================================
|
|
||||||
# TASK: extract chain from pdb and save each chain as a separate file
|
|
||||||
|
|
||||||
# link for saving each chain as a separate file
|
|
||||||
#=======================================================================
|
|
||||||
__description__ = \
|
|
||||||
"""
|
|
||||||
pdb_chain_splitter.py
|
|
||||||
|
|
||||||
extracts chains and saves them as separate pdb files.
|
|
||||||
"""
|
|
||||||
__author__ = "Tanushree Tunstall"
|
|
||||||
__date__ = ""
|
|
||||||
|
|
||||||
from Bio.PDB import Select, PDBIO
|
|
||||||
from Bio.PDB.PDBParser import PDBParser
|
|
||||||
|
|
||||||
class ChainSelect(Select):
|
|
||||||
def __init__(self, chain):
|
|
||||||
self.chain = chain
|
|
||||||
|
|
||||||
def accept_chain(self, chain):
|
|
||||||
if chain.get_id() == self.chain:
|
|
||||||
return 1
|
|
||||||
else:
|
|
||||||
return 0
|
|
|
@ -1,177 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
'''
|
|
||||||
Created on Tue Aug 6 12:56:03 2019
|
|
||||||
|
|
||||||
@author: tanu
|
|
||||||
'''
|
|
||||||
# FIXME: change filename 2(mcsm normalised data)
|
|
||||||
# to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline
|
|
||||||
#=======================================================================
|
|
||||||
# Task: combine 2 dfs on comm_valson cols by detecting them
|
|
||||||
# includes sainity checks
|
|
||||||
|
|
||||||
#=======================================================================
|
|
||||||
#%% load packages
|
|
||||||
import sys, os
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
import re
|
|
||||||
#from varname import nameof
|
|
||||||
|
|
||||||
#%% end of variable assignment for input and output files
|
|
||||||
#=======================================================================
|
|
||||||
#%% function/methd to combine dfs
|
|
||||||
|
|
||||||
def detect_common_cols (df1, df2):
|
|
||||||
"""
|
|
||||||
Detect comm_valson cols
|
|
||||||
|
|
||||||
@param df1: df
|
|
||||||
@type df1: pandas df
|
|
||||||
|
|
||||||
@param df2: df
|
|
||||||
@type df2: pandas df
|
|
||||||
|
|
||||||
@return: comm_valson cols
|
|
||||||
@type: list
|
|
||||||
"""
|
|
||||||
common_cols = np.intersect1d(df1.columns, df2.columns).tolist()
|
|
||||||
print('Length of comm_cols:', len(common_cols)
|
|
||||||
, '\nmerging column/s:', common_cols
|
|
||||||
, '\ntype:', type(common_cols)
|
|
||||||
, '\ndtypes in merging columns:\n', df1[common_cols].dtypes)
|
|
||||||
|
|
||||||
return common_cols
|
|
||||||
|
|
||||||
#%% Function to combine 2 dfs by detecting commom cols and performing
|
|
||||||
# sanity checks on the output df
|
|
||||||
def combine_dfs_with_checks(df1, df2, my_join = 'outer'):
|
|
||||||
"""
|
|
||||||
Combine 2 dfs by finding merging columns automatically
|
|
||||||
|
|
||||||
@param df1: data frame
|
|
||||||
@type df1: pandas df
|
|
||||||
|
|
||||||
@param df2: data frame
|
|
||||||
@type df2: pandas df
|
|
||||||
|
|
||||||
@my_join: join type for merging
|
|
||||||
@type my_join: string
|
|
||||||
|
|
||||||
@return: combined_df
|
|
||||||
@type: pandas df
|
|
||||||
"""
|
|
||||||
|
|
||||||
print('Finding comm_cols and merging cols:'
|
|
||||||
,'\n=========================================================')
|
|
||||||
|
|
||||||
common_cols = np.intersect1d(df1.columns, df2.columns).tolist()
|
|
||||||
print('Length of comm_cols:', len(common_cols)
|
|
||||||
, '\nmerging column/s:', common_cols
|
|
||||||
, '\ntype:', type(common_cols))
|
|
||||||
|
|
||||||
#print('\ndtypes in merging columns:\n', df1[common_cols].dtypes)
|
|
||||||
|
|
||||||
print('selecting consistent dtypes for merging (object i.e string)')
|
|
||||||
#merging_cols = df1[comm_valson_cols].select_dtypes(include = [object]).columns.tolist()
|
|
||||||
#merging_cols = df1[comm_valson_cols].select_dtypes(include = ['int64']).columns.tolist()
|
|
||||||
merging_cols = common_cols.copy()
|
|
||||||
|
|
||||||
nmerging_cols = len(merging_cols)
|
|
||||||
print(' length of merging cols:', nmerging_cols
|
|
||||||
, '\nmerging cols:', merging_cols, 'type:', type(merging_cols)
|
|
||||||
, '\n=========================================================')
|
|
||||||
|
|
||||||
#========================
|
|
||||||
# merge 1 (combined_df)
|
|
||||||
# concatenating 2dfs:
|
|
||||||
# df1, df2
|
|
||||||
#========================
|
|
||||||
# checking cross-over of mutations in the two dfs to merge
|
|
||||||
ndiff_1 = df1[merging_cols].squeeze().isin(df2[merging_cols].squeeze()).sum()
|
|
||||||
ndiff1 = df1.shape[0] - ndiff_1
|
|
||||||
print('There are', ndiff1, 'unmatched mutations in left df')
|
|
||||||
|
|
||||||
#missing_mutinfo = df1[~left_df['mutationinformation'].isin(df2['mutationinformation'])]
|
|
||||||
#missing_mutinfo.to_csv('infoless_muts.csv')
|
|
||||||
|
|
||||||
ndiff_2 = df2[merging_cols].squeeze().isin(df1[merging_cols].squeeze()).sum()
|
|
||||||
ndiff2 = df2.shape[0] - ndiff_2
|
|
||||||
print('There are', ndiff2, 'unmatched mutations in right_df')
|
|
||||||
|
|
||||||
#comm_vals = np.intersect1d(df1[merging_cols], df2[merging_cols])
|
|
||||||
#comm_vals_count = len(comm_vals)
|
|
||||||
#print('length of comm_valson values:', comm_vals_count , '\ntype:', type(comm_vals_count))
|
|
||||||
|
|
||||||
#========================
|
|
||||||
# merging dfs & sanity checks
|
|
||||||
#========================
|
|
||||||
fail = False
|
|
||||||
print('combing with:', my_join)
|
|
||||||
comb_df = pd.merge(df1, df2, on = merging_cols, how = my_join)
|
|
||||||
|
|
||||||
expected_cols = df1.shape[1] + df2.shape[1] - nmerging_cols
|
|
||||||
|
|
||||||
|
|
||||||
if my_join == 'right':
|
|
||||||
df2_nd = df2.drop_duplicates(merging_cols, keep = 'first')
|
|
||||||
expected_rows = df2_nd.shape[0]
|
|
||||||
|
|
||||||
if my_join == 'left':
|
|
||||||
expected_rows = df1.shape[0]
|
|
||||||
|
|
||||||
|
|
||||||
#if my_join == 'inner':
|
|
||||||
# expected_rows = comm_vals_count
|
|
||||||
|
|
||||||
#if my_join == 'outer':
|
|
||||||
# df1_nd = df1.drop_duplicates(merging_cols, keep = 'first')
|
|
||||||
# df2_nd = df2.drop_duplicates(merging_cols, keep = 'first')
|
|
||||||
# expected_rows = df1_nd.shape[0] + df2_nd.shape[0] - comm_vals_count
|
|
||||||
|
|
||||||
|
|
||||||
if my_join == ('inner' or 'outer') and len(merging_cols) > 1:
|
|
||||||
#comm_vals = np.intersect1d(df1['mutationinformation'], df2['mutationinformation'])
|
|
||||||
print('length of merging_cols > 1, therefore omitting row checks')
|
|
||||||
combined_df = comb_df.copy()
|
|
||||||
expected_rows = len(combined_df)
|
|
||||||
|
|
||||||
else:
|
|
||||||
comm_vals = np.intersect1d(df1[merging_cols], df2[merging_cols])
|
|
||||||
print('length of merging_cols == 1, calculating expected rows in merged_df')
|
|
||||||
combined_df = comb_df.drop_duplicates(subset = merging_cols, keep ='first')
|
|
||||||
if my_join == 'inner':
|
|
||||||
expected_rows = len(comm_vals)
|
|
||||||
if my_join == 'outer':
|
|
||||||
df1_nd = df1.drop_duplicates(merging_cols, keep = 'first')
|
|
||||||
df2_nd = df2.drop_duplicates(merging_cols, keep = 'first')
|
|
||||||
expected_rows = df1_nd.shape[0] + df2_nd.shape[0] - len(comm_vals)
|
|
||||||
|
|
||||||
if len(combined_df) == expected_rows and len(combined_df.columns) == expected_cols:
|
|
||||||
print('PASS: successfully combined dfs with:', my_join, 'join')
|
|
||||||
else:
|
|
||||||
print('FAIL: combined_df\'s expected rows and cols not matched')
|
|
||||||
fail = True
|
|
||||||
print('\nExpected no. of rows:', expected_rows
|
|
||||||
, '\nGot:', len(combined_df)
|
|
||||||
, '\nExpected no. of cols:', expected_cols
|
|
||||||
, '\nGot:', len(combined_df.columns))
|
|
||||||
if fail:
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
#if clean:
|
|
||||||
#foo = combined_df2.filter(regex = r'.*_x|_y', axis = 1)
|
|
||||||
#print(foo.columns)
|
|
||||||
#print('Detected duplicate cols with suffix: _x _y'
|
|
||||||
# , '\Dropping duplicate cols and cleaning')
|
|
||||||
|
|
||||||
# drop position col containing suffix '_y' and then rename col without suffix
|
|
||||||
combined_df_clean = combined_df.drop(combined_df.filter(regex = r'.*_y').columns, axis = 1)
|
|
||||||
combined_df_clean.rename(columns=lambda x: re.sub('_x$','', x), inplace = True)
|
|
||||||
|
|
||||||
return combined_df_clean
|
|
||||||
|
|
||||||
#%% end of function
|
|
||||||
#=======================================================================
|
|
||||||
|
|
|
@ -1,287 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
'''
|
|
||||||
Created on Tue Aug 6 12:56:03 2019
|
|
||||||
|
|
||||||
@author: tanu
|
|
||||||
'''
|
|
||||||
# FIXME: change filename 2(mcsm normalised data)
|
|
||||||
# to be consistent like (pnca_complex_mcsm_norm.csv) : changed manually, but ensure this is done in the mcsm pipeline
|
|
||||||
#=======================================================================
|
|
||||||
# Task: combine 2 dfs with aa position as linking column
|
|
||||||
|
|
||||||
# Input: 2 dfs
|
|
||||||
# <gene.lower()>_complex_mcsm_norm.csv
|
|
||||||
# <gene.lower()>_foldx.csv
|
|
||||||
|
|
||||||
# Output: .csv of all 2 dfs combined
|
|
||||||
|
|
||||||
# useful link
|
|
||||||
# https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns
|
|
||||||
#=======================================================================
|
|
||||||
#%% load packages
|
|
||||||
import sys, os
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
#from varname import nameof
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
#=======================================================================
|
|
||||||
#%% specify input and curr dir
|
|
||||||
homedir = os.path.expanduser('~')
|
|
||||||
|
|
||||||
# set working dir
|
|
||||||
os.getcwd()
|
|
||||||
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
|
||||||
os.getcwd()
|
|
||||||
|
|
||||||
# FIXME: local imports
|
|
||||||
#from combining import combine_dfs_with_checks
|
|
||||||
from combining_FIXME import detect_common_cols
|
|
||||||
#=======================================================================
|
|
||||||
#%% command line args
|
|
||||||
#arg_parser = argparse.ArgumentParser()
|
|
||||||
#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
|
|
||||||
#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
|
|
||||||
#args = arg_parser.parse_args()
|
|
||||||
#=======================================================================
|
|
||||||
#%% variable assignment: input and output
|
|
||||||
drug = 'pyrazinamide'
|
|
||||||
gene = 'pncA'
|
|
||||||
gene_match = gene + '_p.'
|
|
||||||
|
|
||||||
#drug = args.drug
|
|
||||||
#gene = args.gene
|
|
||||||
#======
|
|
||||||
# dirs
|
|
||||||
#======
|
|
||||||
datadir = homedir + '/' + 'git/Data'
|
|
||||||
indir = datadir + '/' + drug + '/' + 'input'
|
|
||||||
outdir = datadir + '/' + drug + '/' + 'output'
|
|
||||||
|
|
||||||
#=======
|
|
||||||
# input
|
|
||||||
#=======
|
|
||||||
in_filename_mcsm = gene.lower() + '_complex_mcsm_norm.csv'
|
|
||||||
in_filename_foldx = gene.lower() + '_foldx.csv'
|
|
||||||
in_filename_dssp = gene.lower() + '_dssp.csv'
|
|
||||||
in_filename_kd = gene.lower() + '_kd.csv'
|
|
||||||
in_filename_rd = gene.lower() + '_rd.csv'
|
|
||||||
in_filename_snpinfo = 'ns' + gene.lower() + '_snp_info.csv'
|
|
||||||
in_filename_afor = gene.lower() + '_af_or.csv'
|
|
||||||
in_filename_afor_kin = gene.lower() + '_af_or_kinship.csv'
|
|
||||||
|
|
||||||
|
|
||||||
infile_mcsm = outdir + '/' + in_filename_mcsm
|
|
||||||
infile_foldx = outdir + '/' + in_filename_foldx
|
|
||||||
infile_dssp = outdir + '/' + in_filename_dssp
|
|
||||||
infile_kd = outdir + '/' + in_filename_kd
|
|
||||||
infile_rd = outdir + '/' + in_filename_rd
|
|
||||||
infile_snpinfo = indir + '/' + in_filename_snpinfo
|
|
||||||
infile_afor = outdir + '/' + in_filename_afor
|
|
||||||
infile_afor_kin = outdir + '/' + in_filename_afor_kin
|
|
||||||
|
|
||||||
|
|
||||||
print('\nInput path:', outdir
|
|
||||||
, '\nInput filename mcsm:', infile_mcsm
|
|
||||||
, '\nInput filename foldx:', infile_foldx
|
|
||||||
, '\nInput filename dssp:', infile_dssp
|
|
||||||
, '\nInput filename kd:', infile_kd
|
|
||||||
, '\nInput filename rd', infile_rd
|
|
||||||
, '\nInput filename snp info:', infile_snpinfo
|
|
||||||
, '\nInput filename af or:', infile_afor
|
|
||||||
, '\nInput filename afor kinship:', infile_afor_kin
|
|
||||||
, '\n============================================================')
|
|
||||||
|
|
||||||
#=======
|
|
||||||
# output
|
|
||||||
#=======
|
|
||||||
out_filename_comb = gene.lower() + '_all_params.csv'
|
|
||||||
outfile_comb = outdir + '/' + out_filename_comb
|
|
||||||
print('Output filename:', outfile_comb
|
|
||||||
, '\n============================================================')
|
|
||||||
|
|
||||||
o_join = 'outer'
|
|
||||||
l_join = 'left'
|
|
||||||
r_join = 'right'
|
|
||||||
i_join = 'inner'
|
|
||||||
|
|
||||||
# end of variable assignment for input and output files
|
|
||||||
#&%%====================================================================
|
|
||||||
mcsm_df = pd.read_csv(infile_mcsm, sep = ',')
|
|
||||||
mcsm_df.columns = mcsm_df.columns.str.lower()
|
|
||||||
foldx_df = pd.read_csv(infile_foldx , sep = ',')
|
|
||||||
|
|
||||||
print('==================================='
|
|
||||||
, '\nFirst merge: mcsm + foldx'
|
|
||||||
, '\n===================================')
|
|
||||||
#mcsm_foldx_dfs = combine_dfs_with_checks(mcsm_df, foldx_df, my_join = o_join)
|
|
||||||
merging_cols_m1 = detect_common_cols(mcsm_df, foldx_df)
|
|
||||||
|
|
||||||
mcsm_foldx_dfs = pd.merge(mcsm_df, foldx_df, on = merging_cols_m1, how = 'outer')
|
|
||||||
ncols_m1 = len(mcsm_foldx_dfs.columns)
|
|
||||||
#%%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
print('==================================='
|
|
||||||
, '\nSecond merge: dssp + kd'
|
|
||||||
, '\n===================================')
|
|
||||||
|
|
||||||
dssp_df = pd.read_csv(infile_dssp, sep = ',')
|
|
||||||
kd_df = pd.read_csv(infile_kd, sep = ',')
|
|
||||||
rd_df = pd.read_csv(infile_rd, sep = ',')
|
|
||||||
|
|
||||||
#dssp_kd_dfs = combine_dfs_with_checks(dssp_df, kd_df, my_join = o_join)
|
|
||||||
merging_cols_m2 = detect_common_cols(dssp_df, kd_df)
|
|
||||||
|
|
||||||
dssp_kd_dfs = pd.merge(dssp_df, kd_df, on = merging_cols_m2, how = 'outer')
|
|
||||||
|
|
||||||
print('==================================='
|
|
||||||
, '\nThird merge: dssp_kd_dfs + rd_df'
|
|
||||||
, '\n===================================')
|
|
||||||
#dssp_kd_rd_dfs = combine_dfs_with_checks(dssp_kd_dfs, rd_df, my_join = o_join)
|
|
||||||
merging_cols_m3 = detect_common_cols(dssp_df, kd_df)
|
|
||||||
dssp_kd_rd_dfs = pd.merge(dssp_kd_dfs, rd_df, on = merging_cols_m3, how = 'outer')
|
|
||||||
|
|
||||||
ncols_m3 = len(dssp_kd_rd_dfs.columns)
|
|
||||||
#%%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
print('==================================='
|
|
||||||
, '\nFourth merge: First merge + Third merge'
|
|
||||||
, '\n===================================')
|
|
||||||
#combined_dfs = combine_dfs_with_checks(mcsm_foldx_dfs, dssp_kd_rd_dfs, my_join = i_join)# gives wrong!
|
|
||||||
merging_cols_m4 = detect_common_cols(mcsm_foldx_dfs, dssp_kd_rd_dfs)
|
|
||||||
combined_df_expected_cols = ncols_m1 + ncols_m3 - len(merging_cols_m4)
|
|
||||||
|
|
||||||
combined_df = pd.merge(mcsm_foldx_dfs, dssp_kd_rd_dfs, on = merging_cols_m4, how = 'inner')
|
|
||||||
|
|
||||||
|
|
||||||
if len(combined_df) == len(mcsm_df) and len(combined_df.columns) == combined_df_expected_cols:
|
|
||||||
print('PASS: successfully combined 5 dfs'
|
|
||||||
, '\nnrows combined_df:', len(combined_df)
|
|
||||||
, '\ncols combined_df:', len(combined_df.columns))
|
|
||||||
else:
|
|
||||||
sys.exit('FAIL: check individual df merges')
|
|
||||||
#%%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
#%%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
#%% OR combining
|
|
||||||
afor_df = pd.read_csv(infile_afor, sep = ',')
|
|
||||||
afor_df.columns = afor_df.columns.str.lower()
|
|
||||||
|
|
||||||
if afor_df['mutation'].shape[0] == afor_df['mutation'].nunique():
|
|
||||||
print('No duplicate muts detected in afor_df')
|
|
||||||
else:
|
|
||||||
print('Dropping duplicate muts detected in afor_df')
|
|
||||||
afor_df = afor_df.drop_duplicates(subset = 'mutation', keep = 'first')
|
|
||||||
|
|
||||||
|
|
||||||
snpinfo_df_all = pd.read_csv(infile_snpinfo, sep = ',')
|
|
||||||
snpinfo_df = snpinfo_df_all[['mutation', 'mutationinformation']]
|
|
||||||
|
|
||||||
|
|
||||||
if snpinfo_df['mutation'].shape[0] == snpinfo_df['mutation'].nunique():
|
|
||||||
print('No duplicate muts detected in snpinfo_df')
|
|
||||||
else:
|
|
||||||
dups = snpinfo_df['mutation'].duplicated().sum()
|
|
||||||
print( dups, 'Duplicate muts detected in snpinfo_df'
|
|
||||||
, '\nDim:', snpinfo_df.shape)
|
|
||||||
print('Dropping duplicate muts')
|
|
||||||
snpinfo_df = snpinfo_df.drop_duplicates(subset = 'mutation', keep = 'first')
|
|
||||||
print('Dim:', snpinfo_df.shape)
|
|
||||||
|
|
||||||
print('==================================='
|
|
||||||
, '\nFifth merge: afor_df + snpinfo_df'
|
|
||||||
, '\n===================================')
|
|
||||||
|
|
||||||
merging_cols_m5 = detect_common_cols(afor_df, snpinfo_df)
|
|
||||||
|
|
||||||
afor_snpinfo_dfs = pd.merge(afor_df, snpinfo_df, on = merging_cols_m5, how = 'left')
|
|
||||||
if len(afor_snpinfo_dfs) == afor_df.shape[0]:
|
|
||||||
print('PASS: succesfully combined with left join'
|
|
||||||
, '\nDim of df1:', afor_df.shape
|
|
||||||
, '\nDim of df2:', snpinfo_df.shape)
|
|
||||||
else:
|
|
||||||
sys.exit('FAIL: unsuccessful merge')
|
|
||||||
|
|
||||||
#%%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
afor_kin_df = pd.read_csv(infile_afor_kin, sep = ',')
|
|
||||||
afor_kin_df.columns = afor_kin_df.columns.str.lower()
|
|
||||||
|
|
||||||
print('==================================='
|
|
||||||
, '\nSixth merge: afor_snpinfo_dfs + afor_kin_df'
|
|
||||||
, '\n===================================')
|
|
||||||
|
|
||||||
merging_cols_m6 = detect_common_cols(afor_snpinfo_dfs, afor_kin_df)
|
|
||||||
|
|
||||||
print('Dim of df1:', afor_snpinfo_dfs.shape
|
|
||||||
, '\nDim of df2:', afor_kin_df.shape
|
|
||||||
, '\nno. of merging_cols:', len(merging_cols_m6))
|
|
||||||
|
|
||||||
ors_df = pd.merge(afor_snpinfo_dfs, afor_kin_df, on = merging_cols_m6, how = 'outer')
|
|
||||||
|
|
||||||
print('Dim of ors_df:', ors_df.shape)
|
|
||||||
|
|
||||||
#%%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
|
|
||||||
print('==================================='
|
|
||||||
, '\nSeventh merge: combined_df + ors_df'
|
|
||||||
, '\n===================================')
|
|
||||||
|
|
||||||
merging_cols_m7 = detect_common_cols(combined_df, ors_df)
|
|
||||||
|
|
||||||
print('Dim of df1:', combined_df.shape
|
|
||||||
, '\nDim of df2:', ors_df.shape
|
|
||||||
, '\nno. of merging_cols:', len(merging_cols_m7))
|
|
||||||
|
|
||||||
print('checking mutations in the two dfs:'
|
|
||||||
, '\nmuts in df1 but NOT in df2:'
|
|
||||||
, combined_df['mutationinformation'].isin(ors_df['mutationinformation']).sum()
|
|
||||||
, '\nmuts in df2 but NOT in df1:'
|
|
||||||
, ors_df['mutationinformation'].isin(combined_df['mutationinformation']).sum())
|
|
||||||
|
|
||||||
#print('\nNo. of common muts:', np.intersect1d(combined_df['mutationinformation'], ors_df['mutationinformation']) )
|
|
||||||
|
|
||||||
#combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m7, how = 'outer') # FIXME
|
|
||||||
combined_df_all = pd.merge(combined_df, ors_df, on = merging_cols_m7, how = 'left')
|
|
||||||
|
|
||||||
outdf_expected_rows = len(combined_df)
|
|
||||||
outdf_expected_cols = len(combined_df.columns) + len(ors_df.columns) - len(merging_cols_m7)
|
|
||||||
|
|
||||||
print('\nDim of combined_df_all:', combined_df_all.shape
|
|
||||||
, '\nwith join type: ????')
|
|
||||||
|
|
||||||
if combined_df_all.shape[1] == outdf_expected_cols:
|
|
||||||
print('combined_df has expected no. of cols')
|
|
||||||
if combined_df_all.shape[0] == outdf_expected_rows:
|
|
||||||
print('combined_df has expected no. of rows')
|
|
||||||
else:
|
|
||||||
print('WARNING: nrows discrepancy noted'
|
|
||||||
, '\nFIX IT')
|
|
||||||
print ('thing finished')
|
|
||||||
#%%~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
|
||||||
# write csv
|
|
||||||
|
|
||||||
combined_df_all.to_csv(outfile_comb, index = False)
|
|
||||||
|
|
||||||
#=======================================================================
|
|
||||||
#%% incase you FIX the the function: combine_dfs_with_checks
|
|
||||||
#def main():
|
|
||||||
|
|
||||||
# print('Reading input files:')
|
|
||||||
#mcsm_df = pd.read_csv(infile_mcsm, sep = ',')
|
|
||||||
#mcsm_df.columns = mcsm_df.columns.str.lower()
|
|
||||||
|
|
||||||
#foldx_df = pd.read_csv(infile_foldx , sep = ',')
|
|
||||||
|
|
||||||
#dssp_df = pd.read_csv(infile_dssp, sep = ',')
|
|
||||||
#dssp_df.columns = dssp_df.columns.str.lower()
|
|
||||||
|
|
||||||
#kd_df = pd.read_csv(infile_kd, sep = ',')
|
|
||||||
#kd_df.columns = kd_df.columns.str.lower()
|
|
||||||
|
|
||||||
#rd_df = pd.read_csv(infile_kd, sep = ',')
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#if __name__ == '__main__':
|
|
||||||
# main()
|
|
||||||
#=======================================================================
|
|
||||||
#%% end of script
|
|
|
@ -1,9 +0,0 @@
|
||||||
#!/usr/bin/env Rscript
|
|
||||||
print('R Argument Test')
|
|
||||||
|
|
||||||
cmd <- paste(commandArgs(), collapse=" ")
|
|
||||||
cat("How R was invoked:\n");
|
|
||||||
cat(cmd, "\n")
|
|
||||||
|
|
||||||
args <- commandArgs(trailingOnly = TRUE)
|
|
||||||
cat(c('Command Line Arguments supplied: ', args))
|
|
|
@ -21,6 +21,9 @@ Created on Tue Aug 6 12:56:03 2019
|
||||||
# where each row is a separate mutation
|
# where each row is a separate mutation
|
||||||
# sample ids AND mutations are NOT unique, but the COMBINATION (sample id + mutation) = unique
|
# sample ids AND mutations are NOT unique, but the COMBINATION (sample id + mutation) = unique
|
||||||
|
|
||||||
|
# NOTE
|
||||||
|
#drtype is renamed to 'resistance' in the 35k dataset
|
||||||
|
|
||||||
# output files: all lower case
|
# output files: all lower case
|
||||||
# 0) <gene>_common_ids.csv
|
# 0) <gene>_common_ids.csv
|
||||||
# 1) <gene>_ambiguous_muts.csv
|
# 1) <gene>_ambiguous_muts.csv
|
||||||
|
@ -60,6 +63,7 @@ os.getcwd()
|
||||||
|
|
||||||
# import aa dict
|
# import aa dict
|
||||||
from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
|
from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
|
||||||
|
from tidy_split import tidy_split
|
||||||
#=======================================================================
|
#=======================================================================
|
||||||
#%% command line args
|
#%% command line args
|
||||||
arg_parser = argparse.ArgumentParser()
|
arg_parser = argparse.ArgumentParser()
|
||||||
|
@ -96,8 +100,8 @@ datadir = homedir + '/' + 'git/Data'
|
||||||
#=======
|
#=======
|
||||||
# input
|
# input
|
||||||
#=======
|
#=======
|
||||||
in_filename = 'original_tanushree_data_v2.csv'
|
#in_filename = 'original_tanushree_data_v2.csv' #19k
|
||||||
#in_filename = 'mtb_gwas_v3.csv'
|
in_filename = 'mtb_gwas_meta_v3.csv' #33k
|
||||||
infile = datadir + '/' + in_filename
|
infile = datadir + '/' + in_filename
|
||||||
print('Input file: ', infile
|
print('Input file: ', infile
|
||||||
, '\n============================================================')
|
, '\n============================================================')
|
||||||
|
@ -121,17 +125,45 @@ master_data = pd.read_csv(infile, sep = ',')
|
||||||
#list(master_data.columns)
|
#list(master_data.columns)
|
||||||
|
|
||||||
# extract elevant columns to extract from meta data related to the drug
|
# extract elevant columns to extract from meta data related to the drug
|
||||||
meta_data = master_data[['id'
|
|
||||||
,'country'
|
|
||||||
,'lineage'
|
|
||||||
,'sublineage'
|
|
||||||
,'drtype'
|
|
||||||
, drug
|
|
||||||
, dr_muts_col
|
|
||||||
, other_muts_col
|
|
||||||
]]
|
|
||||||
|
|
||||||
del(master_data)
|
#meta_data_ch = master_data[['id'
|
||||||
|
#, 'country'
|
||||||
|
#, 'lineage'
|
||||||
|
#, 'sublineage'
|
||||||
|
##, 'drtype' #19k only
|
||||||
|
#, 'resistance'
|
||||||
|
#, drug
|
||||||
|
#, dr_muts_col
|
||||||
|
#, other_muts_col]]
|
||||||
|
|
||||||
|
|
||||||
|
core_cols = ['id'
|
||||||
|
, 'country'
|
||||||
|
, 'country2'
|
||||||
|
, 'geographic_source'
|
||||||
|
, 'region'
|
||||||
|
, 'date'
|
||||||
|
, 'strain'
|
||||||
|
, 'lineage'
|
||||||
|
, 'sublineage' #drtype renamed to resistance
|
||||||
|
, 'resistance'
|
||||||
|
, 'location'
|
||||||
|
, 'host_body_site'
|
||||||
|
, 'environment_material'
|
||||||
|
, 'host_status'
|
||||||
|
, 'hiv_status'
|
||||||
|
, 'HIV_status'
|
||||||
|
, 'isolation_source']
|
||||||
|
|
||||||
|
variable_based_cols = [drug
|
||||||
|
, dr_muts_col
|
||||||
|
, other_muts_col]
|
||||||
|
|
||||||
|
cols_to_extract = core_cols + variable_based_cols
|
||||||
|
|
||||||
|
meta_data = master_data[cols_to_extract]
|
||||||
|
|
||||||
|
del(master_data, variable_based_cols, cols_to_extract)
|
||||||
|
|
||||||
# checks and results
|
# checks and results
|
||||||
total_samples = meta_data['id'].nunique()
|
total_samples = meta_data['id'].nunique()
|
||||||
|
@ -269,14 +301,23 @@ print('gene to extract:', gene_match )
|
||||||
#===============
|
#===============
|
||||||
# FIXME: replace drug with variable containing the drug name
|
# FIXME: replace drug with variable containing the drug name
|
||||||
# !!! important !!!
|
# !!! important !!!
|
||||||
meta_data_dr = meta_data[['id'
|
#meta_data_dr = meta_data[['id'
|
||||||
,'country'
|
# ,'country'
|
||||||
,'lineage'
|
# ,'lineage'
|
||||||
,'sublineage'
|
# ,'sublineage'
|
||||||
,'drtype'
|
# ,'drtype'
|
||||||
, drug
|
# , drug
|
||||||
, dr_muts_col
|
# , dr_muts_col
|
||||||
]]
|
# ]]
|
||||||
|
|
||||||
|
dr_based_cols = [drug, dr_muts_col]
|
||||||
|
|
||||||
|
cols_to_extract = core_cols + dr_based_cols
|
||||||
|
|
||||||
|
meta_data_dr = meta_data[cols_to_extract]
|
||||||
|
|
||||||
|
del(dr_based_cols, cols_to_extract)
|
||||||
|
|
||||||
print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) )
|
print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) )
|
||||||
print('actual dim:', meta_data_dr.shape
|
print('actual dim:', meta_data_dr.shape
|
||||||
, '\n===============================================================')
|
, '\n===============================================================')
|
||||||
|
@ -306,14 +347,22 @@ dr_id = pd.Series(dr_id)
|
||||||
print('Extracting dr_muts from:', other_muts_col,'with other meta_data')
|
print('Extracting dr_muts from:', other_muts_col,'with other meta_data')
|
||||||
# FIXME: replace drug with variable containing the drug name
|
# FIXME: replace drug with variable containing the drug name
|
||||||
# !!! important !!!
|
# !!! important !!!
|
||||||
meta_data_other = meta_data[['id'
|
#meta_data_other = meta_data[['id'
|
||||||
,'country'
|
# ,'country'
|
||||||
,'lineage'
|
# ,'lineage'
|
||||||
,'sublineage'
|
# ,'sublineage'
|
||||||
,'drtype'
|
## ,'drtype'
|
||||||
, drug
|
# , drug
|
||||||
, other_muts_col
|
# , other_muts_col
|
||||||
]]
|
# ]]
|
||||||
|
|
||||||
|
dr_based_cols = [drug, other_muts_col]
|
||||||
|
|
||||||
|
cols_to_extract = core_cols + dr_based_cols
|
||||||
|
|
||||||
|
meta_data_other = meta_data[cols_to_extract]
|
||||||
|
|
||||||
|
del(dr_based_cols, cols_to_extract)
|
||||||
|
|
||||||
print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) )
|
print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) )
|
||||||
print('actual dim:', meta_data_other.shape
|
print('actual dim:', meta_data_other.shape
|
||||||
|
@ -373,7 +422,7 @@ print('Writing file:'
|
||||||
, '\nExpected no. of rows:', len(common_ids)
|
, '\nExpected no. of rows:', len(common_ids)
|
||||||
, '\n=============================================================')
|
, '\n=============================================================')
|
||||||
|
|
||||||
common_ids.to_csv(outfile0)
|
common_ids.to_csv(outfile0, index = False)
|
||||||
del(out_filename0)
|
del(out_filename0)
|
||||||
|
|
||||||
# clear variables
|
# clear variables
|
||||||
|
@ -419,44 +468,15 @@ print('This is still dirty data: samples have ', gene_match, 'muts but may have
|
||||||
#https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas
|
#https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas
|
||||||
print('Performing tidy_split(): to separate the mutations into indivdual rows')
|
print('Performing tidy_split(): to separate the mutations into indivdual rows')
|
||||||
|
|
||||||
# define the split function
|
|
||||||
def tidy_split(df, column, sep='|', keep=False):
|
|
||||||
'''
|
|
||||||
Split the values of a column and expand so the new DataFrame has one split
|
|
||||||
value per row. Filters rows where the column is missing.
|
|
||||||
|
|
||||||
Params
|
|
||||||
------
|
|
||||||
df : pandas.DataFrame
|
|
||||||
dataframe with the column to split and expand
|
|
||||||
column : str
|
|
||||||
the column to split and expand
|
|
||||||
sep : str
|
|
||||||
the string used to split the column's values
|
|
||||||
keep : bool
|
|
||||||
whether to retain the presplit value as it's own row
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
pandas.DataFrame
|
|
||||||
Returns a dataframe with the same columns as `df`.
|
|
||||||
'''
|
|
||||||
indexes = list()
|
|
||||||
new_values = list()
|
|
||||||
#df = df.dropna(subset=[column])#!!!!-----see this incase you need to uncomment based on use case
|
|
||||||
for i, presplit in enumerate(df[column].astype(str)):
|
|
||||||
values = presplit.split(sep)
|
|
||||||
if keep and len(values) > 1:
|
|
||||||
indexes.append(i)
|
|
||||||
new_values.append(presplit)
|
|
||||||
for value in values:
|
|
||||||
indexes.append(i)
|
|
||||||
new_values.append(value)
|
|
||||||
new_df = df.iloc[indexes, :].copy()
|
|
||||||
new_df[column] = new_values
|
|
||||||
return new_df
|
|
||||||
|
|
||||||
#%% end of tidy_split()
|
#TIDY SPLIT HERE
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#=========
|
#=========
|
||||||
# DF1: dr_muts_col
|
# DF1: dr_muts_col
|
||||||
#=========
|
#=========
|
||||||
|
@ -761,12 +781,11 @@ del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_m
|
||||||
out_filename1 = gene.lower() + '_ambiguous_muts.csv'
|
out_filename1 = gene.lower() + '_ambiguous_muts.csv'
|
||||||
outfile1 = outdir + '/' + out_filename1
|
outfile1 = outdir + '/' + out_filename1
|
||||||
print('Writing file: ambiguous muts'
|
print('Writing file: ambiguous muts'
|
||||||
, '\nFilename:', out_filename1
|
, '\nFilename:', outfile1)
|
||||||
, '\nPath:', outdir)
|
|
||||||
|
|
||||||
#common_muts = ['gene_matchVal180Phe','gene_matchGln10Pro'] # test
|
#common_muts = ['gene_matchVal180Phe','gene_matchGln10Pro'] # test
|
||||||
inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
|
inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
|
||||||
inspect.to_csv(outfile1)
|
inspect.to_csv(outfile1, index = False)
|
||||||
|
|
||||||
print('Finished writing:', out_filename1
|
print('Finished writing:', out_filename1
|
||||||
, '\nNo. of rows:', len(inspect)
|
, '\nNo. of rows:', len(inspect)
|
||||||
|
@ -1069,13 +1088,13 @@ else:
|
||||||
print('FAIL: SNP has NA, Possible mapping issues from dict?'
|
print('FAIL: SNP has NA, Possible mapping issues from dict?'
|
||||||
, '\nDebug please!'
|
, '\nDebug please!'
|
||||||
, '\n=========================================================')
|
, '\n=========================================================')
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
out_filename2 = gene.lower() + '_mcsm_snps.csv'
|
out_filename2 = gene.lower() + '_mcsm_snps.csv'
|
||||||
outfile2 = outdir + '/' + out_filename2
|
outfile2 = outdir + '/' + out_filename2
|
||||||
|
|
||||||
print('Writing file: mCSM style muts'
|
print('Writing file: mCSM style muts'
|
||||||
, '\nFilename:', out_filename2
|
, '\nFilename:', outfile2
|
||||||
, '\nPath:', outdir
|
|
||||||
, '\nmutation format (SNP): {WT}<POS>{MUT}'
|
, '\nmutation format (SNP): {WT}<POS>{MUT}'
|
||||||
, '\nNo. of distinct muts:', len(snps_only)
|
, '\nNo. of distinct muts:', len(snps_only)
|
||||||
, '\nNo. of distinct positions:', len(pos_only)
|
, '\nNo. of distinct positions:', len(pos_only)
|
||||||
|
@ -1083,7 +1102,7 @@ print('Writing file: mCSM style muts'
|
||||||
|
|
||||||
snps_only.to_csv(outfile2, header = False, index = False)
|
snps_only.to_csv(outfile2, header = False, index = False)
|
||||||
|
|
||||||
print('Finished writing:', out_filename2
|
print('Finished writing:', outfile2
|
||||||
, '\nNo. of rows:', len(snps_only)
|
, '\nNo. of rows:', len(snps_only)
|
||||||
, '\nNo. of cols:', len(snps_only.columns)
|
, '\nNo. of cols:', len(snps_only.columns)
|
||||||
, '\n=============================================================')
|
, '\n=============================================================')
|
||||||
|
@ -1099,7 +1118,7 @@ print('Writing file: LF formatted data'
|
||||||
, '\n============================================================')
|
, '\n============================================================')
|
||||||
|
|
||||||
gene_LF1.to_csv(outfile3, header = True, index = False)
|
gene_LF1.to_csv(outfile3, header = True, index = False)
|
||||||
print('Finished writing:', out_filename3
|
print('Finished writing:', outfile3
|
||||||
, '\nNo. of rows:', len(gene_LF1)
|
, '\nNo. of rows:', len(gene_LF1)
|
||||||
, '\nNo. of cols:', len(gene_LF1.columns)
|
, '\nNo. of cols:', len(gene_LF1.columns)
|
||||||
, '\n=============================================================')
|
, '\n=============================================================')
|
||||||
|
@ -1118,11 +1137,11 @@ all_muts_msa.columns.dtype
|
||||||
all_muts_msa_sorted = all_muts_msa.sort_values(by = 'mutationinformation')
|
all_muts_msa_sorted = all_muts_msa.sort_values(by = 'mutationinformation')
|
||||||
|
|
||||||
# create an extra column with protein name
|
# create an extra column with protein name
|
||||||
all_muts_msa_sorted = all_muts_msa_sorted.assign(fasta_name = '3PL1')
|
#all_muts_msa_sorted = all_muts_msa_sorted.assign(fasta_name = '3PL1')
|
||||||
all_muts_msa_sorted.head()
|
#all_muts_msa_sorted.head()
|
||||||
|
|
||||||
# rearrange columns so the fasta name is the first column (required for mutate.script)
|
# rearrange columns so the fasta name is the first column (required for mutate.script)
|
||||||
all_muts_msa_sorted = all_muts_msa_sorted[['fasta_name', 'mutationinformation']]
|
#all_muts_msa_sorted = all_muts_msa_sorted[['fasta_name', 'mutationinformation']]
|
||||||
all_muts_msa_sorted.head()
|
all_muts_msa_sorted.head()
|
||||||
|
|
||||||
print('Checking NA in snps...')# should be 0
|
print('Checking NA in snps...')# should be 0
|
||||||
|
@ -1138,15 +1157,14 @@ out_filename4 = gene.lower() +'_all_muts_msa.csv'
|
||||||
outfile4 = outdir + '/' + out_filename4
|
outfile4 = outdir + '/' + out_filename4
|
||||||
|
|
||||||
print('Writing file: mCSM style muts for msa',
|
print('Writing file: mCSM style muts for msa',
|
||||||
'\nFilename:', out_filename4,
|
'\nFilename:', outfile4,
|
||||||
'\nPath:', outdir,
|
|
||||||
'\nmutation format (SNP): {WT}<POS>{MUT}',
|
'\nmutation format (SNP): {WT}<POS>{MUT}',
|
||||||
'\nNo.of lines of msa:', len(all_muts_msa),
|
'\nNo.of lines of msa:', len(all_muts_msa),
|
||||||
)
|
)
|
||||||
|
|
||||||
all_muts_msa_sorted.to_csv(outfile4, header = False, index = False)
|
all_muts_msa_sorted.to_csv(outfile4, header = False, index = False)
|
||||||
|
|
||||||
print('Finished writing:', out_filename4
|
print('Finished writing:', outfile4
|
||||||
, '\nNo. of rows:', len(all_muts_msa)
|
, '\nNo. of rows:', len(all_muts_msa)
|
||||||
, '\nNo. of cols:', len(all_muts_msa.columns)
|
, '\nNo. of cols:', len(all_muts_msa.columns)
|
||||||
, '\n=============================================================')
|
, '\n=============================================================')
|
||||||
|
@ -1177,7 +1195,7 @@ print('Writing file: mutational positions'
|
||||||
|
|
||||||
pos_only_sorted.to_csv(outfile5, header = True, index = False)
|
pos_only_sorted.to_csv(outfile5, header = True, index = False)
|
||||||
|
|
||||||
print('Finished writing:', out_filename5
|
print('Finished writing:', outfile5
|
||||||
, '\nNo. of rows:', len(pos_only_sorted)
|
, '\nNo. of rows:', len(pos_only_sorted)
|
||||||
, '\nNo. of cols:', len(pos_only_sorted.columns)
|
, '\nNo. of cols:', len(pos_only_sorted.columns)
|
||||||
, '\n=============================================================')
|
, '\n=============================================================')
|
||||||
|
|
|
@ -1,218 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
"""
|
|
||||||
Created on Tue Apr 7 09:30:16 2020
|
|
||||||
|
|
||||||
@author: tanu
|
|
||||||
"""
|
|
||||||
#=======================================================================
|
|
||||||
# TASK:
|
|
||||||
|
|
||||||
#=======================================================================
|
|
||||||
#%% load packages
|
|
||||||
import sys, os
|
|
||||||
import argparse
|
|
||||||
import re
|
|
||||||
import pandas as pd
|
|
||||||
from Bio.PDB import PDBParser
|
|
||||||
from Bio.PDB.DSSP import DSSP
|
|
||||||
import dms_tools2
|
|
||||||
import dms_tools2.dssp
|
|
||||||
import pprint as pp
|
|
||||||
#=======================================================================
|
|
||||||
#%% specify homedir and curr dir
|
|
||||||
homedir = os.path.expanduser('~')
|
|
||||||
|
|
||||||
# set working dir
|
|
||||||
os.getcwd()
|
|
||||||
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
|
||||||
os.getcwd()
|
|
||||||
#=======================================================================
|
|
||||||
#%% command line args
|
|
||||||
arg_parser = argparse.ArgumentParser()
|
|
||||||
arg_parser.add_argument('-d', '--drug', help='drug name', default = None)
|
|
||||||
arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None) # case sensitive
|
|
||||||
args = arg_parser.parse_args()
|
|
||||||
#=======================================================================
|
|
||||||
#%% variable assignment: input and output
|
|
||||||
#drug = 'pyrazinamide'
|
|
||||||
#gene = 'pncA'
|
|
||||||
#gene_match = gene + '_p.'
|
|
||||||
|
|
||||||
#drug = 'isoniazid'
|
|
||||||
#gene = 'katG'
|
|
||||||
|
|
||||||
#drug = 'cycloserine'
|
|
||||||
#gene = 'alr'
|
|
||||||
|
|
||||||
drug = args.drug
|
|
||||||
gene = args.gene
|
|
||||||
gene_match = gene + '_p.'
|
|
||||||
|
|
||||||
#==========
|
|
||||||
# data dir
|
|
||||||
#==========
|
|
||||||
#indir = 'git/Data/pyrazinamide/input/original'
|
|
||||||
datadir = homedir + '/' + 'git/Data'
|
|
||||||
|
|
||||||
#=======
|
|
||||||
# input
|
|
||||||
#=======
|
|
||||||
#indir = datadir + '/' + drug + '/' + 'output'
|
|
||||||
indir = datadir + '/' + drug + '/' + 'input'
|
|
||||||
in_filename = gene.lower() + '_complex' + '.pdb'
|
|
||||||
#in_filename = 'katg_complex.pdb' # fixme for pnca(consistent filenames i.e pnca_complex.pdb)
|
|
||||||
infile = indir + '/' + in_filename
|
|
||||||
|
|
||||||
#=======
|
|
||||||
# output
|
|
||||||
#=======
|
|
||||||
outdir = datadir + '/' + drug + '/' + 'output'
|
|
||||||
print('Output path:', outdir)
|
|
||||||
|
|
||||||
#out_filename = os.path.splitext(in_filename)[0]+'.dssp' # strip file ext
|
|
||||||
dssp_filename = gene.lower() + '.dssp'
|
|
||||||
dssp_file = outdir + '/' + dssp_filename
|
|
||||||
print('Output dssp:', dssp_file)
|
|
||||||
|
|
||||||
dsspcsv_filename = gene.lower() + '_dssp.csv'
|
|
||||||
dsspcsv_file = outdir + '/' + dsspcsv_filename
|
|
||||||
print('Outfile dssp to csv: ', dsspcsv_file
|
|
||||||
, '\n=============================================================')
|
|
||||||
|
|
||||||
#%% end of variable assignment for input and output files
|
|
||||||
#=======================================================================
|
|
||||||
#%% create .dssp from pdb
|
|
||||||
def dssp_file_from_pdb(inputpdbfile, outfile, DSSP = "dssp"):
|
|
||||||
"""
|
|
||||||
Create a DSSP file from a PDB file
|
|
||||||
|
|
||||||
@param inputpdbfile: pdb file
|
|
||||||
@type inputpdbfile: string
|
|
||||||
|
|
||||||
@param outfile: dssp file
|
|
||||||
@type outfile: string
|
|
||||||
|
|
||||||
@param DSSP: DSSP executable (argument to os.system)
|
|
||||||
@type DSSP: string
|
|
||||||
|
|
||||||
@return: none, creates dssp file
|
|
||||||
"""
|
|
||||||
# out_file = infile +'.dssp'
|
|
||||||
# outfile = os.path.splitext(inputpdbfile)[0]+'.dssp' # strip file ext
|
|
||||||
os.system("%s -i %s -o %s" % (DSSP, inputpdbfile, outfile))
|
|
||||||
#=======================================================================
|
|
||||||
#%% extract chain id from dssp
|
|
||||||
|
|
||||||
#print(dssp.keys())
|
|
||||||
#print(dssp.keys()[0][0])
|
|
||||||
#print(len(dssp))
|
|
||||||
#print(dssp.keys()[0][0])
|
|
||||||
#print(dssp.keys()[len(dssp)-1][0])
|
|
||||||
def extract_chain_dssp(inputpdbfile):
|
|
||||||
"""
|
|
||||||
extracts chain_ids from dssp run on pdb file
|
|
||||||
This is to allow processing of dssp output to df
|
|
||||||
and for writing as csv file
|
|
||||||
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
@param inputpdbfile: pdb file
|
|
||||||
@type inputpdbfile: string
|
|
||||||
|
|
||||||
Returns
|
|
||||||
-------
|
|
||||||
@return: chain_ids from running dssp on pdb file
|
|
||||||
@type list
|
|
||||||
|
|
||||||
"""
|
|
||||||
p = PDBParser()
|
|
||||||
structure = p.get_structure(in_filename, infile)
|
|
||||||
model = structure[0]
|
|
||||||
dssp = DSSP(model, infile)
|
|
||||||
|
|
||||||
dssp_chains = []
|
|
||||||
for num_aa in range(0, len(dssp)):
|
|
||||||
# print(num_aa)
|
|
||||||
# extract the chain id only and append to a list
|
|
||||||
dssp_chains.append(dssp.keys()[num_aa][0])
|
|
||||||
chainsL = list(set(dssp_chains))
|
|
||||||
print(chainsL)
|
|
||||||
# sort the list (since sets are not ordered) for convenience
|
|
||||||
# this will be required for dssp_df
|
|
||||||
pdbchainlist = sorted(chainsL)
|
|
||||||
print('dssp output for'
|
|
||||||
, in_filename, 'contains:', len(pdbchainlist)
|
|
||||||
, 'chains:\n', pdbchainlist)
|
|
||||||
return pdbchainlist
|
|
||||||
#=======================================================================
|
|
||||||
#%% write csv of processed dssp output
|
|
||||||
def dssp_to_csv(inputdsspfile, outfile, pdbchainlist = ['A']):
|
|
||||||
"""
|
|
||||||
Create a df from a dssp file containing ASA, RSA, SS for all chains
|
|
||||||
|
|
||||||
@param infile: dssp file
|
|
||||||
@type infile: string
|
|
||||||
|
|
||||||
@param outfile: csv file
|
|
||||||
@type outfile: string
|
|
||||||
|
|
||||||
@param DSSP: DSSP to df processing using dmstools
|
|
||||||
@type DSSP: string
|
|
||||||
|
|
||||||
@return: none, creates csv file
|
|
||||||
"""
|
|
||||||
dssp_df = pd.DataFrame()
|
|
||||||
|
|
||||||
print('Total no. of chains: ', len(pdbchainlist))
|
|
||||||
for chain_id in pdbchainlist:
|
|
||||||
print('Chain id:', chain_id)
|
|
||||||
dssp_cur = pd.DataFrame()
|
|
||||||
dssp_cur = dms_tools2.dssp.processDSSP(inputdsspfile, chain = chain_id)
|
|
||||||
#!!!Important!!!
|
|
||||||
dssp_cur['chain_id'] = chain_id
|
|
||||||
dssp_df = dssp_df.append(dssp_cur)
|
|
||||||
pp.pprint(dssp_df)
|
|
||||||
|
|
||||||
# Rename column (amino acid) as 'wild_type' and (site} as 'position'
|
|
||||||
# to be the same names as used in the file required for merging later.
|
|
||||||
dssp_df.columns
|
|
||||||
dssp_df.rename(columns = {'site':'position', 'amino_acid':'wild_type_dssp'}, inplace = True)
|
|
||||||
dssp_df.columns
|
|
||||||
|
|
||||||
# sanity check
|
|
||||||
# if len(dssp_df) == len(dssp):
|
|
||||||
# print('PASS: length of dssp_df has correct length')
|
|
||||||
# else:
|
|
||||||
# print('FAIL: length mismatch for dssp_df'
|
|
||||||
# , '\nexpected length:', len(dssp)
|
|
||||||
# , '\nGot length:', len(dssp_df)
|
|
||||||
# , 'Debug please!')
|
|
||||||
|
|
||||||
# write to csv
|
|
||||||
dssp_df.to_csv(outfile, header=True, index = False)
|
|
||||||
|
|
||||||
print('Finished writing:', outfile
|
|
||||||
, '\nNo. of rows:', len(dssp_df)
|
|
||||||
, '\nNo. of cols:', len(dssp_df.columns)
|
|
||||||
, '\n==============================================================')
|
|
||||||
#=======================================================================
|
|
||||||
#%% call functions
|
|
||||||
#dssp_file_from_pdb(infile, dssp_file, DSSP = "dssp")
|
|
||||||
#my_chains = extract_chain_dssp(infile)
|
|
||||||
#dssp_to_csv(dssp_file, dsspcsv_file, my_chains)
|
|
||||||
#%%
|
|
||||||
#=======================================================================
|
|
||||||
def main():
|
|
||||||
print('Running dssp with the following params:\n'
|
|
||||||
, in_filename
|
|
||||||
, 'outfile:', dsspcsv_filename)
|
|
||||||
dssp_file_from_pdb(infile, dssp_file, DSSP = "dssp")
|
|
||||||
my_chains = extract_chain_dssp(infile)
|
|
||||||
dssp_to_csv(dssp_file, dsspcsv_file, my_chains)
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
||||||
#%% end of script
|
|
||||||
#=======================================================================
|
|
||||||
#=======================================================================
|
|
|
@ -1,99 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
DEBUG = False
|
|
||||||
#%%
|
|
||||||
#def find_missense(test_df, ref_allele1, alt_allele0):
|
|
||||||
def find_missense(test_df, ref_allele_column, alt_allele_column, n_diff_colname = 'n_diff', tot_diff_colname = 'tot_diff', ref_a_colname = 'ref_allele', alt_a_colname = 'alt_allele'):
|
|
||||||
|
|
||||||
|
|
||||||
"""Find mismatches in pairwise comparison of strings b/w col_a and col_b
|
|
||||||
|
|
||||||
Case insensitive, converts strings to uppercase before comparison
|
|
||||||
|
|
||||||
@test_df: df containing columns to compare
|
|
||||||
@type: pandas df
|
|
||||||
|
|
||||||
@ref_allele_column: column containing ref allele str
|
|
||||||
@type: str (converts to uppercase)
|
|
||||||
|
|
||||||
@alt_allele_column: column containing alt_allele str
|
|
||||||
@type: str (converts to uppercase)
|
|
||||||
|
|
||||||
@n_diff_colname: user defined colname for no. of char diff b/w ref_allele_str and alt_allele_str
|
|
||||||
@type: str
|
|
||||||
|
|
||||||
@tot_diff_colname: user defined colname abs diff to indicate if strings are of equal length
|
|
||||||
@type: str
|
|
||||||
|
|
||||||
@ref_a_colname: user defined colname containing extracted referece allele
|
|
||||||
@type: str
|
|
||||||
|
|
||||||
@alt_a_colname: user defined colname containing extracted alt allele
|
|
||||||
@type: str
|
|
||||||
|
|
||||||
returns df: with 4 columns. If column names clash, the function column
|
|
||||||
name will override original column
|
|
||||||
@rtype: pandas df
|
|
||||||
"""
|
|
||||||
for ind, val in test_df.iterrows():
|
|
||||||
if DEBUG:
|
|
||||||
print('index:', ind, 'value:', val
|
|
||||||
, '\n============================================================')
|
|
||||||
ref_a = val[ref_allele_column].upper()
|
|
||||||
alt_a = val[alt_allele_column].upper()
|
|
||||||
if DEBUG:
|
|
||||||
print('ref_allele_string:', ref_a, 'alt_allele_string:', alt_a)
|
|
||||||
difference = sum(1 for e in zip(ref_a, alt_a) if e[0] != e[1])
|
|
||||||
test_df.at[ind, n_diff_colname] = difference # adding column
|
|
||||||
tot_difference = difference + abs(len(ref_a) - len(alt_a))
|
|
||||||
test_df.at[ind, tot_diff_colname] = tot_difference # adding column
|
|
||||||
if difference != tot_difference:
|
|
||||||
print('WARNING: lengths of ref_allele and alt_allele differ at index:', ind
|
|
||||||
, '\nNon-missense muts detected')
|
|
||||||
|
|
||||||
# Now finding the mismatched char
|
|
||||||
ref_aln = ''
|
|
||||||
alt_aln = ''
|
|
||||||
if ref_a == alt_a:
|
|
||||||
##test_df.at[ind, 'ref_allele'] = 'no_change' # adding column
|
|
||||||
##test_df.at[ind, 'alt_allele'] = 'no_change' # adding column
|
|
||||||
test_df.at[ind, ref_a_colname] = 'no_change' # adding column
|
|
||||||
test_df.at[ind, alt_a_colname] = 'no_change' # adding column
|
|
||||||
elif len(ref_a) == len(alt_a) and len(ref_a) > 0:
|
|
||||||
print('ref:', ref_a, 'alt:', alt_a)
|
|
||||||
for n in range(len(ref_a)):
|
|
||||||
if ref_a[n] != alt_a[n]:
|
|
||||||
ref_aln += ref_a[n]
|
|
||||||
alt_aln += alt_a[n]
|
|
||||||
##test_df.at[ind, 'ref_allele'] = ref_aln
|
|
||||||
##test_df.at[ind, 'alt_allele'] = alt_aln
|
|
||||||
test_df.at[ind, ref_a_colname] = ref_aln
|
|
||||||
test_df.at[ind, alt_a_colname] = alt_aln
|
|
||||||
print('ref:', ref_aln)
|
|
||||||
print('alt:', alt_aln)
|
|
||||||
else:
|
|
||||||
##test_df.at[ind, 'ref_allele'] = 'ERROR_Not_nsSNP'
|
|
||||||
##test_df.at[ind, 'alt_allele'] = 'ERROR_Not_nsSNP'
|
|
||||||
test_df.at[ind, ref_a_colname] = 'ERROR_Not_nsSNP'
|
|
||||||
test_df.at[ind, alt_a_colname] = 'ERROR_Not_nsSNP'
|
|
||||||
|
|
||||||
return test_df
|
|
||||||
#========================================
|
|
||||||
# a representative example
|
|
||||||
eg_df = {'chromosome_number': [2288719, 2288766, 2288775, 2288779, 2288827, 1111111, 2222222],
|
|
||||||
'ref_allele1': ['Tc', 'AG', 'AGCACCCTG', 'CCCTGGTGGCC', 'CACA', 'AA', 'CAT'],
|
|
||||||
'alt_allele0': ['CC', 'CA', 'GGCACCCTGZ','TCCTGGTGGCCAAD', 'TACA', 'AA', 'TCZ']}
|
|
||||||
|
|
||||||
# snippet of actual data
|
|
||||||
#eg_df = pd.read_csv('pnca_assoc.txt', sep = '\t', nrows = 10, header = 0, index_col = False)
|
|
||||||
eg_df = pd.DataFrame(eg_df)
|
|
||||||
|
|
||||||
def main():
|
|
||||||
#find_missense(eg_df, ref_allele1 = 'ref_allele', alt_allele0 = 'alt_allele')
|
|
||||||
find_missense(test_df = eg_df, ref_allele_column = 'ref_allele1', alt_allele_column = 'alt_allele0')
|
|
||||||
print(eg_df)
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
|
@ -1,14 +0,0 @@
|
||||||
#!/usr/bin/env Rscript
|
|
||||||
require('getopt', quietly=TRUE)
|
|
||||||
|
|
||||||
spec = matrix(c(
|
|
||||||
"drug" , "d", 1, "character",
|
|
||||||
"gene" , "g", 1, "character"
|
|
||||||
), byrow=TRUE, ncol=4)
|
|
||||||
|
|
||||||
opt = getopt(spec);
|
|
||||||
|
|
||||||
drug = opt$drug
|
|
||||||
gene = opt$gene
|
|
||||||
|
|
||||||
cat(c('\nDrug:', drug, '\nGene:', gene, '\n'))
|
|
230
scripts/kd_df.py
230
scripts/kd_df.py
|
@ -1,230 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
'''
|
|
||||||
Created on Tue Aug 6 12:56:03 2019
|
|
||||||
|
|
||||||
@author: tanu
|
|
||||||
'''
|
|
||||||
#=======================================================================
|
|
||||||
# Task: Hydrophobicity (Kd) values for amino acid sequence using the
|
|
||||||
# Kyt&-Doolittle.
|
|
||||||
# Same output as using the expasy server (link below)
|
|
||||||
# Input: fasta file
|
|
||||||
|
|
||||||
# Output: csv file with
|
|
||||||
|
|
||||||
# useful links
|
|
||||||
# https://biopython.org/DIST/docs/api/Bio.SeqUtils.ProtParamData-pysrc.html
|
|
||||||
# https://web.expasy.org/protscale/pscale/protscale_help.html
|
|
||||||
#=======================================================================
|
|
||||||
#%% load packages
|
|
||||||
import sys, os
|
|
||||||
import argparse
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
from pylab import *
|
|
||||||
from Bio.SeqUtils import ProtParamData
|
|
||||||
from Bio.SeqUtils.ProtParam import ProteinAnalysis
|
|
||||||
from Bio import SeqIO
|
|
||||||
#from Bio.Alphabet.IUPAC import IUPACProtein
|
|
||||||
import pprint as pp
|
|
||||||
#=======================================================================
|
|
||||||
#%% specify homedir and curr dir
|
|
||||||
homedir = os.path.expanduser('~')
|
|
||||||
|
|
||||||
# set working dir
|
|
||||||
os.getcwd()
|
|
||||||
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
|
||||||
os.getcwd()
|
|
||||||
#=======================================================================
|
|
||||||
#%% command line args
|
|
||||||
arg_parser = argparse.ArgumentParser()
|
|
||||||
arg_parser.add_argument('-d', '--drug', help='drug name', default = None)
|
|
||||||
arg_parser.add_argument('-g', '--gene', help='gene name', default = None)
|
|
||||||
#arg_parser.add_argument('-p', '--plot', help='show plot', action='store_true')
|
|
||||||
args = arg_parser.parse_args()
|
|
||||||
#=======================================================================
|
|
||||||
#%% variable assignment: input and output
|
|
||||||
#drug = 'pyrazinamide'
|
|
||||||
#gene = 'pncA'
|
|
||||||
drug = args.drug
|
|
||||||
gene = args.gene
|
|
||||||
#plot = args.plot
|
|
||||||
gene_match = gene + '_p.'
|
|
||||||
|
|
||||||
#==========
|
|
||||||
# data dir
|
|
||||||
#==========
|
|
||||||
datadir = homedir + '/' + 'git/Data'
|
|
||||||
|
|
||||||
#=======
|
|
||||||
# input
|
|
||||||
#=======
|
|
||||||
indir = datadir + '/' + drug + '/' + 'input'
|
|
||||||
in_filename = '3pl1.fasta.txt'
|
|
||||||
infile = indir + '/' + in_filename
|
|
||||||
print('Input filename:', in_filename
|
|
||||||
, '\nInput path:', indir
|
|
||||||
, '\n============================================================')
|
|
||||||
|
|
||||||
#=======
|
|
||||||
# output
|
|
||||||
#=======
|
|
||||||
outdir = datadir + '/' + drug + '/' + 'output'
|
|
||||||
out_filename = gene.lower() + '_kd.csv'
|
|
||||||
outfile = outdir + '/' + out_filename
|
|
||||||
print('Output filename:', out_filename
|
|
||||||
, '\nOutput path:', outdir
|
|
||||||
, '\n=============================================================')
|
|
||||||
#%% end of variable assignment for input and output files
|
|
||||||
#=======================================================================
|
|
||||||
#%% kd values from fasta file and output csv
|
|
||||||
def kd_to_csv(inputfasta, outputkdcsv, windowsize = 3):
|
|
||||||
"""
|
|
||||||
Calculate kd (hydropathy values) from input fasta file
|
|
||||||
|
|
||||||
@param inputfasta: fasta file
|
|
||||||
@type inputfasta: string
|
|
||||||
|
|
||||||
@param outputkdcsv: csv file with kd values
|
|
||||||
@type outfile: string
|
|
||||||
|
|
||||||
@param windowsize: windowsize to perform KD calcs on (Kyte&-Doolittle)
|
|
||||||
@type DSSP: numeric
|
|
||||||
|
|
||||||
@return: none, writes kd values df as csv
|
|
||||||
"""
|
|
||||||
#========================
|
|
||||||
# read input fasta file
|
|
||||||
#========================
|
|
||||||
fh = open(inputfasta)
|
|
||||||
|
|
||||||
for record in SeqIO.parse(fh, 'fasta'):
|
|
||||||
id = record.id
|
|
||||||
seq = record.seq
|
|
||||||
num_residues = len(seq)
|
|
||||||
fh.close()
|
|
||||||
|
|
||||||
sequence = str(seq)
|
|
||||||
X = ProteinAnalysis(sequence)
|
|
||||||
|
|
||||||
#===================
|
|
||||||
# calculate KD values: same as the expasy server
|
|
||||||
#===================
|
|
||||||
my_window = windowsize
|
|
||||||
offset = round((my_window/2)-0.5)
|
|
||||||
# edge weight is set to default (100%)
|
|
||||||
|
|
||||||
kd_values = (X.protein_scale(ProtParamData.kd , window = my_window))
|
|
||||||
# sanity checks
|
|
||||||
print('Sequence Length:', num_residues)
|
|
||||||
print('kd_values Length:',len(kd_values))
|
|
||||||
print('Window Length:', my_window)
|
|
||||||
print('Window Offset:', offset)
|
|
||||||
print('=================================================================')
|
|
||||||
print('Checking:len(kd values) is as expected for the given window size & offset...')
|
|
||||||
expected_length = num_residues - (my_window - offset)
|
|
||||||
if len(kd_values) == expected_length:
|
|
||||||
print('PASS: expected and actual length of kd values match')
|
|
||||||
else:
|
|
||||||
print('FAIL: length mismatch'
|
|
||||||
,'\nExpected length:', expected_length
|
|
||||||
,'\nActual length:', len(kd_values)
|
|
||||||
, '\n=========================================================')
|
|
||||||
|
|
||||||
#===================
|
|
||||||
# creating two dfs
|
|
||||||
#===================
|
|
||||||
# 1) aa sequence and 2) kd_values. Then reset index for each df
|
|
||||||
# which will allow easy merging of the two dfs.
|
|
||||||
|
|
||||||
# df1: df of aa seq with index reset to start from 1
|
|
||||||
# (reflective of the actual aa position in a sequence)
|
|
||||||
# Name column of wt as 'wild_type' to be the same name used
|
|
||||||
# in the file required for merging later.
|
|
||||||
dfSeq = pd.DataFrame({'wild_type_kd':list(sequence)})
|
|
||||||
dfSeq.index = np.arange(1, len(dfSeq) + 1) # python is not inclusive
|
|
||||||
|
|
||||||
# df2: df of kd_values with index reset to start from offset + 1 and
|
|
||||||
# subsequent matched length of the kd_values
|
|
||||||
dfVals = pd.DataFrame({'kd_values':kd_values})
|
|
||||||
dfVals.index = np.arange(offset + 1, len(dfVals) + 1 + offset)
|
|
||||||
|
|
||||||
# sanity checks
|
|
||||||
max(dfVals['kd_values'])
|
|
||||||
min(dfVals['kd_values'])
|
|
||||||
|
|
||||||
#===================
|
|
||||||
# concatenating dfs
|
|
||||||
#===================
|
|
||||||
# Merge the two on index
|
|
||||||
# (as these are now reflective of the aa position numbers): df1 and df2
|
|
||||||
# This will introduce NaN where there is missing values. In our case this
|
|
||||||
# will be 2 (first and last ones based on window size and offset)
|
|
||||||
|
|
||||||
kd_df = pd.concat([dfSeq, dfVals], axis = 1)
|
|
||||||
|
|
||||||
#============================
|
|
||||||
# renaming index to position
|
|
||||||
#============================
|
|
||||||
kd_df = kd_df.rename_axis('position')
|
|
||||||
kd_df.head
|
|
||||||
|
|
||||||
print('Checking: position col i.e. index should be numeric')
|
|
||||||
if kd_df.index.dtype == 'int64':
|
|
||||||
print('PASS: position col is numeric'
|
|
||||||
, '\ndtype is:', kd_df.index.dtype)
|
|
||||||
else:
|
|
||||||
print('FAIL: position col is not numeric'
|
|
||||||
, '\nConverting to numeric')
|
|
||||||
kd_df.index.astype('int64')
|
|
||||||
print('Checking dtype for after conversion:\n'
|
|
||||||
, '\ndtype is:', kd_df.index.dtype
|
|
||||||
, '\n=========================================================')
|
|
||||||
|
|
||||||
#===============
|
|
||||||
# writing file
|
|
||||||
#===============
|
|
||||||
print('Writing file:'
|
|
||||||
, '\nFilename:', outputkdcsv
|
|
||||||
# , '\nPath:', outdir
|
|
||||||
, '\nExpected no. of rows:', len(kd_df)
|
|
||||||
, '\nExpected no. of cols:', len(kd_df.columns)
|
|
||||||
, '\n=============================================================')
|
|
||||||
|
|
||||||
kd_df.to_csv(outputkdcsv, header = True, index = True)
|
|
||||||
|
|
||||||
#===============
|
|
||||||
# plot: optional!
|
|
||||||
#===============
|
|
||||||
# http://www.dalkescientific.com/writings/NBN/plotting.html
|
|
||||||
|
|
||||||
# FIXME: save fig
|
|
||||||
# extract just pdb if from 'id' to pass to title of plot
|
|
||||||
# foo = re.match(r'(^[0-9]{1}\w{3})', id).groups(1)
|
|
||||||
# if doplot:
|
|
||||||
plot(kd_values, linewidth = 1.0)
|
|
||||||
#axis(xmin = 1, xmax = num_residues)
|
|
||||||
xlabel('Residue Number')
|
|
||||||
ylabel('Hydrophobicity')
|
|
||||||
title('K&D Hydrophobicity for ' + id)
|
|
||||||
show()
|
|
||||||
|
|
||||||
#%% end of function
|
|
||||||
#=======================================================================
|
|
||||||
#%% call function
|
|
||||||
#kd_to_csv(infile, outfile, windowsize = 3)
|
|
||||||
#=======================================================================
|
|
||||||
def main():
|
|
||||||
print('Running hydropathy calcs with following params\n'
|
|
||||||
, in_filename
|
|
||||||
, '\noutfile:', out_filename)
|
|
||||||
kd_to_csv(infile, outfile, 3)
|
|
||||||
print('Finished writing file:'
|
|
||||||
, '\nFilename:', outfile
|
|
||||||
, '\n=============================================================')
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
||||||
#%% end of script
|
|
||||||
#=======================================================================
|
|
|
@ -1,152 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
"""
|
|
||||||
Created on Wed Jun 10 11:13:49 2020
|
|
||||||
|
|
||||||
@author: tanu
|
|
||||||
"""
|
|
||||||
#==============================================================================
|
|
||||||
# TASK
|
|
||||||
# To format snp_fino.txt file that has already been processed in bash
|
|
||||||
# to include mcsm style muts and gwas style muts. The idea is that the info file
|
|
||||||
# will contain all possible mutation format style to make it easy to populate
|
|
||||||
# and link other files with this sort of meta data. For example: or file
|
|
||||||
#=======================================================================
|
|
||||||
|
|
||||||
# FIXME : add bash info here as well
|
|
||||||
|
|
||||||
#%% useful links
|
|
||||||
#https://chrisalbon.com/python/data_wrangling/pandas_join_merge_dataframe/
|
|
||||||
#https://kanoki.org/2019/11/12/how-to-use-regex-in-pandas/
|
|
||||||
#https://stackoverflow.com/questions/40348541/pandas-diff-with-string
|
|
||||||
#=======================================================================
|
|
||||||
#%% specify dirs
|
|
||||||
import os, sys
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
import re
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
homedir = os.path.expanduser('~')
|
|
||||||
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
|
||||||
|
|
||||||
#from reference_dict import my_aa_dict
|
|
||||||
#from reference_dict import low_3letter_dict # equivalent of my_aa_dict
|
|
||||||
from reference_dict import oneletter_aa_dict
|
|
||||||
#=======================================================================
|
|
||||||
#%% command line args
|
|
||||||
arg_parser = argparse.ArgumentParser()
|
|
||||||
|
|
||||||
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = 'pyrazinamide')
|
|
||||||
arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = 'pncA') # case sensitive
|
|
||||||
|
|
||||||
args = arg_parser.parse_args()
|
|
||||||
#=======================================================================
|
|
||||||
#%% variables
|
|
||||||
#gene = 'pncA'
|
|
||||||
#drug = 'pyrazinamide'
|
|
||||||
#gene_match = gene +'_p.'
|
|
||||||
|
|
||||||
# cmd variables
|
|
||||||
gene = args.gene
|
|
||||||
drug = args.drug
|
|
||||||
gene_match = gene +'_p.'
|
|
||||||
|
|
||||||
#=======================================================================
|
|
||||||
#%% input and output dirs and files
|
|
||||||
#=======
|
|
||||||
# data dir
|
|
||||||
#=======
|
|
||||||
datadir = homedir + '/' + 'git/Data'
|
|
||||||
indir = datadir + '/' + drug + '/input'
|
|
||||||
outdir = datadir + '/' + drug + '/output'
|
|
||||||
|
|
||||||
#=======
|
|
||||||
# input
|
|
||||||
#=======
|
|
||||||
|
|
||||||
gene_info_filename = 'ns'+ gene.lower()+ '_snp_info1.txt'
|
|
||||||
gene_info = indir + '/' + gene_info_filename
|
|
||||||
print('gene info file: ', gene_info
|
|
||||||
, '\n============================================================')
|
|
||||||
|
|
||||||
#=======
|
|
||||||
# output
|
|
||||||
#=======
|
|
||||||
gene_snp_info_filename = 'ns' + gene.lower() + '_snp_info.csv' # other one is called AFandOR
|
|
||||||
outfile_snp_info = indir + '/' + gene_snp_info_filename
|
|
||||||
print('Output file: ', outfile_snp_info
|
|
||||||
, '\n============================================================')
|
|
||||||
|
|
||||||
#%% read files: preformatted using bash
|
|
||||||
info_df2 = pd.read_csv(gene_info, sep = '\t', header = 0) #303, 10
|
|
||||||
|
|
||||||
#%% Split into three cols with 1-letter aa_code & combine to get mutationinformation column
|
|
||||||
# check mutation format in exisiting df
|
|
||||||
info_df2.head()
|
|
||||||
info_df2['mut_info'].head()
|
|
||||||
|
|
||||||
print('Creating column: mutationinformation')
|
|
||||||
info_df2_ncols = len(info_df2.columns)
|
|
||||||
|
|
||||||
info_df2['wild_type'] = info_df2['mut_info'].str.extract('(\w{1})>')
|
|
||||||
info_df2['position'] = info_df2['mut_info'].str.extract('(\d+)')
|
|
||||||
info_df2['mutant_type'] = info_df2['mut_info'].str.extract('>\d+(\w{1})')
|
|
||||||
|
|
||||||
info_df2['mutationinformation'] = info_df2['wild_type'] + info_df2['position'] + info_df2['mutant_type']
|
|
||||||
|
|
||||||
# sanity check
|
|
||||||
ncols_add = 4 # Beware hardcoded
|
|
||||||
if len(info_df2.columns) == info_df2_ncols + ncols_add:
|
|
||||||
print('PASS: Succesfully extracted and added mutationinformation (mcsm style) as below\n'
|
|
||||||
, info_df2['mutationinformation'].head()
|
|
||||||
, '\n=====================================================================')
|
|
||||||
else:
|
|
||||||
print('FAIL: No. of cols mismatch'
|
|
||||||
,'\noriginal length:', info_df2_ncols
|
|
||||||
, '\nExpected no. of cols:', info_df2_ncols + ncols_add
|
|
||||||
, '\nGot no. of cols:', len(info_df2.columns))
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
# update ncols
|
|
||||||
info_df2_ncols = len(info_df2.columns)
|
|
||||||
|
|
||||||
#%% Creating column 'mutation' which as mutation of the format;
|
|
||||||
# <gene_match>.lower()<abc>1<cde>: pnca_p.trp68gly
|
|
||||||
# match the 'one_letter_code' value to get the dict key (three-letter code)
|
|
||||||
|
|
||||||
print('Creating column: mutation')
|
|
||||||
|
|
||||||
# dict to use: oneletter_aa_dict
|
|
||||||
lookup_dict = dict()
|
|
||||||
for k1, v1 in oneletter_aa_dict.items():
|
|
||||||
lookup_dict[k1] = v1['three_letter_code_lower']
|
|
||||||
for k2, v2 in lookup_dict.items():
|
|
||||||
info_df2['wt_3let'] = info_df2['wild_type'].squeeze().map(lookup_dict)
|
|
||||||
info_df2['mt_3let'] = info_df2['mutant_type'].squeeze().map(lookup_dict)
|
|
||||||
|
|
||||||
# create column mutation
|
|
||||||
info_df2['mutation'] = info_df2['wt_3let'] + info_df2['position'] + info_df2['mt_3let']
|
|
||||||
|
|
||||||
# add prefix: gene_match to each value in column
|
|
||||||
info_df2['mutation'] = gene_match.lower() + info_df2['mutation'].astype(str)
|
|
||||||
|
|
||||||
# sanity check
|
|
||||||
ncols_add = 3 # Beware hardcoded
|
|
||||||
if len(info_df2.columns) == info_df2_ncols + ncols_add:
|
|
||||||
print('PASS: Succesfully created column mutation as below\n'
|
|
||||||
, info_df2['mutation'].head()
|
|
||||||
, '\n=====================================================================')
|
|
||||||
else:
|
|
||||||
print('FAIL: No. of cols mismatch\noriginal length:', info_df2_ncols
|
|
||||||
, '\nExpected no. of cols:', info_df2_ncols + ncols_add
|
|
||||||
, '\nGot no. of cols:', len(info_df2.columns))
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
#%% write file
|
|
||||||
print('\n====================================================================='
|
|
||||||
, '\nWriting output file:\n', outfile_snp_info
|
|
||||||
, '\nNo.of rows:', len(info_df2)
|
|
||||||
, '\nNo. of cols:', len(info_df2.columns))
|
|
||||||
info_df2.to_csv(outfile_snp_info, index = False)
|
|
||||||
|
|
|
@ -1,334 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
"""
|
|
||||||
Created on Wed Jun 10 11:13:49 2020
|
|
||||||
|
|
||||||
@author: tanu
|
|
||||||
"""
|
|
||||||
#=======================================================================
|
|
||||||
#%% useful links
|
|
||||||
#https://chrisalbon.com/python/data_wrangling/pandas_join_merge_dataframe/
|
|
||||||
#https://kanoki.org/2019/11/12/how-to-use-regex-in-pandas/
|
|
||||||
#https://stackoverflow.com/questions/40348541/pandas-diff-with-string
|
|
||||||
#=======================================================================
|
|
||||||
#%% specify dirs
|
|
||||||
import os, sys
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
import re
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
homedir = os.path.expanduser('~')
|
|
||||||
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
|
||||||
# local import
|
|
||||||
from find_missense import find_missense
|
|
||||||
#=======================================================================
|
|
||||||
#%% command line args
|
|
||||||
arg_parser = argparse.ArgumentParser()
|
|
||||||
|
|
||||||
arg_parser.add_argument('-d', '--drug', help = 'drug name', default = 'pyrazinamide')
|
|
||||||
arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = 'pncA') # case sensitive
|
|
||||||
|
|
||||||
arg_parser.add_argument('-s', '--start_coord', help = 'start of coding region (cds) of gene', default = 2288681) # pnca cds
|
|
||||||
arg_parser.add_argument('-e', '--end_coord', help = 'end of coding region (cds) of gene', default = 2289241) # pnca cds
|
|
||||||
|
|
||||||
args = arg_parser.parse_args()
|
|
||||||
#=======================================================================
|
|
||||||
#%% variables
|
|
||||||
#gene = 'pncA'
|
|
||||||
#drug = 'pyrazinamide'
|
|
||||||
#start_cds = 2288681
|
|
||||||
#end_cds = 2289241
|
|
||||||
|
|
||||||
# cmd variables
|
|
||||||
gene = args.gene
|
|
||||||
drug = args.drug
|
|
||||||
|
|
||||||
start_cds = args.start_coord
|
|
||||||
end_cds = args.end_coord
|
|
||||||
#=======================================================================
|
|
||||||
#%% input and output dirs and files
|
|
||||||
#=======
|
|
||||||
# data dir
|
|
||||||
#=======
|
|
||||||
datadir = homedir + '/' + 'git/Data'
|
|
||||||
indir = datadir + '/' + drug + '/input'
|
|
||||||
outdir = datadir + '/' + drug + '/output'
|
|
||||||
|
|
||||||
#=======
|
|
||||||
# input
|
|
||||||
#=======
|
|
||||||
|
|
||||||
info_filename = 'snp_info.txt'
|
|
||||||
snp_info = datadir + '/' + info_filename
|
|
||||||
print('Info file: ', snp_info
|
|
||||||
, '\n============================================================')
|
|
||||||
|
|
||||||
|
|
||||||
gene_info_filename = 'ns'+ gene.lower()+ '_snp_info.txt'
|
|
||||||
gene_info = indir + '/' + gene_info_filename
|
|
||||||
print('gene info file: ', gene_info
|
|
||||||
, '\n============================================================')
|
|
||||||
|
|
||||||
|
|
||||||
in_filename_or = 'ns'+ gene.lower()+ '_assoc.txt'
|
|
||||||
gene_or = indir + '/' + in_filename_or
|
|
||||||
print('gene OR file: ', gene_or
|
|
||||||
, '\n============================================================')
|
|
||||||
|
|
||||||
#=======
|
|
||||||
# output
|
|
||||||
#=======
|
|
||||||
gene_or_filename = gene.lower() + '_af_or_kinship.csv' # other one is called AFandOR
|
|
||||||
outfile_or_kin = outdir + '/' + gene_or_filename
|
|
||||||
print('Output file: ', outfile_or_kin
|
|
||||||
, '\n============================================================')
|
|
||||||
|
|
||||||
#%% read files: preformatted using bash
|
|
||||||
# or file: '...assoc.txt'
|
|
||||||
or_df = pd.read_csv(gene_or, sep = '\t', header = 0, index_col = False) # 182, 12 (without filtering for missense muts, it was 212 i.e we 30 muts weren't missense)
|
|
||||||
or_df.head()
|
|
||||||
or_df.columns
|
|
||||||
#%% snp_info file: master and gene specific ones
|
|
||||||
|
|
||||||
# gene info
|
|
||||||
#info_df2 = pd.read_csv('nssnp_info_pnca.txt', sep = '\t', header = 0) #303, 10
|
|
||||||
info_df2 = pd.read_csv(gene_info, sep = '\t', header = 0) #303, 10
|
|
||||||
mis_mut_cover = (info_df2['chromosome_number'].nunique()/info_df2['chromosome_number'].count()) * 100
|
|
||||||
print('*****RESULT*****'
|
|
||||||
, '\nPercentage of MISsense mut in pncA:', mis_mut_cover
|
|
||||||
, '\n*****RESULT*****') #65.7%
|
|
||||||
|
|
||||||
# large file
|
|
||||||
#info_df = pd.read_csv('snp_info.txt', sep = '\t', header = None) #12010
|
|
||||||
info_df = pd.read_csv(snp_info, sep = '\t') #12010
|
|
||||||
#info_df.columns = ['chromosome_number', 'ref_allele', 'alt_allele', 'snp_info'] #12009, 4
|
|
||||||
|
|
||||||
info_df['chromosome_number'].nunique() #10257
|
|
||||||
mut_cover = (info_df['chromosome_number'].nunique()/info_df['chromosome_number'].count()) * 100
|
|
||||||
print('*****RESULT*****'
|
|
||||||
,'\nPercentage of mutations in pncA:', mut_cover
|
|
||||||
, '\n*****RESULT*****') #85.4%
|
|
||||||
|
|
||||||
# extract unique chr position numbers
|
|
||||||
genomic_pos = info_df['chromosome_number'].unique()
|
|
||||||
genomic_pos_df = pd.DataFrame(genomic_pos, columns = ['chr_pos'])
|
|
||||||
genomic_pos_df.dtypes
|
|
||||||
|
|
||||||
genomic_pos_min = info_df['chromosome_number'].min()
|
|
||||||
genomic_pos_max = info_df['chromosome_number'].max()
|
|
||||||
|
|
||||||
# genomic coord for pnca coding region
|
|
||||||
cds_len = (end_cds-start_cds) + 1
|
|
||||||
pred_prot_len = (cds_len/3) - 1
|
|
||||||
|
|
||||||
# mindblowing: difference b/w bitwise (&) and 'and'
|
|
||||||
# DO NOT want &: is this bit set to '1' in both variables? Is this what you want?
|
|
||||||
#if (genomic_pos_min <= start_cds) & (genomic_pos_max >= end_cds):
|
|
||||||
print('*****RESULT*****'
|
|
||||||
, '\nlength of coding region:', cds_len, 'bp'
|
|
||||||
, '\npredicted protein length:', pred_prot_len, 'aa'
|
|
||||||
, '\n*****RESULT*****')
|
|
||||||
|
|
||||||
if genomic_pos_min <= start_cds and genomic_pos_max >= end_cds:
|
|
||||||
print ('PASS: coding region for gene included in snp_info.txt')
|
|
||||||
else:
|
|
||||||
print('FAIL: coding region for gene not included in info file snp_info.txt')
|
|
||||||
sys.exit('ERROR: coding region of gene not included in the info file')
|
|
||||||
|
|
||||||
#%% Extracting ref allele and alt allele as single letters
|
|
||||||
# info_df has some of these params as more than a single letter, which means that
|
|
||||||
# when you try to merge ONLY using chromosome_number, then it messes up... and is WRONG.
|
|
||||||
# Hence the merge needs to be performed on a unique set of attributes which in our case
|
|
||||||
# would be chromosome_number, ref_allele and alt_allele
|
|
||||||
|
|
||||||
#FIXME: Turn to a function
|
|
||||||
orig_len = len(or_df.columns)
|
|
||||||
|
|
||||||
#find_missense(or_df, 'ref_allele1', 'alt_allele0')
|
|
||||||
find_missense(or_df, ref_allele_column = 'ref_allele1', alt_allele_column = 'alt_allele0')
|
|
||||||
|
|
||||||
ncols_add = 4
|
|
||||||
if len(or_df.columns) == orig_len + ncols_add:
|
|
||||||
print('PASS: Succesfuly extracted ref and alt alleles for missense muts')
|
|
||||||
else:
|
|
||||||
print('FAIL: No. of cols mismatch'
|
|
||||||
,'\noriginal length:', orig_len
|
|
||||||
, '\nExpected no. of cols:', orig_len + ncols_add
|
|
||||||
, '\nGot no. of cols:', len(or_df.columns))
|
|
||||||
sys.exit()
|
|
||||||
del(orig_len, ncols_add)
|
|
||||||
|
|
||||||
#%% TRY MERGE
|
|
||||||
# check dtypes
|
|
||||||
or_df.dtypes
|
|
||||||
info_df.dtypes
|
|
||||||
#or_df.info()
|
|
||||||
|
|
||||||
# pandas documentation where it mentions: "Pandas uses the object dtype for storing strings"
|
|
||||||
# check how many unique chr_num in info_df are in or_df
|
|
||||||
genomic_pos_df['chr_pos'].isin(or_df['chromosome_number']).sum() #144
|
|
||||||
|
|
||||||
# check how many chr_num in or_df are in info_df: should be ALL of them
|
|
||||||
or_df['chromosome_number'].isin(genomic_pos_df['chr_pos']).sum() #182
|
|
||||||
|
|
||||||
# sanity check 2
|
|
||||||
if or_df['chromosome_number'].isin(genomic_pos_df['chr_pos']).sum() == len(or_df):
|
|
||||||
print('PASS: all genomic locs in or_df have meta datain info.txt')
|
|
||||||
else:
|
|
||||||
sys.exit('FAIL: some genomic locs or_df chr number DO NOT have meta data in snp_info.txt')
|
|
||||||
#%% Perform merge
|
|
||||||
|
|
||||||
#my_join = 'inner'
|
|
||||||
#my_join = 'outer'
|
|
||||||
my_join = 'left'
|
|
||||||
#my_join = 'right'
|
|
||||||
|
|
||||||
#dfm1 = pd.merge(or_df, info_df, on ='chromosome_number', how = my_join, indicator = True) # not unique!
|
|
||||||
dfm1 = pd.merge(or_df, info_df, on = ['chromosome_number', 'ref_allele', 'alt_allele'], how = my_join, indicator = True)
|
|
||||||
dfm1['_merge'].value_counts()
|
|
||||||
|
|
||||||
# count no. of missense mutations ONLY
|
|
||||||
dfm1.snp_info.str.count(r'(missense.*)').sum()
|
|
||||||
|
|
||||||
dfm2 = pd.merge(or_df, info_df2, on = ['chromosome_number', 'ref_allele', 'alt_allele'], how = my_join, indicator = True)
|
|
||||||
dfm2['_merge'].value_counts()
|
|
||||||
|
|
||||||
# count no. of nan
|
|
||||||
dfm2['mut_type'].isna().sum()
|
|
||||||
|
|
||||||
# drop nan
|
|
||||||
dfm2_mis = dfm2[dfm2['mut_type'].notnull()]
|
|
||||||
|
|
||||||
#%% sanity check
|
|
||||||
# count no. of missense muts
|
|
||||||
if len(dfm1) - dfm1.snp_info.str.count(r'(missense.*)').sum() == dfm2['mut_type'].isna().sum():
|
|
||||||
print('PASSED: numbers cross checked'
|
|
||||||
, '\nTotal no. of missense mutations:', dfm1.snp_info.str.count(r'(missense.*)').sum()
|
|
||||||
, '\nNo. of mutations falsely assumed to be missense:', len(dfm1) - dfm1.snp_info.str.count(r'(missense.*)').sum())
|
|
||||||
|
|
||||||
# two ways to filter to get only missense muts
|
|
||||||
test = dfm1[dfm1['snp_info'].str.count('missense.*')>0]
|
|
||||||
dfm1_mis = dfm1[dfm1['snp_info'].str.match('(missense.*)') == True]
|
|
||||||
test.equals(dfm1_mis)
|
|
||||||
|
|
||||||
# drop nan
|
|
||||||
dfm2_mis = dfm2[dfm2['mut_type'].notnull()]
|
|
||||||
|
|
||||||
if dfm1_mis[['chromosome_number', 'ref_allele', 'alt_allele']].equals(dfm2_mis[['chromosome_number', 'ref_allele', 'alt_allele']]):
|
|
||||||
print('PASS: Further cross checks successful')
|
|
||||||
else:
|
|
||||||
print('FAIL: Second cross check unsuccessfull. Debug please!')
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
#%% extract mut info into three cols
|
|
||||||
orig_len = len(dfm2_mis.columns)
|
|
||||||
|
|
||||||
dfm2_mis['wild_type'] = dfm2_mis['mut_info'].str.extract('(\w{1})>')
|
|
||||||
dfm2_mis['position'] = dfm2_mis['mut_info'].str.extract('(\d+)')
|
|
||||||
dfm2_mis['mutant_type'] = dfm2_mis['mut_info'].str.extract('>\d+(\w{1})')
|
|
||||||
|
|
||||||
dfm2_mis['mutationinformation'] = dfm2_mis['wild_type'] + dfm2_mis['position'] + dfm2_mis['mutant_type']
|
|
||||||
|
|
||||||
# sanity check
|
|
||||||
ncols_add = 4
|
|
||||||
if len(dfm2_mis.columns) == orig_len + ncols_add:
|
|
||||||
print('PASS: Succesfully extracted and added mutationinformation(mcsm style)')
|
|
||||||
else:
|
|
||||||
print('FAIL: No. of cols mismatch'
|
|
||||||
,'\noriginal length:', orig_len
|
|
||||||
, '\nExpected no. of cols:', orig_len + ncols_add
|
|
||||||
, '\nGot no. of cols:', len(dfm2_mis.columns))
|
|
||||||
sys.exit()
|
|
||||||
|
|
||||||
#%% formatting data for output
|
|
||||||
print('no of cols preformatting data:', len(dfm2_mis.columns))
|
|
||||||
|
|
||||||
#1) Add column: OR for kinship calculated from beta coeff
|
|
||||||
print('converting beta coeff to OR by exponent function\n:'
|
|
||||||
, dfm2_mis['beta'].head())
|
|
||||||
dfm2_mis['or_kin'] = np.exp(dfm2_mis['beta'])
|
|
||||||
print(dfm2_mis['or_kin'].head())
|
|
||||||
|
|
||||||
#2) rename af column
|
|
||||||
dfm2_mis.rename(columns = {'af': 'af_kin'
|
|
||||||
, 'beta': 'beta_kin'
|
|
||||||
, 'p_wald': 'pwald_kin'
|
|
||||||
, 'se': 'se_kin', 'logl_H1': 'logl_H1_kin'
|
|
||||||
, 'l_remle': 'l_remle_kin'}, inplace = True)
|
|
||||||
|
|
||||||
#3) drop some not required cols (including duplicate if you want)
|
|
||||||
|
|
||||||
#3a) drop duplicate columns
|
|
||||||
dfm2_mis2 = dfm2_mis.T.drop_duplicates().T #changes dtypes in cols, so not used
|
|
||||||
dup_cols = set(dfm2_mis.columns).difference(dfm2_mis2.columns)
|
|
||||||
print('Duplicate columns identified:', dup_cols)
|
|
||||||
dup_cols = {'alt_allele0', 'ps'} # didn't want to remove tot_diff
|
|
||||||
|
|
||||||
print('removing duplicate columns: kept one of the dup_cols i.e tot_diff')
|
|
||||||
dfm2_mis.drop(list(dup_cols), axis = 1, inplace = True)
|
|
||||||
print(dfm2_mis.columns)
|
|
||||||
|
|
||||||
#3b) other not useful columns
|
|
||||||
dfm2_mis.drop(['chromosome_text', 'chr', 'symbol', '_merge', ], axis = 1, inplace = True)
|
|
||||||
dfm2_mis.rename(columns = {'ref_allele1': 'reference_allele'}, inplace = True)
|
|
||||||
|
|
||||||
print(dfm2_mis.columns)
|
|
||||||
|
|
||||||
#4) reorder columns
|
|
||||||
orkin_linked = dfm2_mis[['mutationinformation',
|
|
||||||
'wild_type',
|
|
||||||
'position',
|
|
||||||
'mutant_type',
|
|
||||||
'chr_num_allele',
|
|
||||||
'ref_allele',
|
|
||||||
'alt_allele',
|
|
||||||
'mut_info',
|
|
||||||
'mut_type',
|
|
||||||
'gene_id',
|
|
||||||
'gene_number',
|
|
||||||
'mut_region',
|
|
||||||
'reference_allele',
|
|
||||||
'alternate_allele',
|
|
||||||
'chromosome_number',
|
|
||||||
#'afs
|
|
||||||
'af_kin',
|
|
||||||
'or_kin',
|
|
||||||
# 'ors_logistic',
|
|
||||||
# 'ors_chi_cus',
|
|
||||||
# 'ors_fisher',
|
|
||||||
'pwald_kin',
|
|
||||||
# 'pvals_logistic',
|
|
||||||
# 'pvals_fisher',
|
|
||||||
# 'ci_lb_fisher',
|
|
||||||
# 'ci_ub_fisher' ,
|
|
||||||
'beta_kin',
|
|
||||||
'se_kin',
|
|
||||||
'logl_H1_kin',
|
|
||||||
'l_remle_kin',
|
|
||||||
# 'stat_chi',
|
|
||||||
# 'pvals_chi',
|
|
||||||
'n_diff',
|
|
||||||
'tot_diff',
|
|
||||||
'n_miss']]
|
|
||||||
|
|
||||||
|
|
||||||
# sanity check after reassigning columns
|
|
||||||
if orkin_linked.shape == dfm2_mis.shape and set(orkin_linked.columns) == set(dfm2_mis.columns):
|
|
||||||
print('PASS: Successfully formatted df with rearranged columns')
|
|
||||||
else:
|
|
||||||
sys.exit('FAIL: something went wrong when rearranging columns!')
|
|
||||||
|
|
||||||
#%% write file
|
|
||||||
print('\n====================================================================='
|
|
||||||
, '\nWriting output file:\n', outfile_or_kin
|
|
||||||
, '\nNo.of rows:', len(dfm2_mis)
|
|
||||||
, '\nNo. of cols:', len(dfm2_mis.columns))
|
|
||||||
orkin_linked.to_csv(outfile_or_kin, index = False)
|
|
||||||
|
|
||||||
#%% diff b/w allele0 and 1: or_df
|
|
||||||
#https://stackoverflow.com/questions/40348541/pandas-diff-with-string
|
|
||||||
#df = or_df.iloc[[5, 15, 17, 19, 34]]
|
|
||||||
#df[['alt_allele0','ref_allele1']].ne(df[['alt_allele0','ref_allele1']].shift()).any(axis=1).astype(int)
|
|
||||||
#df[['alt_allele0','ref_allele1']].ne(df[['alt_allele0','ref_allele1']].shift()).any(axis=1).astype(int)
|
|
||||||
|
|
|
@ -1,34 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
from Bio import SeqIO
|
|
||||||
from Bio import pairwise2
|
|
||||||
from Bio.pairwise2 import format_alignment
|
|
||||||
from align import myalign
|
|
||||||
import re
|
|
||||||
import os
|
|
||||||
os.chdir('/home/tanu/git/LSHTM_analysis/scripts/examples')
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""
|
|
||||||
align ref_seq and pdb_seq
|
|
||||||
# FIXME: pass command line args i.e filename
|
|
||||||
|
|
||||||
"""
|
|
||||||
my_dict = {}
|
|
||||||
align_fastas_to_align = open('align_fastas.txt', 'r')
|
|
||||||
for record in SeqIO.parse(align_fastas_to_align,"fasta"):
|
|
||||||
myid = record.id
|
|
||||||
seq = str(record.seq)
|
|
||||||
my_dict.update({myid : seq})
|
|
||||||
|
|
||||||
my_keys = list(my_dict.keys())
|
|
||||||
my_ref_seq = my_dict[my_keys[0]]
|
|
||||||
my_pdb_seq = my_dict[my_keys[1]]
|
|
||||||
|
|
||||||
fasta_alignment = myalign(my_ref_seq, my_pdb_seq)
|
|
||||||
print(fasta_alignment)
|
|
||||||
print('class:', type(fasta_alignment))
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
|
@ -1,70 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
|
|
||||||
# Copyright 2020, Tanushree Tunstall
|
|
||||||
# This program is distributed under General Public License v. 3. See the file
|
|
||||||
# COPYING for a copy of the license.
|
|
||||||
|
|
||||||
__description__ = \
|
|
||||||
"""
|
|
||||||
chain_extract.py
|
|
||||||
|
|
||||||
extract chain/s from pdb and saves each chain as a separate file
|
|
||||||
"""
|
|
||||||
__author__ = "Tanushree Tunstall"
|
|
||||||
__date__ = ""
|
|
||||||
#=======================================================================
|
|
||||||
import os, shutil, sys
|
|
||||||
#from pdbtools.helper import cmdline
|
|
||||||
from chain_extract import ChainExtract
|
|
||||||
from Bio.PDB import PDBParser, PDBIO, Select
|
|
||||||
#from Bio.PDB.PDBParser import PDBParser
|
|
||||||
|
|
||||||
#io = PDBIO()
|
|
||||||
import argparse
|
|
||||||
#=======================================================================
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""
|
|
||||||
Function to call if run from command line.
|
|
||||||
|
|
||||||
Example use:
|
|
||||||
pdb_chain_extract.py -f <your_pdb_file> -c <chainid1><chainid2> -p <outpath> -o <outfile>
|
|
||||||
Extracts chain 'A' by default.
|
|
||||||
"""
|
|
||||||
arg_parser = argparse.ArgumentParser()
|
|
||||||
arg_parser.add_argument('-i', '--pdb_file', help='provide pdb file', default = 'None')
|
|
||||||
arg_parser.add_argument('-c', '--chain', help='chain/s to extract without spaces.', nargs = '+', default = 'A', type = list)
|
|
||||||
arg_parser.add_argument('-p', '--out_path', help='specify output path', default = '.', type = str)
|
|
||||||
arg_parser.add_argument('-o', '--out_file', help='specify output filename. Will be used as a prefix to append chain id and pdb file extension', default = 'pdbfile', type = str)
|
|
||||||
args = arg_parser.parse_args()
|
|
||||||
|
|
||||||
# Extract chains and write each chain as a separate file
|
|
||||||
pdb_file = args.pdb_file
|
|
||||||
print('input pdb file:', pdb_file)
|
|
||||||
|
|
||||||
# type = list, makes it a list of lists. Hence extracting the list of chains.
|
|
||||||
chains = args.chain[0]
|
|
||||||
#chains = ['A','B','C']
|
|
||||||
print ('user supplied chain:', chains)
|
|
||||||
|
|
||||||
# output filename and path
|
|
||||||
outpath = args.out_path
|
|
||||||
outfile = args.out_file
|
|
||||||
|
|
||||||
# get structure
|
|
||||||
p = PDBParser(PERMISSIVE=1)
|
|
||||||
structure = p.get_structure(pdb_file, pdb_file)
|
|
||||||
print('input pdb filename:', structure.get_id())
|
|
||||||
|
|
||||||
my_chains = chains
|
|
||||||
#my_chains = ['G', 'H']
|
|
||||||
c_names = ''.join(my_chains)
|
|
||||||
print('Extracting chains:', my_chains)
|
|
||||||
pdb_chains_file = outpath + '/' + outfile + '_' + c_names + '.pdb'
|
|
||||||
io = PDBIO()
|
|
||||||
io.set_structure(structure)
|
|
||||||
io.save(pdb_chains_file, ChainExtract(my_chains))
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
|
|
|
@ -1,71 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
|
|
||||||
# Copyright 2020, Tanushree Tunstall
|
|
||||||
# This program is distributed under General Public License v. 3. See the file
|
|
||||||
# COPYING for a copy of the license.
|
|
||||||
|
|
||||||
__description__ = \
|
|
||||||
"""
|
|
||||||
chain_splitter.py
|
|
||||||
|
|
||||||
extract chain/s from pdb and saves each chain as a separate file
|
|
||||||
"""
|
|
||||||
__author__ = "Tanushree Tunstall"
|
|
||||||
__date__ = ""
|
|
||||||
#=======================================================================
|
|
||||||
import os, shutil, sys
|
|
||||||
#from pdbtools.helper import cmdline
|
|
||||||
from chain_splitter import ChainSelect
|
|
||||||
from Bio.PDB import Select, PDBIO
|
|
||||||
from Bio.PDB.PDBParser import PDBParser
|
|
||||||
#io = PDBIO()
|
|
||||||
import argparse
|
|
||||||
#=======================================================================
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""
|
|
||||||
Function to call if run from command line.
|
|
||||||
|
|
||||||
Example use:
|
|
||||||
pdb_chain_splitter.py -f <your_pdb_file> -c <chainid1><chainid2>
|
|
||||||
Extracts chain 'A' by default.
|
|
||||||
FIXME: extract all chains from the given pdb and write them out individually
|
|
||||||
"""
|
|
||||||
arg_parser = argparse.ArgumentParser()
|
|
||||||
arg_parser.add_argument('-i', '--pdb_file', help='provide pdb file', default = 'None')
|
|
||||||
arg_parser.add_argument('-c', '--chain', help='chain/s to extract without spaces.', nargs = '+', default = 'A', type = list)
|
|
||||||
arg_parser.add_argument('-p', '--out_path', help='specify output path', default = '.', type = str)
|
|
||||||
arg_parser.add_argument('-o', '--out_file', help='specify output filename. Will be used as a prefix to append chain id and pdb file extension', default = 'pdb_file_chain', type = str)
|
|
||||||
args = arg_parser.parse_args()
|
|
||||||
|
|
||||||
# Extract chains and write each chain as a separate file
|
|
||||||
pdb_file = args.pdb_file
|
|
||||||
print('input pdb file:', pdb_file)
|
|
||||||
|
|
||||||
# type = list, makes it a list of lists. Hence extracting the list of chains.
|
|
||||||
chains = args.chain[0]
|
|
||||||
#chains = ['A','B','C']
|
|
||||||
print ('user supplied chain:', chains)
|
|
||||||
|
|
||||||
# output filename and path
|
|
||||||
outpath = args.out_path
|
|
||||||
outfile = args.out_file
|
|
||||||
|
|
||||||
# get structure
|
|
||||||
p = PDBParser(PERMISSIVE=1)
|
|
||||||
structure = p.get_structure(pdb_file, pdb_file)
|
|
||||||
print('input pdb filename:', structure.get_id())
|
|
||||||
|
|
||||||
for chain in chains:
|
|
||||||
chain = chain.upper()
|
|
||||||
print ('Extracting chain:', chain)
|
|
||||||
|
|
||||||
pdb_chain_file = outpath + '/' + outfile + '_{}.pdb'.format(chain)
|
|
||||||
|
|
||||||
io = PDBIO()
|
|
||||||
io.set_structure(structure)
|
|
||||||
io.save('{}'.format(pdb_chain_file), ChainSelect(chain))
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
Subproject commit 881ff8f27aaf1db4266a84fb03baad3dab552c64
|
|
|
@ -1,70 +0,0 @@
|
||||||
|
|
||||||
#======================================================
|
|
||||||
# renumber pdb file based on user defined start number
|
|
||||||
#======================================================
|
|
||||||
home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_residue_renumber /home/tanu/git/Data/cycloserine/input/alr_complex_model.pdb -s 35 -r
|
|
||||||
|
|
||||||
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_residue_renumber /home/tanu/git/Data/rifampicin/input/rpob_complex_model.pdb -s 29 -r
|
|
||||||
|
|
||||||
#======================================================
|
|
||||||
# pdb_seq.py: extract seq from structure
|
|
||||||
#======================================================
|
|
||||||
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_seq -a /home/tanu/git/Data/ethambutol/input/3byw.pdb > 3byw_seq.txt
|
|
||||||
#/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_seq -c A -a /home/tanu/git/Data/ethambutol/input/3byw.pdb > 3byw_seq.txt
|
|
||||||
======
|
|
||||||
# gidB
|
|
||||||
=======
|
|
||||||
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_seq -a /home/tanu/git/LSHTM_3TB/gid/docking/3g89.pdb > 3g89_seq.txt
|
|
||||||
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_seq -a /home/tanu/git/LSHTM_3TB/gid/docking/gidb_chopin1.pdb > gidb_chopin1_seq.txt
|
|
||||||
|
|
||||||
alignment
|
|
||||||
>3g89A_ATOM chain_length:238
|
|
||||||
MFGKHPGGLSERGRALLLEGGKALGLDLKPHLEAFSRLYALLQEAGEEEVVVKHFLDSLTLLRLPLWQGPLRVLDLGTGA
|
|
||||||
GFPGLPLKIVRPELELVLVDATRKKVAFVERAIEVLGLKGARALWGRAEVLAREAGHREAYARAVARAVAPLCVLSELLL
|
|
||||||
PFLEVGGAAVAMKGPRVEEELAPLPPALERLGGRLGEVLALQLPLSGEARHLVVLEKTAPTPPAYPRRPGVPERHPLC
|
|
||||||
>gidb_chopin1 _ATOM chain_length:224
|
|
||||||
MSPIEPAASAIFGPRLGLARRYAEALAGPGVERGLVGPREVGRLWDRHLLNCAVIGELLERGDRVVDIGSGAGLPGVPLA
|
|
||||||
IARPDLQVVLLEPLLRRTESLREMVTDLGVAVEIVRGRAEESWVQDQLGGSDAAVSRAVAALDKLTKWSMPLIRPNGRML
|
|
||||||
AIKGERAHDEVREHRRVMIASGAVDVRVVTCGANYLRPPATVVFARRGKQIARGSARMASGGTA
|
|
||||||
|
|
||||||
#======================================================
|
|
||||||
# pdb_mutator.py: mutate residue: FIXME, needs charm
|
|
||||||
#======================================================
|
|
||||||
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_mutator -r 39 -m XXX /home/tanu/git/Data/ethambutol/input/3byw.pdb
|
|
||||||
|
|
||||||
#======================================================
|
|
||||||
# pdb_ligand.py: list ligands within pdb
|
|
||||||
# note: works ONLY for pdb containing ligands
|
|
||||||
# this is because such pdbs contain a field 'HETATM '
|
|
||||||
#======================================================
|
|
||||||
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand /home/tanu/git/Data/ethambutol/input/7bvf_b.pdb
|
|
||||||
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand /home/tanu/git/Data/ethambutol/input/7bvf.pdb
|
|
||||||
|
|
||||||
#======================================================
|
|
||||||
# pdb_hetatm.py: list ligands for valid pdbs AND docked complexes (my use case)
|
|
||||||
#======================================================
|
|
||||||
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand_tt /home/tanu/git/Data/cycloserine/input/alr_complex.pdb
|
|
||||||
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand_tt /home/tanu/git/Data/pyrazinamide/input/pnca_complex.pdb
|
|
||||||
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand_tt /home/tanu/git/Data/ethambutol/input/7bvf_b.pdb
|
|
||||||
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand_tt /home/tanu/git/Data/ethambutol/input/7bvf.pdb
|
|
||||||
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_ligand_tt /home/tanu/git/Data/rifampicin/input/rpob_complex.pdb
|
|
||||||
#======================================================
|
|
||||||
# get torsion angles
|
|
||||||
#======================================================
|
|
||||||
/home/tanu/git/LSHTM_analysis/scripts/pdbtools/scripts/pdb_torsion /home/tanu/git/Data/rifampicin/input/rpob_complex.pdb > /home/tanu/git/Data/rifampicin/input/rpob_torsion.txt
|
|
||||||
|
|
||||||
|
|
||||||
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
|
||||||
|
|
||||||
# my pdb tools
|
|
||||||
|
|
||||||
#======================================================
|
|
||||||
# save specifed chains as individual pdbs
|
|
||||||
#======================================================
|
|
||||||
./pdb_chain_splitter.py -i /home/tanu/git/Data/ethambutol/input/3byw.pdb -c DF -p /home/tanu/git/Data/ethambutol/input -o 3byw
|
|
||||||
|
|
||||||
#======================================================
|
|
||||||
# save specifed chains as one pdb
|
|
||||||
#======================================================
|
|
||||||
./pdb_chain_extract.py -i /home/tanu/git/Data/ethambutol/input/3byw.pdb -c DF -p /home/tanu/git/Data/ethambutol/input -o 3byw^C
|
|
||||||
|
|
|
@ -1,12 +0,0 @@
|
||||||
[extractor]
|
|
||||||
data_dir = /home/tanu/git/Data
|
|
||||||
#master_file = original_tanushree_data_v2.csv
|
|
||||||
master_file = mtb_gwas_v3.csv
|
|
||||||
|
|
||||||
# Relative Paths. Per-drug paths will be created like:
|
|
||||||
#
|
|
||||||
# /home/tanu/git/Data/<drug name>/input
|
|
||||||
# /home/tanu/git/Data/<drug name>/output
|
|
||||||
input_dir = input
|
|
||||||
output_dir = output
|
|
||||||
|
|
172
scripts/rd_df.py
172
scripts/rd_df.py
|
@ -1,172 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
'''
|
|
||||||
Created on Tue Aug 6 12:56:03 2019
|
|
||||||
|
|
||||||
@author: tanu
|
|
||||||
'''
|
|
||||||
#=============================================================================
|
|
||||||
# Task: Residue depth (rd) processing to generate a df with residue_depth(rd)
|
|
||||||
# values
|
|
||||||
|
|
||||||
# FIXME
|
|
||||||
# Input: '.tsv' i.e residue depth txt file (output from .zip file manually
|
|
||||||
# downloaded from the website).
|
|
||||||
# This should be integrated into the pipeline
|
|
||||||
|
|
||||||
# Output: .csv with 3 cols i.e position, rd_values & 3-letter wt aa code(caps)
|
|
||||||
#=============================================================================
|
|
||||||
#%% load packages
|
|
||||||
import sys, os
|
|
||||||
import argparse
|
|
||||||
import pandas as pd
|
|
||||||
#=============================================================================
|
|
||||||
#%% specify input and curr dir
|
|
||||||
homedir = os.path.expanduser('~')
|
|
||||||
|
|
||||||
# set working dir
|
|
||||||
os.getcwd()
|
|
||||||
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
|
|
||||||
os.getcwd()
|
|
||||||
#=======================================================================
|
|
||||||
#%% command line args
|
|
||||||
arg_parser = argparse.ArgumentParser()
|
|
||||||
arg_parser.add_argument('-d', '--drug', help='drug name', default = None)
|
|
||||||
arg_parser.add_argument('-g', '--gene', help='gene name', default = None) # case sensitive
|
|
||||||
args = arg_parser.parse_args()
|
|
||||||
#=======================================================================
|
|
||||||
#%% variable assignment: input and output
|
|
||||||
#drug = 'pyrazinamide'
|
|
||||||
#gene = 'pncA'
|
|
||||||
drug = args.drug
|
|
||||||
gene = args.gene
|
|
||||||
gene_match = gene + '_p.'
|
|
||||||
|
|
||||||
#==========
|
|
||||||
# data dir
|
|
||||||
#==========
|
|
||||||
datadir = homedir + '/' + 'git/Data'
|
|
||||||
|
|
||||||
#=======
|
|
||||||
# input
|
|
||||||
#=======
|
|
||||||
outdir = datadir + '/' + drug + '/' + 'output'
|
|
||||||
in_filename = '3pl1_rd.tsv'
|
|
||||||
infile = outdir + '/' + in_filename
|
|
||||||
print('Input filename:', in_filename
|
|
||||||
, '\nInput path:', outdir
|
|
||||||
, '\n=============================================================')
|
|
||||||
#=======
|
|
||||||
# output
|
|
||||||
#=======
|
|
||||||
outdir = datadir + '/' + drug + '/' + 'output'
|
|
||||||
out_filename = gene.lower() + '_rd.csv'
|
|
||||||
outfile = outdir + '/' + out_filename
|
|
||||||
print('Output filename:', out_filename
|
|
||||||
, '\nOutput path:', outdir
|
|
||||||
, '\n=============================================================')
|
|
||||||
|
|
||||||
#%% end of variable assignment for input and output files
|
|
||||||
#=======================================================================
|
|
||||||
#%% rd values from <gene>_rd.tsv values
|
|
||||||
def rd_to_csv(inputtsv, outputrdcsv):
|
|
||||||
"""
|
|
||||||
Calculate kd (hydropathy values) from input fasta file
|
|
||||||
|
|
||||||
@param inputtsv: tsv file downloaded from {INSERT LINK}
|
|
||||||
@type inputtsv: string
|
|
||||||
|
|
||||||
@param outputrdsv: csv file with rd values
|
|
||||||
@type outfile: string
|
|
||||||
|
|
||||||
@return: none, writes rd values df as csv
|
|
||||||
"""
|
|
||||||
#========================
|
|
||||||
# read downloaded tsv file
|
|
||||||
#========================
|
|
||||||
#%% Read input file
|
|
||||||
rd_data = pd.read_csv(inputtsv, sep = '\t')
|
|
||||||
print('Reading input file:', inputtsv
|
|
||||||
, '\nNo. of rows:', len(rd_data)
|
|
||||||
, '\nNo. of cols:', len(rd_data.columns))
|
|
||||||
|
|
||||||
print('Column names:', rd_data.columns
|
|
||||||
, '\n===============================================================')
|
|
||||||
#========================
|
|
||||||
# creating position col
|
|
||||||
#========================
|
|
||||||
# Extracting residue number from index and assigning
|
|
||||||
# the values to a column [position]. Then convert the position col to numeric.
|
|
||||||
rd_data['position'] = rd_data.index.str.extract('([0-9]+)').values
|
|
||||||
|
|
||||||
# converting position to numeric
|
|
||||||
rd_data['position'] = pd.to_numeric(rd_data['position'])
|
|
||||||
rd_data['position'].dtype
|
|
||||||
|
|
||||||
print('Extracted residue num from index and assigned as a column:'
|
|
||||||
, '\ncolumn name: position'
|
|
||||||
, '\ntotal no. of cols now:', len(rd_data.columns)
|
|
||||||
, '\n=============================================================')
|
|
||||||
|
|
||||||
#========================
|
|
||||||
# Renaming amino-acid
|
|
||||||
# and all-atom cols
|
|
||||||
#========================
|
|
||||||
print('Renaming columns:'
|
|
||||||
, '\ncolname==> # chain:residue: wt_3letter_caps'
|
|
||||||
, '\nYES... the column name *actually* contains a # ..!'
|
|
||||||
, '\ncolname==> all-atom: rd_values'
|
|
||||||
, '\n=============================================================')
|
|
||||||
|
|
||||||
rd_data.rename(columns = {'# chain:residue':'wt_3letter_caps', 'all-atom':'rd_values'}, inplace = True)
|
|
||||||
print('Column names:', rd_data.columns)
|
|
||||||
|
|
||||||
#========================
|
|
||||||
# extracting df with the
|
|
||||||
# desired columns
|
|
||||||
#========================
|
|
||||||
print('Extracting relevant columns for writing df as csv')
|
|
||||||
|
|
||||||
rd_df = rd_data[['position','rd_values','wt_3letter_caps']]
|
|
||||||
|
|
||||||
if len(rd_df) == len(rd_data):
|
|
||||||
print('PASS: extracted df has expected no. of rows'
|
|
||||||
,'\nExtracted df dim:'
|
|
||||||
,'\nNo. of rows:', len(rd_df)
|
|
||||||
,'\nNo. of cols:', len(rd_df.columns))
|
|
||||||
else:
|
|
||||||
print('FAIL: no. of rows mimatch'
|
|
||||||
, '\nExpected no. of rows:', len(rd_data)
|
|
||||||
, '\nGot no. of rows:', len(rd_df)
|
|
||||||
, '\n=====================================================')
|
|
||||||
|
|
||||||
#===============
|
|
||||||
# writing file
|
|
||||||
#===============
|
|
||||||
print('Writing file:'
|
|
||||||
, '\nFilename:', outputrdcsv
|
|
||||||
# , '\nPath:', outdir
|
|
||||||
# , '\nExpected no. of rows:', len(rd_df)
|
|
||||||
# , '\nExpected no. of cols:', len(rd_df.columns)
|
|
||||||
, '\n=========================================================')
|
|
||||||
|
|
||||||
rd_df.to_csv(outputrdcsv, header = True, index = False)
|
|
||||||
|
|
||||||
#%% end of function
|
|
||||||
#=======================================================================
|
|
||||||
#%% call function
|
|
||||||
#rd_to_csv(infile, outfile)
|
|
||||||
#=======================================================================
|
|
||||||
def main():
|
|
||||||
print('residue depth using the following params\n'
|
|
||||||
, in_filename
|
|
||||||
, '\noutfile:', out_filename)
|
|
||||||
rd_to_csv(infile, outfile)
|
|
||||||
print('Finished Writing file:'
|
|
||||||
, '\nFilename:', outfile
|
|
||||||
, '\n=============================================================')
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
||||||
#%% end of script
|
|
||||||
#=======================================================================
|
|
11289
scripts/scratch/3byw.pdb
11289
scripts/scratch/3byw.pdb
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
@ -1,15 +0,0 @@
|
||||||
>Mycobacterium tuberculosis H37Rv|Rv3423c|alr
|
|
||||||
VKRFWENVGKPNDTTDGRGTTSLAMTPISQTPGLLAEAMVDLGAIEHNVRVLREHAGHAQLMAVVKADGYGH
|
|
||||||
GATRVAQTALGAGAAELGVATVDEALALRADGITAPVLAWLHPPGIDFGPALLADVQVAVSSLRQLDELLHA
|
|
||||||
VRRTGRTATVTVKVDTGLNRNGVGPAQFPAMLTALRQAMAEDAVRLRGLMSHMVYADKPDDSINDVQAQRFT
|
|
||||||
AFLAQAREQGVRFEVAHLSNSSATMARPDLTFDLVRPGIAVYGLSPVPALGDMGLVPAMTVKCAVALVKSIR
|
|
||||||
AGEGVSYGHTWIAPRDTNLALLPIGYADGVFRSLGGRLEVLINGRRCPGVGRICMDQFMVDLGPGPLDVAEG
|
|
||||||
DEAILFGPGIRGEPTAQDWADLVGTIHYEVVTSPRGRITRTYREAENR
|
|
||||||
|
|
||||||
>alr_complex | chain A | 371 aa
|
|
||||||
LAEAMVDLGAIEHNVRVLREHAGHAQLMAVVKADGYGHGATRVAQTALGAGAAELGVATVDEALALRADGIT
|
|
||||||
APVLAWLHPPGIDFGPALLADVQVAVSSLRQLDELLHAVRRTGRTATVTVKVDTGLNRNGVGPAQFPAMLTA
|
|
||||||
LRQAMAEDAVRLRGLMSHMVYADKPDDSINDVQAQRFTAFLAQAREQGVRFEVAHLSNSSATMARPDLTFDL
|
|
||||||
VRPGIAVYGLSPVPALGDMGLVPAMTVKCAVALVKSIRAGEGVSYGHTWIAPRDTNLALLPIGYADGVFRSL
|
|
||||||
GGRLEVLINGRRCPGVGRICMDQFMVDLGPGPLDVAEGDEAILFGPGIRGEPTAQDWADLVGTIHYEVVTSP
|
|
||||||
RGRITRTYREA
|
|
|
@ -1,125 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
|
|
||||||
# useful links
|
|
||||||
#https://towardsdatascience.com/pairwise-sequence-alignment-using-biopython-d1a9d0ba861f
|
|
||||||
#https://www.biostars.org/p/265338/
|
|
||||||
|
|
||||||
from Bio import SeqIO
|
|
||||||
from Bio import pairwise2
|
|
||||||
from Bio.pairwise2 import format_alignment
|
|
||||||
import re
|
|
||||||
import os
|
|
||||||
|
|
||||||
#%%
|
|
||||||
os.chdir('/home/tanu/git/LSHTM_analysis/scripts/examples')
|
|
||||||
#%%
|
|
||||||
def myalign(ref_seq, pdb_seq):
|
|
||||||
myalign_dict = {}
|
|
||||||
|
|
||||||
alignments = pairwise2.align.globalxx(ref_seq, pdb_seq)
|
|
||||||
#alignments = pairwise2.align.localxx(ref, struct)
|
|
||||||
|
|
||||||
match = []
|
|
||||||
|
|
||||||
for a, b in zip(alignments[0][0], alignments[0][1]):
|
|
||||||
if a == b:
|
|
||||||
match.append('|')
|
|
||||||
else:
|
|
||||||
match.append(' ')
|
|
||||||
|
|
||||||
|
|
||||||
#print(match)
|
|
||||||
print(alignments[0][0])
|
|
||||||
print("".join(match))
|
|
||||||
print(alignments[0][1])
|
|
||||||
|
|
||||||
result_align = alignments[0][1]
|
|
||||||
#print(result_align)
|
|
||||||
print('===============================================================\n')
|
|
||||||
|
|
||||||
# update dict
|
|
||||||
myalign_dict.update({'aligned_fasta': result_align})
|
|
||||||
|
|
||||||
|
|
||||||
aa_regex = '\w'
|
|
||||||
m = re.search(aa_regex, result_align)
|
|
||||||
#m = my_match.span()
|
|
||||||
offset = m.start()
|
|
||||||
offset_end = m.end()
|
|
||||||
|
|
||||||
print('start of match:', offset
|
|
||||||
, '\nend of match:', offset_end)
|
|
||||||
print('===============================================================\n')
|
|
||||||
|
|
||||||
# update dict
|
|
||||||
myalign_dict.update({'start_match' : offset})
|
|
||||||
myalign_dict.update({'end_match' : offset_end})
|
|
||||||
|
|
||||||
return myalign_dict
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""
|
|
||||||
read file containing reference and pdb_sequence to align
|
|
||||||
"""
|
|
||||||
my_dict = {}
|
|
||||||
align_fastas_to_align = open('align_fastas.txt', 'r')
|
|
||||||
for record in SeqIO.parse(align_fastas_to_align,"fasta"):
|
|
||||||
myid = record.id
|
|
||||||
seq = str(record.seq)
|
|
||||||
my_dict.update({myid : seq})
|
|
||||||
|
|
||||||
my_keys = list(my_dict.keys())
|
|
||||||
my_ref_seq = my_dict[my_keys[0]] # ref_seq
|
|
||||||
my_pdb_seq = my_dict[my_keys[1]] # pdb_seq
|
|
||||||
|
|
||||||
fasta_alignment = myalign(my_ref_seq, my_pdb_seq)
|
|
||||||
print('this is my result:', fasta_alignment)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
||||||
|
|
||||||
#%% debug: individually
|
|
||||||
my_dict = {}
|
|
||||||
align_fastas_to_align = open('align_fastas.txt', 'r')
|
|
||||||
for record in SeqIO.parse(align_fastas_to_align,"fasta"):
|
|
||||||
myid =record.id
|
|
||||||
seq=str(record.seq)
|
|
||||||
#print(myid, seq)
|
|
||||||
my_dict.update({myid: seq})
|
|
||||||
|
|
||||||
print(my_dict)
|
|
||||||
print(my_dict.keys())
|
|
||||||
|
|
||||||
my_keys = list(my_dict.keys())
|
|
||||||
alignments = pairwise2.align.globalxx(my_dict[my_keys[0]], my_dict[my_keys[1]])
|
|
||||||
match = []
|
|
||||||
|
|
||||||
for a, b in zip(alignments[0][0], alignments[0][1]):
|
|
||||||
if a == b:
|
|
||||||
match.append('|')
|
|
||||||
else:
|
|
||||||
match.append(' ')
|
|
||||||
|
|
||||||
#print(match)
|
|
||||||
print(alignments[0][0])
|
|
||||||
print("".join(match))
|
|
||||||
print(alignments[0][1])
|
|
||||||
|
|
||||||
result_align = alignments[0][1]
|
|
||||||
#print(result_align)
|
|
||||||
print('===============================================================\n')
|
|
||||||
|
|
||||||
#offset = ''
|
|
||||||
aa_regex = '\w'
|
|
||||||
m = re.search(aa_regex, result_align)
|
|
||||||
#m = my_match.span()
|
|
||||||
offset = m.start()
|
|
||||||
offset_end = m.end()
|
|
||||||
|
|
||||||
print('start of match:', offset, '\nend of match:', offset_end)
|
|
||||||
print('===============================================================\n')
|
|
||||||
|
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,52 +0,0 @@
|
||||||
#!/usr/bin/python3
|
|
||||||
|
|
||||||
#=======================================================================
|
|
||||||
# TASK: select specified chains from the structure & save a cropped structure with
|
|
||||||
# the selected chains. Useful for dimer, etc modelling.
|
|
||||||
|
|
||||||
# link for saving each chain as a separate file
|
|
||||||
# https://stackoverflow.com/questions/11685716/how-to-extract-chains-from-a-structure-file
|
|
||||||
#=======================================================================
|
|
||||||
#%%
|
|
||||||
import os, sys
|
|
||||||
from Bio.PDB import PDBParser, PDBIO, Select
|
|
||||||
#%% homdir and curr dir and local imports
|
|
||||||
homedir = os.path.expanduser('~')
|
|
||||||
# set working dir
|
|
||||||
os.getcwd()
|
|
||||||
os.chdir(homedir + '/git/Data/ethambutol/input/')
|
|
||||||
os.getcwd()
|
|
||||||
#%%
|
|
||||||
# Select() Method to return True for every chain in 'chains'
|
|
||||||
class ChainExtract(Select):
|
|
||||||
def __init__(self, chain):
|
|
||||||
self.chain = chain
|
|
||||||
|
|
||||||
def accept_chain(self, chain):
|
|
||||||
#print dir(chain)
|
|
||||||
if chain.id in self.chain:
|
|
||||||
return 1
|
|
||||||
else:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
def main():
|
|
||||||
p = PDBParser(PERMISSIVE=1)
|
|
||||||
structure = p.get_structure("3byw", "3byw.pdb")
|
|
||||||
|
|
||||||
my_chains = ['G', 'H'] # specify selected chains
|
|
||||||
c_names = ''.join(my_chains)
|
|
||||||
|
|
||||||
pdb_chains_file = 'pdb_crop_' + c_names + '.pdb'
|
|
||||||
|
|
||||||
io = PDBIO()
|
|
||||||
io.set_structure(structure)
|
|
||||||
#io.save(structure.get_id() + "_crop.structure", ChainExtract(my_chains))
|
|
||||||
io.save(pdb_chains_file, ChainExtract(my_chains))
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main()
|
|
||||||
#%%
|
|
||||||
# test
|
|
||||||
#my_chains = ['G', 'H'] # specify selected chains
|
|
||||||
#foo = ''.join(my_chains) # to append to file name
|
|
||||||
#pdb_chains_file = '_{}.pdb'.format(my_chains)
|
|
|
@ -1,49 +0,0 @@
|
||||||
#!/usr/bin/python3
|
|
||||||
|
|
||||||
#=======================================================================
|
|
||||||
# TASK: extract chain from pdb and save each chain as a separate file
|
|
||||||
|
|
||||||
# link for saving each chain as a separate file
|
|
||||||
# https://stackoverflow.com/questions/11685716/how-to-extract-chains-from-a-pdb-file
|
|
||||||
# command line args
|
|
||||||
# https://stackoverflow.com/questions/15753701/how-can-i-pass-a-list-as-a-command-line-argument-with-argparse
|
|
||||||
#=======================================================================
|
|
||||||
|
|
||||||
#%%
|
|
||||||
import os, sys
|
|
||||||
from Bio.PDB import Select, PDBIO
|
|
||||||
from Bio.PDB.PDBParser import PDBParser
|
|
||||||
#%% homdir and curr dir and local imports
|
|
||||||
homedir = os.path.expanduser('~')
|
|
||||||
# set working dir
|
|
||||||
os.getcwd()
|
|
||||||
os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
|
||||||
os.getcwd()
|
|
||||||
#%%
|
|
||||||
class ChainSelect(Select):
|
|
||||||
def __init__(self, chain):
|
|
||||||
self.chain = chain
|
|
||||||
|
|
||||||
def accept_chain(self, chain):
|
|
||||||
if chain.get_id() == self.chain:
|
|
||||||
return 1
|
|
||||||
else:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
def main():
|
|
||||||
chains = ['A','B','C','F']
|
|
||||||
p = PDBParser(PERMISSIVE=1)
|
|
||||||
#structure = p.get_structure(pdb_file, pdb_file)
|
|
||||||
structure = p.get_structure('/home/tanu/git/Data/ethambutol/input/3byw', '/home/tanu/git/Data/ethambutol/input/3byw.pdb')
|
|
||||||
#print('STRUCTURE:', structure.get_id())
|
|
||||||
# pdb_filename = print()
|
|
||||||
for chain in chains:
|
|
||||||
pdb_chain_file = 'pdb_file_chain_{}.pdb'.format(chain)
|
|
||||||
|
|
||||||
io = PDBIO()
|
|
||||||
io.set_structure(structure)
|
|
||||||
io.save('{}'.format(pdb_chain_file), ChainSelect(chain))
|
|
||||||
|
|
||||||
# If run from command line...
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
|
@ -1,45 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
import os
|
|
||||||
from biopandas.pdb import PandasPdb
|
|
||||||
|
|
||||||
#%%
|
|
||||||
homedir = os.path.expanduser('~')
|
|
||||||
os.chdir(homedir + '/git/LSHTM_analysis/scripts/examples')
|
|
||||||
|
|
||||||
#%%
|
|
||||||
file_list = ['7bvf_b.pdb', 'pnca_complex.pdb', 'alr_complex.pdb']
|
|
||||||
|
|
||||||
file_list = ['7bvf_b.pdb']
|
|
||||||
#file_list = ['pnca_complex.pdb']
|
|
||||||
file_list = ['alr_complex.pdb']
|
|
||||||
BORING_LIGANDS = ["HOH","CA","SO4","IOD","NA","CL","GOL","PO4"]
|
|
||||||
#%% df with list
|
|
||||||
ligands_dict = {}
|
|
||||||
for pdb_id in file_list:
|
|
||||||
ppdb = PandasPdb()
|
|
||||||
pdb_file = ppdb.read_pdb(pdb_id)
|
|
||||||
het = pdb_file.df['HETATM']
|
|
||||||
het_list = list(set(het['residue_name']))
|
|
||||||
ligands = [ l for l in het_list if l not in BORING_LIGANDS]
|
|
||||||
lig_dict = {pdb_id:ligands}
|
|
||||||
#lig_dict = {pdb_id:het_list} # include BORING_LIGANDS
|
|
||||||
|
|
||||||
ligands_dict.update(lig_dict)
|
|
||||||
print(ligands_dict)
|
|
||||||
print('pdb_code:', pdb_file.code) # works only in case of valid pdb
|
|
||||||
print('pdb_code:', pdb_file.pdb_path) #works for bespoke pdb but prints the full path
|
|
||||||
print('pdb_code:', os.path.basename(pdb_file.pdb_path)) # prints only the last part i.e filename
|
|
||||||
#%% test with one
|
|
||||||
ppdb = PandasPdb()
|
|
||||||
pdb_file = ppdb.read_pdb('7bvf_b.pdb')
|
|
||||||
het = pdb_file.df['HETATM']
|
|
||||||
het_list = list(set(het['residue_name']))
|
|
||||||
print(het_list)
|
|
||||||
ligands = [ l for l in het_list if l not in BORING_LIGANDS]
|
|
||||||
print(ligands)
|
|
||||||
|
|
||||||
#%% extract last part from file path
|
|
||||||
print(os.path.basename(homedir + '/git/LSHTM_analysis/scripts/examples'))
|
|
||||||
print(os.path.basename('alr_complex.pdb'))
|
|
||||||
foo = os.path.basename(pdb_file.pdb_path)
|
|
||||||
print(foo)
|
|
|
@ -1,81 +0,0 @@
|
||||||
#!/usr/bin/env python
|
|
||||||
import os
|
|
||||||
from Bio.PDB import *
|
|
||||||
from biopandas.pdb import PandasPdb
|
|
||||||
from collections import defaultdict, OrderedDict
|
|
||||||
import pandas as pd
|
|
||||||
from functools import reduce
|
|
||||||
#%% see verison of pandas
|
|
||||||
#print(pd.__version__)
|
|
||||||
|
|
||||||
#%%
|
|
||||||
homedir = os.path.expanduser('~')
|
|
||||||
os.chdir(homedir + '/git/LSHTM_analysis/scripts/examples')
|
|
||||||
# link
|
|
||||||
#https://www.pythonprogramming.in/pandas-count-distinct-values-of-one-column-depend-on-another-column.html
|
|
||||||
#https://datascience.stackexchange.com/questions/32328/export-pandas-to-dictionary-by-combining-multiple-row-values
|
|
||||||
|
|
||||||
# 3 way merge
|
|
||||||
#https://stackoverflow.com/questions/23668427/pandas-three-way-joining-multiple-dataframes-on-columns
|
|
||||||
#https://stackoverflow.com/questions/52223045/merge-multiple-dataframes-based-on-a-common-column
|
|
||||||
|
|
||||||
#%% Read data
|
|
||||||
file_list = ['7bvf_b.pdb']
|
|
||||||
file_list = ['3byw.pdb']
|
|
||||||
#file_list = ['7bvf_b.pdb', 'pnca_complex.pdb', '3byw']
|
|
||||||
#%%
|
|
||||||
|
|
||||||
for pdb in file_list:
|
|
||||||
print(pdb)
|
|
||||||
p = PDBParser()
|
|
||||||
structure = p.get_structure(pdb, pdb)
|
|
||||||
for model in structure:
|
|
||||||
for chain in model:
|
|
||||||
for residue in chain:
|
|
||||||
for atom in residue:
|
|
||||||
#print(atom)
|
|
||||||
|
|
||||||
#%% biopandas
|
|
||||||
pdb_dict = {}
|
|
||||||
for pdb_id in file_list:
|
|
||||||
ppdb = PandasPdb()
|
|
||||||
pdb_file = ppdb.read_pdb(pdb_id)
|
|
||||||
#dir(pdb_file)
|
|
||||||
atm_df = pdb_file.df['ATOM']
|
|
||||||
#print('column names:', atm_df.columns)
|
|
||||||
|
|
||||||
pdb_chains = list(set(atm_df['chain_id']))
|
|
||||||
print('pdb chains:', pdb_chains)
|
|
||||||
|
|
||||||
total_chains = len(pdb_chains)
|
|
||||||
print('total no. of chains:', total_chains)
|
|
||||||
|
|
||||||
chain_info = {}
|
|
||||||
#atm_df_s = atm_df.sort_values(by=['atom_number', 'chain_id', 'residue_number'])
|
|
||||||
c_start = atm_df.groupby('chain_id').residue_number.min()
|
|
||||||
print(c_start)
|
|
||||||
c_start_df = pd.DataFrame({'chain_id': c_start.index, 'start_res': c_start.values})
|
|
||||||
|
|
||||||
c_end = atm_df.groupby('chain_id').residue_number.max()
|
|
||||||
print(c_end)
|
|
||||||
c_end_df = pd.DataFrame({'chain_id': c_end.index, 'end_res': c_end.values})
|
|
||||||
|
|
||||||
c_length = atm_df.groupby('chain_id').residue_number.nunique()
|
|
||||||
print(c_length)
|
|
||||||
c_length_df = pd.DataFrame({'chain_id': c_length.index, 'chain_len': c_length.values})
|
|
||||||
|
|
||||||
# combine 3 series into and assign 'chain_id' as a column
|
|
||||||
# using rlambda function works well (as it should work with whatever number of dataframes you want to merge)
|
|
||||||
# using pd.concat creates extra chain id cols
|
|
||||||
c_df = reduce(lambda left,right: pd.merge(left,right, on = 'chain_id'), [c_start_df, c_end_df, c_length_df])
|
|
||||||
|
|
||||||
# convert df to dict with 'chain_id' as key and columns as list of values
|
|
||||||
chain_dict = c_df.set_index('chain_id').T.to_dict('list')
|
|
||||||
print(chain_dict)
|
|
||||||
#%% Idea
|
|
||||||
#protein_name: total_chains: 8, total ligands/hetatm = 3
|
|
||||||
#df of chain details
|
|
||||||
#chain start_res end_res len_chain
|
|
||||||
#pdb tools script separate one chain
|
|
||||||
|
|
||||||
# remove water and
|
|
File diff suppressed because it is too large
Load diff
60
scripts/tidy_split.py
Normal file
60
scripts/tidy_split.py
Normal file
|
@ -0,0 +1,60 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
'''
|
||||||
|
Created on Tue Aug 6 12:56:03 2019
|
||||||
|
|
||||||
|
@author: tanu
|
||||||
|
'''
|
||||||
|
#=======================================================================
|
||||||
|
#%% load libraries
|
||||||
|
import os, sys
|
||||||
|
import pandas as pd
|
||||||
|
#import numpy as np
|
||||||
|
#=======================================================================
|
||||||
|
#%% homdir and curr dir and local imports
|
||||||
|
#homedir = os.path.expanduser('~')
|
||||||
|
# set working dir
|
||||||
|
#os.getcwd()
|
||||||
|
#os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
||||||
|
#os.getcwd()
|
||||||
|
#%%=====================================================================
|
||||||
|
# define the split function
|
||||||
|
def tidy_split(df, column, sep = '|', keep = False):
|
||||||
|
'''
|
||||||
|
Split the values of a column and expand so the new DataFrame has one split
|
||||||
|
value per row. Filters rows where the column is missing.
|
||||||
|
|
||||||
|
Params
|
||||||
|
------
|
||||||
|
df : pandas.DataFrame
|
||||||
|
dataframe with the column to split and expand
|
||||||
|
column : str
|
||||||
|
the column to split and expand
|
||||||
|
sep : str
|
||||||
|
the string used to split the column's values
|
||||||
|
keep : bool
|
||||||
|
whether to retain the presplit value as it's own row
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
pandas.DataFrame
|
||||||
|
Returns a dataframe with the same columns as `df`.
|
||||||
|
'''
|
||||||
|
indexes = list()
|
||||||
|
new_values = list()
|
||||||
|
#df = df.dropna(subset=[column])#!!!!-----see this incase you need to uncomment based on use case
|
||||||
|
for i, presplit in enumerate(df[column].astype(str)):
|
||||||
|
values = presplit.split(sep)
|
||||||
|
if keep and len(values) > 1:
|
||||||
|
indexes.append(i)
|
||||||
|
new_values.append(presplit)
|
||||||
|
for value in values:
|
||||||
|
indexes.append(i)
|
||||||
|
new_values.append(value)
|
||||||
|
new_df = df.iloc[indexes, :].copy()
|
||||||
|
new_df[column] = new_values
|
||||||
|
return new_df
|
||||||
|
#%%=====================================================================
|
||||||
|
#end of tidy_split()
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue