diff --git a/scripts/af_or_calcs_scratch.R b/scripts/af_or_calcs_scratch.R deleted file mode 100644 index 7e18859..0000000 --- a/scripts/af_or_calcs_scratch.R +++ /dev/null @@ -1,565 +0,0 @@ -######################################################### -# TASK: To compare OR from master snps -# chisq, fisher test and logistic and adjusted logistic -######################################################### -getwd() -setwd('~/git/LSHTM_analysis/scripts') -getwd() - -#install.packages("logistf") -library(logistf) -######################################################### -#%% variable assignment: input and output paths & filenames -drug = 'pyrazinamide' -gene = 'pncA' -gene_match = paste0(gene,'_p.') -cat(gene_match) - -#=========== -# input and output dirs -#=========== -datadir = paste0('~/git/Data') -indir = paste0(datadir, '/', drug, '/', 'input') -outdir = paste0(datadir, '/', drug, '/', 'output') - -#=========== -# input and output files -#=========== -in_filename = 'original_tanushree_data_v2.csv' -#in_filename = 'mtb_gwas_v3.csv' -infile = paste0(datadir, '/', in_filename) -cat(paste0('Reading infile1: raw snps', ' ', infile) ) - -# infile2: _gene associated meta snps file to extract valid snps and add calcs to. -# This is outfile3 from snps_extraction.py -in_filename_metadata = paste0(tolower(gene), '_metadata.csv') -infile_metadata = paste0(outdir, '/', in_filename_metadata) -cat(paste0('Reading infile2: gene associated metadata:', infile_metadata)) - -#=========== -# output -#=========== -out_filename = paste0(tolower(gene),'_', 'af_or.csv') -outfile = paste0(outdir, '/', out_filename) -cat(paste0('Output file with full path:', outfile)) -#%% end of variable assignment for input and output files - -######################################################### -# 1: Read master/raw snps stored in snps/ -##################################################### - -#=============== -# Step 1: read raw snps (all remove entries with NA in pza column) -#=============== -raw_data_all = read.csv(infile, stringsAsFactors = F) - -# building cols to extract -dr_muts_col = paste0('dr_mutations_', drug) -other_muts_col = paste0('other_mutations_', drug) - -cat('Extracting columns based on variables:\n' - , drug - , '\n' - , dr_muts_col - , '\n' - , other_muts_col - , '\n===============================================================') - -raw_data = raw_data_all[,c("id" - , drug - , dr_muts_col - , other_muts_col)] -rm(raw_data_all) - -rm(indir, in_filename, infile) - -#=========== -# 1a: exclude na -#=========== -raw_data = raw_data[!is.na(raw_data[[drug]]),] - -total_samples = length(unique(raw_data$id)) -cat(paste0('Total samples without NA in', ' ', drug, 'is:', total_samples)) - -# sanity check: should be true -is.numeric(total_samples) - -#=========== -# 1b: combine the two mutation columns -#=========== -#raw_data$all_mutations_pyrazinamide = paste(raw_data$dr_mutations_pyrazinamide, raw_data$other_mutations_pyrazinamide) - -all_muts_colname = paste0('all_mutations_', drug) -raw_data[[all_muts_colname]] = paste(raw_data[[dr_muts_col]], raw_data[[other_muts_col]]) -head(raw_data[[all_muts_colname]]) - -#=========== -# 1c: create yet another column that contains all the mutations but in lower case -#=========== -head(raw_data[[all_muts_colname]]) -raw_data$all_muts_gene = tolower(raw_data[[all_muts_colname]]) -head(raw_data$all_muts_gene) - -# sanity checks -#table(grepl("gene_p",raw_data$all_muts_gene)) -cat(paste0('converting gene match:', gene_match, ' ', 'to lowercase')) -gene_match = tolower(gene_match) - -table(grepl(gene_match,raw_data$all_muts_gene)) - -# sanity check -if(sum(table(grepl(gene_match, raw_data$all_muts_gene))) == total_samples){ - cat('PASS: Total no. of samples match') -} else{ - cat('FAIL: No. of samples mismatch') -} - -######################################################### -# 2: Read valid snps for which OR -# can be calculated -######################################################### -cat(paste0('Reading metadata infile:', infile_metadata)) - -gene_metadata = read.csv(infile_metadata - #, file.choose() - , stringsAsFactors = F - , header = T) - - -# clear variables -rm(in_filename_metadata, infile_metadata) - -# count na in pyrazinamide column -tot_pza_na = sum(is.na(gene_metadata$pyrazinamide)) -expected_rows = nrow(gene_metadata) - tot_pza_na - -# drop na from the pyrazinamide colum -gene_snps_or = gene_metadata[!is.na(gene_metadata[[drug]]),] - -# sanity check -if(nrow(gene_snps_or) == expected_rows){ - cat('PASS: no. of rows match with expected_rows') -} else{ - cat('FAIL: nrows mismatch.') -} - -# extract unique snps to iterate over for AF and OR calcs -gene_snps_unique = unique(gene_snps_or$mutation) - -cat(paste0('Total no. of distinct comp snps to perform OR calcs: ', length(gene_snps_unique))) -#===================================== -#OR calcs using the following 4 -#1) chisq.test -#2) fisher -#3) modified chisq.test -#4) logistic -#5) adjusted logistic? - -#====================================== -######################### -# custom chisq function: -# To calculate OR -######################### -#i = "pnca_p.trp68gly" -#mut = grepl(i,raw_data$all_muts_gene) -#dst = raw_data[[drug]] - -#x = as.numeric(mut) -#y = dst - -mychisq_or = function(x,y){ - tab = as.matrix(table(x,y)) - a = tab[2,2] - if (a==0){ a<-0.5} - b = tab[2,1] - if (b==0){ b<-0.5} - c = tab[1,2] - if (c==0){ c<-0.5} - d = tab[1,1] - if (d==0){ d<-0.5} - (a/b)/(c/d) - -} - -#====================================== -#============== -# TEST WITH ONE -#============== -i = "pnca_p.trp68gly" -i = "pnca_p.gln10pro" -i = "pnca_p.leu159arg" - -# IV -table(grepl(i,raw_data$all_muts_gene)) -mut = grepl(i,raw_data$all_muts_gene) - -# DV -#dst = raw_data$pyrazinamide -dst = raw_data[[drug]] # or raw_data[,drug] - -# 2X2 table -table(mut, dst) - -# CV -#c = raw_data$id[mut] -c = raw_data$id[grepl(i,raw_data$all_muts_gene)] -#sid = grepl(raw_data$id[mut], raw_data$id) # warning -#argument 'pattern' has length > 1 and only the first element will be used -#grepl(raw_data$id=="ERR2512440", raw_data$id) - -sid = grepl(paste(c,collapse="|"), raw_data$id) -table(sid) - -# 3X2 table -table(mut, dst, sid) - -#=================================================== -# compare ORs from different calcs - -#1) chisq -chisq.test(table(mut,dst)) -chisq_estimate = chisq.test(table(mut,dst))$statistic -est_chisq = chisq_estimate[[1]]; print(paste0('chi sq estimate:', est_chisq))# numeric part only -pval_chisq = chisq.test(table(mut,dst))$p.value; print(paste0('pvalue:', pval_chisq)) - - -#2) fisher -fisher.test(table(mut, dst)) -fisher.test(table(mut, dst))$p.value -est_fisher = fisher.test(table(mut, dst))$estimate -or_fisher = est_fisher[[1]]; print(paste0('OR fisher:', or_fisher))# numeric part only -pval_fisher = fisher.test(table(mut, dst))$p.value; print(paste0('pval fisher:', pval_fisher)) - - -#3) custom chisq -or_mychisq = mychisq_or(mut,dst) - -#4) logistic -summary(model<-glm(dst ~ mut, family = binomial)) - -or_logistic = exp(summary(model)$coefficients[2,1]); print(paste0('OR logistic:', or_logistic)) -pval_logistic = summary(model)$coefficients[2,4]; print(paste0('pval logistic:', pval_logistic)) - -# extract SE of the logistic model for a given snp -logistic_se = summary(model)$coefficients[2,2]; print(paste0('SE:', logistic_se)) - -# extract Z of the logistic model for a given snp -logistic_zval = summary(model)$coefficients[2,3]; print(paste0('Z-value:', logistic_zval)) - -# extract confint interval of snp (2 steps, since the output is a named number) -ci_mod = exp(confint(model))[2,]; print(paste0('CI:', ci_mod)) -#logistic_ci = paste(ci_mod[["2.5 %"]], ",", ci_mod[["97.5 %"]]) - -logistic_ci_lower = ci_mod[["2.5 %"]]; print(paste0('CI_lower:', logistic_ci_lower)) -logistic_ci_upper = ci_mod[["97.5 %"]]; print(paste0('CI_upper:', logistic_ci_upper)) - -# adjusted logistic or: doesn't seem to make a difference -summary(model2<-glm(dst ~ mut + sid, family = binomial)) -or_logistic2 = exp(summary(model2)$coefficients[2,1]); print(paste0('Adjusted OR logistic:', or_logistic2)) -pval_logistic2 = summary(model2)$coefficients[2,4]; print(paste0('Adjusted pval logistic:',pval_logistic2)) - -# print all output -writeLines(c(paste0("mutation:", i) - , paste0("=========================") - , paste0("OR logistic:", or_logistic,"--->", "pval logistic:", pval_logistic ) - , paste0("OR adjusted logistic:", or_logistic2,"--->", "pval adjusted logistic:", pval_logistic2) - , paste0("OR fisher:", or_fisher, "--->","pval fisher:", pval_fisher ) - , paste0("OR custom chisq:", or_mychisq, "--->","pval fisher:", pval_fisher ) - , paste0("Chisq estimate:", est_chisq, "--->","pval chisq:", pval_chisq))) -#%%======================================================== -# looping with sapply: a subset of mutations -#%%======================================================== - -snps = c("pnca_p.trp68gly", "pnca_p.leu4ser", "pnca_p.leu159arg","pnca_p.his57arg" ) -#snps = snps_test[1:2] -snps - -# custom chisq -ors = sapply(snps,function(m){ - mut = grepl(m,raw_data$all_muts_gene) - mychisq_or(mut,dst) -}) -head(ors) - -# pvalue fisher, to be used with custom chisq -pvals = sapply(snps,function(m){ - mut = grepl(m,raw_data$all_muts_gene) - fisher.test(mut,dst)$p.value -}) -head(pvals) - -# allele frequency -afs = sapply(snps,function(m){ - mut = grepl(m,raw_data$all_muts_gene) - mean(mut) -}) -head(afs) - -# logistic reg parameters: individual sapply - -#-------------- -## logistci or -#-------------- - ors_logistic = sapply(snps,function(m){ - mut = grepl(m,raw_data$all_muts_gene) - model<-glm(dst ~ mut, family = binomial) - or_logistic = exp(summary(model)$coefficients[2,1]) - -}) -ors_logistic -head(ors_logistic); head(names(ors_logistic)) - -#------------------- -## logistic p-value -#-------------- -pvals_logistic = sapply(snps,function(m){ - mut = grepl(m,raw_data$all_muts_gene) - model<-glm(dst ~ mut , family = binomial) - pval_logistic = summary(model)$coefficients[2,4] -}) - -head(pvals_logistic); head(names(pvals_logistic)) - -#-------------- -## logistic se -#-------------- -se_logistic = sapply(snps,function(m){ - mut = grepl(m,raw_data$all_muts_gene) - model<-glm(dst ~ mut , family = binomial) - logistic_se = summary(model)$coefficients[2,2] -}) - -head(se_logistic); head(names(se_logistic)) - -#-------------- -## logistic z-value -#-------------- -zval_logistic = sapply(gene_snps_unique,function(m){ - mut = grepl(m,raw_data$all_muts_gene) - model<-glm(dst ~ mut , family = binomial) - logistic_zval = summary(model)$coefficients[2,3] -}) - -head(zval_logistic); head(names(zval_logistic)) -#-------------- -## logistic ci - lower bound -#-------------- -ci_lb_logistic = sapply(snps,function(m){ - mut = grepl(m,raw_data$all_muts_gene) - model<-glm(dst ~ mut , family = binomial) - ci_mod = exp(confint(model))[2,] - logistic_ci_lower = ci_mod[["2.5 %"]] -}) - -head(ci_lb_logistic); head(names(ci_lb_logistic)) -#-------------- -## logistic ci - upper bound -#-------------- -ci_ub_logistic = sapply(snps,function(m){ - mut = grepl(m,raw_data$all_muts_gene) - model<-glm(dst ~ mut , family = binomial) - ci_mod = exp(confint(model))[2,] - logistic_ci_upper = ci_mod[["97.5 %"]] - -}) - -head(ci_ub_logistic); head(names(ci_ub_logistic)) - -#-------------- -# adjusted logistic with sample id: Doesn't seem to make a difference -#-------------- -logistic_ors2 = sapply(snps,function(m){ - mut = grepl(m,raw_data$all_muts_gene) - c = raw_data$id[mut] - sid = grepl(paste(c,collapse="|"), raw_data$id) - model2<-glm(dst ~ mut + sid, family = binomial) - or_logistic2 = exp(summary(model2)$coefficients[2,1]) - #pval_logistic2 = summary(model2)$coefficients[2,4] -}) -head(logistic_ors) - -#=!=!=!=!=!=!=!=!=!=!=!=!=!=! -# COMMENT: individual sapply seem wasteful -#=!=!=!=!=!=!=!=!=!=!=!=!=!=! - -#%%======================================================== -# sapply with multiple values being returned as df -#Link: https://gist.github.com/primaryobjects/33adabc337edd67b4a8d -#%%======================================================== -snps = c("pnca_p.trp68gly", "pnca_p.leu4ser", "pnca_p.leu159arg","pnca_p.his57arg" ) -#snps = snps_test[1:4] -snps - -# yayy works! -# DV: pyrazinamide 0 or 1 -dst = raw_data[[drug]] - -# initialise an empty df -or_df = data.frame() - -x = sapply(snps,function(m){ - - #df = data.frame() - - mut = grepl(m,raw_data$all_muts_gene) - model<-glm(dst ~ mut, family = binomial) - - # allele frequency - afs = mean(mut) - - # logistic model - beta_logistic = summary(model)$coefficients[2,1] - - or_logistic = exp(summary(model)$coefficients[2,1]) - print(paste0('logistic OR:', or_logistic)) - - pval_logistic = summary(model)$coefficients[2,4] - print(paste0('logistic pval:', pval_logistic)) - - se_logistic = summary(model)$coefficients[2,2] - zval_logistic = summary(model)$coefficients[2,3] - ci_mod = exp(confint(model))[2,] - ci_lower_logistic = ci_mod[["2.5 %"]] - ci_upper_logistic = ci_mod[["97.5 %"]] - - # custom_chisq and fisher: OR p-value and CI - or_mychisq = mychisq_or(dst, mut) - - or_fisher = fisher.test(dst, mut)$estimate - or_fisher = or_fisher[[1]] - - pval_fisher = fisher.test(dst, mut)$p.value - - ci_lower_fisher = fisher.test(dst, mut)$conf.int[1] - ci_upper_fisher = fisher.test(dst, mut)$conf.int[2] - - # chi sq estimates - estimate_chisq = chisq.test(dst, mut)$statistic; estimate_chisq - est_chisq = estimate_chisq[[1]]; print(est_chisq) - - pval_chisq = chisq.test(dst, mut)$p.value - - # build a row to append to df - row = data.frame(mutation = m - , af = afs - , beta_logistic = beta_logistic - , or_logistic = or_logistic - , pval_logistic = pval_logistic - , se_logistic = se_logistic - , zval_logistic = zval_logistic - , ci_low_logistic = ci_lower_logistic - , ci_hi_logistic = ci_upper_logistic - , or_mychisq = or_mychisq - , or_fisher = or_fisher - , pval_fisher = pval_fisher - , ci_low_fisher= ci_lower_fisher - , ci_hi_fisher = ci_upper_fisher - , est_chisq = est_chisq - , pval_chisq = pval_chisq - ) - #print(row) - - or_df <<- rbind(or_df, row) - -}) - -write.csv(or_df, 'test_ors.csv') - -#================================= -# testing logistic or with maxit, etc. - -print(paste0('subset to iterate over;', snps)) - -# start loop - perfectSeparation <- function(w) { - if(grepl("fitted probabilities numerically 0 or 1 occurred", - as.character(w))) {ww <<- ww+1} -} - -for (i in snps){ - - print(i) - - # IV - #mut<-as.numeric(grepl(i,raw_data$all_muts_gene)) - mut = grepl(i,raw_data$all_muts_gene) - table(mut) - - # DV - #dst<-as.numeric(raw_data[[drug]]) - dst = raw_data[[drug]] - - # table - print(table(dst, mut)) - - #===================== - # logistic regression, glm.control(maxit = n) - #https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression - #===================== - #n = 1 - summary(model<-glm(dst ~ mut - , family = binomial - #, control = glm.control(maxit = 1) - #, options(warn = 1) - )) - #, warning = perfectSeparation)) - or_logistic = exp(summary(model)$coefficients[2,1]); print(or_logistic) - pval_logistic_maxit = summary(model)$coefficients[2,4]; print(pval_logistic_maxit) - logistic_se = summary(model)$coefficients[2,2] - logistic_zval = summary(model)$coefficients[2,3] - ci_mod = exp(confint(model))[2,] - logistic_ci_lower = ci_mod[["2.5 %"]] - logistic_ci_upper = ci_mod[["97.5 %"]] - - #===================== - # fishers test - #===================== - #attributes(fisher.test(table(dst, mut))) - or_fisher = fisher.test(table(dst, mut))$estimate - or_fisher = or_fisher[[1]]; or_fisher - - pval_fisher = fisher.test(table(dst, mut))$p.value ; pval_fisher - - #===================== - # custom chi square - #===================== - or_mychisq = mychisq_or(mut,dst) - - #===================== - # chi square - #===================== - #chisq.test(y = dst, x = mut) - #attributes(chisq.test(table(dst, mut))) - est_chisq = chisq.test(table(dst, mut))$statistic - est_chisq = est_chisq[[1]]; est_chisq - - pval_chisq = chisq.test(table(dst, mut))$p.value; pval_chisq - - # all output - writeLines(c(paste0("mutation:", i) - , paste0("=========================") - , paste0("OR logistic:", or_logistic,"--->", "pval logistic:", pval_logistic ) - , paste0("OR fisher:", or_fisher, "--->","pval fisher:", pval_fisher ) - , paste0("OR custom chisq:", or_mychisq, "--->","pval fisher:", pval_fisher ) - , paste0("Chisq estimate:", est_chisq, "--->","pval chisq:", pval_chisq))) -} - #===================== - # fishers test - #===================== - #attributes(fisher.test(table(dst, mut))) - or_fisher = fisher.test(table(dst, mut))$estimate - or_fisher = or_fisher[[1]]; or_fisher - - pval_fisher = fisher.test(table(dst, mut))$p.value ; pval_fisher - - - # https://stats.stackexchange.com/questions/259635/what-is-the-difference-using-a-fishers-exact-test-vs-a-logistic-regression-for - #mymethod = "minlike" # default - mymethod = "central" - #mymethod = "blaker" - - exact2x2(table(dst, mut),tsmethod=mymethod) - - mcnemar.exact(x,y=NULL, conf.level=.95) - -#===================================================================== diff --git a/scripts/nssnp_info_format.py b/scripts/nssnp_info_format.py new file mode 100755 index 0000000..6917e35 --- /dev/null +++ b/scripts/nssnp_info_format.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Wed Jun 10 11:13:49 2020 + +@author: tanu +""" +#============================================================================== +# TASK +# To format snp_fino.txt file that has already been processed in bash +# to include mcsm style muts and gwas style muts. The idea is that the info file +# will contain all possible mutation format style to make it easy to populate +# and link other files with this sort of meta data. For example: or file +#======================================================================= + +# FIXME : add bash info here as well + +#%% useful links +#https://chrisalbon.com/python/data_wrangling/pandas_join_merge_dataframe/ +#https://kanoki.org/2019/11/12/how-to-use-regex-in-pandas/ +#https://stackoverflow.com/questions/40348541/pandas-diff-with-string +#======================================================================= +#%% specify dirs +import os, sys +import pandas as pd +import numpy as np +import re +import argparse + +homedir = os.path.expanduser('~') +os.chdir(homedir + '/git/LSHTM_analysis/scripts') + +#from reference_dict import my_aa_dict +#from reference_dict import low_3letter_dict # equivalent of my_aa_dict +from reference_dict import oneletter_aa_dict +#======================================================================= +#%% command line args +arg_parser = argparse.ArgumentParser() + +arg_parser.add_argument('-d', '--drug', help = 'drug name', default = 'pyrazinamide') +arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = 'pncA') # case sensitive + +args = arg_parser.parse_args() +#======================================================================= +#%% variables +#gene = 'pncA' +#drug = 'pyrazinamide' +#gene_match = gene +'_p.' + +# cmd variables +gene = args.gene +drug = args.drug +gene_match = gene +'_p.' + +#======================================================================= +#%% input and output dirs and files +#======= +# data dir +#======= +datadir = homedir + '/' + 'git/Data' +indir = datadir + '/' + drug + '/input' +outdir = datadir + '/' + drug + '/output' + +#======= +# input +#======= + +gene_info_filename = 'ns'+ gene.lower()+ '_snp_info1.txt' +gene_info = indir + '/' + gene_info_filename +print('gene info file: ', gene_info + , '\n============================================================') + +#======= +# output +#======= +gene_snp_info_filename = 'ns' + gene.lower() + '_snp_info.csv' # other one is called AFandOR +outfile_snp_info = indir + '/' + gene_snp_info_filename +print('Output file: ', outfile_snp_info + , '\n============================================================') + +#%% read files: preformatted using bash +info_df2 = pd.read_csv(gene_info, sep = '\t', header = 0) #303, 10 + +#%% Split into three cols with 1-letter aa_code & combine to get mutationinformation column +# check mutation format in exisiting df +info_df2.head() +info_df2['mut_info'].head() + +print('Creating column: mutationinformation') +info_df2_ncols = len(info_df2.columns) + +info_df2['wild_type'] = info_df2['mut_info'].str.extract('(\w{1})>') +info_df2['position'] = info_df2['mut_info'].str.extract('(\d+)') +info_df2['mutant_type'] = info_df2['mut_info'].str.extract('>\d+(\w{1})') + +info_df2['mutationinformation'] = info_df2['wild_type'] + info_df2['position'] + info_df2['mutant_type'] + +# sanity check +ncols_add = 4 # Beware hardcoded +if len(info_df2.columns) == info_df2_ncols + ncols_add: + print('PASS: Succesfully extracted and added mutationinformation (mcsm style) as below\n' + , info_df2['mutationinformation'].head() + , '\n=====================================================================') +else: + print('FAIL: No. of cols mismatch' + ,'\noriginal length:', info_df2_ncols + , '\nExpected no. of cols:', info_df2_ncols + ncols_add + , '\nGot no. of cols:', len(info_df2.columns)) + sys.exit() + +# update ncols +info_df2_ncols = len(info_df2.columns) + +#%% Creating column 'mutation' which as mutation of the format; +# .lower()1: pnca_p.trp68gly +# match the 'one_letter_code' value to get the dict key (three-letter code) + +print('Creating column: mutation') + +# dict to use: oneletter_aa_dict +lookup_dict = dict() +for k1, v1 in oneletter_aa_dict.items(): + lookup_dict[k1] = v1['three_letter_code_lower'] +for k2, v2 in lookup_dict.items(): + info_df2['wt_3let'] = info_df2['wild_type'].squeeze().map(lookup_dict) + info_df2['mt_3let'] = info_df2['mutant_type'].squeeze().map(lookup_dict) + +# create column mutation +info_df2['mutation'] = info_df2['wt_3let'] + info_df2['position'] + info_df2['mt_3let'] + +# add prefix: gene_match to each value in column +info_df2['mutation'] = gene_match.lower() + info_df2['mutation'].astype(str) + +# sanity check +ncols_add = 3 # Beware hardcoded +if len(info_df2.columns) == info_df2_ncols + ncols_add: + print('PASS: Succesfully created column mutation as below\n' + , info_df2['mutation'].head() + , '\n=====================================================================') +else: + print('FAIL: No. of cols mismatch\noriginal length:', info_df2_ncols + , '\nExpected no. of cols:', info_df2_ncols + ncols_add + , '\nGot no. of cols:', len(info_df2.columns)) + sys.exit() + +#%% write file +print('\n=====================================================================' + , '\nWriting output file:\n', outfile_snp_info + , '\nNo.of rows:', len(info_df2) + , '\nNo. of cols:', len(info_df2.columns)) +info_df2.to_csv(outfile_snp_info, index = False) + diff --git a/scripts/or_kinship_link.py b/scripts/or_kinship_link.py index 9bd02d0..6444401 100755 --- a/scripts/or_kinship_link.py +++ b/scripts/or_kinship_link.py @@ -221,9 +221,9 @@ else: print('FAIL: Second cross check unsuccessfull. Debug please!') sys.exit() +#%% extract mut info into three cols orig_len = len(dfm2_mis.columns) -#%% extract mut info into three cols dfm2_mis['wild_type'] = dfm2_mis['mut_info'].str.extract('(\w{1})>') dfm2_mis['position'] = dfm2_mis['mut_info'].str.extract('(\d+)') dfm2_mis['mutant_type'] = dfm2_mis['mut_info'].str.extract('>\d+(\w{1})') diff --git a/scripts/reference_dict.py b/scripts/reference_dict.py index 8087009..6e7c0e8 100644 --- a/scripts/reference_dict.py +++ b/scripts/reference_dict.py @@ -57,12 +57,17 @@ print('Input filename:', in_filename #%% end of variable assignment for input and output files #======================================================================= #%% Read input file -my_aa = pd.read_csv(infile) #20, 6 +aa_table = pd.read_csv(infile) #20, 6 +#------------------------ +#1) 3-letter (lower) code as key +#------------------------- # assign the one_letter code as the row names so that it is easier to create # a dict of dicts using index #my_aa = pd.read_csv('aa_codes.csv', index_col = 0) #20, 6 #a way to it since it is the first column -my_aa = my_aa.set_index('three_letter_code_lower') #20, 5 +my_aa = aa_table.set_index('three_letter_code_lower') #20, 5 +my_aa.columns +my_aa.index #================== # convert file @@ -75,6 +80,40 @@ my_aa = my_aa.set_index('three_letter_code_lower') #20, 5 my_aa_dict = my_aa.to_dict('index') #20, with 5 subkeys print('Printing my_aa_dict:', my_aa_dict.keys()) +#FIXME : use the below in all code +low_3letter_dict = my_aa.to_dict('index') #20, with 5 subkeys +print('Printing lower-case 3 letter aa dict:',low_3letter_dict.keys()) + +#------------------------ +#2) 1-letter code as key +#------------------------- +aa_1let = aa_table.set_index('one_letter_code') #20, 5 +aa_1let.columns +aa_1let.index + +oneletter_aa_dict = aa_1let.to_dict('index') #20, with 5 subkeys +print('Printing one letter aa dict:', oneletter_aa_dict.keys()) + +#------------------------ +#3) amino acid name as key +#------------------------- +aa_name = aa_table.set_index('amino_acid_name') #20, 5 +aa_name.columns +aa_name.index + +aa_name_dict = aa_name.to_dict('index') #20, with 5 subkeys +print('Printing amino acid names aa dict:', aa_name_dict.keys()) + +#------------------------ +#3) 3 letter uppercase as key +#------------------------- +aa_up3let = aa_table.set_index('three_letter_code_upper') #20, 5 +aa_up3let.columns +aa_up3let.index + +up_3letter_aa_dict = aa_up3let.to_dict('index') #20, with 5 subkeys +print('Printing upper case 3 letter aa dict:', up_3letter_aa_dict.keys()) + #================================================ # dict of aa with their corresponding properties # This is defined twice