From d410459e256b006173762142c62c116ad032554b Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Mon, 13 Jan 2020 16:09:49 +0000 Subject: [PATCH] added file to generate mutant seqs --- .../scripts/generate_mut_sequences.py | 244 ++++++++++++++++++ 1 file changed, 244 insertions(+) create mode 100644 mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py diff --git a/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py b/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py new file mode 100644 index 0000000..60018f3 --- /dev/null +++ b/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py @@ -0,0 +1,244 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Tue Jun 25 08:46:36 2019 + +@author: tanushree +""" +############################################ +# load libraries +import os +import pandas as pd +from Bio import SeqIO +############################################ +#******************************************************************** +# TASK: Read in fasta files and create mutant sequences akin to a MSA, +# to allow generation of logo plots + +# Requirements: +# input: Fasta file of protein/target for which mut seqs will be created + # path: "Data//input/original/" +# output: MSA for mutant sequences + # path: "Data//input/processed/" +#*********************************************************************** +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +############# specify variables for input and output paths and filenames +homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde +basedir = "/git/Data/pyrazinamide/input" + +# input +inpath = "/original" +in_filename_fasta = "/3pl1.fasta.txt" +infile_fasta = homedir + basedir + inpath + in_filename_fasta +print("Input file is:", infile_fasta) + +inpath_p = "/processed" +in_filename_meta_data = "/meta_data_with_AFandOR.csv" +infile_meta_data = homedir + basedir + inpath_p + in_filename_meta_data +print("Input file is:", infile_meta_data) + +# output: only path specified, filenames in respective sections +outpath = "/processed" + +################## end of variable assignment for input and output files +#========== +#read files +#========== +############# +#fasta file +############# +#my_file = infile_fasta + +my_fasta = str() +for seq_record in SeqIO.parse(infile_fasta, "fasta"): + my_seq = seq_record.seq + my_fasta = str(my_seq) #convert to a string + print(my_fasta) +# print( len(my_fasta) ) +# print( type(my_fasta) ) + +len(my_fasta) + +############# +# SNP info +############# +# read mutant_info file and extract cols with positions and mutant_info +# This should be all samples with pncA muts +#my_data = pd.read_csv('mcsm_complex1_normalised.csv') #335, 15 +#my_data = pd.read_csv('meta_data_with_AFandOR.csv') #3093, 22 +my_data = pd.read_csv(infile_meta_data) #3093, 22 +list(my_data.columns) + +#FIXME: You need a better way to identify this +# remove positions not in the structure +#pos_remove = 186 +my_data = my_data[my_data.position != 186] #3092, 22 + +# if multiple positions, then try the example below; +# https://stackoverflow.com/questions/29017525/deleting-rows-based-on-multiple-conditions-python-pandas +#df = df[(df.one > 0) | (df.two > 0) | (df.three > 0) & (df.four < 1)] + +#mut_info1 = my_data[['Position', 'Mutant_type']] #335, 2 +mut_info1 = my_data[['position', 'mutant_type']] #3092, 2 + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +############### +# data cleaning +################ +# extract only those positions that have a frequency count of pos>1 +###mut_info['freq_pos'] = mut_info.groupby('Position').count()#### dodgy + +# add a column of frequency for each position +#mut_info1['freq_pos'] = mut_info1.groupby('Position')['Position'].transform('count') #335,3 +mut_info1['freq_pos'] = mut_info1.groupby('position')['position'].transform('count') #3092,3 + +# sort by position +mut_info2 = mut_info1.sort_values(by=['position']) + +#FIXME +#__main__:1: SettingWithCopyWarning: +#A value is trying to be set on a copy of a slice from a DataFrame. +#Try using .loc[row_indexer,col_indexer] = value instead + +#See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy + +#sort dataframe by freq values so the row indices are in order! +#mut_info2 = mut_info1.sort_values(by = 'freq_pos' +# , axis = 0 +# , ascending = False +# , inplace = False +# , na_position = 'last') + +#mut_info2 = mut_info2.reset_index( drop = True) + + +# count how many pos have freq 1 as you will need to exclude those +mut_info2[mut_info2.freq_pos == 1].sum() #20 + +# extract entries with freq_pos>1 +# should be 3093-211 = 3072 +mut_info3 = mut_info2.loc[mut_info2['freq_pos'] >1] #3072 + +# reset index to allow iteration <<<<<<<< IMPORTANT +mut_info = mut_info3.reset_index(drop = True) + +del(mut_info1, mut_info2, mut_info3, my_data) + +#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +################### +# generate mut seqs +################### +mut_seqsL = [] * len(mut_info) + +# iterate +for i, pos in enumerate(mut_info['position']): + print('index:', i, 'position:', pos) + mut = mut_info['mutant_type'][i] +# print(mut) +# print( type(mut) ) + print('index:', i, 'position:', pos, 'mutant', mut) + + my_fastaL = list(my_fasta) + offset_pos = pos-1 #due to counting starting from 0 + my_fastaL[offset_pos] = mut +# print(my_fastaL) + mut_seq = "".join(my_fastaL) +# print(mut_seq + '\n') + mut_seqsL.append(mut_seq) +# print('original:', my_fasta, ',', 'replaced at', pos, 'with', mut, mut_seq) + +############### +# sanity check +################ +len_orig = len(my_fasta) +# checking if all the mutant sequences have the same length as the original fasta file sequence +for seqs in mut_seqsL: +# print(seqs) +# print(len(seqs)) + if len(seqs) != len_orig: + print('sequence lengths mismatch' +'\n', 'mutant seq length:', len(seqs), 'vs original seq length:', len_orig) + else: + print('**Hooray** Length of mutant and original sequences match') + +del(i, len_orig, mut, mut_seq, my_fastaL, offset_pos, pos, seqs) + +############ +# write file +############ +#filepath = homedir +'/git/LSHTM_Y1_PNCA/combined_v3/logo_plot/snp_seqsfile' +#filepath = homedir + '/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Data/gene_msa.txt' + +print(outpath) +out_filename_gene = "/gene_msa.txt" +outfile_gene = homedir + basedir + outpath + out_filename_gene +print("Output file is:", outfile_gene) + +with open(outfile_gene, 'w') as file_handler: + for item in mut_seqsL: + file_handler.write("{}\n".format(item)) + +R="\n".join(mut_seqsL) +f = open('Columns.csv','w') +f.write(R) +f.close() + + +################################################################################# +# extracting only positions with SNPs so that when you plot only those positions +################################################################################ +#mut_seqsL = mut_seqsL[:3] #just trying with 3 seqs + +# create a list of unique positions +pos = mut_info['position'] #3072 +posL = list(set(list(pos))) #110 +del(pos) + +snp_seqsL = [] * len(mut_seqsL) + +for j, mut_seq in enumerate(mut_seqsL): + print (j, mut_seq) +# print(mut_seq[101]) #testing, this should be P, T V (in order of the mut_info file) + mut_seqsE = list(mut_seq) +# extract specific posistions (corres to SNPs) from list of mutant sequences + snp_seqL1 = [mut_seqsE[i-1] for i in posL] #should be 110 +# print(snp_seqL1) +# print(len(snp_seqL1)) + snp_seq_clean = "".join(snp_seqL1) + snp_seqsL.append(snp_seq_clean) + +############### +# sanity check +################ +no_unique_snps = len(posL) + +# checking if all the mutant sequences have the same length as the original fasta file sequence +for seqs in snp_seqsL: +# print(seqs) +# print(len(seqs)) + if len(seqs) != no_unique_snps: + print('sequence lengths mismatch' +'\n', 'mutant seq length:', len(seqs), 'vs original seq length:', no_unique_snps) + else: + print('**Hooray** Length of mutant and original sequences match') + +del(mut_seq, mut_seqsE, mut_seqsL, seqs, snp_seqL1, snp_seq_clean) + + +############ +# write file +############ +#filepath = homedir +'/git/LSHTM_Y1_PNCA/combined_v3/logo_plot/snp_seqsfile' +#filepath = homedir + '/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Data/snps_msa.txt' + +print(outpath) +out_filename_snps = "/snps_msa.txt" +outfile_snps = homedir + basedir + outpath + out_filename_snps +print("Output file is:", outfile_snps) + +with open(outfile_snps, 'w') as file_handler: + for item in snp_seqsL: + file_handler.write("{}\n".format(item)) + +R="\n".join(snp_seqsL) +f = open('Columns.csv','w') +f.write(R) +f.close()