added file to generate mutant seqs

2020-01-13 16:09:49 +00:00 · 2020-01-13 16:09:49 +00:00 · d410459e25
commit d410459e25
parent e3f4c630a1
1 changed files with 244 additions and 0 deletions
--- a/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py
+++ b/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py
@ -0,0 +1,244 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Jun 25 08:46:36 2019
+
+@author: tanushree
+"""
+############################################
+# load libraries
+import os
+import pandas as pd
+from Bio import SeqIO
+############################################
+#********************************************************************
+# TASK: Read in fasta files and create mutant sequences akin to a MSA,
+# to allow generation of logo plots
+
+# Requirements:
+# input: Fasta file of protein/target for which mut seqs will be created
+	# path: "Data/<drug>/input/original/<filename>"
+# output: MSA for mutant sequences
+	# path: "Data/<drug>/input/processed/<filename>"
+#***********************************************************************
+#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+############# specify variables for input and output paths and filenames
+homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
+basedir = "/git/Data/pyrazinamide/input"
+
+# input
+inpath = "/original"
+in_filename_fasta = "/3pl1.fasta.txt"
+infile_fasta = homedir + basedir + inpath + in_filename_fasta
+print("Input file is:", infile_fasta)
+
+inpath_p = "/processed"
+in_filename_meta_data = "/meta_data_with_AFandOR.csv"
+infile_meta_data = homedir + basedir + inpath_p + in_filename_meta_data
+print("Input file is:", infile_meta_data)
+
+# output: only path specified, filenames in respective sections
+outpath = "/processed"
+
+################## end of variable assignment for input and output files
+#==========
+#read files
+#==========
+#############
+#fasta file
+#############
+#my_file = infile_fasta
+
+my_fasta = str()
+for seq_record in SeqIO.parse(infile_fasta, "fasta"):
+    my_seq = seq_record.seq
+    my_fasta = str(my_seq) #convert to a string
+    print(my_fasta)
+#    print( len(my_fasta) )
+#    print( type(my_fasta) )
+
+len(my_fasta)
+
+#############
+# SNP info
+#############
+# read mutant_info file and extract cols with positions and mutant_info
+# This should be all samples with pncA muts
+#my_data = pd.read_csv('mcsm_complex1_normalised.csv') #335, 15
+#my_data = pd.read_csv('meta_data_with_AFandOR.csv') #3093, 22
+my_data = pd.read_csv(infile_meta_data) #3093, 22
+list(my_data.columns)
+
+#FIXME: You need a better way to identify this
+# remove positions not in the structure
+#pos_remove = 186
+my_data = my_data[my_data.position != 186] #3092, 22
+
+# if multiple positions, then try the example below;
+# https://stackoverflow.com/questions/29017525/deleting-rows-based-on-multiple-conditions-python-pandas
+#df = df[(df.one > 0) | (df.two > 0) | (df.three > 0) & (df.four < 1)]
+
+#mut_info1 = my_data[['Position', 'Mutant_type']] #335, 2
+mut_info1 = my_data[['position', 'mutant_type']] #3092, 2
+
+#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+###############
+# data cleaning
+################
+# extract only those positions that have a frequency count of pos>1
+###mut_info['freq_pos'] = mut_info.groupby('Position').count()#### dodgy
+
+# add a column of frequency for each position
+#mut_info1['freq_pos'] = mut_info1.groupby('Position')['Position'].transform('count') #335,3
+mut_info1['freq_pos'] = mut_info1.groupby('position')['position'].transform('count') #3092,3
+
+# sort by position
+mut_info2 = mut_info1.sort_values(by=['position'])
+
+#FIXME
+#__main__:1: SettingWithCopyWarning: 
+#A value is trying to be set on a copy of a slice from a DataFrame.
+#Try using .loc[row_indexer,col_indexer] = value instead
+
+#See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
+
+#sort dataframe by freq values so the row indices are in order!
+#mut_info2 = mut_info1.sort_values(by = 'freq_pos'
+#                      , axis = 0
+#                      , ascending = False
+#                      , inplace = False
+#                      , na_position = 'last')
+
+#mut_info2 = mut_info2.reset_index( drop = True)
+
+
+# count how many pos have freq 1 as you will need to exclude those
+mut_info2[mut_info2.freq_pos == 1].sum() #20
+
+# extract entries with freq_pos>1
+# should be 3093-211 = 3072
+mut_info3 = mut_info2.loc[mut_info2['freq_pos'] >1] #3072
+
+# reset index to allow iteration <<<<<<<< IMPORTANT
+mut_info = mut_info3.reset_index(drop = True)
+
+del(mut_info1, mut_info2, mut_info3, my_data)
+
+#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+###################
+# generate mut seqs
+###################
+mut_seqsL = [] * len(mut_info) 
+
+# iterate 
+for i, pos in enumerate(mut_info['position']):
+    print('index:', i, 'position:', pos)
+    mut = mut_info['mutant_type'][i]
+#    print(mut)
+#    print( type(mut) )
+    print('index:', i, 'position:', pos, 'mutant', mut)
+
+    my_fastaL = list(my_fasta)
+    offset_pos = pos-1 #due to counting starting from 0
+    my_fastaL[offset_pos] = mut
+#    print(my_fastaL)
+    mut_seq = "".join(my_fastaL)
+#    print(mut_seq + '\n')
+    mut_seqsL.append(mut_seq)
+#    print('original:', my_fasta, ',', 'replaced at', pos, 'with', mut,  mut_seq)
+
+###############
+# sanity check
+################
+len_orig = len(my_fasta)
+#    checking if all the mutant sequences have the same length as the original fasta file sequence
+for seqs in mut_seqsL:
+#    print(seqs)
+#    print(len(seqs))
+    if len(seqs) != len_orig:
+        print('sequence lengths mismatch' +'\n', 'mutant seq length:', len(seqs), 'vs original seq length:', len_orig)
+    else: 
+        print('**Hooray** Length of mutant and original sequences match')
+ 
+del(i, len_orig, mut, mut_seq, my_fastaL, offset_pos, pos, seqs)       
+      
+############
+# write file
+############
+#filepath = homedir +'/git/LSHTM_Y1_PNCA/combined_v3/logo_plot/snp_seqsfile'
+#filepath =  homedir + '/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Data/gene_msa.txt'
+
+print(outpath)
+out_filename_gene = "/gene_msa.txt"
+outfile_gene = homedir + basedir + outpath + out_filename_gene
+print("Output file is:", outfile_gene)
+
+with open(outfile_gene, 'w') as file_handler:
+    for item in mut_seqsL:
+        file_handler.write("{}\n".format(item))
+        
+R="\n".join(mut_seqsL)
+f = open('Columns.csv','w')
+f.write(R)
+f.close()
+
+
+#################################################################################
+# extracting only positions with SNPs so that when you plot only those positions
+################################################################################
+#mut_seqsL = mut_seqsL[:3]  #just trying with 3 seqs
+
+# create a list of unique positions
+pos = mut_info['position'] #3072
+posL = list(set(list(pos))) #110
+del(pos)
+
+snp_seqsL = [] * len(mut_seqsL)
+
+for j, mut_seq in enumerate(mut_seqsL):
+    print (j, mut_seq)
+#    print(mut_seq[101]) #testing, this should be P, T V (in order of the mut_info file)
+    mut_seqsE = list(mut_seq)
+# extract specific posistions (corres to SNPs) from list of mutant sequences
+    snp_seqL1 = [mut_seqsE[i-1] for i in posL]    #should be 110
+#    print(snp_seqL1)
+#    print(len(snp_seqL1))
+    snp_seq_clean = "".join(snp_seqL1)
+    snp_seqsL.append(snp_seq_clean)
+
+###############
+# sanity check
+################
+no_unique_snps = len(posL)
+
+# checking if all the mutant sequences have the same length as the original fasta file sequence
+for seqs in snp_seqsL:
+#    print(seqs)
+#    print(len(seqs))
+    if len(seqs) != no_unique_snps:
+        print('sequence lengths mismatch' +'\n', 'mutant seq length:', len(seqs), 'vs original seq length:', no_unique_snps)
+    else: 
+        print('**Hooray** Length of mutant and original sequences match')
+        
+del(mut_seq, mut_seqsE, mut_seqsL, seqs, snp_seqL1, snp_seq_clean)
+
+
+############
+# write file
+############
+#filepath = homedir +'/git/LSHTM_Y1_PNCA/combined_v3/logo_plot/snp_seqsfile'
+#filepath =  homedir + '/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Data/snps_msa.txt'
+
+print(outpath)
+out_filename_snps = "/snps_msa.txt"
+outfile_snps = homedir + basedir + outpath + out_filename_snps
+print("Output file is:", outfile_snps)
+
+with open(outfile_snps, 'w') as file_handler:
+    for item in snp_seqsL:
+        file_handler.write("{}\n".format(item))
+        
+R="\n".join(snp_seqsL)
+f = open('Columns.csv','w')
+f.write(R)
+f.close()