#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Jun 25 08:46:36 2019 @author: tanushree """ ############################################ # load libraries import os import pandas as pd from Bio import SeqIO ############################################ #******************************************************************** # TASK: Read in fasta files and create mutant sequences akin to a MSA, # to allow generation of logo plots # Requirements: # input: Fasta file of protein/target for which mut seqs will be created # path: "Data//input/original/" # output: MSA for mutant sequences # path: "Data//input/processed/" #*********************************************************************** #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% ############# specify variables for input and output paths and filenames homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde basedir = "/git/Data/pyrazinamide/input" # input inpath = "/original" in_filename_fasta = "/3pl1.fasta.txt" infile_fasta = homedir + basedir + inpath + in_filename_fasta print("Input file is:", infile_fasta) inpath_p = "/processed" in_filename_meta_data = "/meta_data_with_AFandOR.csv" infile_meta_data = homedir + basedir + inpath_p + in_filename_meta_data print("Input file is:", infile_meta_data) # output: only path specified, filenames in respective sections outpath = "/processed" ################## end of variable assignment for input and output files #========== #read files #========== ############# #fasta file ############# #my_file = infile_fasta my_fasta = str() for seq_record in SeqIO.parse(infile_fasta, "fasta"): my_seq = seq_record.seq my_fasta = str(my_seq) #convert to a string print(my_fasta) # print( len(my_fasta) ) # print( type(my_fasta) ) len(my_fasta) ############# # SNP info ############# # read mutant_info file and extract cols with positions and mutant_info # This should be all samples with pncA muts #my_data = pd.read_csv('mcsm_complex1_normalised.csv') #335, 15 #my_data = pd.read_csv('meta_data_with_AFandOR.csv') #3093, 22 my_data = pd.read_csv(infile_meta_data) #3093, 22 list(my_data.columns) #FIXME: You need a better way to identify this # remove positions not in the structure #pos_remove = 186 my_data = my_data[my_data.position != 186] #3092, 22 # if multiple positions, then try the example below; # https://stackoverflow.com/questions/29017525/deleting-rows-based-on-multiple-conditions-python-pandas #df = df[(df.one > 0) | (df.two > 0) | (df.three > 0) & (df.four < 1)] #mut_info1 = my_data[['Position', 'Mutant_type']] #335, 2 mut_info1 = my_data[['position', 'mutant_type']] #3092, 2 #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% ############### # data cleaning ################ # extract only those positions that have a frequency count of pos>1 ###mut_info['freq_pos'] = mut_info.groupby('Position').count()#### dodgy # add a column of frequency for each position #mut_info1['freq_pos'] = mut_info1.groupby('Position')['Position'].transform('count') #335,3 mut_info1['freq_pos'] = mut_info1.groupby('position')['position'].transform('count') #3092,3 # sort by position mut_info2 = mut_info1.sort_values(by=['position']) #FIXME #__main__:1: SettingWithCopyWarning: #A value is trying to be set on a copy of a slice from a DataFrame. #Try using .loc[row_indexer,col_indexer] = value instead #See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy #sort dataframe by freq values so the row indices are in order! #mut_info2 = mut_info1.sort_values(by = 'freq_pos' # , axis = 0 # , ascending = False # , inplace = False # , na_position = 'last') #mut_info2 = mut_info2.reset_index( drop = True) # count how many pos have freq 1 as you will need to exclude those mut_info2[mut_info2.freq_pos == 1].sum() #20 # extract entries with freq_pos>1 # should be 3093-211 = 3072 mut_info3 = mut_info2.loc[mut_info2['freq_pos'] >1] #3072 # reset index to allow iteration <<<<<<<< IMPORTANT mut_info = mut_info3.reset_index(drop = True) del(mut_info1, mut_info2, mut_info3, my_data) #%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% ################### # generate mut seqs ################### mut_seqsL = [] * len(mut_info) # iterate for i, pos in enumerate(mut_info['position']): print('index:', i, 'position:', pos) mut = mut_info['mutant_type'][i] # print(mut) # print( type(mut) ) print('index:', i, 'position:', pos, 'mutant', mut) my_fastaL = list(my_fasta) offset_pos = pos-1 #due to counting starting from 0 my_fastaL[offset_pos] = mut # print(my_fastaL) mut_seq = "".join(my_fastaL) # print(mut_seq + '\n') mut_seqsL.append(mut_seq) # print('original:', my_fasta, ',', 'replaced at', pos, 'with', mut, mut_seq) ############### # sanity check ################ len_orig = len(my_fasta) # checking if all the mutant sequences have the same length as the original fasta file sequence for seqs in mut_seqsL: # print(seqs) # print(len(seqs)) if len(seqs) != len_orig: print('sequence lengths mismatch' +'\n', 'mutant seq length:', len(seqs), 'vs original seq length:', len_orig) else: print('**Hooray** Length of mutant and original sequences match') del(i, len_orig, mut, mut_seq, my_fastaL, offset_pos, pos, seqs) ############ # write file ############ #filepath = homedir +'/git/LSHTM_Y1_PNCA/combined_v3/logo_plot/snp_seqsfile' #filepath = homedir + '/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Data/gene_msa.txt' print(outpath) out_filename_gene = "/gene_msa.txt" outfile_gene = homedir + basedir + outpath + out_filename_gene print("Output file is:", outfile_gene) with open(outfile_gene, 'w') as file_handler: for item in mut_seqsL: file_handler.write("{}\n".format(item)) R="\n".join(mut_seqsL) f = open('Columns.csv','w') f.write(R) f.close() ################################################################################# # extracting only positions with SNPs so that when you plot only those positions ################################################################################ #mut_seqsL = mut_seqsL[:3] #just trying with 3 seqs # create a list of unique positions pos = mut_info['position'] #3072 posL = list(set(list(pos))) #110 del(pos) snp_seqsL = [] * len(mut_seqsL) for j, mut_seq in enumerate(mut_seqsL): print (j, mut_seq) # print(mut_seq[101]) #testing, this should be P, T V (in order of the mut_info file) mut_seqsE = list(mut_seq) # extract specific posistions (corres to SNPs) from list of mutant sequences snp_seqL1 = [mut_seqsE[i-1] for i in posL] #should be 110 # print(snp_seqL1) # print(len(snp_seqL1)) snp_seq_clean = "".join(snp_seqL1) snp_seqsL.append(snp_seq_clean) ############### # sanity check ################ no_unique_snps = len(posL) # checking if all the mutant sequences have the same length as the original fasta file sequence for seqs in snp_seqsL: # print(seqs) # print(len(seqs)) if len(seqs) != no_unique_snps: print('sequence lengths mismatch' +'\n', 'mutant seq length:', len(seqs), 'vs original seq length:', no_unique_snps) else: print('**Hooray** Length of mutant and original sequences match') del(mut_seq, mut_seqsE, mut_seqsL, seqs, snp_seqL1, snp_seq_clean) ############ # write file ############ #filepath = homedir +'/git/LSHTM_Y1_PNCA/combined_v3/logo_plot/snp_seqsfile' #filepath = homedir + '/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Data/snps_msa.txt' print(outpath) out_filename_snps = "/snps_msa.txt" outfile_snps = homedir + basedir + outpath + out_filename_snps print("Output file is:", outfile_snps) with open(outfile_snps, 'w') as file_handler: for item in snp_seqsL: file_handler.write("{}\n".format(item)) R="\n".join(snp_seqsL) f = open('Columns.csv','w') f.write(R) f.close()