#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Jun 25 08:46:36 2019 @author: tanushree """ ############################################ # load libraries import os import pandas as pd from Bio import SeqIO ############################################ #******************************************************************** # TASK: Read in fasta files and create mutant sequences akin to a MSA, # to allow generation of logo plots # Requirements: # input: Fasta file of protein/target for which mut seqs will be created # path: "Data//input/original/" # output: MSA for mutant sequences # path: "Data//input/processed/" #*********************************************************************** #%% # specify input and output variables homedir = os.path.expanduser('~') #======= # input #======= ############# # fasta file ############# indir = 'git/Data/pyrazinamide/input/original' in_filename_fasta = "3pl1.fasta.txt" infile_fasta = homedir + '/' + indir + '/' + in_filename_fasta print(infile_fasta) ############# # meta data ############# # FIXME when you change the dir struc inpath_p = "git/Data/pyrazinamide/input/processed" in_filename_meta_data = "meta_data_with_AFandOR.csv" infile_meta_data = homedir + '/' + inpath_p + '/' + in_filename_meta_data print("Input file is:", infile_meta_data) #======= # output #======= outdir = 'git/Data/pyrazinamide/output' # filenames in respective sections ################## end of variable assignment for input and output files #%% #========== # read files #========== ############# # fasta file ############# my_fasta_o = str() for seq_record in SeqIO.parse(infile_fasta, "fasta"): my_seq = seq_record.seq my_fasta_o = str(my_seq) #convert to a string print(my_fasta_o) print(len(my_fasta_o)) # print( type(my_fasta) ) # remove non_struc positions from fasta def remove_char(str, n): first_part = str[:n] last_part = str[n+1:] return first_part + last_part #print(remove_char('Python', 0)) ns_pos_o = 186 offset = 1 # 0 based indexing ns_pos = ns_pos_o - offset my_fasta = remove_char(my_fasta_o, ns_pos) print("orig length:", len(my_fasta_o)) print("new length:", len(my_fasta)) ############# # SNP info and no of MSA to generate ############# # read mutant_info file and extract cols with positions and mutant_info # This should be all samples with pncA muts #my_data = pd.read_csv('mcsm_complex1_normalised.csv') #335, 15 my_data = pd.read_csv(infile_meta_data) list(my_data.columns) #my_data['OR'].value_counts() #my_data['OR'].isna().sum() #FIXME: You need a better way to identify this # ideally this file should not contain any non_struc pos # remove positions not in the structure my_data = my_data[my_data.position != ns_pos_o] #3092, 22 # if multiple positions, then try the example below; # https://stackoverflow.com/questions/29017525/deleting-rows-based-on-multiple-conditions-python-pandas #df = df[(df.one > 0) | (df.two > 0) | (df.three > 0) & (df.four < 1)] mut_info1 = my_data[['position', 'mutant_type']] #%% ################ # data cleaning ################ # extract only those positions that have a frequency count of pos>1 ###mut_info['freq_pos'] = mut_info.groupby('Position').count()#### dodgy # add a column of frequency for each position #mut_info1['freq_pos'] = mut_info1.groupby('position')['position'].transform('count') mut_info1['freq_pos'] = mut_info1.position.map(mut_info1.position.value_counts()) # sort by position mut_info2 = mut_info1.sort_values(by=['position']) # count how many pos have freq 1 as you will need to exclude those mutfreq1_count = mut_info2[mut_info2.freq_pos == 1].sum().freq_pos # extract entries with freq_pos>1 # should be 3093-211 = 3072 mut_info3 = mut_info2.loc[mut_info2['freq_pos'] >1] #3072 print("orig length:", len(mut_info1)) print("No. of excluded values:", mutfreq1_count) print("new length:", len(mut_info3)) # sanity check if ( (len(mut_info1) - mutfreq1_count) == len(mut_info3) ): print("Sanity check passed: Filtered data correctly") else: print("Error: Debug you code") # reset index to allow iteration !!!!!!!!!! IMPORTANT mut_info = mut_info3.reset_index(drop = True) ##del(mut_info1, mut_info2, mut_info3, my_data) ################### # generate mut seqs ################### mut_seqsL = [] * len(mut_info) # iterate for i, pos in enumerate(mut_info['position']): my_fastaL = list(my_fasta) mut = mut_info['mutant_type'][i] offset_pos = pos-1 print('1-index:', pos, '0-index cur:', offset_pos, my_fastaL[offset_pos], 'mut:', mut) my_fastaL[offset_pos] = mut print('1-index:', pos, '0-index new:', offset_pos, my_fastaL[offset_pos], 'mut:', mut) mut_seq = "".join(my_fastaL) # print(mut_seq + '\n') print('original:', my_fasta, ',', 'replaced:', my_fasta[offset_pos], 'at', pos, 'with', mut, mut_seq) mut_seqsL.append(mut_seq) ############### # sanity check ################ len_orig = len(my_fasta) # checking if all the mutant sequences have the same length as the original fasta file sequence for seqs in mut_seqsL: # print(seqs) # print(len(seqs)) if len(seqs) != len_orig: print('sequence lengths mismatch' +'\n', 'mutant seq length:', len(seqs), 'vs original seq length:', len_orig) else: print('**Hooray** Length of mutant and original sequences match') del(i, len_orig, mut, mut_seq, my_fastaL, offset_pos, pos, seqs) ############ # write file ############ #filepath = homedir +'/git/LSHTM_Y1_PNCA/combined_v3/logo_plot/snp_seqsfile' #filepath = homedir + '/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Data/gene_msa.txt' print(outdir) out_filename = "gene_msa.txt" outfile_gene = homedir + '/' + outdir + '/' + out_filename print(outfile_gene) with open(outfile_gene, 'w') as file_handler: for item in mut_seqsL: file_handler.write("{}\n".format(item)) #R = "\n".join(mut_seqsL) #f = open('Columns.csv','w') #f.write(R) #f.close()