From f9837b474ca81272c172970cc3e6c7521a544312 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 25 Feb 2020 18:13:18 +0000 Subject: [PATCH] updating mut_seq script --- .../scripts/generate_mut_sequences.py | 222 +++++++----------- 1 file changed, 85 insertions(+), 137 deletions(-) mode change 100644 => 100755 mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py diff --git a/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py b/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py old mode 100644 new mode 100755 index 60018f3..04a1cc0 --- a/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py +++ b/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py @@ -21,110 +21,121 @@ from Bio import SeqIO # output: MSA for mutant sequences # path: "Data//input/processed/" #*********************************************************************** -#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -############# specify variables for input and output paths and filenames -homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde -basedir = "/git/Data/pyrazinamide/input" - +#%% +# specify input and output variables +homedir = os.path.expanduser('~') +#======= # input -inpath = "/original" -in_filename_fasta = "/3pl1.fasta.txt" -infile_fasta = homedir + basedir + inpath + in_filename_fasta -print("Input file is:", infile_fasta) +#======= +############# +# fasta file +############# +indir = 'git/Data/pyrazinamide/input/original' +in_filename_fasta = "3pl1.fasta.txt" +infile_fasta = homedir + '/' + indir + '/' + in_filename_fasta +print(infile_fasta) -inpath_p = "/processed" -in_filename_meta_data = "/meta_data_with_AFandOR.csv" -infile_meta_data = homedir + basedir + inpath_p + in_filename_meta_data +############# +# meta data +############# +# FIXME when you change the dir struc +inpath_p = "git/Data/pyrazinamide/input/processed" +in_filename_meta_data = "meta_data_with_AFandOR.csv" +infile_meta_data = homedir + '/' + inpath_p + '/' + in_filename_meta_data print("Input file is:", infile_meta_data) -# output: only path specified, filenames in respective sections -outpath = "/processed" +#======= +# output +#======= +outdir = 'git/Data/pyrazinamide/output' +# filenames in respective sections ################## end of variable assignment for input and output files +#%% #========== -#read files +# read files #========== -############# -#fasta file -############# -#my_file = infile_fasta -my_fasta = str() +############# +# fasta file +############# +my_fasta_o = str() for seq_record in SeqIO.parse(infile_fasta, "fasta"): my_seq = seq_record.seq - my_fasta = str(my_seq) #convert to a string - print(my_fasta) -# print( len(my_fasta) ) + my_fasta_o = str(my_seq) #convert to a string + print(my_fasta_o) + print(len(my_fasta_o)) # print( type(my_fasta) ) -len(my_fasta) +# remove non_struc positions from fasta +def remove_char(str, n): + first_part = str[:n] + last_part = str[n+1:] + return first_part + last_part +#print(remove_char('Python', 0)) + +ns_pos_o = 186 +offset = 1 # 0 based indexing +ns_pos = ns_pos_o - offset +my_fasta = remove_char(my_fasta_o, ns_pos) +print("orig length:", len(my_fasta_o)) +print("new length:", len(my_fasta)) ############# -# SNP info +# SNP info and no of MSA to generate ############# # read mutant_info file and extract cols with positions and mutant_info # This should be all samples with pncA muts #my_data = pd.read_csv('mcsm_complex1_normalised.csv') #335, 15 -#my_data = pd.read_csv('meta_data_with_AFandOR.csv') #3093, 22 -my_data = pd.read_csv(infile_meta_data) #3093, 22 +my_data = pd.read_csv(infile_meta_data) list(my_data.columns) +#my_data['OR'].value_counts() +#my_data['OR'].isna().sum() #FIXME: You need a better way to identify this +# ideally this file should not contain any non_struc pos # remove positions not in the structure -#pos_remove = 186 -my_data = my_data[my_data.position != 186] #3092, 22 +my_data = my_data[my_data.position != ns_pos_o] #3092, 22 # if multiple positions, then try the example below; # https://stackoverflow.com/questions/29017525/deleting-rows-based-on-multiple-conditions-python-pandas #df = df[(df.one > 0) | (df.two > 0) | (df.three > 0) & (df.four < 1)] -#mut_info1 = my_data[['Position', 'Mutant_type']] #335, 2 -mut_info1 = my_data[['position', 'mutant_type']] #3092, 2 - -#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -############### +mut_info1 = my_data[['position', 'mutant_type']] +#%% +################ # data cleaning ################ # extract only those positions that have a frequency count of pos>1 ###mut_info['freq_pos'] = mut_info.groupby('Position').count()#### dodgy # add a column of frequency for each position -#mut_info1['freq_pos'] = mut_info1.groupby('Position')['Position'].transform('count') #335,3 -mut_info1['freq_pos'] = mut_info1.groupby('position')['position'].transform('count') #3092,3 +#mut_info1['freq_pos'] = mut_info1.groupby('position')['position'].transform('count') +mut_info1['freq_pos'] = mut_info1.position.map(mut_info1.position.value_counts()) # sort by position mut_info2 = mut_info1.sort_values(by=['position']) -#FIXME -#__main__:1: SettingWithCopyWarning: -#A value is trying to be set on a copy of a slice from a DataFrame. -#Try using .loc[row_indexer,col_indexer] = value instead - -#See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy - -#sort dataframe by freq values so the row indices are in order! -#mut_info2 = mut_info1.sort_values(by = 'freq_pos' -# , axis = 0 -# , ascending = False -# , inplace = False -# , na_position = 'last') - -#mut_info2 = mut_info2.reset_index( drop = True) - - # count how many pos have freq 1 as you will need to exclude those -mut_info2[mut_info2.freq_pos == 1].sum() #20 +mutfreq1_count = mut_info2[mut_info2.freq_pos == 1].sum().freq_pos # extract entries with freq_pos>1 # should be 3093-211 = 3072 mut_info3 = mut_info2.loc[mut_info2['freq_pos'] >1] #3072 +print("orig length:", len(mut_info1)) +print("No. of excluded values:", mutfreq1_count) +print("new length:", len(mut_info3)) +# sanity check +if ( (len(mut_info1) - mutfreq1_count) == len(mut_info3) ): + print("Sanity check passed: Filtered data correctly") +else: + print("Error: Debug you code") -# reset index to allow iteration <<<<<<<< IMPORTANT +# reset index to allow iteration !!!!!!!!!! IMPORTANT mut_info = mut_info3.reset_index(drop = True) -del(mut_info1, mut_info2, mut_info3, my_data) +##del(mut_info1, mut_info2, mut_info3, my_data) -#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% ################### # generate mut seqs ################### @@ -132,20 +143,19 @@ mut_seqsL = [] * len(mut_info) # iterate for i, pos in enumerate(mut_info['position']): - print('index:', i, 'position:', pos) - mut = mut_info['mutant_type'][i] -# print(mut) -# print( type(mut) ) - print('index:', i, 'position:', pos, 'mutant', mut) - my_fastaL = list(my_fasta) - offset_pos = pos-1 #due to counting starting from 0 + mut = mut_info['mutant_type'][i] + offset_pos = pos-1 + + print('1-index:', pos, '0-index cur:', offset_pos, my_fastaL[offset_pos], 'mut:', mut) my_fastaL[offset_pos] = mut -# print(my_fastaL) + print('1-index:', pos, '0-index new:', offset_pos, my_fastaL[offset_pos], 'mut:', mut) + mut_seq = "".join(my_fastaL) # print(mut_seq + '\n') + print('original:', my_fasta, ',', 'replaced:', my_fasta[offset_pos], 'at', pos, 'with', mut, mut_seq) mut_seqsL.append(mut_seq) -# print('original:', my_fasta, ',', 'replaced at', pos, 'with', mut, mut_seq) + ############### # sanity check @@ -161,84 +171,22 @@ for seqs in mut_seqsL: print('**Hooray** Length of mutant and original sequences match') del(i, len_orig, mut, mut_seq, my_fastaL, offset_pos, pos, seqs) - + ############ # write file ############ #filepath = homedir +'/git/LSHTM_Y1_PNCA/combined_v3/logo_plot/snp_seqsfile' #filepath = homedir + '/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Data/gene_msa.txt' - -print(outpath) -out_filename_gene = "/gene_msa.txt" -outfile_gene = homedir + basedir + outpath + out_filename_gene -print("Output file is:", outfile_gene) +print(outdir) +out_filename = "gene_msa.txt" +outfile_gene = homedir + '/' + outdir + '/' + out_filename +print(outfile_gene) with open(outfile_gene, 'w') as file_handler: for item in mut_seqsL: file_handler.write("{}\n".format(item)) -R="\n".join(mut_seqsL) -f = open('Columns.csv','w') -f.write(R) -f.close() - - -################################################################################# -# extracting only positions with SNPs so that when you plot only those positions -################################################################################ -#mut_seqsL = mut_seqsL[:3] #just trying with 3 seqs - -# create a list of unique positions -pos = mut_info['position'] #3072 -posL = list(set(list(pos))) #110 -del(pos) - -snp_seqsL = [] * len(mut_seqsL) - -for j, mut_seq in enumerate(mut_seqsL): - print (j, mut_seq) -# print(mut_seq[101]) #testing, this should be P, T V (in order of the mut_info file) - mut_seqsE = list(mut_seq) -# extract specific posistions (corres to SNPs) from list of mutant sequences - snp_seqL1 = [mut_seqsE[i-1] for i in posL] #should be 110 -# print(snp_seqL1) -# print(len(snp_seqL1)) - snp_seq_clean = "".join(snp_seqL1) - snp_seqsL.append(snp_seq_clean) - -############### -# sanity check -################ -no_unique_snps = len(posL) - -# checking if all the mutant sequences have the same length as the original fasta file sequence -for seqs in snp_seqsL: -# print(seqs) -# print(len(seqs)) - if len(seqs) != no_unique_snps: - print('sequence lengths mismatch' +'\n', 'mutant seq length:', len(seqs), 'vs original seq length:', no_unique_snps) - else: - print('**Hooray** Length of mutant and original sequences match') - -del(mut_seq, mut_seqsE, mut_seqsL, seqs, snp_seqL1, snp_seq_clean) - - -############ -# write file -############ -#filepath = homedir +'/git/LSHTM_Y1_PNCA/combined_v3/logo_plot/snp_seqsfile' -#filepath = homedir + '/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Data/snps_msa.txt' - -print(outpath) -out_filename_snps = "/snps_msa.txt" -outfile_snps = homedir + basedir + outpath + out_filename_snps -print("Output file is:", outfile_snps) - -with open(outfile_snps, 'w') as file_handler: - for item in snp_seqsL: - file_handler.write("{}\n".format(item)) - -R="\n".join(snp_seqsL) -f = open('Columns.csv','w') -f.write(R) -f.close() +#R = "\n".join(mut_seqsL) +#f = open('Columns.csv','w') +#f.write(R) +#f.close()