updating mut_seq script

2020-02-25 18:13:18 +00:00 · 2020-02-25 18:13:18 +00:00 · 7b393a2b13
commit 7b393a2b13
parent 2805fdda40
1 changed files with 85 additions and 137 deletions
--- a/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py
+++ b/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py
@ -21,110 +21,121 @@ from Bio import SeqIO
 # output: MSA for mutant sequences
 	# path: "Data/<drug>/input/processed/<filename>"
 #***********************************************************************
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-############# specify variables for input and output paths and filenames
-homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
-basedir = "/git/Data/pyrazinamide/input"
-
+#%%
+# specify input and output variables
+homedir = os.path.expanduser('~')
+#=======
 # input
-inpath = "/original"
-in_filename_fasta = "/3pl1.fasta.txt"
-infile_fasta = homedir + basedir + inpath + in_filename_fasta
-print("Input file is:", infile_fasta)
+#=======
+#############
+# fasta file
+#############
+indir = 'git/Data/pyrazinamide/input/original'
+in_filename_fasta = "3pl1.fasta.txt"
+infile_fasta = homedir + '/' + indir + '/' + in_filename_fasta
+print(infile_fasta)

-inpath_p = "/processed"
-in_filename_meta_data = "/meta_data_with_AFandOR.csv"
-infile_meta_data = homedir + basedir + inpath_p + in_filename_meta_data
+#############
+# meta data
+#############
+# FIXME when you change the dir struc
+inpath_p = "git/Data/pyrazinamide/input/processed"
+in_filename_meta_data = "meta_data_with_AFandOR.csv"
+infile_meta_data = homedir + '/' + inpath_p + '/' + in_filename_meta_data
 print("Input file is:", infile_meta_data)

-# output: only path specified, filenames in respective sections
-outpath = "/processed"
+#=======
+# output 
+#=======
+outdir =   'git/Data/pyrazinamide/output'
+# filenames in respective sections

 ################## end of variable assignment for input and output files
+#%%
 #==========
-#read files
+# read files
 #==========
-#############
-#fasta file
-#############
-#my_file = infile_fasta

-my_fasta = str()
+#############
+# fasta file
+#############
+my_fasta_o = str()
 for seq_record in SeqIO.parse(infile_fasta, "fasta"):
    my_seq = seq_record.seq
-    my_fasta = str(my_seq) #convert to a string
-    print(my_fasta)
-#    print( len(my_fasta) )
+    my_fasta_o = str(my_seq) #convert to a string
+    print(my_fasta_o)
+    print(len(my_fasta_o))
 #    print( type(my_fasta) )

-len(my_fasta)
+# remove non_struc positions from fasta
+def remove_char(str, n):
+    first_part = str[:n] 
+    last_part = str[n+1:]
+    return first_part + last_part
+#print(remove_char('Python', 0))
+
+ns_pos_o = 186
+offset = 1 # 0 based indexing
+ns_pos = ns_pos_o - offset 
+my_fasta = remove_char(my_fasta_o, ns_pos)
+print("orig length:", len(my_fasta_o))
+print("new length:", len(my_fasta))

 #############
-# SNP info
+# SNP info and no of MSA to generate
 #############
 # read mutant_info file and extract cols with positions and mutant_info
 # This should be all samples with pncA muts
 #my_data = pd.read_csv('mcsm_complex1_normalised.csv') #335, 15
-#my_data = pd.read_csv('meta_data_with_AFandOR.csv') #3093, 22
-my_data = pd.read_csv(infile_meta_data) #3093, 22
+my_data = pd.read_csv(infile_meta_data)
 list(my_data.columns)
+#my_data['OR'].value_counts()
+#my_data['OR'].isna().sum()

 #FIXME: You need a better way to identify this
+# ideally this file should not contain any non_struc pos
 # remove positions not in the structure
-#pos_remove = 186
-my_data = my_data[my_data.position != 186] #3092, 22
+my_data = my_data[my_data.position != ns_pos_o] #3092, 22

 # if multiple positions, then try the example below;
 # https://stackoverflow.com/questions/29017525/deleting-rows-based-on-multiple-conditions-python-pandas
 #df = df[(df.one > 0) | (df.two > 0) | (df.three > 0) & (df.four < 1)]

-#mut_info1 = my_data[['Position', 'Mutant_type']] #335, 2
-mut_info1 = my_data[['position', 'mutant_type']] #3092, 2
-
-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-###############
+mut_info1 = my_data[['position', 'mutant_type']]
+#%%
+################
 # data cleaning
 ################
 # extract only those positions that have a frequency count of pos>1
 ###mut_info['freq_pos'] = mut_info.groupby('Position').count()#### dodgy

 # add a column of frequency for each position
-#mut_info1['freq_pos'] = mut_info1.groupby('Position')['Position'].transform('count') #335,3
-mut_info1['freq_pos'] = mut_info1.groupby('position')['position'].transform('count') #3092,3
+#mut_info1['freq_pos'] = mut_info1.groupby('position')['position'].transform('count')
+mut_info1['freq_pos'] = mut_info1.position.map(mut_info1.position.value_counts())

 # sort by position
 mut_info2 = mut_info1.sort_values(by=['position'])

-#FIXME
-#__main__:1: SettingWithCopyWarning: 
-#A value is trying to be set on a copy of a slice from a DataFrame.
-#Try using .loc[row_indexer,col_indexer] = value instead
-
-#See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
-
-#sort dataframe by freq values so the row indices are in order!
-#mut_info2 = mut_info1.sort_values(by = 'freq_pos'
-#                      , axis = 0
-#                      , ascending = False
-#                      , inplace = False
-#                      , na_position = 'last')
-
-#mut_info2 = mut_info2.reset_index( drop = True)
-
-
 # count how many pos have freq 1 as you will need to exclude those
-mut_info2[mut_info2.freq_pos == 1].sum() #20
+mutfreq1_count = mut_info2[mut_info2.freq_pos == 1].sum().freq_pos

 # extract entries with freq_pos>1
 # should be 3093-211 = 3072
 mut_info3 = mut_info2.loc[mut_info2['freq_pos'] >1] #3072
+print("orig length:", len(mut_info1))
+print("No. of excluded values:", mutfreq1_count)
+print("new length:", len(mut_info3))
+# sanity check
+if ( (len(mut_info1) - mutfreq1_count) == len(mut_info3) ):
+    print("Sanity check passed: Filtered data correctly")
+else:
+    print("Error: Debug you code")

-# reset index to allow iteration <<<<<<<< IMPORTANT
+# reset index to allow iteration !!!!!!!!!! IMPORTANT
 mut_info = mut_info3.reset_index(drop = True)

-del(mut_info1, mut_info2, mut_info3, my_data)
+##del(mut_info1, mut_info2, mut_info3, my_data)

-#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 ###################
 # generate mut seqs
 ###################
@ -132,20 +143,19 @@ mut_seqsL = [] * len(mut_info)

 # iterate 
 for i, pos in enumerate(mut_info['position']):
-    print('index:', i, 'position:', pos)
-    mut = mut_info['mutant_type'][i]
-#    print(mut)
-#    print( type(mut) )
-    print('index:', i, 'position:', pos, 'mutant', mut)
-
    my_fastaL = list(my_fasta)
-    offset_pos = pos-1 #due to counting starting from 0
+    mut = mut_info['mutant_type'][i]
+    offset_pos = pos-1
+    
+    print('1-index:', pos, '0-index cur:', offset_pos, my_fastaL[offset_pos], 'mut:', mut)
    my_fastaL[offset_pos] = mut
-#    print(my_fastaL)
+    print('1-index:', pos, '0-index new:', offset_pos, my_fastaL[offset_pos], 'mut:', mut)
+     
    mut_seq = "".join(my_fastaL)
 #    print(mut_seq + '\n')
+    print('original:', my_fasta, ',', 'replaced:', my_fasta[offset_pos], 'at', pos, 'with', mut,  mut_seq)
    mut_seqsL.append(mut_seq)
-#    print('original:', my_fasta, ',', 'replaced at', pos, 'with', mut,  mut_seq)
+

 ###############
 # sanity check
@ -167,78 +177,16 @@ del(i, len_orig, mut, mut_seq, my_fastaL, offset_pos, pos, seqs)
 ############
 #filepath = homedir +'/git/LSHTM_Y1_PNCA/combined_v3/logo_plot/snp_seqsfile'
 #filepath =  homedir + '/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Data/gene_msa.txt'
-
-print(outpath)
-out_filename_gene = "/gene_msa.txt"
-outfile_gene = homedir + basedir + outpath + out_filename_gene
-print("Output file is:", outfile_gene)
+print(outdir)
+out_filename = "gene_msa.txt"
+outfile_gene = homedir + '/' + outdir + '/' + out_filename
+print(outfile_gene)

 with open(outfile_gene, 'w') as file_handler:
    for item in mut_seqsL:
        file_handler.write("{}\n".format(item))
        
-R="\n".join(mut_seqsL)
-f = open('Columns.csv','w')
-f.write(R)
-f.close()
-
-
-#################################################################################
-# extracting only positions with SNPs so that when you plot only those positions
-################################################################################
-#mut_seqsL = mut_seqsL[:3]  #just trying with 3 seqs
-
-# create a list of unique positions
-pos = mut_info['position'] #3072
-posL = list(set(list(pos))) #110
-del(pos)
-
-snp_seqsL = [] * len(mut_seqsL)
-
-for j, mut_seq in enumerate(mut_seqsL):
-    print (j, mut_seq)
-#    print(mut_seq[101]) #testing, this should be P, T V (in order of the mut_info file)
-    mut_seqsE = list(mut_seq)
-# extract specific posistions (corres to SNPs) from list of mutant sequences
-    snp_seqL1 = [mut_seqsE[i-1] for i in posL]    #should be 110
-#    print(snp_seqL1)
-#    print(len(snp_seqL1))
-    snp_seq_clean = "".join(snp_seqL1)
-    snp_seqsL.append(snp_seq_clean)
-
-###############
-# sanity check
-################
-no_unique_snps = len(posL)
-
-# checking if all the mutant sequences have the same length as the original fasta file sequence
-for seqs in snp_seqsL:
-#    print(seqs)
-#    print(len(seqs))
-    if len(seqs) != no_unique_snps:
-        print('sequence lengths mismatch' +'\n', 'mutant seq length:', len(seqs), 'vs original seq length:', no_unique_snps)
-    else: 
-        print('**Hooray** Length of mutant and original sequences match')
-        
-del(mut_seq, mut_seqsE, mut_seqsL, seqs, snp_seqL1, snp_seq_clean)
-
-
-############
-# write file
-############
-#filepath = homedir +'/git/LSHTM_Y1_PNCA/combined_v3/logo_plot/snp_seqsfile'
-#filepath =  homedir + '/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Data/snps_msa.txt'
-
-print(outpath)
-out_filename_snps = "/snps_msa.txt"
-outfile_snps = homedir + basedir + outpath + out_filename_snps
-print("Output file is:", outfile_snps)
-
-with open(outfile_snps, 'w') as file_handler:
-    for item in snp_seqsL:
-        file_handler.write("{}\n".format(item))
-        
-R="\n".join(snp_seqsL)
-f = open('Columns.csv','w')
-f.write(R)
-f.close()
+#R = "\n".join(mut_seqsL)
+#f = open('Columns.csv','w')
+#f.write(R)
+#f.close()