updating mut_seq script
This commit is contained in:
parent
2805fdda40
commit
7b393a2b13
1 changed files with 85 additions and 137 deletions
222
mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py
Normal file → Executable file
222
mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py
Normal file → Executable file
|
@ -21,110 +21,121 @@ from Bio import SeqIO
|
||||||
# output: MSA for mutant sequences
|
# output: MSA for mutant sequences
|
||||||
# path: "Data/<drug>/input/processed/<filename>"
|
# path: "Data/<drug>/input/processed/<filename>"
|
||||||
#***********************************************************************
|
#***********************************************************************
|
||||||
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
#%%
|
||||||
############# specify variables for input and output paths and filenames
|
# specify input and output variables
|
||||||
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
|
homedir = os.path.expanduser('~')
|
||||||
basedir = "/git/Data/pyrazinamide/input"
|
#=======
|
||||||
|
|
||||||
# input
|
# input
|
||||||
inpath = "/original"
|
#=======
|
||||||
in_filename_fasta = "/3pl1.fasta.txt"
|
#############
|
||||||
infile_fasta = homedir + basedir + inpath + in_filename_fasta
|
# fasta file
|
||||||
print("Input file is:", infile_fasta)
|
#############
|
||||||
|
indir = 'git/Data/pyrazinamide/input/original'
|
||||||
|
in_filename_fasta = "3pl1.fasta.txt"
|
||||||
|
infile_fasta = homedir + '/' + indir + '/' + in_filename_fasta
|
||||||
|
print(infile_fasta)
|
||||||
|
|
||||||
inpath_p = "/processed"
|
#############
|
||||||
in_filename_meta_data = "/meta_data_with_AFandOR.csv"
|
# meta data
|
||||||
infile_meta_data = homedir + basedir + inpath_p + in_filename_meta_data
|
#############
|
||||||
|
# FIXME when you change the dir struc
|
||||||
|
inpath_p = "git/Data/pyrazinamide/input/processed"
|
||||||
|
in_filename_meta_data = "meta_data_with_AFandOR.csv"
|
||||||
|
infile_meta_data = homedir + '/' + inpath_p + '/' + in_filename_meta_data
|
||||||
print("Input file is:", infile_meta_data)
|
print("Input file is:", infile_meta_data)
|
||||||
|
|
||||||
# output: only path specified, filenames in respective sections
|
#=======
|
||||||
outpath = "/processed"
|
# output
|
||||||
|
#=======
|
||||||
|
outdir = 'git/Data/pyrazinamide/output'
|
||||||
|
# filenames in respective sections
|
||||||
|
|
||||||
################## end of variable assignment for input and output files
|
################## end of variable assignment for input and output files
|
||||||
|
#%%
|
||||||
#==========
|
#==========
|
||||||
#read files
|
# read files
|
||||||
#==========
|
#==========
|
||||||
#############
|
|
||||||
#fasta file
|
|
||||||
#############
|
|
||||||
#my_file = infile_fasta
|
|
||||||
|
|
||||||
my_fasta = str()
|
#############
|
||||||
|
# fasta file
|
||||||
|
#############
|
||||||
|
my_fasta_o = str()
|
||||||
for seq_record in SeqIO.parse(infile_fasta, "fasta"):
|
for seq_record in SeqIO.parse(infile_fasta, "fasta"):
|
||||||
my_seq = seq_record.seq
|
my_seq = seq_record.seq
|
||||||
my_fasta = str(my_seq) #convert to a string
|
my_fasta_o = str(my_seq) #convert to a string
|
||||||
print(my_fasta)
|
print(my_fasta_o)
|
||||||
# print( len(my_fasta) )
|
print(len(my_fasta_o))
|
||||||
# print( type(my_fasta) )
|
# print( type(my_fasta) )
|
||||||
|
|
||||||
len(my_fasta)
|
# remove non_struc positions from fasta
|
||||||
|
def remove_char(str, n):
|
||||||
|
first_part = str[:n]
|
||||||
|
last_part = str[n+1:]
|
||||||
|
return first_part + last_part
|
||||||
|
#print(remove_char('Python', 0))
|
||||||
|
|
||||||
|
ns_pos_o = 186
|
||||||
|
offset = 1 # 0 based indexing
|
||||||
|
ns_pos = ns_pos_o - offset
|
||||||
|
my_fasta = remove_char(my_fasta_o, ns_pos)
|
||||||
|
print("orig length:", len(my_fasta_o))
|
||||||
|
print("new length:", len(my_fasta))
|
||||||
|
|
||||||
#############
|
#############
|
||||||
# SNP info
|
# SNP info and no of MSA to generate
|
||||||
#############
|
#############
|
||||||
# read mutant_info file and extract cols with positions and mutant_info
|
# read mutant_info file and extract cols with positions and mutant_info
|
||||||
# This should be all samples with pncA muts
|
# This should be all samples with pncA muts
|
||||||
#my_data = pd.read_csv('mcsm_complex1_normalised.csv') #335, 15
|
#my_data = pd.read_csv('mcsm_complex1_normalised.csv') #335, 15
|
||||||
#my_data = pd.read_csv('meta_data_with_AFandOR.csv') #3093, 22
|
my_data = pd.read_csv(infile_meta_data)
|
||||||
my_data = pd.read_csv(infile_meta_data) #3093, 22
|
|
||||||
list(my_data.columns)
|
list(my_data.columns)
|
||||||
|
#my_data['OR'].value_counts()
|
||||||
|
#my_data['OR'].isna().sum()
|
||||||
|
|
||||||
#FIXME: You need a better way to identify this
|
#FIXME: You need a better way to identify this
|
||||||
|
# ideally this file should not contain any non_struc pos
|
||||||
# remove positions not in the structure
|
# remove positions not in the structure
|
||||||
#pos_remove = 186
|
my_data = my_data[my_data.position != ns_pos_o] #3092, 22
|
||||||
my_data = my_data[my_data.position != 186] #3092, 22
|
|
||||||
|
|
||||||
# if multiple positions, then try the example below;
|
# if multiple positions, then try the example below;
|
||||||
# https://stackoverflow.com/questions/29017525/deleting-rows-based-on-multiple-conditions-python-pandas
|
# https://stackoverflow.com/questions/29017525/deleting-rows-based-on-multiple-conditions-python-pandas
|
||||||
#df = df[(df.one > 0) | (df.two > 0) | (df.three > 0) & (df.four < 1)]
|
#df = df[(df.one > 0) | (df.two > 0) | (df.three > 0) & (df.four < 1)]
|
||||||
|
|
||||||
#mut_info1 = my_data[['Position', 'Mutant_type']] #335, 2
|
mut_info1 = my_data[['position', 'mutant_type']]
|
||||||
mut_info1 = my_data[['position', 'mutant_type']] #3092, 2
|
#%%
|
||||||
|
################
|
||||||
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
||||||
###############
|
|
||||||
# data cleaning
|
# data cleaning
|
||||||
################
|
################
|
||||||
# extract only those positions that have a frequency count of pos>1
|
# extract only those positions that have a frequency count of pos>1
|
||||||
###mut_info['freq_pos'] = mut_info.groupby('Position').count()#### dodgy
|
###mut_info['freq_pos'] = mut_info.groupby('Position').count()#### dodgy
|
||||||
|
|
||||||
# add a column of frequency for each position
|
# add a column of frequency for each position
|
||||||
#mut_info1['freq_pos'] = mut_info1.groupby('Position')['Position'].transform('count') #335,3
|
#mut_info1['freq_pos'] = mut_info1.groupby('position')['position'].transform('count')
|
||||||
mut_info1['freq_pos'] = mut_info1.groupby('position')['position'].transform('count') #3092,3
|
mut_info1['freq_pos'] = mut_info1.position.map(mut_info1.position.value_counts())
|
||||||
|
|
||||||
# sort by position
|
# sort by position
|
||||||
mut_info2 = mut_info1.sort_values(by=['position'])
|
mut_info2 = mut_info1.sort_values(by=['position'])
|
||||||
|
|
||||||
#FIXME
|
|
||||||
#__main__:1: SettingWithCopyWarning:
|
|
||||||
#A value is trying to be set on a copy of a slice from a DataFrame.
|
|
||||||
#Try using .loc[row_indexer,col_indexer] = value instead
|
|
||||||
|
|
||||||
#See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
|
|
||||||
|
|
||||||
#sort dataframe by freq values so the row indices are in order!
|
|
||||||
#mut_info2 = mut_info1.sort_values(by = 'freq_pos'
|
|
||||||
# , axis = 0
|
|
||||||
# , ascending = False
|
|
||||||
# , inplace = False
|
|
||||||
# , na_position = 'last')
|
|
||||||
|
|
||||||
#mut_info2 = mut_info2.reset_index( drop = True)
|
|
||||||
|
|
||||||
|
|
||||||
# count how many pos have freq 1 as you will need to exclude those
|
# count how many pos have freq 1 as you will need to exclude those
|
||||||
mut_info2[mut_info2.freq_pos == 1].sum() #20
|
mutfreq1_count = mut_info2[mut_info2.freq_pos == 1].sum().freq_pos
|
||||||
|
|
||||||
# extract entries with freq_pos>1
|
# extract entries with freq_pos>1
|
||||||
# should be 3093-211 = 3072
|
# should be 3093-211 = 3072
|
||||||
mut_info3 = mut_info2.loc[mut_info2['freq_pos'] >1] #3072
|
mut_info3 = mut_info2.loc[mut_info2['freq_pos'] >1] #3072
|
||||||
|
print("orig length:", len(mut_info1))
|
||||||
|
print("No. of excluded values:", mutfreq1_count)
|
||||||
|
print("new length:", len(mut_info3))
|
||||||
|
# sanity check
|
||||||
|
if ( (len(mut_info1) - mutfreq1_count) == len(mut_info3) ):
|
||||||
|
print("Sanity check passed: Filtered data correctly")
|
||||||
|
else:
|
||||||
|
print("Error: Debug you code")
|
||||||
|
|
||||||
# reset index to allow iteration <<<<<<<< IMPORTANT
|
# reset index to allow iteration !!!!!!!!!! IMPORTANT
|
||||||
mut_info = mut_info3.reset_index(drop = True)
|
mut_info = mut_info3.reset_index(drop = True)
|
||||||
|
|
||||||
del(mut_info1, mut_info2, mut_info3, my_data)
|
##del(mut_info1, mut_info2, mut_info3, my_data)
|
||||||
|
|
||||||
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
|
||||||
###################
|
###################
|
||||||
# generate mut seqs
|
# generate mut seqs
|
||||||
###################
|
###################
|
||||||
|
@ -132,20 +143,19 @@ mut_seqsL = [] * len(mut_info)
|
||||||
|
|
||||||
# iterate
|
# iterate
|
||||||
for i, pos in enumerate(mut_info['position']):
|
for i, pos in enumerate(mut_info['position']):
|
||||||
print('index:', i, 'position:', pos)
|
|
||||||
mut = mut_info['mutant_type'][i]
|
|
||||||
# print(mut)
|
|
||||||
# print( type(mut) )
|
|
||||||
print('index:', i, 'position:', pos, 'mutant', mut)
|
|
||||||
|
|
||||||
my_fastaL = list(my_fasta)
|
my_fastaL = list(my_fasta)
|
||||||
offset_pos = pos-1 #due to counting starting from 0
|
mut = mut_info['mutant_type'][i]
|
||||||
|
offset_pos = pos-1
|
||||||
|
|
||||||
|
print('1-index:', pos, '0-index cur:', offset_pos, my_fastaL[offset_pos], 'mut:', mut)
|
||||||
my_fastaL[offset_pos] = mut
|
my_fastaL[offset_pos] = mut
|
||||||
# print(my_fastaL)
|
print('1-index:', pos, '0-index new:', offset_pos, my_fastaL[offset_pos], 'mut:', mut)
|
||||||
|
|
||||||
mut_seq = "".join(my_fastaL)
|
mut_seq = "".join(my_fastaL)
|
||||||
# print(mut_seq + '\n')
|
# print(mut_seq + '\n')
|
||||||
|
print('original:', my_fasta, ',', 'replaced:', my_fasta[offset_pos], 'at', pos, 'with', mut, mut_seq)
|
||||||
mut_seqsL.append(mut_seq)
|
mut_seqsL.append(mut_seq)
|
||||||
# print('original:', my_fasta, ',', 'replaced at', pos, 'with', mut, mut_seq)
|
|
||||||
|
|
||||||
###############
|
###############
|
||||||
# sanity check
|
# sanity check
|
||||||
|
@ -161,84 +171,22 @@ for seqs in mut_seqsL:
|
||||||
print('**Hooray** Length of mutant and original sequences match')
|
print('**Hooray** Length of mutant and original sequences match')
|
||||||
|
|
||||||
del(i, len_orig, mut, mut_seq, my_fastaL, offset_pos, pos, seqs)
|
del(i, len_orig, mut, mut_seq, my_fastaL, offset_pos, pos, seqs)
|
||||||
|
|
||||||
############
|
############
|
||||||
# write file
|
# write file
|
||||||
############
|
############
|
||||||
#filepath = homedir +'/git/LSHTM_Y1_PNCA/combined_v3/logo_plot/snp_seqsfile'
|
#filepath = homedir +'/git/LSHTM_Y1_PNCA/combined_v3/logo_plot/snp_seqsfile'
|
||||||
#filepath = homedir + '/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Data/gene_msa.txt'
|
#filepath = homedir + '/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Data/gene_msa.txt'
|
||||||
|
print(outdir)
|
||||||
print(outpath)
|
out_filename = "gene_msa.txt"
|
||||||
out_filename_gene = "/gene_msa.txt"
|
outfile_gene = homedir + '/' + outdir + '/' + out_filename
|
||||||
outfile_gene = homedir + basedir + outpath + out_filename_gene
|
print(outfile_gene)
|
||||||
print("Output file is:", outfile_gene)
|
|
||||||
|
|
||||||
with open(outfile_gene, 'w') as file_handler:
|
with open(outfile_gene, 'w') as file_handler:
|
||||||
for item in mut_seqsL:
|
for item in mut_seqsL:
|
||||||
file_handler.write("{}\n".format(item))
|
file_handler.write("{}\n".format(item))
|
||||||
|
|
||||||
R="\n".join(mut_seqsL)
|
#R = "\n".join(mut_seqsL)
|
||||||
f = open('Columns.csv','w')
|
#f = open('Columns.csv','w')
|
||||||
f.write(R)
|
#f.write(R)
|
||||||
f.close()
|
#f.close()
|
||||||
|
|
||||||
|
|
||||||
#################################################################################
|
|
||||||
# extracting only positions with SNPs so that when you plot only those positions
|
|
||||||
################################################################################
|
|
||||||
#mut_seqsL = mut_seqsL[:3] #just trying with 3 seqs
|
|
||||||
|
|
||||||
# create a list of unique positions
|
|
||||||
pos = mut_info['position'] #3072
|
|
||||||
posL = list(set(list(pos))) #110
|
|
||||||
del(pos)
|
|
||||||
|
|
||||||
snp_seqsL = [] * len(mut_seqsL)
|
|
||||||
|
|
||||||
for j, mut_seq in enumerate(mut_seqsL):
|
|
||||||
print (j, mut_seq)
|
|
||||||
# print(mut_seq[101]) #testing, this should be P, T V (in order of the mut_info file)
|
|
||||||
mut_seqsE = list(mut_seq)
|
|
||||||
# extract specific posistions (corres to SNPs) from list of mutant sequences
|
|
||||||
snp_seqL1 = [mut_seqsE[i-1] for i in posL] #should be 110
|
|
||||||
# print(snp_seqL1)
|
|
||||||
# print(len(snp_seqL1))
|
|
||||||
snp_seq_clean = "".join(snp_seqL1)
|
|
||||||
snp_seqsL.append(snp_seq_clean)
|
|
||||||
|
|
||||||
###############
|
|
||||||
# sanity check
|
|
||||||
################
|
|
||||||
no_unique_snps = len(posL)
|
|
||||||
|
|
||||||
# checking if all the mutant sequences have the same length as the original fasta file sequence
|
|
||||||
for seqs in snp_seqsL:
|
|
||||||
# print(seqs)
|
|
||||||
# print(len(seqs))
|
|
||||||
if len(seqs) != no_unique_snps:
|
|
||||||
print('sequence lengths mismatch' +'\n', 'mutant seq length:', len(seqs), 'vs original seq length:', no_unique_snps)
|
|
||||||
else:
|
|
||||||
print('**Hooray** Length of mutant and original sequences match')
|
|
||||||
|
|
||||||
del(mut_seq, mut_seqsE, mut_seqsL, seqs, snp_seqL1, snp_seq_clean)
|
|
||||||
|
|
||||||
|
|
||||||
############
|
|
||||||
# write file
|
|
||||||
############
|
|
||||||
#filepath = homedir +'/git/LSHTM_Y1_PNCA/combined_v3/logo_plot/snp_seqsfile'
|
|
||||||
#filepath = homedir + '/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Data/snps_msa.txt'
|
|
||||||
|
|
||||||
print(outpath)
|
|
||||||
out_filename_snps = "/snps_msa.txt"
|
|
||||||
outfile_snps = homedir + basedir + outpath + out_filename_snps
|
|
||||||
print("Output file is:", outfile_snps)
|
|
||||||
|
|
||||||
with open(outfile_snps, 'w') as file_handler:
|
|
||||||
for item in snp_seqsL:
|
|
||||||
file_handler.write("{}\n".format(item))
|
|
||||||
|
|
||||||
R="\n".join(snp_seqsL)
|
|
||||||
f = open('Columns.csv','w')
|
|
||||||
f.write(R)
|
|
||||||
f.close()
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue