LSHTM_analysis/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 25 08:46:36 2019

@author: tanushree
"""
############################################
# load libraries
import os
import pandas as pd
import numpy as np
from Bio import SeqIO
############################################
#********************************************************************
# TASK: Read in fasta files and create mutant sequences akin to a MSA,
# to allow generation of logo plots

# Requirements:
# input: Fasta file of protein/target for which mut seqs will be created
	# path: "Data/<drug>/input/original/<filename>"
# output: MSA for mutant sequences
	# path: "Data/<drug>/input/processed/<filename>"
#***********************************************************************
#%%
# specify input and output variables
homedir = os.path.expanduser('~')
#=======
# input
#=======
#############
# fasta file
#############
indir = 'git/Data/pyrazinamide/input/original'
in_filename_fasta = "3pl1.fasta.txt"
infile_fasta = homedir + '/' + indir + '/' + in_filename_fasta
print(infile_fasta)

#############
# meta data
#############
# FIXME when you change the dir struc
inpath_p = "git/Data/pyrazinamide/input/processed"
in_filename_meta_data = "meta_data_with_AFandOR.csv"
infile_meta_data = homedir + '/' + inpath_p + '/' + in_filename_meta_data
print("Input file is:", infile_meta_data)

#=======
# output
#=======
outdir = 'git/Data/pyrazinamide/output'
# filenames in respective sections

################## end of variable assignment for input and output files
#%%
#==========
# read files
#==========

#############
# fasta file
#############
my_fasta_o = str()
for seq_record in SeqIO.parse(infile_fasta, "fasta"):
    my_seq = seq_record.seq
    my_fasta_o = str(my_seq) #convert to a string
    print(my_fasta_o)
    print(len(my_fasta_o))
#    print( type(my_fasta) )

# remove non_struc positions from fasta
def remove_char(str, n):
    first_part = str[:n]
    last_part = str[n+1:]
    return first_part + last_part
#print(remove_char('Python', 0))

ns_pos_o = 186
offset = 1 # 0 based indexing
ns_pos = ns_pos_o - offset
my_fasta = remove_char(my_fasta_o, ns_pos)
print("orig length:", len(my_fasta_o))
print("new length:", len(my_fasta))

#############
# SNP info and no of MSA to generate
#############
# read mutant_info file and extract cols with positions and mutant_info
# This should be all samples with pncA muts
#my_data = pd.read_csv('mcsm_complex1_normalised.csv')
my_data = pd.read_csv(infile_meta_data)
list(my_data.columns)
#my_data['OR'].value_counts()
#my_data['OR'].isna().sum()

#FIXME: You need a better way to identify this
# ideally this file should not contain any non_struc pos
# remove positions not in the structure
my_data = my_data[my_data.position != ns_pos_o]

# if multiple positions, then try the example below;
# https://stackoverflow.com/questions/29017525/deleting-rows-based-on-multiple-conditions-python-pandas
#df = df[(df.one > 0) | (df.two > 0) | (df.three > 0) & (df.four < 1)]

# count mutations per sample
mut_info = my_data[['id', 'Mutationinformation', 'wild_type', 'position', 'mutant_type']]

# test
foo = mut_info[mut_info.Mutationinformation.str.contains('C72Y')]

foo = mut_info.pivot_table(values = ['Mutationinformation']
                           , index = ['Mutationinformation', 'id']
#                           , columns =
                           , aggfunc = 'count')

# table
foo_tab = mut_info.pivot_table(values = ['Mutationinformation']
#                           , index = ['Mutationinformation']
                           , columns = ['id', 'Mutationinformation']
                           , aggfunc = 'count'
#                           , margins = True)
                           )
foo_tab.stack('id')

mut_info.to_csv('mutinfo.csv')

mut_info1 = my_data[['position', 'mutant_type']]
#%%
################
# data cleaning
################
# extract only those positions that have a frequency count of pos>1
###mut_info['freq_pos'] = mut_info.groupby('Position').count()#### dodgy

# add a column of frequency for each position
#mut_info1['freq_pos'] = mut_info1.groupby('position')['position'].transform('count')
mut_info1['freq_pos'] = mut_info1.position.map(mut_info1.position.value_counts())

# sort by position
mut_info2 = mut_info1.sort_values(by=['position'])

# count how many pos have freq 1 as you will need to exclude those
mutfreq1_count = mut_info2[mut_info2.freq_pos == 1].sum().freq_pos

# extract entries with freq_pos>1
# should be 3093-211 = 3072
mut_info3 = mut_info2.loc[mut_info2['freq_pos'] >1] #3072
print("orig length:", len(mut_info1))
print("No. of excluded values:", mutfreq1_count)
print("new length:", len(mut_info3))
# sanity check
if ( (len(mut_info1) - mutfreq1_count) == len(mut_info3) ):
    print("Sanity check passed: Filtered data correctly")
else:
    print("Error: Debug you code")

# reset index to allow iteration !!!!!!!!!! IMPORTANT
mut_info = mut_info3.reset_index(drop = True)

##del(mut_info1, mut_info2, mut_info3, my_data)

###################
# generate mut seqs
###################
mut_seqsL = [] * len(mut_info)

# iterate
for i, pos in enumerate(mut_info['position']):
    my_fastaL = list(my_fasta)
    mut = mut_info['mutant_type'][i]
    offset_pos = pos-1

    print('1-index:', pos, '0-index cur:', offset_pos, my_fastaL[offset_pos], 'mut:', mut)
    my_fastaL[offset_pos] = mut
    print('1-index:', pos, '0-index new:', offset_pos, my_fastaL[offset_pos], 'mut:', mut)

    mut_seq = "".join(my_fastaL)
#    print(mut_seq + '\n')
    print('original:', my_fasta, ',', 'replaced:', my_fasta[offset_pos], 'at', pos, 'with', mut,  mut_seq)
    mut_seqsL.append(mut_seq)


###############
# sanity check
################
len_orig = len(my_fasta)
#    checking if all the mutant sequences have the same length as the original fasta file sequence
for seqs in mut_seqsL:
#    print(seqs)
#    print(len(seqs))
    if len(seqs) != len_orig:
        print('sequence lengths mismatch' +'\n', 'mutant seq length:', len(seqs), 'vs original seq length:', len_orig)
    else:
        print('**Hooray** Length of mutant and original sequences match')

del(i, len_orig, mut, mut_seq, my_fastaL, offset_pos, pos, seqs)

############
# write file
############
#filepath = homedir +'/git/LSHTM_Y1_PNCA/combined_v3/logo_plot/snp_seqsfile'
#filepath =  homedir + '/git/LSHTM_Y1_PNCA/mcsm_analysis/pyrazinamide/Data/gene_msa.txt'
print(outdir)
out_filename = "gene_msa.txt"
outfile_gene = homedir + '/' + outdir + '/' + out_filename
print(outfile_gene)

with open(outfile_gene, 'w') as file_handler:
    for item in mut_seqsL:
        file_handler.write("{}\n".format(item))

#R = "\n".join(mut_seqsL)
#f = open('Columns.csv','w')
#f.write(R)
#f.close()