#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Aug  6 12:56:03 2019

@author: tanu
"""

# FIXME: include error checking to enure you only
# concentrate on positions that have structural info?

#%% load libraries
###################
# load libraries
import os, sys
import pandas as pd
#import numpy as np

#from pandas.api.types import is_string_dtype
#from pandas.api.types import is_numeric_dtype

# to create dir
#my_dir = os.path.expanduser('~/some_dir')
#make sure mcsm_analysis/ exists
#or specify the output directory

#%%
#%%
#%%
#========================================================
# TASK: extract ALL pncA mutations from GWAS data
#========================================================
#%%
####################
# my working dir
os.getcwd()
homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
os.getcwd()
#%%
from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
#%%
#NOTE: Out_dir MUST exis
# User defined dir strpyrazinamide
drug = 'pyrazinamide'
gene = 'pnca'
out_dir = homedir + '/git/LSHTM_analysis/mcsm_analysis/'
# = out_dir + drug
data_dir = homedir + '/git/Data'

if not os.path.exists(data_dir):
    print('Error!', data_dir, 'does not exist. Please ensure it exists and contains the appropriate raw data')
    os.makedirs(data_dir)
    die()

if not os.path.exists(out_dir):
    print('Error!', out_dir, 'does not exist. Please create it')
    exit()
    
#if not os.path.exists(work_dir):
#    print('creating dir that does not exist', 'dir_name:', work_dir)
#    os.makedirs(work_dir)
else:
    print('Dir exists: Carrying on')

# now create sub dir structure within work_dir
# pyrazinamide/mcsm_analysis

# we need three dir
# Data
# Scripts
    # Plotting
# Results
    # Plots
    
# create a list of dir names
#dir_names = ['Data', 'Scripts', 'Results']


#for i in dir_names:
#    this_dir = (work_dir + '/' + i)
#    if not os.path.exists(this_dir):
#        print('creating dir that does not exist:', this_dir)
#        os.makedirs(this_dir)
#else:
#    print('Dir exists: Carrying on')
      
# Create sub dirs
# 1)        
# Scripts
    # Plotting
#subdir_plotting = work_dir + '/Scripts/Plotting'
#if not os.path.exists(subdir_plotting):
#      print('creating dir that does not exist:', subdir_plotting)
#      os.makedirs(subdir_plotting)
#else:
#    print('Dir exists: Carrying on')
 
# 2)    
# Results
    # Plots
#subdir_plots = work_dir + '/Results/Plots'        
#if not os.path.exists(subdir_plots):
#        print('creating dir that does not exist:', subdir_plots)
#        os.makedirs(subdir_plots)    
#else:
#    print('Dir exists: Carrying on')
         
# clear varaibles
#del(dir_names, drug, i, subdir_plots, subdir_plotting)

#exit()
#%%
#==============================================================================
############
# STEP 1: Read file original_tanushree_data_v2.csv
############
data_file = data_dir + '/input/original/original_tanushree_data_v2.csv'
meta_data = pd.read_csv(data_file, sep = ',') 

# column names
list(meta_data.columns)

# extract elevant columns to extract from meta data related to the pyrazinamide
meta_data = meta_data[['id'
       ,'country'
       ,'lineage'
       ,'sublineage'
       ,'drtype'
       , 'pyrazinamide'
       , 'dr_mutations_pyrazinamide'
       , 'other_mutations_pyrazinamide'
        ]] 

# checks
total_samples = meta_data['id'].nunique() # 19265

# counts NA per column
meta_data.isna().sum()

# glance
meta_data.head()

# equivalent of table in R
# pyrazinamide counts
meta_data.pyrazinamide.value_counts() 

#%%
############
# STEP 2: extract entries containing selected genes: 
# pyrazinamide: pnca_p.
# in the dr_mutations and other mutations"
# as we are interested in the mutations in the protein coding region only 
# (corresponding to a structure)
# and drop the entries with NA
#############
meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]

del(meta_pza)

##########################
# pyrazinamide: pnca_p.
##########################
meta_data_pnca = meta_data[['id'
       ,'country'
       ,'lineage'
       ,'sublineage'
       ,'drtype'
       , 'pyrazinamide'
       , 'dr_mutations_pyrazinamide'
       , 'other_mutations_pyrazinamide'
        ]] 

del(meta_data)

# sanity checks

# dr_mutations only
meta_pnca_dr = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
meta_pnca_dr['id'].nunique() 
del(meta_pnca_dr)

# other mutations
meta_pnca_other = meta_data_pnca.loc[meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
meta_pnca_other['id'].nunique() 
del(meta_pnca_other)

# Now extract "all" mutations
meta_pnca_all = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*') | meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*') ]

meta_pnca_all['id'].nunique() 
pnca_samples = len(meta_pnca_all)
pnca_na = meta_pnca_all['pyrazinamide'].isna().sum() 
comp_pnca_samples = pnca_samples - pnca_na 

#=#=#=#=#=#=#
# COMMENT: use it later to check number of complete samples from LF data
#=#=#=#=#=#=#

# sanity checks
meta_pnca_all.dr_mutations_pyrazinamide.value_counts()
meta_pnca_all.other_mutations_pyrazinamide.value_counts()

# more sanity checks 
# !CAUTION!: muts will change depending on your gene

# dr muts : insert
meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro')] # 
meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Phe106Leu')] # empty
meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Val139Leu')]

meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists #  rows
m = meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists #  rows

# other_muts
meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro*')] # empty
meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Phe106Leu')]

#=#=#=#=#=#=#=#=#=#
# FIXME
# COMMENTS: both mutations columns are separated by ; 
# CHECK if there are mutations that exist both in dr and other_muts!
# doesn't make any sense for the same mut to exist in both, I would have thought!
#=#=#=#=#=#=#=#=#=#

# remove not required variables
del(meta_data_pnca)

############
# STEP 3: split the columns of 
# a) dr_mutation_... (;) as 
# the column has snps related to multiple genes.
# useful links
# https://stackoverflow.com/questions/17116814/pandas-how-do-i-split-text-in-a-column-into-multiple-rows
# this one works beautifully
# https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows
############

# sanity check: counts NA per column afer subsetted df: i.e in meta_pza(with pncA_p. extracted mutations)
meta_pnca_all.isna().sum()

#=#=#=#=#=#=#=#=#=#
# COMMENT: no NA's in dr_mutations/other_mutations_columns
#=#=#=#=#=#=#=#=#=#
# define the split function
def tidy_split(df, column, sep='|', keep=False):
    """
    Split the values of a column and expand so the new DataFrame has one split
    value per row. Filters rows where the column is missing.

    Params
    ------
    df : pandas.DataFrame
        dataframe with the column to split and expand
    column : str
        the column to split and expand
    sep : str
        the string used to split the column's values
    keep : bool
        whether to retain the presplit value as it's own row

    Returns
    -------
    pandas.DataFrame
        Returns a dataframe with the same columns as `df`.
    """
    indexes = list()
    new_values = list()
    #df = df.dropna(subset=[column])#<<<<<<-----see this incase you need to uncomment based on use case
    for i, presplit in enumerate(df[column].astype(str)):
        values = presplit.split(sep)
        if keep and len(values) > 1:
            indexes.append(i)
            new_values.append(presplit)
        for value in values:
            indexes.append(i)
            new_values.append(value)
    new_df = df.iloc[indexes, :].copy()
    new_df[column] = new_values
    return new_df

########
# 3a: call tidy_split() on 'dr_mutations_pyrazinamide' column and remove leading white spaces
#https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas
########    
meta_pnca_WF0 = tidy_split(meta_pnca_all, 'dr_mutations_pyrazinamide', sep = ';') 

# remove leading white space else these are counted as distinct mutations as well
meta_pnca_WF0['dr_mutations_pyrazinamide'] = meta_pnca_WF0['dr_mutations_pyrazinamide'].str.lstrip() 

########
# 3b: call function on 'other_mutations_pyrazinamide' column and remove leading white spaces
######## 
meta_pnca_WF1 = tidy_split(meta_pnca_WF0, 'other_mutations_pyrazinamide', sep = ';') 

# remove the leading white spaces in the column
meta_pnca_WF1['other_mutations_pyrazinamide'] = meta_pnca_WF1['other_mutations_pyrazinamide'].str.strip() 

##########
# Step 4: Reshape data so that all mutations are in one column and the 
# annotations for the mutation reflect the column name
# LINK: http://www.datasciencemadesimple.com/reshape-wide-long-pandas-python-melt-function/

# data frame “df” is passed to melt() function
# id_vars is the variable which need to be left unaltered
# var_name are the column names so we named it as 'mutation_info'
# value_name are its values so we named it as 'mutation'
##########
meta_pnca_WF1.columns

meta_pnca_LF0 = pd.melt(meta_pnca_WF1
                      , id_vars = ['id', 'country', 'lineage', 'sublineage', 'drtype', 'pyrazinamide']
                      , var_name = 'mutation_info'
                      , value_name = 'mutation') 

# sanity check: should be true
if len(meta_pnca_LF0) == len(meta_pnca_WF1)*2:
    print('sanity check passed: Long format df has the expected length')
else:
    print("Sanity check failed: Debug please!")

###########
# Step 5: This is still dirty data. Filter LF data so that you only have
# mutations corresponding to pnca_p. 
# this will be your list you run OR calcs 
###########
meta_pnca_LF1 = meta_pnca_LF0[meta_pnca_LF0['mutation'].str.contains('pncA_p.*')] 

# sanity checks
# unique samples
meta_pnca_LF1['id'].nunique()
if len(meta_pnca_all) == meta_pnca_LF1['id'].nunique():
    print("Sanity check passed: No of samples with pncA mutations match")
else:
    print("Sanity check failed: Debug please!")

# count if all the mutations are indeed in the protein coding region 
# i.e begin with pnca_p
meta_pnca_LF1['mutation'].str.count('pncA_p.').sum() # 3093

# should  be true.
# and check against the length of the df, which should match
if len(meta_pnca_LF1) == meta_pnca_LF1['mutation'].str.count('pncA_p.').sum():
    print("Sanity check passed: Long format data containing pnca mutations indeed correspond to pncA_p region")
else:
    print("Sanity check failed: Debug please!")

###########
# Step 6: Filter dataframe with "na" in the drug column
# This is because for OR, you can't use the snps that have the
# NA in the specified drug column
# it creates problems when performing calcs in R inside the loop
# so best to filter it here
###########
# NOT NEEDED FOR all snps, only for extracting valid OR snps
del (meta_pnca_WF0, meta_pnca_WF1, meta_pnca_LF0, meta_pnca_all)

###########
# Step 7: count unique pncA_p mutations (all and comp cases)
###########
meta_pnca_LF1['mutation'].nunique() 
meta_pnca_LF1.groupby('mutation_info').nunique()

meta_pnca_LF1['id'].nunique()  
meta_pnca_LF1['mutation'].nunique() 
meta_pnca_LF1.groupby('id').nunique()

###########
# Step 8: convert all snps only (IN LOWERCASE)
# because my mcsm file intergated has lowercase
###########
# convert mutation to lower case as it needs to exactly match the dict key
#meta_pnca_LF1['mutation'] = meta_pnca_LF1.mutation.str.lower() # WARNINGS: suggested to use .loc
meta_pnca_LF1['mutation'] = meta_pnca_LF1.loc[:, 'mutation'].str.lower()

###########
# Step 9 : Split 'mutation' column into three:  wild_type, position and
# mutant_type separately. Then map three letter code to one from the 
# referece_dict imported pncaeady. First convert to mutation to lowercase
# to allow to match entries from dict 
###########
#=======
# Step 9a: iterate through the dict, create a lookup dict i.e
# lookup_dict = {three_letter_code: one_letter_code}.
# lookup dict should be the key and the value (you want to create a column for)
# Then use this to perform the mapping separetly for wild type and mutant cols.
# The three letter code is extracted using a regex match from the dataframe and then converted
# to 'pandas series'since map only works in pandas series
#=======
# initialise a sub dict that is a lookup dict for three letter code to one
lookup_dict = dict()
for k, v in my_aa_dict.items():
    lookup_dict[k] = v['one_letter_code']
    wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
    meta_pnca_LF1['wild_type'] = wt.map(lookup_dict)   
    mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
    meta_pnca_LF1['mutant_type'] = mut.map(lookup_dict)

# extract position info from mutation column separetly using regex
meta_pnca_LF1['position'] = meta_pnca_LF1['mutation'].str.extract(r'(\d+)') 

# clear variables
del(k, v, wt, mut, lookup_dict)

#=========
# Step 9b: iterate through the dict, create a lookup dict that i.e
# lookup_dict =  {three_letter_code: aa_prop_water} 
# Do this for both wild_type and mutant as above.
#=========
# initialise a sub dict that is lookup dict for three letter code to aa prop
lookup_dict = dict()

for k, v in my_aa_dict.items():
    lookup_dict[k] = v['aa_prop_water']
    #print(lookup_dict)
    wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
    meta_pnca_LF1['wt_prop_water'] = wt.map(lookup_dict)   
    mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
    meta_pnca_LF1['mut_prop_water'] = mut.map(lookup_dict)
    
# added two more cols

# clear variables
del(k, v, wt, mut, lookup_dict)

#========
# Step 9c: iterate through the dict, create a lookup dict that i.e
# lookup_dict =  {three_letter_code: aa_prop_polarity} 
# Do this for both wild_type and mutant as above.
#=========
# initialise a sub dict that is lookup dict for three letter code to aa prop
lookup_dict = dict()

for k, v in my_aa_dict.items():
    lookup_dict[k] = v['aa_prop_polarity']
    #print(lookup_dict)
    wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
    meta_pnca_LF1['wt_prop_polarity'] = wt.map(lookup_dict)   
    mut = meta_pnca_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
    meta_pnca_LF1['mut_prop_polarity'] = mut.map(lookup_dict)
    
# added two more cols
    
# clear variables
del(k, v, wt, mut, lookup_dict)

########
# Step 10: combine the wild_type+poistion+mutant_type columns to generate 
# Mutationinformation (matches mCSM output field)
# Remember to use .map(str) for int col types to allow string concatenation
#########
meta_pnca_LF1['Mutationinformation'] = meta_pnca_LF1['wild_type'] + meta_pnca_LF1.position.map(str) + meta_pnca_LF1['mutant_type']

#=#=#=#=#=#=#
# Step 11:
# COMMENT: there is more processing in the older version of this script
# consult if necessary
# possibly due to the presence of true_wt
# since this file doesn't contain any true_wt, we won't need it(hopefully!)
#=#=#=#=#=#=#

#%%
###########
# Step 12: Output files for only SNPs to run mCSM
###########

#=========
# Step 12a: all SNPs to run mCSM
#=========
snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique()) 
pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique()) 

# assign meaningful colnames 
#snps_only.rename({0 : 'all_pnca_snps'}, axis = 1, inplace = True)
#list(snps_only.columns)
snps_only.isna().sum() # should be 0

# output csv: all SNPS for mCSM analysis
# specify variable name for output file
gene = 'pnca'
#drug = 'pyrazinamide'
my_fname1 = '_snps_'
nrows = len(snps_only) 

#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
#output_file_path = work_dir + '/Data/'
output_file_path = data_dir + '/input/processed/' + drug + '/'

if not os.path.exists(output_file_path):
    print( output_file_path, 'does not exist. Creating')
    os.makedirs(output_file_path)
    exit()

output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv'
print(output_filename) #<<<- check

# write to csv: without column or row names
# Bad practice: numbers at the start of a filename
snps_only.to_csv(output_filename, header = False, index = False)

#=========
# Step 12b: all snps with annotation
#=========
# all snps, selected cols
#pnca_snps_ALL = meta_pnca_LF1[['id','country','lineage', 'sublineage'
#                               , 'drtype', 'pyrazinamide'
#                               , 'mutation_info', 'mutation', 'Mutationinformation']] 

#len(pnca_snps_ALL) 

# sanity check
#meta_pnca_LF1['mutation'].nunique() 

# output csv: WITH column but WITHOUT row names(all snps with meta data)
# specify variable name for output file
#gene = 'pnca'
#drug = 'pyrazinamide'
#my_fname2 = '_snps_with_metadata_'
#nrows = len(pnca_snps_ALL) 

#output_file_path = work_dir + '/Data/'
#output_filename = output_file_path + gene + my_fname2 + str(nrows) + '.csv'
#print(output_filename)  #<<<- check

# write out file
#pnca_snps_ALL.to_csv(output_filename, header = True, index = False)

#=========
# Step 12c: comp snps for OR calcs with annotation
#=========
# remove all NA's from pyrazinamide column from LF1
    
# counts NA per column
meta_pnca_LF1.isna().sum()

# remove NA
meta_pnca_LF2 = meta_pnca_LF1.dropna(subset=['pyrazinamide'])

# sanity checks
# should be True
len(meta_pnca_LF2) == len(meta_pnca_LF1) - meta_pnca_LF1['pyrazinamide'].isna().sum()

# unique counts
meta_pnca_LF2['mutation'].nunique() 

meta_pnca_LF2.groupby('mutation_info').nunique() 

# sanity check
meta_pnca_LF2['id'].nunique() 

# should be True
if meta_pnca_LF2['id'].nunique() == comp_pnca_samples:
    print ('sanity check passed: complete numbers match')
else:
    print('Error: Please Debug!')

# value counts
meta_pnca_LF2.mutation.value_counts()
#meta_pnca_LF2.groupby(['mutation_info', 'mutation']).size()

# valid/comp snps
# uncomment as necessary
pnca_snps_COMP  = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())
len(pnca_snps_COMP) 

# output csv: WITH column but WITHOUT row names (COMP snps with meta data)
# specify variable name for output file

gene = 'pnca'
#drug = 'pyrazinamide'
my_fname3 = '_comp_snps_with_metadata_'
nrows = len(pnca_snps_COMP) 

#output_filename = output_file_path + gene + my_fname3 + str(nrows) + '.csv'
#print(output_filename) #<<<-check

# write out file
#pnca_snps_COMP.to_csv(output_filename, header = True, index = False)


#=========
# Step 12d: comp snps only
#=========
# output csv: comp SNPS for info (i.e snps for which OR exist)
# specify variable name for output file

snps_only = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())

gene = 'pnca'
#drug = 'pyrazinamide'
my_fname1 = '_comp_snps_'
nrows = len(snps_only) 

output_filename = output_file_path + gene + my_fname1  + str(nrows) + '.csv'
print(output_filename) #<<<- check

# write to csv: without column or row names
snps_only.to_csv(output_filename, header = False, index = False)


#=#=#=#=#=#=#=#
# COMMENT: LF1 is the file to extract all unique snps for mcsm 
# but you have that already in file called pnca_snps...
# LF2: is the file for extracting snps tested for DS and hence OR calcs
#=#=#=#=#=#=#=#

###########
# Step 13 : Output the whole df i.e 
# file for meta_data which is now formatted with
# each row as a unique snp rather than the original version where
# each row is a unique id
# ***** This is the file you will ADD the AF and OR calculations to *****
###########

# output csv: the entire DF
# specify variable name for output file
gene = 'pnca'
#drug = 'pyrazinamide'
my_fname4 = '_metadata'
#nrows = len(meta_pnca_LF1)
output_filename = output_file_path + gene + my_fname4  + '.csv'
print(output_filename) #<<<-check

# write out file
meta_pnca_LF1.to_csv(output_filename)