import commit

2020-01-08 16:15:33 +00:00 · 2020-01-08 16:15:33 +00:00 · bccfe68192
commit bccfe68192
39 changed files with 6837 additions and 0 deletions
--- a/meta_data_analysis/pnca_data_extraction.py
+++ b/meta_data_analysis/pnca_data_extraction.py
@ -0,0 +1,626 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Aug  6 12:56:03 2019
+
+@author: tanu
+"""
+
+# FIXME: include error checking to enure you only
+# concentrate on positions that have structural info?
+
+#%% load libraries
+###################
+# load libraries
+import os, sys
+import pandas as pd
+#import numpy as np
+
+#from pandas.api.types import is_string_dtype
+#from pandas.api.types import is_numeric_dtype
+
+# to create dir
+#my_dir = os.path.expanduser('~/some_dir')
+#make sure mcsm_analysis/ exists
+#or specify the output directory
+
+#%%
+#%%
+#%%
+#========================================================
+# TASK: extract ALL pncA mutations from GWAS data
+#========================================================
+#%%
+####################
+# my working dir
+os.getcwd()
+homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde
+os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis')
+os.getcwd()
+#%%
+from reference_dict import my_aa_dict #CHECK DIR STRUC THERE!
+#%%
+#NOTE: Out_dir MUST exis
+# User defined dir strpyrazinamide
+drug = 'pyrazinamide'
+gene = 'pnca'
+out_dir = homedir + '/git/LSHTM_analysis/mcsm_analysis/'
+# = out_dir + drug
+data_dir = homedir + '/git/Data'
+
+if not os.path.exists(data_dir):
+    print('Error!', data_dir, 'does not exist. Please ensure it exists and contains the appropriate raw data')
+    os.makedirs(data_dir)
+    die()
+
+if not os.path.exists(out_dir):
+    print('Error!', out_dir, 'does not exist. Please create it')
+    exit()
+    
+#if not os.path.exists(work_dir):
+#    print('creating dir that does not exist', 'dir_name:', work_dir)
+#    os.makedirs(work_dir)
+else:
+    print('Dir exists: Carrying on')
+
+# now create sub dir structure within work_dir
+# pyrazinamide/mcsm_analysis
+
+# we need three dir
+# Data
+# Scripts
+    # Plotting
+# Results
+    # Plots
+    
+# create a list of dir names
+#dir_names = ['Data', 'Scripts', 'Results']
+
+
+#for i in dir_names:
+#    this_dir = (work_dir + '/' + i)
+#    if not os.path.exists(this_dir):
+#        print('creating dir that does not exist:', this_dir)
+#        os.makedirs(this_dir)
+#else:
+#    print('Dir exists: Carrying on')
+      
+# Create sub dirs
+# 1)        
+# Scripts
+    # Plotting
+#subdir_plotting = work_dir + '/Scripts/Plotting'
+#if not os.path.exists(subdir_plotting):
+#      print('creating dir that does not exist:', subdir_plotting)
+#      os.makedirs(subdir_plotting)
+#else:
+#    print('Dir exists: Carrying on')
+ 
+# 2)    
+# Results
+    # Plots
+#subdir_plots = work_dir + '/Results/Plots'        
+#if not os.path.exists(subdir_plots):
+#        print('creating dir that does not exist:', subdir_plots)
+#        os.makedirs(subdir_plots)    
+#else:
+#    print('Dir exists: Carrying on')
+         
+# clear varaibles
+#del(dir_names, drug, i, subdir_plots, subdir_plotting)
+
+#exit()
+#%%
+#==============================================================================
+############
+# STEP 1: Read file original_tanushree_data_v2.csv
+############
+data_file = data_dir + '/input/original/original_tanushree_data_v2.csv'
+meta_data = pd.read_csv(data_file, sep = ',') 
+
+# column names
+list(meta_data.columns)
+
+# extract elevant columns to extract from meta data related to the pyrazinamide
+meta_data = meta_data[['id'
+       ,'country'
+       ,'lineage'
+       ,'sublineage'
+       ,'drtype'
+       , 'pyrazinamide'
+       , 'dr_mutations_pyrazinamide'
+       , 'other_mutations_pyrazinamide'
+        ]] 
+
+# checks
+total_samples = meta_data['id'].nunique() # 19265
+
+# counts NA per column
+meta_data.isna().sum()
+
+# glance
+meta_data.head()
+
+# equivalent of table in R
+# pyrazinamide counts
+meta_data.pyrazinamide.value_counts() 
+
+#%%
+############
+# STEP 2: extract entries containing selected genes: 
+# pyrazinamide: pnca_p.
+# in the dr_mutations and other mutations"
+# as we are interested in the mutations in the protein coding region only 
+# (corresponding to a structure)
+# and drop the entries with NA
+#############
+meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
+meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
+
+del(meta_pza)
+
+##########################
+# pyrazinamide: pnca_p.
+##########################
+meta_data_pnca = meta_data[['id'
+       ,'country'
+       ,'lineage'
+       ,'sublineage'
+       ,'drtype'
+       , 'pyrazinamide'
+       , 'dr_mutations_pyrazinamide'
+       , 'other_mutations_pyrazinamide'
+        ]] 
+
+del(meta_data)
+
+# sanity checks
+
+# dr_mutations only
+meta_pnca_dr = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
+meta_pnca_dr['id'].nunique() 
+del(meta_pnca_dr)
+
+# other mutations
+meta_pnca_other = meta_data_pnca.loc[meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
+meta_pnca_other['id'].nunique() 
+del(meta_pnca_other)
+
+# Now extract "all" mutations
+meta_pnca_all = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*') | meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*') ]
+
+meta_pnca_all['id'].nunique() 
+pnca_samples = len(meta_pnca_all)
+pnca_na = meta_pnca_all['pyrazinamide'].isna().sum() 
+comp_pnca_samples = pnca_samples - pnca_na 
+
+#=#=#=#=#=#=#
+# COMMENT: use it later to check number of complete samples from LF data
+#=#=#=#=#=#=#
+
+# sanity checks
+meta_pnca_all.dr_mutations_pyrazinamide.value_counts()
+meta_pnca_all.other_mutations_pyrazinamide.value_counts()
+
+# more sanity checks 
+# !CAUTION!: muts will change depending on your gene
+
+# dr muts : insert
+meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro')] # 
+meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Phe106Leu')] # empty
+meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Val139Leu')]
+
+meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists #  rows
+m = meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists #  rows
+
+# other_muts
+meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro*')] # empty
+meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Phe106Leu')]
+
+#=#=#=#=#=#=#=#=#=#
+# FIXME
+# COMMENTS: both mutations columns are separated by ; 
+# CHECK if there are mutations that exist both in dr and other_muts!
+# doesn't make any sense for the same mut to exist in both, I would have thought!
+#=#=#=#=#=#=#=#=#=#
+
+# remove not required variables
+del(meta_data_pnca)
+
+############
+# STEP 3: split the columns of 
+# a) dr_mutation_... (;) as 
+# the column has snps related to multiple genes.
+# useful links
+# https://stackoverflow.com/questions/17116814/pandas-how-do-i-split-text-in-a-column-into-multiple-rows
+# this one works beautifully
+# https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows
+############
+
+# sanity check: counts NA per column afer subsetted df: i.e in meta_pza(with pncA_p. extracted mutations)
+meta_pnca_all.isna().sum()
+
+#=#=#=#=#=#=#=#=#=#
+# COMMENT: no NA's in dr_mutations/other_mutations_columns
+#=#=#=#=#=#=#=#=#=#
+# define the split function
+def tidy_split(df, column, sep='|', keep=False):
+    """
+    Split the values of a column and expand so the new DataFrame has one split
+    value per row. Filters rows where the column is missing.
+
+    Params
+    ------
+    df : pandas.DataFrame
+        dataframe with the column to split and expand
+    column : str
+        the column to split and expand
+    sep : str
+        the string used to split the column's values
+    keep : bool
+        whether to retain the presplit value as it's own row
+
+    Returns
+    -------
+    pandas.DataFrame
+        Returns a dataframe with the same columns as `df`.
+    """
+    indexes = list()
+    new_values = list()
+    #df = df.dropna(subset=[column])#<<<<<<-----see this incase you need to uncomment based on use case
+    for i, presplit in enumerate(df[column].astype(str)):
+        values = presplit.split(sep)
+        if keep and len(values) > 1:
+            indexes.append(i)
+            new_values.append(presplit)
+        for value in values:
+            indexes.append(i)
+            new_values.append(value)
+    new_df = df.iloc[indexes, :].copy()
+    new_df[column] = new_values
+    return new_df
+
+########
+# 3a: call tidy_split() on 'dr_mutations_pyrazinamide' column and remove leading white spaces
+#https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas
+########    
+meta_pnca_WF0 = tidy_split(meta_pnca_all, 'dr_mutations_pyrazinamide', sep = ';') 
+
+# remove leading white space else these are counted as distinct mutations as well
+meta_pnca_WF0['dr_mutations_pyrazinamide'] = meta_pnca_WF0['dr_mutations_pyrazinamide'].str.lstrip() 
+
+########
+# 3b: call function on 'other_mutations_pyrazinamide' column and remove leading white spaces
+######## 
+meta_pnca_WF1 = tidy_split(meta_pnca_WF0, 'other_mutations_pyrazinamide', sep = ';') 
+
+# remove the leading white spaces in the column
+meta_pnca_WF1['other_mutations_pyrazinamide'] = meta_pnca_WF1['other_mutations_pyrazinamide'].str.strip() 
+
+##########
+# Step 4: Reshape data so that all mutations are in one column and the 
+# annotations for the mutation reflect the column name
+# LINK: http://www.datasciencemadesimple.com/reshape-wide-long-pandas-python-melt-function/
+
+# data frame “df” is passed to melt() function
+# id_vars is the variable which need to be left unaltered
+# var_name are the column names so we named it as 'mutation_info'
+# value_name are its values so we named it as 'mutation'
+##########
+meta_pnca_WF1.columns
+
+meta_pnca_LF0 = pd.melt(meta_pnca_WF1
+                      , id_vars = ['id', 'country', 'lineage', 'sublineage', 'drtype', 'pyrazinamide']
+                      , var_name = 'mutation_info'
+                      , value_name = 'mutation') 
+
+# sanity check: should be true
+if len(meta_pnca_LF0) == len(meta_pnca_WF1)*2:
+    print('sanity check passed: Long format df has the expected length')
+else:
+    print("Sanity check failed: Debug please!")
+
+###########
+# Step 5: This is still dirty data. Filter LF data so that you only have
+# mutations corresponding to pnca_p. 
+# this will be your list you run OR calcs 
+###########
+meta_pnca_LF1 = meta_pnca_LF0[meta_pnca_LF0['mutation'].str.contains('pncA_p.*')] 
+
+# sanity checks
+# unique samples
+meta_pnca_LF1['id'].nunique()
+if len(meta_pnca_all) == meta_pnca_LF1['id'].nunique():
+    print("Sanity check passed: No of samples with pncA mutations match")
+else:
+    print("Sanity check failed: Debug please!")
+
+# count if all the mutations are indeed in the protein coding region 
+# i.e begin with pnca_p
+meta_pnca_LF1['mutation'].str.count('pncA_p.').sum() # 3093
+
+# should  be true.
+# and check against the length of the df, which should match
+if len(meta_pnca_LF1) == meta_pnca_LF1['mutation'].str.count('pncA_p.').sum():
+    print("Sanity check passed: Long format data containing pnca mutations indeed correspond to pncA_p region")
+else:
+    print("Sanity check failed: Debug please!")
+
+###########
+# Step 6: Filter dataframe with "na" in the drug column
+# This is because for OR, you can't use the snps that have the
+# NA in the specified drug column
+# it creates problems when performing calcs in R inside the loop
+# so best to filter it here
+###########
+# NOT NEEDED FOR all snps, only for extracting valid OR snps
+del (meta_pnca_WF0, meta_pnca_WF1, meta_pnca_LF0, meta_pnca_all)
+
+###########
+# Step 7: count unique pncA_p mutations (all and comp cases)
+###########
+meta_pnca_LF1['mutation'].nunique() 
+meta_pnca_LF1.groupby('mutation_info').nunique()
+
+meta_pnca_LF1['id'].nunique()  
+meta_pnca_LF1['mutation'].nunique() 
+meta_pnca_LF1.groupby('id').nunique()
+
+###########
+# Step 8: convert all snps only (IN LOWERCASE)
+# because my mcsm file intergated has lowercase
+###########
+# convert mutation to lower case as it needs to exactly match the dict key
+#meta_pnca_LF1['mutation'] = meta_pnca_LF1.mutation.str.lower() # WARNINGS: suggested to use .loc
+meta_pnca_LF1['mutation'] = meta_pnca_LF1.loc[:, 'mutation'].str.lower()
+
+###########
+# Step 9 : Split 'mutation' column into three:  wild_type, position and
+# mutant_type separately. Then map three letter code to one from the 
+# referece_dict imported pncaeady. First convert to mutation to lowercase
+# to allow to match entries from dict 
+###########
+#=======
+# Step 9a: iterate through the dict, create a lookup dict i.e
+# lookup_dict = {three_letter_code: one_letter_code}.
+# lookup dict should be the key and the value (you want to create a column for)
+# Then use this to perform the mapping separetly for wild type and mutant cols.
+# The three letter code is extracted using a regex match from the dataframe and then converted
+# to 'pandas series'since map only works in pandas series
+#=======
+# initialise a sub dict that is a lookup dict for three letter code to one
+lookup_dict = dict()
+for k, v in my_aa_dict.items():
+    lookup_dict[k] = v['one_letter_code']
+    wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
+    meta_pnca_LF1['wild_type'] = wt.map(lookup_dict)   
+    mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
+    meta_pnca_LF1['mutant_type'] = mut.map(lookup_dict)
+
+# extract position info from mutation column separetly using regex
+meta_pnca_LF1['position'] = meta_pnca_LF1['mutation'].str.extract(r'(\d+)') 
+
+# clear variables
+del(k, v, wt, mut, lookup_dict)
+
+#=========
+# Step 9b: iterate through the dict, create a lookup dict that i.e
+# lookup_dict =  {three_letter_code: aa_prop_water} 
+# Do this for both wild_type and mutant as above.
+#=========
+# initialise a sub dict that is lookup dict for three letter code to aa prop
+lookup_dict = dict()
+
+for k, v in my_aa_dict.items():
+    lookup_dict[k] = v['aa_prop_water']
+    #print(lookup_dict)
+    wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
+    meta_pnca_LF1['wt_prop_water'] = wt.map(lookup_dict)   
+    mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze()
+    meta_pnca_LF1['mut_prop_water'] = mut.map(lookup_dict)
+    
+# added two more cols
+
+# clear variables
+del(k, v, wt, mut, lookup_dict)
+
+#========
+# Step 9c: iterate through the dict, create a lookup dict that i.e
+# lookup_dict =  {three_letter_code: aa_prop_polarity} 
+# Do this for both wild_type and mutant as above.
+#=========
+# initialise a sub dict that is lookup dict for three letter code to aa prop
+lookup_dict = dict()
+
+for k, v in my_aa_dict.items():
+    lookup_dict[k] = v['aa_prop_polarity']
+    #print(lookup_dict)
+    wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on
+    meta_pnca_LF1['wt_prop_polarity'] = wt.map(lookup_dict)   
+    mut = meta_pnca_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze()
+    meta_pnca_LF1['mut_prop_polarity'] = mut.map(lookup_dict)
+    
+# added two more cols
+    
+# clear variables
+del(k, v, wt, mut, lookup_dict)
+
+########
+# Step 10: combine the wild_type+poistion+mutant_type columns to generate 
+# Mutationinformation (matches mCSM output field)
+# Remember to use .map(str) for int col types to allow string concatenation
+#########
+meta_pnca_LF1['Mutationinformation'] = meta_pnca_LF1['wild_type'] + meta_pnca_LF1.position.map(str) + meta_pnca_LF1['mutant_type']
+
+#=#=#=#=#=#=#
+# Step 11:
+# COMMENT: there is more processing in the older version of this script
+# consult if necessary
+# possibly due to the presence of true_wt
+# since this file doesn't contain any true_wt, we won't need it(hopefully!)
+#=#=#=#=#=#=#
+
+#%%
+###########
+# Step 12: Output files for only SNPs to run mCSM
+###########
+
+#=========
+# Step 12a: all SNPs to run mCSM
+#=========
+snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique()) 
+pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique()) 
+
+# assign meaningful colnames 
+#snps_only.rename({0 : 'all_pnca_snps'}, axis = 1, inplace = True)
+#list(snps_only.columns)
+snps_only.isna().sum() # should be 0
+
+# output csv: all SNPS for mCSM analysis
+# specify variable name for output file
+gene = 'pnca'
+#drug = 'pyrazinamide'
+my_fname1 = '_snps_'
+nrows = len(snps_only) 
+
+#output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/'
+#output_file_path = work_dir + '/Data/'
+output_file_path = data_dir + '/input/processed/' + drug + '/'
+
+if not os.path.exists(output_file_path):
+    print( output_file_path, 'does not exist. Creating')
+    os.makedirs(output_file_path)
+    exit()
+
+output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv'
+print(output_filename) #<<<- check
+
+# write to csv: without column or row names
+# Bad practice: numbers at the start of a filename
+snps_only.to_csv(output_filename, header = False, index = False)
+
+#=========
+# Step 12b: all snps with annotation
+#=========
+# all snps, selected cols
+#pnca_snps_ALL = meta_pnca_LF1[['id','country','lineage', 'sublineage'
+#                               , 'drtype', 'pyrazinamide'
+#                               , 'mutation_info', 'mutation', 'Mutationinformation']] 
+
+#len(pnca_snps_ALL) 
+
+# sanity check
+#meta_pnca_LF1['mutation'].nunique() 
+
+# output csv: WITH column but WITHOUT row names(all snps with meta data)
+# specify variable name for output file
+#gene = 'pnca'
+#drug = 'pyrazinamide'
+#my_fname2 = '_snps_with_metadata_'
+#nrows = len(pnca_snps_ALL) 
+
+#output_file_path = work_dir + '/Data/'
+#output_filename = output_file_path + gene + my_fname2 + str(nrows) + '.csv'
+#print(output_filename)  #<<<- check
+
+# write out file
+#pnca_snps_ALL.to_csv(output_filename, header = True, index = False)
+
+#=========
+# Step 12c: comp snps for OR calcs with annotation
+#=========
+# remove all NA's from pyrazinamide column from LF1
+    
+# counts NA per column
+meta_pnca_LF1.isna().sum()
+
+# remove NA
+meta_pnca_LF2 = meta_pnca_LF1.dropna(subset=['pyrazinamide'])
+
+# sanity checks
+# should be True
+len(meta_pnca_LF2) == len(meta_pnca_LF1) - meta_pnca_LF1['pyrazinamide'].isna().sum()
+
+# unique counts
+meta_pnca_LF2['mutation'].nunique() 
+
+meta_pnca_LF2.groupby('mutation_info').nunique() 
+
+# sanity check
+meta_pnca_LF2['id'].nunique() 
+
+# should be True
+if meta_pnca_LF2['id'].nunique() == comp_pnca_samples:
+    print ('sanity check passed: complete numbers match')
+else:
+    print('Error: Please Debug!')
+
+# value counts
+meta_pnca_LF2.mutation.value_counts()
+#meta_pnca_LF2.groupby(['mutation_info', 'mutation']).size()
+
+# valid/comp snps
+# uncomment as necessary
+pnca_snps_COMP  = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())
+len(pnca_snps_COMP) 
+
+# output csv: WITH column but WITHOUT row names (COMP snps with meta data)
+# specify variable name for output file
+
+gene = 'pnca'
+#drug = 'pyrazinamide'
+my_fname3 = '_comp_snps_with_metadata_'
+nrows = len(pnca_snps_COMP) 
+
+#output_filename = output_file_path + gene + my_fname3 + str(nrows) + '.csv'
+#print(output_filename) #<<<-check
+
+# write out file
+#pnca_snps_COMP.to_csv(output_filename, header = True, index = False)
+
+
+#=========
+# Step 12d: comp snps only
+#=========
+# output csv: comp SNPS for info (i.e snps for which OR exist)
+# specify variable name for output file
+
+snps_only = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique())
+
+gene = 'pnca'
+#drug = 'pyrazinamide'
+my_fname1 = '_comp_snps_'
+nrows = len(snps_only) 
+
+output_filename = output_file_path + gene + my_fname1  + str(nrows) + '.csv'
+print(output_filename) #<<<- check
+
+# write to csv: without column or row names
+snps_only.to_csv(output_filename, header = False, index = False)
+
+
+#=#=#=#=#=#=#=#
+# COMMENT: LF1 is the file to extract all unique snps for mcsm 
+# but you have that already in file called pnca_snps...
+# LF2: is the file for extracting snps tested for DS and hence OR calcs
+#=#=#=#=#=#=#=#
+
+###########
+# Step 13 : Output the whole df i.e 
+# file for meta_data which is now formatted with
+# each row as a unique snp rather than the original version where
+# each row is a unique id
+# ***** This is the file you will ADD the AF and OR calculations to *****
+###########
+
+# output csv: the entire DF
+# specify variable name for output file
+gene = 'pnca'
+#drug = 'pyrazinamide'
+my_fname4 = '_metadata'
+#nrows = len(meta_pnca_LF1)
+output_filename = output_file_path + gene + my_fname4  + '.csv'
+print(output_filename) #<<<-check
+
+# write out file
+meta_pnca_LF1.to_csv(output_filename)