#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Tue Aug 6 12:56:03 2019 @author: tanu """ # FIXME: include error checking to enure you only # concentrate on positions that have structural info? #%% load libraries ################### # load libraries import os, sys import pandas as pd #import numpy as np #from pandas.api.types import is_string_dtype #from pandas.api.types import is_numeric_dtype # to create dir #my_dir = os.path.expanduser('~/some_dir') #make sure mcsm_analysis/ exists #or specify the output directory #%% #%% #%% #======================================================== # TASK: extract ALL pncA mutations from GWAS data #======================================================== #%% #################### # my working dir os.getcwd() homedir = os.path.expanduser('~') # spyder/python doesn't recognise tilde os.chdir(homedir + '/git/LSHTM_analysis/meta_data_analysis') os.getcwd() #%% from reference_dict import my_aa_dict #CHECK DIR STRUC THERE! #%% #NOTE: Out_dir MUST exis # User defined dir strpyrazinamide drug = 'pyrazinamide' gene = 'pnca' out_dir = homedir + '/git/LSHTM_analysis/mcsm_analysis/' # = out_dir + drug data_dir = homedir + '/git/Data' if not os.path.exists(data_dir): print('Error!', data_dir, 'does not exist. Please ensure it exists and contains the appropriate raw data') os.makedirs(data_dir) die() if not os.path.exists(out_dir): print('Error!', out_dir, 'does not exist. Please create it') exit() #if not os.path.exists(work_dir): # print('creating dir that does not exist', 'dir_name:', work_dir) # os.makedirs(work_dir) else: print('Dir exists: Carrying on') # now create sub dir structure within work_dir # pyrazinamide/mcsm_analysis # we need three dir # Data # Scripts # Plotting # Results # Plots # create a list of dir names #dir_names = ['Data', 'Scripts', 'Results'] #for i in dir_names: # this_dir = (work_dir + '/' + i) # if not os.path.exists(this_dir): # print('creating dir that does not exist:', this_dir) # os.makedirs(this_dir) #else: # print('Dir exists: Carrying on') # Create sub dirs # 1) # Scripts # Plotting #subdir_plotting = work_dir + '/Scripts/Plotting' #if not os.path.exists(subdir_plotting): # print('creating dir that does not exist:', subdir_plotting) # os.makedirs(subdir_plotting) #else: # print('Dir exists: Carrying on') # 2) # Results # Plots #subdir_plots = work_dir + '/Results/Plots' #if not os.path.exists(subdir_plots): # print('creating dir that does not exist:', subdir_plots) # os.makedirs(subdir_plots) #else: # print('Dir exists: Carrying on') # clear varaibles #del(dir_names, drug, i, subdir_plots, subdir_plotting) #exit() #%% #============================================================================== ############ # STEP 1: Read file original_tanushree_data_v2.csv ############ data_file = data_dir + '/input/original/original_tanushree_data_v2.csv' meta_data = pd.read_csv(data_file, sep = ',') # column names list(meta_data.columns) # extract elevant columns to extract from meta data related to the pyrazinamide meta_data = meta_data[['id' ,'country' ,'lineage' ,'sublineage' ,'drtype' , 'pyrazinamide' , 'dr_mutations_pyrazinamide' , 'other_mutations_pyrazinamide' ]] # checks total_samples = meta_data['id'].nunique() # 19265 # counts NA per column meta_data.isna().sum() # glance meta_data.head() # equivalent of table in R # pyrazinamide counts meta_data.pyrazinamide.value_counts() #%% ############ # STEP 2: extract entries containing selected genes: # pyrazinamide: pnca_p. # in the dr_mutations and other mutations" # as we are interested in the mutations in the protein coding region only # (corresponding to a structure) # and drop the entries with NA ############# meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] del(meta_pza) ########################## # pyrazinamide: pnca_p. ########################## meta_data_pnca = meta_data[['id' ,'country' ,'lineage' ,'sublineage' ,'drtype' , 'pyrazinamide' , 'dr_mutations_pyrazinamide' , 'other_mutations_pyrazinamide' ]] del(meta_data) # sanity checks # dr_mutations only meta_pnca_dr = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] meta_pnca_dr['id'].nunique() del(meta_pnca_dr) # other mutations meta_pnca_other = meta_data_pnca.loc[meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] meta_pnca_other['id'].nunique() del(meta_pnca_other) # Now extract "all" mutations meta_pnca_all = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*') | meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*') ] meta_pnca_all['id'].nunique() pnca_samples = len(meta_pnca_all) pnca_na = meta_pnca_all['pyrazinamide'].isna().sum() comp_pnca_samples = pnca_samples - pnca_na #=#=#=#=#=#=# # COMMENT: use it later to check number of complete samples from LF data #=#=#=#=#=#=# # sanity checks meta_pnca_all.dr_mutations_pyrazinamide.value_counts() meta_pnca_all.other_mutations_pyrazinamide.value_counts() # more sanity checks # !CAUTION!: muts will change depending on your gene # dr muts : insert meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro')] # meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Phe106Leu')] # empty meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Val139Leu')] meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists # rows m = meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists # rows # other_muts meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro*')] # empty meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Phe106Leu')] #=#=#=#=#=#=#=#=#=# # FIXME # COMMENTS: both mutations columns are separated by ; # CHECK if there are mutations that exist both in dr and other_muts! # doesn't make any sense for the same mut to exist in both, I would have thought! #=#=#=#=#=#=#=#=#=# # remove not required variables del(meta_data_pnca) ############ # STEP 3: split the columns of # a) dr_mutation_... (;) as # the column has snps related to multiple genes. # useful links # https://stackoverflow.com/questions/17116814/pandas-how-do-i-split-text-in-a-column-into-multiple-rows # this one works beautifully # https://stackoverflow.com/questions/12680754/split-explode-pandas-dataframe-string-entry-to-separate-rows ############ # sanity check: counts NA per column afer subsetted df: i.e in meta_pza(with pncA_p. extracted mutations) meta_pnca_all.isna().sum() #=#=#=#=#=#=#=#=#=# # COMMENT: no NA's in dr_mutations/other_mutations_columns #=#=#=#=#=#=#=#=#=# # define the split function def tidy_split(df, column, sep='|', keep=False): """ Split the values of a column and expand so the new DataFrame has one split value per row. Filters rows where the column is missing. Params ------ df : pandas.DataFrame dataframe with the column to split and expand column : str the column to split and expand sep : str the string used to split the column's values keep : bool whether to retain the presplit value as it's own row Returns ------- pandas.DataFrame Returns a dataframe with the same columns as `df`. """ indexes = list() new_values = list() #df = df.dropna(subset=[column])#<<<<<<-----see this incase you need to uncomment based on use case for i, presplit in enumerate(df[column].astype(str)): values = presplit.split(sep) if keep and len(values) > 1: indexes.append(i) new_values.append(presplit) for value in values: indexes.append(i) new_values.append(value) new_df = df.iloc[indexes, :].copy() new_df[column] = new_values return new_df ######## # 3a: call tidy_split() on 'dr_mutations_pyrazinamide' column and remove leading white spaces #https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas ######## meta_pnca_WF0 = tidy_split(meta_pnca_all, 'dr_mutations_pyrazinamide', sep = ';') # remove leading white space else these are counted as distinct mutations as well meta_pnca_WF0['dr_mutations_pyrazinamide'] = meta_pnca_WF0['dr_mutations_pyrazinamide'].str.lstrip() ######## # 3b: call function on 'other_mutations_pyrazinamide' column and remove leading white spaces ######## meta_pnca_WF1 = tidy_split(meta_pnca_WF0, 'other_mutations_pyrazinamide', sep = ';') # remove the leading white spaces in the column meta_pnca_WF1['other_mutations_pyrazinamide'] = meta_pnca_WF1['other_mutations_pyrazinamide'].str.strip() ########## # Step 4: Reshape data so that all mutations are in one column and the # annotations for the mutation reflect the column name # LINK: http://www.datasciencemadesimple.com/reshape-wide-long-pandas-python-melt-function/ # data frame “df” is passed to melt() function # id_vars is the variable which need to be left unaltered # var_name are the column names so we named it as 'mutation_info' # value_name are its values so we named it as 'mutation' ########## meta_pnca_WF1.columns meta_pnca_LF0 = pd.melt(meta_pnca_WF1 , id_vars = ['id', 'country', 'lineage', 'sublineage', 'drtype', 'pyrazinamide'] , var_name = 'mutation_info' , value_name = 'mutation') # sanity check: should be true if len(meta_pnca_LF0) == len(meta_pnca_WF1)*2: print('sanity check passed: Long format df has the expected length') else: print("Sanity check failed: Debug please!") ########### # Step 5: This is still dirty data. Filter LF data so that you only have # mutations corresponding to pnca_p. # this will be your list you run OR calcs ########### meta_pnca_LF1 = meta_pnca_LF0[meta_pnca_LF0['mutation'].str.contains('pncA_p.*')] # sanity checks # unique samples meta_pnca_LF1['id'].nunique() if len(meta_pnca_all) == meta_pnca_LF1['id'].nunique(): print("Sanity check passed: No of samples with pncA mutations match") else: print("Sanity check failed: Debug please!") # count if all the mutations are indeed in the protein coding region # i.e begin with pnca_p meta_pnca_LF1['mutation'].str.count('pncA_p.').sum() # 3093 # should be true. # and check against the length of the df, which should match if len(meta_pnca_LF1) == meta_pnca_LF1['mutation'].str.count('pncA_p.').sum(): print("Sanity check passed: Long format data containing pnca mutations indeed correspond to pncA_p region") else: print("Sanity check failed: Debug please!") ########### # Step 6: Filter dataframe with "na" in the drug column # This is because for OR, you can't use the snps that have the # NA in the specified drug column # it creates problems when performing calcs in R inside the loop # so best to filter it here ########### # NOT NEEDED FOR all snps, only for extracting valid OR snps del (meta_pnca_WF0, meta_pnca_WF1, meta_pnca_LF0, meta_pnca_all) ########### # Step 7: count unique pncA_p mutations (all and comp cases) ########### meta_pnca_LF1['mutation'].nunique() meta_pnca_LF1.groupby('mutation_info').nunique() meta_pnca_LF1['id'].nunique() meta_pnca_LF1['mutation'].nunique() meta_pnca_LF1.groupby('id').nunique() ########### # Step 8: convert all snps only (IN LOWERCASE) # because my mcsm file intergated has lowercase ########### # convert mutation to lower case as it needs to exactly match the dict key #meta_pnca_LF1['mutation'] = meta_pnca_LF1.mutation.str.lower() # WARNINGS: suggested to use .loc meta_pnca_LF1['mutation'] = meta_pnca_LF1.loc[:, 'mutation'].str.lower() ########### # Step 9 : Split 'mutation' column into three: wild_type, position and # mutant_type separately. Then map three letter code to one from the # referece_dict imported pncaeady. First convert to mutation to lowercase # to allow to match entries from dict ########### #======= # Step 9a: iterate through the dict, create a lookup dict i.e # lookup_dict = {three_letter_code: one_letter_code}. # lookup dict should be the key and the value (you want to create a column for) # Then use this to perform the mapping separetly for wild type and mutant cols. # The three letter code is extracted using a regex match from the dataframe and then converted # to 'pandas series'since map only works in pandas series #======= # initialise a sub dict that is a lookup dict for three letter code to one lookup_dict = dict() for k, v in my_aa_dict.items(): lookup_dict[k] = v['one_letter_code'] wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on meta_pnca_LF1['wild_type'] = wt.map(lookup_dict) mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze() meta_pnca_LF1['mutant_type'] = mut.map(lookup_dict) # extract position info from mutation column separetly using regex meta_pnca_LF1['position'] = meta_pnca_LF1['mutation'].str.extract(r'(\d+)') # clear variables del(k, v, wt, mut, lookup_dict) #========= # Step 9b: iterate through the dict, create a lookup dict that i.e # lookup_dict = {three_letter_code: aa_prop_water} # Do this for both wild_type and mutant as above. #========= # initialise a sub dict that is lookup dict for three letter code to aa prop lookup_dict = dict() for k, v in my_aa_dict.items(): lookup_dict[k] = v['aa_prop_water'] #print(lookup_dict) wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on meta_pnca_LF1['wt_prop_water'] = wt.map(lookup_dict) mut = meta_pnca_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze() meta_pnca_LF1['mut_prop_water'] = mut.map(lookup_dict) # added two more cols # clear variables del(k, v, wt, mut, lookup_dict) #======== # Step 9c: iterate through the dict, create a lookup dict that i.e # lookup_dict = {three_letter_code: aa_prop_polarity} # Do this for both wild_type and mutant as above. #========= # initialise a sub dict that is lookup dict for three letter code to aa prop lookup_dict = dict() for k, v in my_aa_dict.items(): lookup_dict[k] = v['aa_prop_polarity'] #print(lookup_dict) wt = meta_pnca_LF1['mutation'].str.extract('pnca_p.(\w{3})').squeeze() # converts to a series that map works on meta_pnca_LF1['wt_prop_polarity'] = wt.map(lookup_dict) mut = meta_pnca_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze() meta_pnca_LF1['mut_prop_polarity'] = mut.map(lookup_dict) # added two more cols # clear variables del(k, v, wt, mut, lookup_dict) ######## # Step 10: combine the wild_type+poistion+mutant_type columns to generate # Mutationinformation (matches mCSM output field) # Remember to use .map(str) for int col types to allow string concatenation ######### meta_pnca_LF1['Mutationinformation'] = meta_pnca_LF1['wild_type'] + meta_pnca_LF1.position.map(str) + meta_pnca_LF1['mutant_type'] #=#=#=#=#=#=# # Step 11: # COMMENT: there is more processing in the older version of this script # consult if necessary # possibly due to the presence of true_wt # since this file doesn't contain any true_wt, we won't need it(hopefully!) #=#=#=#=#=#=# #%% ########### # Step 12: Output files for only SNPs to run mCSM ########### #========= # Step 12a: all SNPs to run mCSM #========= snps_only = pd.DataFrame(meta_pnca_LF1['Mutationinformation'].unique()) pos_only = pd.DataFrame(meta_pnca_LF1['position'].unique()) # assign meaningful colnames #snps_only.rename({0 : 'all_pnca_snps'}, axis = 1, inplace = True) #list(snps_only.columns) snps_only.isna().sum() # should be 0 # output csv: all SNPS for mCSM analysis # specify variable name for output file gene = 'pnca' #drug = 'pyrazinamide' my_fname1 = '_snps_' nrows = len(snps_only) #output_file_path = '/home/tanu/git/Data/input/processed/pyrazinamide/' #output_file_path = work_dir + '/Data/' output_file_path = data_dir + '/input/processed/' + drug + '/' if not os.path.exists(output_file_path): print( output_file_path, 'does not exist. Creating') os.makedirs(output_file_path) exit() output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv' print(output_filename) #<<<- check # write to csv: without column or row names # Bad practice: numbers at the start of a filename snps_only.to_csv(output_filename, header = False, index = False) #========= # Step 12b: all snps with annotation #========= # all snps, selected cols #pnca_snps_ALL = meta_pnca_LF1[['id','country','lineage', 'sublineage' # , 'drtype', 'pyrazinamide' # , 'mutation_info', 'mutation', 'Mutationinformation']] #len(pnca_snps_ALL) # sanity check #meta_pnca_LF1['mutation'].nunique() # output csv: WITH column but WITHOUT row names(all snps with meta data) # specify variable name for output file #gene = 'pnca' #drug = 'pyrazinamide' #my_fname2 = '_snps_with_metadata_' #nrows = len(pnca_snps_ALL) #output_file_path = work_dir + '/Data/' #output_filename = output_file_path + gene + my_fname2 + str(nrows) + '.csv' #print(output_filename) #<<<- check # write out file #pnca_snps_ALL.to_csv(output_filename, header = True, index = False) #========= # Step 12c: comp snps for OR calcs with annotation #========= # remove all NA's from pyrazinamide column from LF1 # counts NA per column meta_pnca_LF1.isna().sum() # remove NA meta_pnca_LF2 = meta_pnca_LF1.dropna(subset=['pyrazinamide']) # sanity checks # should be True len(meta_pnca_LF2) == len(meta_pnca_LF1) - meta_pnca_LF1['pyrazinamide'].isna().sum() # unique counts meta_pnca_LF2['mutation'].nunique() meta_pnca_LF2.groupby('mutation_info').nunique() # sanity check meta_pnca_LF2['id'].nunique() # should be True if meta_pnca_LF2['id'].nunique() == comp_pnca_samples: print ('sanity check passed: complete numbers match') else: print('Error: Please Debug!') # value counts meta_pnca_LF2.mutation.value_counts() #meta_pnca_LF2.groupby(['mutation_info', 'mutation']).size() # valid/comp snps # uncomment as necessary pnca_snps_COMP = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique()) len(pnca_snps_COMP) # output csv: WITH column but WITHOUT row names (COMP snps with meta data) # specify variable name for output file gene = 'pnca' #drug = 'pyrazinamide' my_fname3 = '_comp_snps_with_metadata_' nrows = len(pnca_snps_COMP) #output_filename = output_file_path + gene + my_fname3 + str(nrows) + '.csv' #print(output_filename) #<<<-check # write out file #pnca_snps_COMP.to_csv(output_filename, header = True, index = False) #========= # Step 12d: comp snps only #========= # output csv: comp SNPS for info (i.e snps for which OR exist) # specify variable name for output file snps_only = pd.DataFrame(meta_pnca_LF2['Mutationinformation'].unique()) gene = 'pnca' #drug = 'pyrazinamide' my_fname1 = '_comp_snps_' nrows = len(snps_only) output_filename = output_file_path + gene + my_fname1 + str(nrows) + '.csv' print(output_filename) #<<<- check # write to csv: without column or row names snps_only.to_csv(output_filename, header = False, index = False) #=#=#=#=#=#=#=# # COMMENT: LF1 is the file to extract all unique snps for mcsm # but you have that already in file called pnca_snps... # LF2: is the file for extracting snps tested for DS and hence OR calcs #=#=#=#=#=#=#=# ########### # Step 13 : Output the whole df i.e # file for meta_data which is now formatted with # each row as a unique snp rather than the original version where # each row is a unique id # ***** This is the file you will ADD the AF and OR calculations to ***** ########### # output csv: the entire DF # specify variable name for output file gene = 'pnca' #drug = 'pyrazinamide' my_fname4 = '_metadata' #nrows = len(meta_pnca_LF1) output_filename = output_file_path + gene + my_fname4 + '.csv' print(output_filename) #<<<-check # write out file meta_pnca_LF1.to_csv(output_filename)