From 6330a2e716b0a0e5cbff049ca22de64bb603f153 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Thu, 14 Apr 2022 10:16:12 +0100 Subject: [PATCH] added v2 for data extraction --- scripts/data_extraction.py | 2 +- scripts/data_extraction_v2.py | 1109 +++++++++++++++++++++++++++++++++ 2 files changed, 1110 insertions(+), 1 deletion(-) create mode 100644 scripts/data_extraction_v2.py diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index b991490..241ff5b 100755 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -622,7 +622,7 @@ print('Dim of dr_df:', dr_df.shape #%% #========= -# DF2: other_mutations_pyrazinamdie +# DF2: other_muts_col #========= ######## # tidy_split(): on other_muts_col column and remove leading white spaces diff --git a/scripts/data_extraction_v2.py b/scripts/data_extraction_v2.py new file mode 100644 index 0000000..aa7ec93 --- /dev/null +++ b/scripts/data_extraction_v2.py @@ -0,0 +1,1109 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +''' +Created on Tue Aug 6 12:56:03 2019 + +@author: tanu +''' + +# FIXME: include error checking to enure you only +# concentrate on positions that have structural info? + +# FIXME: import dirs.py to get the basic dir paths available +#======================================================================= +# TASK: extract ALL matched mutations from GWAS data +# Input data file has the following format: each row = unique sample id +# id,country,lineage,sublineage,drtype,drug,dr_muts_col,other_muts_col... +# 0,sampleID,USA,lineage2,lineage2.2.1,Drug-resistant,0.0,WT,gene_matchPOS; pncA_c.POS... +# where multiple mutations and multiple mutation types are separated by ';'. +# We are interested in the protein coding region i.e mutation with the_'p.' format. +# This script splits the mutations on the ';' and extracts protein coding muts only +# where each row is a separate mutation +# sample ids AND mutations are NOT unique, but the COMBINATION (sample id + mutation) = unique + + +# output files: all lower case +# 1) _gwas.csv +# 2) _common_ids.csv +# 3) _ambiguous_muts.csv +# 4) _mcsm_formatted_snps.csv +# 5) _metadata_poscounts.csv +# 6) _metadata.csv +# 7) _all_muts_msa.csv +# 8) _mutational_positons.csv + +#------------ +# NOTE +#----------- +# drtype is renamed to 'resistance' in the 35k dataset +# all colnames in the ouput files lowercase + +#------------- +# requires +#------------- +#reference_dict.py +#tidy_split.py +#======================================================================= +#%% load libraries +import os, sys +import re +import pandas as pd +import numpy as np +import argparse +from statistics import mean, median, mode +from statistics import multimode +#======================================================================= +#%% dir and local imports +homedir = os.path.expanduser('~') +# set working dir +os.getcwd() +os.chdir(homedir + '/git/LSHTM_analysis/scripts') +os.getcwd() +#======================================================================= +#%% command line args +arg_parser = argparse.ArgumentParser() +arg_parser.add_argument('-d', '--drug', help='drug name (case sensitive)', default = None) +arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', default = None) +arg_parser.add_argument('--datadir', help = 'Data Directory. By default, it assmumes homedir + git/Data') +arg_parser.add_argument('-i', '--input_dir', help = 'Input dir containing pdb files. By default, it assmumes homedir + + input') +arg_parser.add_argument('-o', '--output_dir', help = 'Output dir for results. By default, it assmes homedir + + output') +arg_parser.add_argument('-m', '--make_dirs', help = 'Make dir for input and output', action='store_true') + +arg_parser.add_argument('--debug', action ='store_true', help = 'Debug Mode') + +args = arg_parser.parse_args() +#======================================================================= +#%% variable assignment: input and output paths & filenames +drug = args.drug +gene = args.gene +datadir = args.datadir +indir = args.input_dir +outdir = args.output_dir +make_dirs = args.make_dirs + +#%% input and output dirs and files +#======= +# dirs +#======= +if not datadir: + datadir = homedir + '/' + 'git/Data' + +if not indir: + indir = datadir + '/' + drug + '/input_v2' + +if not outdir: + outdir = datadir + '/' + drug + '/output_v2' + +if make_dirs: + print('make_dirs is turned on, creating data dir:', datadir) + try: + os.makedirs(datadir, exist_ok = True) + print("Directory '%s' created successfully" %datadir) + except OSError as error: + print("Directory '%s' can not be created") + + print('make_dirs is turned on, creating indir:', indir) + try: + os.makedirs(indir, exist_ok = True) + print("Directory '%s' created successfully" %indir) + except OSError as error: + print("Directory '%s' can not be created") + + print('make_dirs is turned on, creating outdir:', outdir) + try: + os.makedirs(outdir, exist_ok = True) + print("Directory '%s' created successfully" %outdir) + except OSError as error: + print("Directory '%s' can not be created") + +# handle missing dirs here +if not os.path.isdir(datadir): + print('ERROR: Data directory does not exist:', datadir + , '\nPlease create and ensure gwas data is present and then rerun\nelse specify cmd option --make_dirs') + sys.exit() +if not os.path.isdir(indir): + print('ERROR: Input directory does not exist:', indir + , '\nPlease either create or specify indir and rerun\nelse specify cmd option --make_dirs') + sys.exit() +if not os.path.isdir(outdir): + print('ERROR: Output directory does not exist:', outdir + , '\nPlease create or specify outdir and rerun\nelse specify cmd option --make_dirs') + sys.exit() + +# Requires +from reference_dict import my_aa_dict # CHECK DIR STRUC THERE! +from tidy_split import tidy_split + +#======================================================================= +gene_match = gene + '_p.' +print('mut pattern for gene', gene, ':', gene_match) + +nssnp_match = gene_match +'[A-Za-z]{3}[0-9]+[A-Za-z]{3}' +print('nsSNP for gene', gene, ':', nssnp_match) + +wt_regex = gene_match.lower()+'([A-Za-z]{3})' +print('wt regex:', wt_regex) + +mut_regex = r'[0-9]+(\w{3})$' +print('mt regex:', mut_regex) + +pos_regex = r'([0-9]+)' +print('position regex:', pos_regex) + +# building cols to extract +dr_muts_col = 'dr_mutations_' + drug +other_muts_col = 'other_mutations_' + drug +resistance_col = 'drtype' + +print('Extracting columns based on variables:\n' + , drug + , '\n' + , dr_muts_col + , '\n' + , other_muts_col + , '\n' + , resistance_col + , '\n===============================================================') +#======================================================================= + +#======= +# input +#======= +#in_filename_master_master = 'original_tanushree_data_v2.csv' #19k +in_filename_master = 'mtb_gwas_meta_v6.csv' #35k +infile_master = datadir + '/' + in_filename_master +print('Input file: ', infile_master + , '\n============================================================') + +#======= +# output +#======= +# several output files: in respective sections at the time of outputting files +print('Output filename: in the respective sections' + , '\nOutput path: ', outdir + , '\n=============================================================') + +#%%end of variable assignment for input and output files +#======================================================================= +#%% Read input file +master_data = pd.read_csv(infile_master, sep = ',') + +# column names +#list(master_data.columns) + +# extract elevant columns to extract from meta data related to the drug +if in_filename_master == 'original_tanushree_data_v2.csv': + meta_data = master_data[['id' + , 'country' + , 'lineage' + , 'sublineage' + , 'drtype' + , drug + , dr_muts_col + , other_muts_col]] +else: + core_cols = ['id' + , 'sample' + #, 'patient_id' + #, 'strain' + , 'lineage' + , 'sublineage' + , 'country_code' + #, 'geographic_source' + , resistance_col] + + variable_based_cols = [drug + , dr_muts_col + , other_muts_col] + + cols_to_extract = core_cols + variable_based_cols + print('Extracting', len(cols_to_extract), 'columns from master data') + + meta_data = master_data[cols_to_extract] + del(master_data, variable_based_cols, cols_to_extract) + +print('Extracted meta data from filename:', in_filename_master + , '\nDim:', meta_data.shape) + +# checks and results +total_samples = meta_data['id'].nunique() +print('RESULT: Total samples:', total_samples + , '\n===========================================================') + +# counts NA per column +meta_data.isna().sum() +print('No. of NAs/column:' + '\n', meta_data.isna().sum() + , '\n===========================================================\n') + +#%% Quick checks: +meta_data['lineage'].value_counts() +meta_data['lineage'].value_counts().sum() +meta_data['lineage'].nunique() + +meta_data['id'].nunique() +meta_data['sample'].nunique() +meta_data['id'].equals(meta_data['sample']) +foo = meta_data.copy() +foo['Diff'] = np.where( foo['id'] == foo['sample'] , '1', '0') +foo['Diff'].value_counts() + +meta_data['drtype'].value_counts() +meta_data['drtype'].value_counts().sum() + +#%% Write file: _gwas.csv +check_file = outdir + '/' + gene.lower() + '_gwas.csv' +meta_data.to_csv(check_file, index = False) +print('\n----------------------------------' + , '\nWriting subsetted gwas data:', gene + , '\n----------------------------------' + , '\nFile', check_file + , '\nDim:', meta_data.shape) + +# equivalent of table in R +# drug counts: complete samples for OR calcs +meta_data[drug].value_counts() +print('===========================================================\n' + , 'RESULT: Total no. of samples tested for', drug, meta_data[drug].value_counts().sum() + , '\n===========================================================\n' + , 'RESULT: No. of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts() + , '\n===========================================================\n' + , 'RESULT: Percentage of Sus and Res', drug, 'samples:\n', meta_data[drug].value_counts(normalize = True)*100 + , '\n===========================================================') +#%% Important sanity checks +# This is to find out how many samples have 1 and more than 1 mutation,so you +# can use it to check if your data extraction process for dr_muts +# and other_muts has worked correctly AND also to check the dim of the +# final formatted data. +# This will have: unique COMBINATION of sample id and mutations +#======== +# First: counting mutations in dr_muts_col column +#======== +print('Now counting WT &', nssnp_match, 'muts within the column:', dr_muts_col) +# drop na and extract a clean df +clean_df = meta_data.dropna(subset=[dr_muts_col]) + +# sanity check: count na +na_count = meta_data[dr_muts_col].isna().sum() + +if len(clean_df) == (total_samples - na_count): + print('PASS: clean_df extracted: length is', len(clean_df) + , '\nNo.of NAs in', dr_muts_col, '=', na_count, '/', total_samples + , '\n==========================================================') +else: + sys.exit('FAIL: Could not drop NAs') + +dr_gene_count = 0 +wt = 0 +id_dr = [] +id2_dr = [] +#nssnp_match_regex = re.compile(nssnp_match) + +for i, id in enumerate(clean_df.id): + #print (i, id) + #id_dr.append(id) + #count_gene_dr = clean_df[dr_muts_col].iloc[i].count(gene_match) # can include stop muts + count_gene_dr = len(re.findall(nssnp_match, clean_df[dr_muts_col].iloc[i], re.IGNORECASE)) + #print(count_gene_dr) + if count_gene_dr > 0: + id_dr.append(id) + if count_gene_dr > 1: + id2_dr.append(id) + #print(id, count_gene_dr) + dr_gene_count = dr_gene_count + count_gene_dr + count_wt = clean_df[dr_muts_col].iloc[i].count('WT') + wt = wt + count_wt + +print('RESULTS:') +print('Total WT in dr_muts_col:', wt) +print('Total matches of', gene, 'SNP matches in', dr_muts_col, ':', dr_gene_count) +print('Total samples with > 1', gene, 'nsSNPs in dr_muts_col:', len(id2_dr) ) +print('=================================================================') + +del(clean_df, na_count, i, id, wt, id2_dr, count_gene_dr, count_wt) + +#======== +# Second: counting mutations in other_muts_col column +#======== +print('Now counting WT &', nssnp_match, 'muts within the column:', other_muts_col) +# drop na and extract a clean df +clean_df = meta_data.dropna(subset=[other_muts_col]) + +# sanity check: count na +na_count = meta_data[other_muts_col].isna().sum() + +if len(clean_df) == (total_samples - na_count): + print('PASS: clean_df extracted: length is', len(clean_df) + , '\nNo.of NAs =', na_count, '/', total_samples + , '\n=========================================================') +else: + sys.exit('FAIL: Could not drop NAs') + +other_gene_count = 0 +wt_other = 0 +id_other = [] +id2_other = [] + +for i, id in enumerate(clean_df.id): + #print (i, id) + #id_other.append(id) + #count_gene_other = clean_df[other_muts_col].iloc[i].count(gene_match) # can include stop muts + count_gene_other = len(re.findall(nssnp_match, clean_df[other_muts_col].iloc[i], re.IGNORECASE)) + if count_gene_other > 0: + id_other.append(id) + if count_gene_other > 1: + id2_other.append(id) + #print(id, count_gene_other) + other_gene_count = other_gene_count + count_gene_other + count_wt = clean_df[other_muts_col].iloc[i].count('WT') + wt_other = wt_other + count_wt + +print('RESULTS:') +print('Total WT in other_muts_col:', wt_other) +print('Total matches of', gene, 'SNP matches in', other_muts_col, ':', other_gene_count) +print('Total samples with > 1', gene, 'nsSNPs in other_muts_col:', len(id2_other) ) +print('=================================================================') + +print('Predicting total no. of rows in the curated df:', dr_gene_count + other_gene_count + , '\n===================================================================') +expected_rows = dr_gene_count + other_gene_count + +#del( wt_other, clean_df, i, id, na_count, id2_other, count_gene_other, count_wt) +del(clean_df, na_count, i, id, wt_other, id2_other, count_gene_other, count_wt ) + +#%% Extracting dr and other muts separately along with common cols +print('Extracting dr_muts from col:', dr_muts_col, 'with other meta_data') +print('muts to extract:', nssnp_match ) + +#=============== +# dr mutations: extract gene_match entries with meta data and ONLY dr_muts col +#=============== +if in_filename_master == 'original_tanushree_data_v2.csv': + meta_data_dr = meta_data[['id' + ,'country' + ,'lineage' + ,'sublineage' + ,'drtype' + , drug + , dr_muts_col]] +else: + dr_based_cols = [drug, dr_muts_col] + cols_to_extract = core_cols + dr_based_cols + print('Extracting', len(cols_to_extract), 'columns from meta data') + meta_data_dr = meta_data[cols_to_extract] + del(dr_based_cols, cols_to_extract) + +if meta_data_dr.shape[0] == len(meta_data) and meta_data_dr.shape[1] == (len(meta_data.columns)-1): + print('PASS: Dimensions match' + , '\n===============================================================') +else: + print('FAIL: Dimensions mismatch:' + , 'Expected dim:', len(meta_data), (len(meta_data.columns)-1) + , '\nGot:', meta_data_dr.shape + , '\n===============================================================') + sys.exit() + +# Extract within this the nsSNPs for gene of interest using string match +#meta_gene_dr = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(gene_match, na = False)] +meta_gene_dr = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)] +print('gene_snp_match in dr:', len(meta_gene_dr)) + +dr_id = meta_gene_dr['id'].unique() +print('RESULT: No. of samples with dr muts in', gene, ':', len(dr_id)) + +if len(id_dr) == len(meta_gene_dr): + print('PASS: lengths match' + , '\n===============================================================') +else: + print('FAIL: length mismatch' + , '\nExpected len:', len(id_dr) + , '\nGot:', len(meta_gene_dr)) + sys.exit() + +dr_id = pd.Series(dr_id) + +#================= +# other mutations: extract nssnp_match entries from other_muts_col +#================== +print('Extracting other_muts from:', other_muts_col,'with other meta_data') +print('muts to extract:', nssnp_match) + +if in_filename_master == 'original_tanushree_data_v2.csv': + meta_data_other = meta_data[['id' + , 'country' + , 'lineage' + , 'sublineage' + , 'drtype' + , drug + , other_muts_col]] +else: + other_based_cols = [drug, other_muts_col] + cols_to_extract = core_cols + other_based_cols + print('Extracting', len(cols_to_extract), 'columns from meta data') + meta_data_other = meta_data[cols_to_extract] + del(other_based_cols, cols_to_extract) + +if meta_data_other.shape[0] == len(meta_data) and meta_data_other.shape[1] == (len(meta_data.columns)-1): + print('PASS: Dimensions match' + , '\n===============================================================') +else: + print('FAIL: Dimensions mismatch:' + , 'Expected dim:', len(meta_data), (len(meta_data.columns)-1) + , '\nGot:', meta_data_other.shape + , '\n===============================================================') + sys.exit() + +# Extract within this nSSNP for gene of interest using string match +#meta_gene_other = meta_data_other.loc[meta_data_other[other_muts_col].str.contains(gene_match, na = False)] +meta_gene_other = meta_data_other.loc[meta_data_other[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)] +print('gene_snp_match in other:', len(meta_gene_other)) + +other_id = meta_gene_other['id'].unique() +print('RESULT: No. of samples with other muts in', gene, ':', len(other_id)) + +if len(id_other) == len(meta_gene_other): + print('PASS: lengths match' + , '\n==============================================================') +else: + print('FAIL: length mismatch' + , '\nExpected len:', len(id_other) + , '\nGot:', len(meta_gene_other)) + sys.exit() + +other_id = pd.Series(other_id) +#%% Find common IDs +#================== +# Common ids +#================== +print('Now extracting common_ids...') +common_mut_ids = dr_id.isin(other_id).sum() +print('RESULT: No. of common ids:', common_mut_ids) + +# sanity checks +# check if True: should be since these are common +if dr_id.isin(other_id).sum() == other_id.isin(dr_id).sum(): + print('PASS: Cross check on no. of common ids') +else: + sys.exit('FAIL: Cross check on no. of common ids failed') + +# check if the common are indeed the same! +# bit of a tautology, but better safe than sorry! +common_ids = dr_id[dr_id.isin(other_id)] +common_ids = common_ids.reset_index() +common_ids.columns = ['index', 'id'] + +common_ids2 = other_id[other_id.isin(dr_id)] +common_ids2 = common_ids2.reset_index() +common_ids2.columns = ['index', 'id2'] + +# should be True +if common_ids['id'].equals(common_ids2['id2']): + print('PASS: Further cross checks on common ids') + nu_common_ids = common_ids.nunique() +else: + sys.exit('FAIL: Further cross checks on common ids') + +# good sanity check: use it later to check gene_sample_counts +expected_gene_samples = (len(meta_gene_dr) + len(meta_gene_other) - common_mut_ids) +print('Expected no. of gene samples:', expected_gene_samples + , '\n=================================================================') +#%% Write file: _common_ids.csv +#print(outdir) +out_filename_cid = gene.lower() + '_common_ids.csv' +outfile_cid = outdir + '/' + out_filename_cid +print('\n----------------------------------' + '\nWriting file: common_ids' + '\n----------------------------------' + , '\nFile:', outfile_cid + , '\nNo. of rows:', len(common_ids) + , '\n=============================================================') + +common_ids.to_csv(outfile_cid, index = False) + +del(out_filename_cid) + +# clear variables +del(dr_id, other_id, meta_data_dr, meta_data_other, common_ids, common_mut_ids, common_ids2) + +#%% Extract gene specific nsSNPs: all nsSNPs i.e.'nssnp_match' +print('Extracting nsSNP match:', gene, 'mutations from cols:\n' + , dr_muts_col, 'and', other_muts_col, 'using string match:' + , '\n===================================================================') +#meta_gene_all = meta_data.loc[meta_data[dr_muts_col].str.contains(gene_match, na = False) | meta_data[other_muts_col].str.contains(gene_match, na = False) ] +meta_gene_all = meta_data.loc[meta_data[dr_muts_col].str.contains(nssnp_match + , na = False + , regex = True + , case = False) | meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ] + +extracted_gene_samples = meta_gene_all['id'].nunique() +print('RESULT: No. of unique samples with', gene, 'nsSNP muts:', extracted_gene_samples + , '\n===================================================================') + +# sanity check: length of gene samples +print('Performing sanity check:') +if extracted_gene_samples == expected_gene_samples: + print('PASS: expected & actual no. of nssnp gene samples match' + , '\nNo. of gene samples:', len(meta_gene_all) + , '\n=========================================================') +else: + sys.exit('FAIL: Length mismatch in gene samples!') + +# count NA in drug column +gene_na = meta_gene_all[drug].isna().sum() +print('No. of gene samples without', drug, 'testing:', gene_na) + +# use it later to check number of complete samples from LF data +comp_gene_samples = len(meta_gene_all) - gene_na +print('Complete gene samples tested for', drug, ':', comp_gene_samples) +print('=================================================================') + +# Comment: This is still dirty data since these +# are samples that have nsSNP muts, but can have others as well +# since the format for mutations is mut1; mut2, etc. +print('This is still dirty data: samples have ', nssnp_match, 'muts but may have others as well' + , '\nsince the format for mutations is mut1; mut2, etc.' + , '\n=============================================================') + +print('Performing tidy_split(): to separate the mutations into individual rows') + +#%% tidy_split(): dr_muts_col +#========= +# DF1: dr_muts_col +# and remove leading white spaces +#========= +col_to_split1 = dr_muts_col +print ('Firstly, applying tidy split on dr muts df', meta_gene_dr.shape + , '\ncolumn name to apply tidy_split():', col_to_split1 + , '\n============================================================') +# apply tidy_split() +dr_WF0 = tidy_split(meta_gene_dr, col_to_split1, sep = ';') +# remove leading white space else these are counted as distinct mutations as well +dr_WF0[dr_muts_col] = dr_WF0[dr_muts_col].str.strip() + +# extract only the samples/rows with nssnp_match +#dr_gene_WF0 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(gene_match)] +dr_gene_WF0 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(nssnp_match, regex = True, case = False)] + +print('Lengths after tidy split and extracting', nssnp_match, 'muts:' + , '\nOld length:' , len(meta_gene_dr) + , '\nLength after split:', len(dr_WF0) + , '\nLength of nssnp df:', len(dr_gene_WF0) + , '\nExpected len:', dr_gene_count + , '\n=============================================================') +if len(dr_gene_WF0) == dr_gene_count: + print('PASS: length matches expected length' + , '\n===============================================================') +else: + sys.exit('FAIL: lengths mismatch') + +# count the freq of 'dr_muts' samples +dr_muts_df = dr_gene_WF0 [['id', dr_muts_col]] +print('Dim of dr_muts_df:', dr_muts_df.shape) + +# add freq column +dr_muts_df['dr_sample_freq'] = dr_muts_df.groupby('id')['id'].transform('count') +#dr_muts_df['dr_sample_freq'] = dr_muts_df.loc[dr_muts_df.groupby('id')].transform('count') +print('Revised dim of dr_muts_df:', dr_muts_df.shape) + +c1 = dr_muts_df.dr_sample_freq.value_counts() +print('Counting no. of sample frequency:\n', c1 + , '\n===================================================================') + +# sanity check: length of gene samples +if len(dr_gene_WF0) == c1.sum(): + print('PASS: WF data has expected length' + , '\nLength of dr_gene WFO:', c1.sum() + , '\n===============================================================') +else: + sys.exit('FAIL: length mismatch!') + +# Important: Assign 'column name' on which split was performed as an extra column +# This is so you can identify if mutations are dr_type or other in the final df +dr_df = dr_gene_WF0.assign(mutation_info = dr_muts_col) +print('Dim of dr_df:', dr_df.shape + , '\n==============================================================' + , '\nEnd of tidy split() on dr_muts, and added an extra column relecting mut_category' + , '\n===============================================================') +#%% tidy_split(): other_muts_col +#========= +# DF2: other_muts_col +# and remove leading white spaces +#========= +col_to_split2 = other_muts_col +print ('applying second tidy split() separately on other muts df', meta_gene_other.shape + , '\ncolumn name to apply tidy_split():', col_to_split2 + , '\n============================================================') + +# apply tidy_split() +other_WF1 = tidy_split(meta_gene_other, col_to_split2, sep = ';') +# remove the leading white spaces in the column +other_WF1[other_muts_col] = other_WF1[other_muts_col].str.strip() + +# extract only the samples/rows with nssnp_match +#other_gene_WF1 = other_WF1.loc[other_WF1[other_muts_col].str.contains(gene_match)] +other_gene_WF1 = other_WF1.loc[other_WF1[other_muts_col].str.contains(nssnp_match, regex = True, case = False)] + +print('Lengths after tidy split and extracting', gene_match, 'muts:', + '\nOld length:' , len(meta_gene_other), + '\nLength after split:', len(other_WF1), + '\nLength of nssnp df:', len(other_gene_WF1), + '\nExpected len:', other_gene_count + , '\n=============================================================') +if len(other_gene_WF1) == other_gene_count: + print('PASS: length matches expected length' + , '\n===============================================================') +else: + sys.exit('FAIL: lengths mismatch') + +# count the freq of 'other muts' samples +other_muts_df = other_gene_WF1 [['id', other_muts_col]] +print('Dim of other_muts_df:', other_muts_df.shape) + +# add freq column +other_muts_df['other_sample_freq'] = other_muts_df.groupby('id')['id'].transform('count') +print('Revised dim of other_muts_df:', other_muts_df.shape) + +c2 = other_muts_df.other_sample_freq.value_counts() +print('Counting no. of sample frequency:\n', c2) +print('=================================================================') +# sanity check: length of gene samples +if len(other_gene_WF1) == c2.sum(): + print('PASS: WF data has expected length' + , '\nLength of other_gene WFO:', c2.sum() + , '\n===============================================================') +else: + sys.exit('FAIL: Length mismatch') + +# Important: Assign 'column name' on which split was performed as an extra column +# This is so you can identify if mutations are dr_type or other in the final df +other_df = other_gene_WF1.assign(mutation_info = other_muts_col) +print('dim of other_df:', other_df.shape + , '\n===============================================================' + , '\nEnd of tidy split() on other_muts, and added an extra column relecting mut_category' + , '\n===============================================================') +#%% NEW check1: dr muts and other muts +#foo = dr_WF0[dr_muts_col][~dr_WF0[dr_muts_col].isin(dr_df[dr_muts_col])] +#bar = other_WF1[other_muts_col][~other_WF1[other_muts_col].isin(other_df[other_muts_col])] +dr_df['id'][~dr_df['id'].isin(other_df['id'])].count() +other_df['id'][~other_df['id'].isin(dr_df['id'])].count() + +#%% Concatenating two dfs: gene_LF0 +# equivalent of rbind in R +#========== +# Concatentating the two dfs: equivalent of rbind in R +#========== +# Important: Change column names to allow concat: +# dr_muts.. & other_muts : 'mutation' +print('Now concatenating the two dfs by row' + , '\nFirst assigning a common colname: "mutation" to the col containing muts' + , '\nThis is done for both dfs' + , '\n===================================================================') + +dr_df.columns +dr_df.rename(columns = {dr_muts_col: 'mutation'}, inplace = True) +dr_df.columns + +other_df.columns +other_df.rename(columns = {other_muts_col: 'mutation'}, inplace = True) +other_df.columns + +if len(dr_df.columns) == len(other_df.columns): + print('Checking dfs for concatening by rows:' + , '\nDim of dr_df:', dr_df.shape + , '\nDim of other_df:', other_df.shape + , '\nExpected nrows:', len(dr_df) + len(other_df) + , '\n=============================================================') +else: + sys.exit('FAIL: No. of cols mismatch for concatenating') + +# checking colnames before concat +print('Checking colnames BEFORE concatenating the two dfs...') +if (set(dr_df.columns) == set(other_df.columns)): + print('PASS: column names match necessary for merging two dfs') +else: + sys.exit('FAIL: Colnames mismatch for concatenating!') + +# concatenate (axis = 0): Rbind +print('Now appending the two dfs: Rbind') +gene_LF_comb = pd.concat([dr_df, other_df], ignore_index = True, axis = 0) + +print('Finding stop mutations in concatenated df') +stop_muts = gene_LF_comb['mutation'].str.contains('\*').sum() +if stop_muts == 0: + print('PASS: No stop mutations detected') +else: + print('stop mutations detected' + , '\nNo. of stop muts:', stop_muts, '\n' + , gene_LF_comb.groupby(['mutation_info'])['mutation'].apply(lambda x: x[x.str.contains('\*')].count()) + , '\nNow removing them') + +gene_LF0_nssnp = gene_LF_comb[~gene_LF_comb['mutation'].str.contains('\*')] +print('snps only: by subtracting stop muts:', len(gene_LF0_nssnp)) + +gene_LF0 = gene_LF_comb[gene_LF_comb['mutation'].str.contains(nssnp_match, case = False)] +print('snps only by direct regex:', len(gene_LF0)) + +if gene_LF0_nssnp.equals(gene_LF0): + print('PASS: regex for extracting nssnp worked correctly & stop mutations successfully removed' + , '\nUsing the regex extracted df') +else: + sys.exit('FAIL: posssibly regex or no of stop mutations' + , 'Regex being used:', nssnp_match) + #sys.exit() + +# checking colnames and length after concat +print('Checking colnames AFTER concatenating the two dfs...') +if (set(dr_df.columns) == set(gene_LF0.columns)): + print('PASS: column names match' + , '\n=============================================================') +else: + sys.exit('FAIL: Colnames mismatch AFTER concatenating') + +print('Checking concatenated df') +if len(gene_LF0) == (len(dr_df) + len(other_df))- stop_muts: + print('PASS:length of df after concat match' + , '\n===============================================================') +else: + sys.exit('FAIL: length mismatch') + +#%% NEW check2:id, sample, etc. +gene_LF0['id'].count() +gene_LF0['id'].nunique() +gene_LF0['sample'].nunique() +gene_LF0['mutation_info'].value_counts() +gene_LF0[drug].isna().sum() +#%% Concatenating two dfs sanity checks: gene_LF1 +#========================================================= +# This is hopefully clean data, but just double check +# Filter LF data so that you only have +# mutations corresponding to nssnp_match (case insensitive) +# this will be your list you run OR calcs +#========================================================= +print('Length of gene_LF0:', len(gene_LF0) + , '\nThis should be what we need. But just double checking and extracting nsSNP for', gene + , '\nfrom LF0 (concatenated data) using case insensitive regex match:', nssnp_match) + +gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(nssnp_match, regex = True, case = False)] + +if len(gene_LF0) == len(gene_LF1): + print('PASS: length of gene_LF0 and gene_LF1 match', + '\nConfirming extraction and concatenating worked correctly' + , '\n==========================================================') +else: + print('FAIL: BUT NOT FATAL!' + , '\ngene_LF0 and gene_LF1 lengths differ' + , '\nsuggesting error in extraction process' + , ' use gene_LF1 for downstreama analysis' + , '\n=========================================================') + +print('Length of dfs pre and post processing...' + , '\ngene WF data (unique samples in each row):', extracted_gene_samples + , '\ngene LF data (unique mutation in each row):', len(gene_LF1) + , '\n=============================================================') + +# sanity check for extraction +# This ought to pass if nsnsp_match happens right at the beginning of creating 'expected_rows' +print('Verifying whether extraction process worked correctly...') +if len(gene_LF1) == expected_rows: + print('PASS: extraction process performed correctly' + , '\nExpected length:', expected_rows + , '\nGot:', len(gene_LF1) + , '\nRESULT: Total no. of mutant sequences for logo plot:', len(gene_LF1) + , '\n=========================================================') +else: + print('FAIL: extraction process has bugs' + , '\nExpected length:', expected_rows + , '\nGot:', len(gene_LF1) + , '\nDebug please' + , '\n=========================================================') + +# more sanity checks +print('Performing some more sanity checks...') + +# From LF1: useful for OR counts +# no. of unique muts +distinct_muts = gene_LF1.mutation.value_counts() +print('Distinct genomic mutations:', len(distinct_muts)) + +# no. of samples contributing the unique muts +gene_LF1.id.nunique() +print('No.of samples contributing to distinct genomic muts:', gene_LF1.id.nunique()) + +# no. of dr and other muts +mut_grouped = gene_LF1.groupby('mutation_info').mutation.nunique() +print('No.of unique dr and other muts:\n', gene_LF1.groupby('mutation_info').mutation.nunique()) + +# sanity check +if len(distinct_muts) == mut_grouped.sum() : + print('PASS:length of LF1 is as expected' + , '\n===============================================================') +else: + print('FAIL: Mistmatch in count of muts' + , '\nExpected count:', len(distinct_muts) + , '\nActual count:', mut_grouped.sum() + , '\nMuts should be distinct within dr* and other* type' + , '\nInspecting...possibly ambiguous muts' + , '\nNo. of possible ambiguous muts:', mut_grouped.sum() - len(distinct_muts) + , '\n=========================================================') + +muts_split = list(gene_LF1.groupby('mutation_info')) +dr_muts = muts_split[0][1].mutation +other_muts = muts_split[1][1].mutation +print('splitting muts by mut_info:', muts_split) +print('no.of dr_muts samples:', len(dr_muts)) +print('no. of other_muts samples', len(other_muts)) + +#%% Ambiguous muts +# IMPORTANT: The same mutation cannot be classed as a drug AND 'others' +if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: + print('WARNING: Ambiguous muts detected in dr_ and other_ mutation category' + , '\n===============================================================') +else: + print('PASS: NO ambiguous muts detected' + , '\nMuts are unique to dr_ and other_ mutation class' + , '\n=========================================================') + +# inspect dr_muts and other muts: Fixed in case no ambiguous muts detected! +if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: + print('Finding ambiguous muts...' + , '\n=========================================================' + , '\nTotal no. of samples in dr_muts present in other_muts:', dr_muts.isin(other_muts).sum() + , '\nThese are:', dr_muts[dr_muts.isin(other_muts)] + , '\n=========================================================' + , '\nTotal no. of samples in other_muts present in dr_muts:', other_muts.isin(dr_muts).sum() + , '\nThese are:\n', other_muts[other_muts.isin(dr_muts)] + , '\n=========================================================') + + print('Counting no. of ambiguous muts...' + , '\nNo. of ambiguous muts in dr:' + , len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist()) + , '\nNo. of ambiguous muts in other:' + , len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist()) + , '\n=========================================================') + + if dr_muts[dr_muts.isin(other_muts)].nunique() == other_muts[other_muts.isin(dr_muts)].nunique(): + common_muts = dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist() + print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts)) + , '\nlist of ambiguous mutations (see below):', *common_muts, sep = '\n') + print('\n===========================================================') +else: + #sys.exit('Error: ambiguous muts present, but extraction failed. Debug!') + print('No: ambiguous muts present') + +#print('Counting no. of ambiguous muts...') +#if dr_muts[dr_muts.isin(other_muts)].nunique() == other_muts[other_muts.isin(dr_muts)].nunique(): +# common_muts = dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist() +# print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts)) +# , '\nlist of ambiguous mutations (see below):', *common_muts, sep = '\n') +# print('\n===========================================================') +#else: +# print('Error: ambiguous muts detected, but extraction failed. Debug!' +# , '\nNo. of ambiguous muts in dr:' +# , len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist()) +# , '\nNo. of ambiguous muts in other:' +# , len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist()) +# , '\n=========================================================') + +#%% Ambiguous muts: revised annotation for mutation_info +ambiguous_muts_df = gene_LF1[gene_LF1['mutation'].isin(common_muts)] + +ambiguous_muts_value_counts = ambiguous_muts_df.groupby('mutation')['mutation_info'].value_counts() +ambiguous_muts_value_counts + +bar = gene_LF1.copy() +bar.equals(gene_LF1) + +gene_LF1_orig = gene_LF1.copy() + +#===================================== +# Now create a new df that will have: + # ambiguous muts + # mutation_info + # revised mutation_info +# The revised label is based on value_counts +# for mutaiton_info. The corresponding mutation_info: +# label is chosen that corresponds to the max of value counts +#===================================== +ambig_muts_rev_df = pd.DataFrame() + +for i in common_muts: + #print(i) + temp_df = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info']] + + # DANGER: ASSUMES TWO STATES ONLY and that value_counts sorts by descending + max_info_table = gene_LF1[gene_LF1['mutation'] == i][['mutation', 'mutation_info']].value_counts() + + revised_label = max_info_table[[0]].index[0][1] # max value_count + old_label = max_info_table[[1]].index[0][1] # min value_count + print('\nAmbiguous mutation labels...' + , '\nSetting mutation_info for', i, 'to', revised_label) + temp_df['mutation_info_REV'] = np.where( (temp_df['mutation_info'] == old_label) + , revised_label + , temp_df['mutation_info']) + ambig_muts_rev_df = pd.concat([ambig_muts_rev_df,temp_df]) + +ambig_muts_rev_df['mutation_info'].value_counts() +ambig_muts_rev_df['mutation_info_REV'].value_counts() + +#================= +# Merge ambig muts +# with gene_LF1 +#=================== +bar = gene_LF1.copy() +bar['mutation_info_old'] = bar['mutation_info'] + +ambig_muts_rev_df.index +bar.loc[ambig_muts_rev_df.index, 'mutation_info'] = ambig_muts_rev_df['mutation_info_REV'] +bar['mutation_info_old'].value_counts() +bar['mutation_info'].value_counts() +foo = bar.iloc[ambig_muts_rev_df.index] +foo[['mutation', 'mutation_info', 'mutation_info_old']] + +# CHECK if there are still any ambiguous muts + +#%% ROUND THE HOUSES: DELETE +foo = ambig_muts_rev_df[['mutation', 'mutation_info_REV']] +fooU = foo.drop_duplicates() + +#https://cmdlinetips.com/2021/04/convert-two-column-values-from-pandas-dataframe-to-a-dictionary/ +fooD = fooU.set_index('mutation').to_dict()['mutation_info_REV'] +fooD +#https://stackoverflow.com/questions/45334020/pandas-map-column-data-based-on-value-from-another-column-using-if-to-determine +#df['New_State_Name'] = np.where(df['Name']=='Person1',df['State'].map(state_map),df['State'].map(state_map2)) +bar['NEW'] = bar['mutation_info_old'] +bar['NEW'].value_counts() +bar['mutation_info_old'].value_counts() + +bar['NEW'].value_counts() +bar2 = bar[['mutation', 'mutation_info', 'NEW']] +bar2.isnull().sum() + +bar['NEW'].value_counts() +bar['mutation_info'].value_counts() +bar.loc[bar['mutation'].isin(fooD.keys()), 'NEW'] = bar['mutation_info'].map(fooD) +bar['NEW'] = bar['NEW'].map(fooD).fillna(bar['NEW']) + +bar['NEW2'].value_counts() +bar['NEW'].value_counts() +#%% OUTFILE 3, write file: ambiguous muts and ambiguous mut counts + +#print(outdir) +=================== +# ambiguous muts +#======================= +#dr_muts.to_csv('dr_muts.csv', header = True) +#other_muts.to_csv('other_muts.csv', header = True) +if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: + out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv' + outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts + print('\n----------------------------------' + , '\nWriting file: ambiguous muts' + , '\n----------------------------------' + , '\nFilename:', outfile_ambig_muts) + inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)] + inspect.to_csv(outfile_ambig_muts, index = True) + + print('Finished writing:', out_filename_ambig_muts + , '\nNo. of rows:', len(inspect) + , '\nNo. of cols:', len(inspect.columns) + , '\nNo. of rows = no. of samples with the ambiguous muts present:' + , dr_muts.isin(other_muts).sum() + other_muts.isin(dr_muts).sum() + , '\n=============================================================') + del(out_filename_ambig_muts) + +#======================= +# ambiguous mut counts +#======================= +out_filename_ambig_mut_counts = gene.lower() + '_ambiguous_mut_counts.csv' +outfile_ambig_mut_counts = outdir + '/' + out_filename_ambig_mut_counts +print('\n----------------------------------' + , '\nWriting file: ambiguous muts' + , '\n----------------------------------' + , '\nFilename:', outfile_ambig_mut_counts) + +ambiguous_muts_value_counts.to_csv(outfile_ambig_mut_counts, index = True) + +#%% clear variables +del(id_dr, id_other + #, meta_data + , meta_gene_dr, meta_gene_other + #, mut_grouped + , muts_split, other_WF1, other_df, other_muts_df, other_gene_count, gene_LF0, gene_na) + +del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_muts_df, dr_gene_WF0, dr_gene_count, expected_gene_samples, other_gene_WF1) + +#%% Splitting mutation column to get mCSM style muts i.e. mutationinformation column + +#%% NEW TODO: map mutationinformation +#%% NEW: mappings +#======================= +# column name: +#======================= +# mapping 1: labels +dm_om_label_map = {dr_muts_col: 'DM' + , other_muts_col: 'OM'} + +gene_LF0['mutation_info_labels'] = gene_LF0['mutation_info'].map(dm_om_label_map) + +# mapping 2: numeric +dm_om_num_map = {dr_muts_col: 1 + , other_muts_col: 0} + +gene_LF0['dm_om_numeric'] = gene_LF0['mutation_info'].map(dm_om_num_map) + +gene_LF0['mutation_info'].value_counts() +gene_LF0['mutation_info_labels'].value_counts() +gene_LF0['dm_om_numeric'].value_counts() + +#============================ +# column name: +#============================ +gene_LF0['drtype'].value_counts() + +# mapping: numeric +drtype_map = {'XDR': 5 + , 'Pre-XDR': 4 + , 'MDR': 3 + , 'Pre-MDR': 2 + , 'Other': 1 + , 'Sensitive': 0} +gene_LF0['drtype_numeric'] = gene_LF0['drtype'].map(drtype_map) + +gene_LF0['drtype'].value_counts() +gene_LF0['drtype_numeric'].value_counts() + +#%% multimode +# COPY dst column +gene_LF0['dst'] = gene_LF0[drug] # to allow cross checking +gene_LF0['dst_multimode'] = gene_LF0[drug] + +# sanity check +gene_LF0[drug].value_counts() +gene_LF0['dst_multimode'].value_counts() +gene_LF0[drug].isna().sum() + +if gene_LF0[drug].value_counts().sum()+gene_LF0[drug].isna().sum() == len(gene_LF0): + print('\nPASS:', 'Numbers match') +else: + print('\nFAIL:', 'Numbers mismatch') + +gene_LF0['mutation'].value_counts() +#data.C.isnull().groupby([df['A'],df['B']]).sum().astype(int).reset_index(name='count') +gene_LF0[drug].isnull().groupby(gene_LF0['mutation']).sum() + +# GOAL is to populate na in the dst column from the count of the dm_om_numeric column +gene_LF0['dst_multimode'].isnull().groupby(gene_LF0['mutationinformation']).sum() + +# COPY mutationinformation for sanity check +#data['mutation'] = data['mutationinformation'] +gene_LF0['mutationinformation'] = gene_LF0['mutation'] + +# Get multimode for dm_om_numeric column +dm_om_multimode = gene_LF0.groupby('mutationinformation')['dm_om_numeric'].agg(multimode) +dm_om_multimode + +gene_LF0['dst_multimode'] = gene_LF0['dst_multimode'].fillna(dm_om_multimode) +gene_LF0['dst_noNA'] = gene_LF0['dst_multimode'].apply(lambda x: np.nanmax(x)) +print(gene_LF0) + +# Finally created a revised dst with the max from the multimode +gene_LF0['dst_mode'] = gene_LF0.groupby('mutationinformation')['dst_noNA'].max()