data extraction tidy up

This commit is contained in:
Tanushree Tunstall 2020-07-08 13:26:33 +01:00
parent a4670b9944
commit c958cc1081
2 changed files with 195 additions and 105 deletions

View file

@ -50,6 +50,7 @@ Created on Tue Aug 6 12:56:03 2019
#======================================================================= #=======================================================================
#%% load libraries #%% load libraries
import os, sys import os, sys
import re
import pandas as pd import pandas as pd
#import numpy as np #import numpy as np
import argparse import argparse
@ -72,12 +73,14 @@ arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', defau
args = arg_parser.parse_args() args = arg_parser.parse_args()
#======================================================================= #=======================================================================
#%% variable assignment: input and output paths & filenames #%% variable assignment: input and output paths & filenames
drug = args.drug
gene = args.gene
#drug = 'pyrazinamide' #drug = 'pyrazinamide'
#gene = 'pncA' #gene = 'pncA'
drug = args.drug
gene = args.gene
gene_match = gene + '_p.' gene_match = gene + '_p.'
nssnp_match = gene_match+'[A-Z]{3}[0-9]+[A-Z]{3}'
# building cols to extract # building cols to extract
dr_muts_col = 'dr_mutations_' + drug dr_muts_col = 'dr_mutations_' + drug
@ -93,14 +96,16 @@ print('Extracting columns based on variables:\n'
#======================================================================= #=======================================================================
#%% input and output dirs and files #%% input and output dirs and files
#======= #=======
# data dir # dirs
#======= #=======
datadir = homedir + '/' + 'git/Data' datadir = homedir + '/' + 'git/Data'
indir = datadir + '/' + drug + '/' + 'input'
outdir = datadir + '/' + drug + '/' + 'output'
#======= #=======
# input # input
#======= #=======
#in_filename = 'original_tanushree_data_v2.csv' #19k in_filename = 'original_tanushree_data_v2.csv' #19k
in_filename = 'mtb_gwas_meta_v3.csv' #33k in_filename = 'mtb_gwas_meta_v3.csv' #33k
infile = datadir + '/' + in_filename infile = datadir + '/' + in_filename
print('Input file: ', infile print('Input file: ', infile
@ -109,9 +114,7 @@ print('Input file: ', infile
#======= #=======
# output # output
#======= #=======
# several output files # several output files: in respective sections at the time of outputting files
# output filenames in respective sections at the time of outputting files
outdir = datadir + '/' + drug + '/' + 'output'
print('Output filename: in the respective sections' print('Output filename: in the respective sections'
, '\nOutput path: ', outdir , '\nOutput path: ', outdir
, '\n=============================================================') , '\n=============================================================')
@ -126,15 +129,14 @@ master_data = pd.read_csv(infile, sep = ',')
# extract elevant columns to extract from meta data related to the drug # extract elevant columns to extract from meta data related to the drug
#meta_data_ch = master_data[['id' meta_data = master_data[['id'
#, 'country' , 'country'
#, 'lineage' , 'lineage'
#, 'sublineage' , 'sublineage'
##, 'drtype' #19k only , 'drtype' #19k only
#, 'resistance' , drug
#, drug , dr_muts_col
#, dr_muts_col , other_muts_col]]
#, other_muts_col]]
core_cols = ['id' core_cols = ['id'
@ -160,6 +162,7 @@ variable_based_cols = [drug
, other_muts_col] , other_muts_col]
cols_to_extract = core_cols + variable_based_cols cols_to_extract = core_cols + variable_based_cols
print('Extracting', len(cols_to_extract), 'columns from master data')
meta_data = master_data[cols_to_extract] meta_data = master_data[cols_to_extract]
@ -174,18 +177,21 @@ print('RESULT: Total samples:', total_samples
meta_data.isna().sum() meta_data.isna().sum()
print('No. of NAs/column:' + '\n', meta_data.isna().sum() print('No. of NAs/column:' + '\n', meta_data.isna().sum()
, '\n===========================================================') , '\n===========================================================')
#
# glance # glance
meta_data.head() #meta_data.head()
#total_samples - NA pyrazinamide = ?
# 19K: 19265-6754 = 12511
# 33K: 33681 - 23823 = 9858
# equivalent of table in R # equivalent of table in R
# drug counts # drug counts: complete samples for OR calcs
meta_data[drug].value_counts() meta_data[drug].value_counts()
print('RESULT: Sus and Res samples:\n', meta_data[drug].value_counts() print('RESULT: Sus and Res samples:\n', meta_data[drug].value_counts()
, '\n===========================================================') , '\n===========================================================')
# clear variables # clear variables
del(in_filename,infile) del(in_filename, infile)
#del(outdir) #del(outdir)
#%% #%%
# !!!IMPORTANT!!! sanity check: # !!!IMPORTANT!!! sanity check:
@ -208,11 +214,10 @@ na_count = meta_data[dr_muts_col].isna().sum()
if len(clean_df) == (total_samples - na_count): if len(clean_df) == (total_samples - na_count):
print('PASS: clean_df extracted: length is', len(clean_df) print('PASS: clean_df extracted: length is', len(clean_df)
, '\nNo.of NAs =', na_count, '/', total_samples , '\nNo.of NAs in', dr_muts_col, '=', na_count, '/', total_samples
, '\n==========================================================') , '\n==========================================================')
else: else:
print('FAIL: dropping NA failed' sys.exit('FAIL: Could not drop NA')
, '\n==========================================================')
dr_gene_count = 0 dr_gene_count = 0
wt = 0 wt = 0
@ -238,7 +243,7 @@ print('Total matches of', gene_match, 'in dr_muts_col:', dr_gene_count)
print('Total samples with > 1', gene_match, 'muts in dr_muts_col:', len(id2_dr) ) print('Total samples with > 1', gene_match, 'muts in dr_muts_col:', len(id2_dr) )
print('=================================================================') print('=================================================================')
del(i, id, wt, id2_dr, clean_df, na_count, count_gene_dr, count_wt) del(clean_df, na_count, i, id, wt, id2_dr, count_gene_dr, count_wt)
#======== #========
# Second: counting <gene_match> mutations in dr_muts_col column # Second: counting <gene_match> mutations in dr_muts_col column
@ -258,6 +263,7 @@ if len(clean_df) == (total_samples - na_count):
else: else:
print('FAIL: dropping NA failed' print('FAIL: dropping NA failed'
, '\n=========================================================') , '\n=========================================================')
sys.exit()
other_gene_count = 0 other_gene_count = 0
wt_other = 0 wt_other = 0
@ -268,7 +274,7 @@ for i, id in enumerate(clean_df.id):
# print (i, id) # print (i, id)
# id_other.append(id) # id_other.append(id)
# count_gene_other = clean_df[other_muts_col].iloc[i].count('gene_match') # count_gene_other = clean_df[other_muts_col].iloc[i].count('gene_match')
count_gene_other = clean_df[other_muts_col].iloc[i].count(gene_match) count_gene_other = clean_df[other_muts_col].iloc[i].count(gene_match)
if count_gene_other > 0: if count_gene_other > 0:
id_other.append(id) id_other.append(id)
if count_gene_other > 1: if count_gene_other > 1:
@ -313,14 +319,20 @@ print('gene to extract:', gene_match )
dr_based_cols = [drug, dr_muts_col] dr_based_cols = [drug, dr_muts_col]
cols_to_extract = core_cols + dr_based_cols cols_to_extract = core_cols + dr_based_cols
print('Extracting', len(cols_to_extract), 'columns from meta data')
meta_data_dr = meta_data[cols_to_extract] meta_data_dr = meta_data[cols_to_extract]
del(dr_based_cols, cols_to_extract) del(dr_based_cols, cols_to_extract)
print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) ) if meta_data_dr.shape[0] == len(meta_data) and meta_data_dr.shape[1] == (len(meta_data.columns)-1):
print('actual dim:', meta_data_dr.shape print('PASS: Dimensions match')
, '\n===============================================================') else:
print('FAIL: Dimensions mismatch:'
, 'Expected dim should be:', len(meta_data), (len(meta_data.columns)-1)
, '\nGot:', meta_data_dr.shape
, '\n===============================================================')
sys.exit()
# Extract within this the gene of interest using string match # Extract within this the gene of interest using string match
#meta_gene_dr = meta_data.loc[meta_data[dr_muts_col].str.contains('gene_match*', na = False)] #meta_gene_dr = meta_data.loc[meta_data[dr_muts_col].str.contains('gene_match*', na = False)]
@ -338,13 +350,14 @@ if len(id_dr) == len(meta_gene_dr):
else: else:
print('FAIL: length mismatch' print('FAIL: length mismatch'
, '\n===============================================================') , '\n===============================================================')
sys.exit()
dr_id = pd.Series(dr_id) dr_id = pd.Series(dr_id)
#================= #=================
# other mutations: extract gene_match entries # other mutations: extract gene_match entries from other_muts_col
#================== #==================
print('Extracting dr_muts from:', other_muts_col,'with other meta_data') print('Extracting other_muts from:', other_muts_col,'with other meta_data')
# FIXME: replace drug with variable containing the drug name # FIXME: replace drug with variable containing the drug name
# !!! important !!! # !!! important !!!
#meta_data_other = meta_data[['id' #meta_data_other = meta_data[['id'
@ -356,17 +369,24 @@ print('Extracting dr_muts from:', other_muts_col,'with other meta_data')
# , other_muts_col # , other_muts_col
# ]] # ]]
dr_based_cols = [drug, other_muts_col] other_based_cols = [drug, other_muts_col]
cols_to_extract = core_cols + dr_based_cols cols_to_extract = core_cols + other_based_cols
print('Extracting', len(cols_to_extract), 'columns from meta data')
meta_data_other = meta_data[cols_to_extract] meta_data_other = meta_data[cols_to_extract]
del(dr_based_cols, cols_to_extract) del(other_based_cols, cols_to_extract)
print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) )
print('actual dim:', meta_data_other.shape if meta_data_other.shape[0] == len(meta_data) and meta_data_other.shape[1] == (len(meta_data.columns)-1):
, '\n===============================================================') print('PASS: Dimensions match')
else:
print('FAIL: Dimensions mismatch:'
, 'Expected dim should be:', len(meta_data), (len(meta_data.columns)-1)
, '\nGot:', meta_data_other.shape
, '\n===============================================================')
sys.exit()
# Extract within this the gene of interest using string match # Extract within this the gene of interest using string match
meta_gene_other = meta_data_other.loc[meta_data_other[other_muts_col].str.contains(gene_match, na = False)] meta_gene_other = meta_data_other.loc[meta_data_other[other_muts_col].str.contains(gene_match, na = False)]
@ -383,6 +403,7 @@ if len(id_other) == len(meta_gene_other):
else: else:
print('FAIL: length mismatch' print('FAIL: length mismatch'
, '\n===============================================================') , '\n===============================================================')
sys.exit()
other_id = pd.Series(other_id) other_id = pd.Series(other_id)
#%% Find common IDs #%% Find common IDs
@ -392,9 +413,12 @@ print('RESULT: No. of common ids:', common_mut_ids)
# sanity checks # sanity checks
# check if True: should be since these are common # check if True: should be since these are common
dr_id.isin(other_id).sum() == other_id.isin(dr_id).sum() if dr_id.isin(other_id).sum() == other_id.isin(dr_id).sum():
print('PASS: Cross check on no. of common ids')
else:
sys.exit('FAIL: Cross check on no. of common ids failed')
# check if the 24 Ids that are common are indeed the same! # check if the common are indeed the same!
# bit of a tautology, but better safe than sorry! # bit of a tautology, but better safe than sorry!
common_ids = dr_id[dr_id.isin(other_id)] common_ids = dr_id[dr_id.isin(other_id)]
common_ids = common_ids.reset_index() common_ids = common_ids.reset_index()
@ -405,25 +429,29 @@ common_ids2 = common_ids2.reset_index()
common_ids2.columns = ['index', 'id2'] common_ids2.columns = ['index', 'id2']
# should be True # should be True
print(common_ids['id'].equals(common_ids2['id2'])) if common_ids['id'].equals(common_ids2['id2']):
print('PASS: Further cross checks on common ids')
else:
sys.exit('FAIL: Further cross checks on common ids')
# good sanity check: use it later to check gene_sample_counts # good sanity check: use it later to check gene_sample_counts
expected_gene_samples = ( len(meta_gene_dr) + len(meta_gene_other) - common_mut_ids ) expected_gene_samples = ( len(meta_gene_dr) + len(meta_gene_other) - common_mut_ids )
print('expected no. of gene samples:', expected_gene_samples) print('Expected no. of gene samples:', expected_gene_samples)
print('=================================================================') print('=================================================================')
#%% write file #%% write file
#print(outdir) #print(outdir)
out_filename0 = gene.lower() + '_common_ids.csv' out_filename_cid = gene.lower() + '_common_ids.csv'
outfile0 = outdir + '/' + out_filename0 outfile_cid = outdir + '/' + out_filename_cid
#FIXME: CHECK line len(common_ids) #FIXME: CHECK line len(common_ids)
print('Writing file:' print('Writing file:'
, '\nFile:', outfile0 , '\nFile:', outfile_cid
, '\nExpected no. of rows:', len(common_ids) , '\nNo. of rows:', len(common_ids)
, '\n=============================================================') , '\n=============================================================')
common_ids.to_csv(outfile0, index = False) common_ids.to_csv(outfile_cid, index = False)
del(out_filename0)
del(out_filename_cid)
# clear variables # clear variables
del(dr_id, other_id, meta_data_dr, meta_data_other, common_ids, common_mut_ids, common_ids2) del(dr_id, other_id, meta_data_dr, meta_data_other, common_ids, common_mut_ids, common_ids2)
@ -441,20 +469,19 @@ print('RESULT: actual no. of gene samples extracted:', extracted_gene_samples
# sanity check: length of gene samples # sanity check: length of gene samples
print('Performing sanity check:') print('Performing sanity check:')
if extracted_gene_samples == expected_gene_samples: if extracted_gene_samples == expected_gene_samples:
print('No. of gene samples:', len(meta_gene_all) print('PASS: expected & actual no. of gene samples match'
, '\nPASS: expected & actual no. of gene samples match' , '\nNo. of gene samples:', len(meta_gene_all)
, '\n=========================================================') , '\n=========================================================')
else: else:
print('FAIL: Debug please!' sys.exit('FAIL: Length mismatch in gene samples!')
, '\n===============================================================')
# count NA in drug column # count NA in drug column
gene_na = meta_gene_all[drug].isna().sum() gene_na = meta_gene_all[drug].isna().sum()
print('No. of gene samples without pza testing i.e NA in pza column:', gene_na) print('No. of gene samples without', drug, 'testing:', gene_na)
# use it later to check number of complete samples from LF data # use it later to check number of complete samples from LF data
comp_gene_samples = len(meta_gene_all) - gene_na comp_gene_samples = len(meta_gene_all) - gene_na
print('comp gene samples tested for pza:', comp_gene_samples) print('comp gene samples tested for', drug, ':', comp_gene_samples)
print('=================================================================') print('=================================================================')
# Comment: This is still dirty data since these # Comment: This is still dirty data since these
@ -464,19 +491,9 @@ print('This is still dirty data: samples have ', gene_match, 'muts but may have
, '\nsince the format for mutations is mut1; mut2, etc.' , '\nsince the format for mutations is mut1; mut2, etc.'
, '\n=============================================================') , '\n=============================================================')
#%% tidy_split():Function to split mutations on specified delimiter: ';'
#https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas
print('Performing tidy_split(): to separate the mutations into indivdual rows') print('Performing tidy_split(): to separate the mutations into indivdual rows')
#TIDY SPLIT HERE
#========= #=========
# DF1: dr_muts_col # DF1: dr_muts_col
#========= #=========
@ -495,7 +512,7 @@ dr_WF0[dr_muts_col] = dr_WF0[dr_muts_col].str.lstrip()
# extract only the samples/rows with gene_match # extract only the samples/rows with gene_match
dr_gene_WF0 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(gene_match)] dr_gene_WF0 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(gene_match)]
print('lengths after tidy split and extracting', gene_match, 'muts:' print('Lengths after tidy split and extracting', gene_match, 'muts:'
, '\nold length:' , len(meta_gene_dr) , '\nold length:' , len(meta_gene_dr)
, '\nlen after split:', len(dr_WF0) , '\nlen after split:', len(dr_WF0)
, '\ndr_gene_WF0 length:', len(dr_gene_WF0) , '\ndr_gene_WF0 length:', len(dr_gene_WF0)
@ -507,6 +524,7 @@ if len(dr_gene_WF0) == dr_gene_count:
else: else:
print('FAIL: lengths mismatch' print('FAIL: lengths mismatch'
, '\n===============================================================') , '\n===============================================================')
sys.exit()
# count the freq of 'dr_muts' samples # count the freq of 'dr_muts' samples
dr_muts_df = dr_gene_WF0 [['id', dr_muts_col]] dr_muts_df = dr_gene_WF0 [['id', dr_muts_col]]
@ -515,29 +533,42 @@ print('dim of dr_muts_df:', dr_muts_df.shape)
# add freq column # add freq column
dr_muts_df['dr_sample_freq'] = dr_muts_df.groupby('id')['id'].transform('count') dr_muts_df['dr_sample_freq'] = dr_muts_df.groupby('id')['id'].transform('count')
#dr_muts_df['dr_sample_freq'] = dr_muts_df.loc[dr_muts_df.groupby('id')].transform('count') #dr_muts_df['dr_sample_freq'] = dr_muts_df.loc[dr_muts_df.groupby('id')].transform('count')
print('revised dim of dr_muts_df:', dr_muts_df.shape) print('Revised dim of dr_muts_df:', dr_muts_df.shape)
c1 = dr_muts_df.dr_sample_freq.value_counts() c1 = dr_muts_df.dr_sample_freq.value_counts()
print('counting no. of sample frequency:\n', c1 print('Counting no. of sample frequency:\n', c1
, '\n===================================================================') , '\n===================================================================')
# sanity check: length of gene samples # sanity check: length of gene samples
if len(dr_gene_WF0) == c1.sum(): if len(dr_gene_WF0) == c1.sum():
print('PASS: WF data has expected length' print('PASS: WF data has expected length'
, '\nlength of dr_gene WFO:', c1.sum() , '\nLength of dr_gene WFO:', c1.sum()
, '\n===============================================================') , '\n===============================================================')
else: else:
print('FAIL: Debug please!' sys.exit('FAIL: length mismatch!')
, '\n===============================================================')
#!!! Important !!! #!!! Important !!!
# Assign 'column name' on which split was performed as an extra column # Assign 'column name' on which split was performed as an extra column
# This is so you can identify if mutations are dr_type or other in the final df # This is so you can identify if mutations are dr_type or other in the final df
dr_df = dr_gene_WF0.assign(mutation_info = dr_muts_col) dr_df_all = dr_gene_WF0.assign(mutation_info = dr_muts_col)
print('dim of dr_df:', dr_df.shape print('Dim of dr_df:', dr_df_all.shape
, '\n==============================================================' , '\n=============================================================='
, '\nEnd of tidy split() on dr_muts, and added an extra column relecting mut_category' , '\nEnd of tidy split() on dr_muts, and added an extra column relecting mut_category'
, '\n===============================================================') , '\n===============================================================')
# Further clean up to remove stop mutations as these are similar in format to
# gene match but dont have a 'mutant aa' (for e.g: abc10* vs nssnp: abc10cde)
dr_stop = dr_df_all[dr_muts_col].str.count('\*').sum()
print('Removing stop mutations denoted by an asterix'
, '\nNo. of stop mutations:',dr_stop)
dr_df = dr_df_all[~dr_df_all[dr_muts_col].str.contains('\*')]
if len(dr_df) == len(dr_df_all) - dr_stop:
print('PASS: Successfully removed stop mutatiosn from dr_df')
else:
sys.exit('FAIL: Could not remove stop mutations. Check regex or variable names?')
#%% #%%
#========= #=========
# DF2: other_mutations_pyrazinamdie # DF2: other_mutations_pyrazinamdie
@ -558,7 +589,7 @@ other_WF1[other_muts_col] = other_WF1[other_muts_col].str.strip()
# extract only the samples/rows with gene_match # extract only the samples/rows with gene_match
other_gene_WF1 = other_WF1.loc[other_WF1[other_muts_col].str.contains(gene_match)] other_gene_WF1 = other_WF1.loc[other_WF1[other_muts_col].str.contains(gene_match)]
print('lengths after tidy split and extracting', gene_match, 'muts:', print('Lengths after tidy split and extracting', gene_match, 'muts:',
'\nold length:' , len(meta_gene_other), '\nold length:' , len(meta_gene_other),
'\nlen after split:', len(other_WF1), '\nlen after split:', len(other_WF1),
'\nother_gene_WF1 length:', len(other_gene_WF1), '\nother_gene_WF1 length:', len(other_gene_WF1),
@ -568,36 +599,50 @@ if len(other_gene_WF1) == other_gene_count:
print('PASS: length matches with expected length' print('PASS: length matches with expected length'
, '\n===============================================================') , '\n===============================================================')
else: else:
print('FAIL: lengths mismatch' sys.exit('FAIL: lengths mismatch')
, '\n===============================================================')
# count the freq of 'other muts' samples # count the freq of 'other muts' samples
other_muts_df = other_gene_WF1 [['id', other_muts_col]] other_muts_df = other_gene_WF1 [['id', other_muts_col]]
print('dim of other_muts_df:', other_muts_df.shape) print('Dim of other_muts_df:', other_muts_df.shape)
# add freq column # add freq column
other_muts_df['other_sample_freq'] = other_muts_df.groupby('id')['id'].transform('count') other_muts_df['other_sample_freq'] = other_muts_df.groupby('id')['id'].transform('count')
print('revised dim of other_muts_df:', other_muts_df.shape) print('Revised dim of other_muts_df:', other_muts_df.shape)
c2 = other_muts_df.other_sample_freq.value_counts() c2 = other_muts_df.other_sample_freq.value_counts()
print('counting no. of sample frequency:\n', c2) print('Counting no. of sample frequency:\n', c2)
print('=================================================================') print('=================================================================')
# sanity check: length of gene samples # sanity check: length of gene samples
if len(other_gene_WF1) == c2.sum(): if len(other_gene_WF1) == c2.sum():
print('PASS: WF data has expected length' print('PASS: WF data has expected length'
, '\nlength of other_gene WFO:', c2.sum() , '\nLength of other_gene WFO:', c2.sum()
, '\n===============================================================') , '\n===============================================================')
else: else:
print('FAIL: Debug please!' sys.exit('FAIL: Length mismatch')
, '\n===============================================================')
#!!! Important !!! #!!! Important !!!
# Assign 'column name' on which split was performed as an extra column # Assign 'column name' on which split was performed as an extra column
# This is so you can identify if mutations are dr_type or other in the final df # This is so you can identify if mutations are dr_type or other in the final df
other_df = other_gene_WF1.assign(mutation_info = other_muts_col) other_df_all = other_gene_WF1.assign(mutation_info = other_muts_col)
print('dim of other_df:', other_df.shape print('dim of other_df:', other_df_all.shape
, '\n===============================================================' , '\n==============================================================='
, '\nEnd of tidy split() on other_muts, and added an extra column relecting mut_category' , '\nEnd of tidy split() on other_muts, and added an extra column relecting mut_category'
, '\n===============================================================') , '\n===============================================================')
# Further clean up to remove stop mutations as these are similar in format to
# gene match but don't have a 'mutant aa' (for e.g: abc10* vs nssnp: abc10cde)
other_stop = other_df_all[other_muts_col].str.count('\*').sum()
print('Removing stop mutations denoted by an asterix'
, '\nNo. of stop mutations:',other_stop)
other_df = other_df_all[~other_df_all[other_muts_col].str.contains('\*')]
if len(other_df) == len(other_df_all) - other_stop:
print('PASS: Successfully removed stop mutatiosn from other_df')
else:
sys.exit('FAIL: Could not remove stop mutations. Check regex or variable names?')
#%% #%%
#========== #==========
# Concatentating the two dfs: equivalent of rbind in R # Concatentating the two dfs: equivalent of rbind in R
@ -606,8 +651,8 @@ print('dim of other_df:', other_df.shape
# change column names to allow concat: # change column names to allow concat:
# dr_muts.. & other_muts : 'mutation' # dr_muts.. & other_muts : 'mutation'
print('Now concatenating the two dfs by row' print('Now concatenating the two dfs by row'
, '\nfirst assigning a common colname: "mutation" to the col containing muts' , '\nFirst assigning a common colname: "mutation" to the col containing muts'
, '\nthis is done for both dfs' , '\nThis is done for both dfs'
, '\n===================================================================') , '\n===================================================================')
dr_df.columns dr_df.columns
@ -618,39 +663,58 @@ other_df.columns
other_df.rename(columns = {other_muts_col: 'mutation'}, inplace = True) other_df.rename(columns = {other_muts_col: 'mutation'}, inplace = True)
other_df.columns other_df.columns
print('Now appending the two dfs:' if len(dr_df.columns) == len(other_df.columns):
, '\ndr_df dim:', dr_df.shape print('Checking dfs for concatening by rows:'
, '\nother_df dim:', other_df.shape , '\ndr_df dim:', dr_df.shape
, '\ndr_df length:', len(dr_df) , '\nother_df dim:', other_df.shape
, '\nother_df length:', len(other_df) , '\nexpected nrows:', len(dr_df) + len(other_df)
, '\nexpected length:', len(dr_df) + len(other_df) , '\n=============================================================')
, '\n=============================================================') else:
sys.exit('FAIL: No. of cols mismatch for concatenating')
# checking colnames before concat # checking colnames before concat
print('checking colnames BEFORE concatenating the two dfs...') print('checking colnames BEFORE concatenating the two dfs...')
if (set(dr_df.columns) == set(other_df.columns)): if (set(dr_df.columns) == set(other_df.columns)):
print('PASS: column names match necessary for merging two dfs') print('PASS: column names match necessary for merging two dfs')
else: else:
print('FAIL: Debug please!') sys.exit('FAIL: Colnames mismatch for concatenating!')
# concatenate (axis = 0): Rbind # concatenate (axis = 0): Rbind
gene_LF0 = pd.concat([dr_df, other_df], ignore_index = True, axis = 0) print('Now appending the two dfs: Rbind')
gene_LF_comb = pd.concat([dr_df, other_df], ignore_index = True, axis = 0)
print('Finding stop mutations in concatenated df')
stop_muts = gene_LF_comb['mutation'].str.contains('\*').sum()
stop_muts_group = gene_LF_comb['mutation'].str.counts('\*').value_counts
gene_LF_comb.groupby(['mutation_info'])['mutation'].apply(lambda x: x[x.str.contains('\*')].count())
gene_LF0 = gene_LF_comb[gene_LF_comb['mutation'].str.contains(nssnp_match, case = False)]
# checking colnames and length after concat # checking colnames and length after concat
print('checking colnames AFTER concatenating the two dfs...') print('checking colnames AFTER concatenating the two dfs...')
if (set(dr_df.columns) == set(gene_LF0.columns)): if (set(dr_df.columns) == set(gene_LF0.columns)):
print('PASS: column names match') print('PASS: column names match
, '\n=============================================================')
else: else:
print('FAIL: Debug please!') sys.exit('FAIL: Colnames mismatch AFTER concatenating')
print('checking length AFTER concatenating the two dfs...')
print('checking concatened df')
if len(gene_LF0) == len(dr_df) + len(other_df): if len(gene_LF0) == len(dr_df) + len(other_df):
print('PASS:length of df after concat match' print('PASS:length of df after concat match'
, '\n===============================================================') , '\n===============================================================')
else: else:
print('FAIL: length mismatch' sys.exit('FAIL: length mismatch')
, '\n===============================================================')
#%% #%%
########### ###########
# This is hopefully clean data, but just double check # This is hopefully clean data, but just double check
@ -658,13 +722,37 @@ else:
# mutations corresponding to gene_match* (string match pattern) # mutations corresponding to gene_match* (string match pattern)
# this will be your list you run OR calcs # this will be your list you run OR calcs
########### ###########
#my_regex = 'pncA_p.[A-Z]{3}[0-9]+[A-Z]{3}'
gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(nssnp_match, case = False)]
len(gene_LF1)
print('length of gene_LF0:', len(gene_LF0), print('length of gene_LF0:', len(gene_LF0),
'\nThis should be what you need. But just double check and extract', gene_match, '\nThis should be what you need. But just double check and extract', gene_match,
'\nfrom LF0 (concatenated data) using string match:', gene_match) '\nfrom LF0 (concatenated data) using string match:', gene_match
, '\nand double checking for stop mutations')
if gene_LF0['mutation'].str.count('\*').sum() > 0:
sys.exit('FAIL: Stop mutations detected post concatenating dfs. Resolve at source!')
print('Double checking and creating df: gene_LF1') print('Double checking and creating df: gene_LF1')
gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(gene_match)] gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(gene_match)]
if len(gene_LF0) == len(gene_LF1): if len(gene_LF0) == len(gene_LF1):
print('PASS: length of gene_LF0 and gene_LF1 match', print('PASS: length of gene_LF0 and gene_LF1 match',
'\nconfirming extraction and concatenating worked correctly') '\nconfirming extraction and concatenating worked correctly')

View file

@ -18,7 +18,9 @@ import pandas as pd
#os.chdir(homedir + '/git/LSHTM_analysis/scripts') #os.chdir(homedir + '/git/LSHTM_analysis/scripts')
#os.getcwd() #os.getcwd()
#%%===================================================================== #%%=====================================================================
# define the split function # tidy_split():Function to split mutations on specified delimiter: ';'
#https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas
def tidy_split(df, column, sep = '|', keep = False): def tidy_split(df, column, sep = '|', keep = False):
''' '''
Split the values of a column and expand so the new DataFrame has one split Split the values of a column and expand so the new DataFrame has one split