data extraction tidy up
This commit is contained in:
parent
e4328df255
commit
65e6b28d9e
2 changed files with 195 additions and 105 deletions
|
@ -50,6 +50,7 @@ Created on Tue Aug 6 12:56:03 2019
|
|||
#=======================================================================
|
||||
#%% load libraries
|
||||
import os, sys
|
||||
import re
|
||||
import pandas as pd
|
||||
#import numpy as np
|
||||
import argparse
|
||||
|
@ -72,12 +73,14 @@ arg_parser.add_argument('-g', '--gene', help='gene name (case sensitive)', defau
|
|||
args = arg_parser.parse_args()
|
||||
#=======================================================================
|
||||
#%% variable assignment: input and output paths & filenames
|
||||
drug = args.drug
|
||||
gene = args.gene
|
||||
|
||||
#drug = 'pyrazinamide'
|
||||
#gene = 'pncA'
|
||||
|
||||
drug = args.drug
|
||||
gene = args.gene
|
||||
gene_match = gene + '_p.'
|
||||
nssnp_match = gene_match+'[A-Z]{3}[0-9]+[A-Z]{3}'
|
||||
|
||||
# building cols to extract
|
||||
dr_muts_col = 'dr_mutations_' + drug
|
||||
|
@ -93,14 +96,16 @@ print('Extracting columns based on variables:\n'
|
|||
#=======================================================================
|
||||
#%% input and output dirs and files
|
||||
#=======
|
||||
# data dir
|
||||
# dirs
|
||||
#=======
|
||||
datadir = homedir + '/' + 'git/Data'
|
||||
indir = datadir + '/' + drug + '/' + 'input'
|
||||
outdir = datadir + '/' + drug + '/' + 'output'
|
||||
|
||||
#=======
|
||||
# input
|
||||
#=======
|
||||
#in_filename = 'original_tanushree_data_v2.csv' #19k
|
||||
in_filename = 'original_tanushree_data_v2.csv' #19k
|
||||
in_filename = 'mtb_gwas_meta_v3.csv' #33k
|
||||
infile = datadir + '/' + in_filename
|
||||
print('Input file: ', infile
|
||||
|
@ -109,9 +114,7 @@ print('Input file: ', infile
|
|||
#=======
|
||||
# output
|
||||
#=======
|
||||
# several output files
|
||||
# output filenames in respective sections at the time of outputting files
|
||||
outdir = datadir + '/' + drug + '/' + 'output'
|
||||
# several output files: in respective sections at the time of outputting files
|
||||
print('Output filename: in the respective sections'
|
||||
, '\nOutput path: ', outdir
|
||||
, '\n=============================================================')
|
||||
|
@ -126,15 +129,14 @@ master_data = pd.read_csv(infile, sep = ',')
|
|||
|
||||
# extract elevant columns to extract from meta data related to the drug
|
||||
|
||||
#meta_data_ch = master_data[['id'
|
||||
#, 'country'
|
||||
#, 'lineage'
|
||||
#, 'sublineage'
|
||||
##, 'drtype' #19k only
|
||||
#, 'resistance'
|
||||
#, drug
|
||||
#, dr_muts_col
|
||||
#, other_muts_col]]
|
||||
meta_data = master_data[['id'
|
||||
, 'country'
|
||||
, 'lineage'
|
||||
, 'sublineage'
|
||||
, 'drtype' #19k only
|
||||
, drug
|
||||
, dr_muts_col
|
||||
, other_muts_col]]
|
||||
|
||||
|
||||
core_cols = ['id'
|
||||
|
@ -160,6 +162,7 @@ variable_based_cols = [drug
|
|||
, other_muts_col]
|
||||
|
||||
cols_to_extract = core_cols + variable_based_cols
|
||||
print('Extracting', len(cols_to_extract), 'columns from master data')
|
||||
|
||||
meta_data = master_data[cols_to_extract]
|
||||
|
||||
|
@ -174,18 +177,21 @@ print('RESULT: Total samples:', total_samples
|
|||
meta_data.isna().sum()
|
||||
print('No. of NAs/column:' + '\n', meta_data.isna().sum()
|
||||
, '\n===========================================================')
|
||||
|
||||
#
|
||||
# glance
|
||||
meta_data.head()
|
||||
#meta_data.head()
|
||||
#total_samples - NA pyrazinamide = ?
|
||||
# 19K: 19265-6754 = 12511
|
||||
# 33K: 33681 - 23823 = 9858
|
||||
|
||||
# equivalent of table in R
|
||||
# drug counts
|
||||
# drug counts: complete samples for OR calcs
|
||||
meta_data[drug].value_counts()
|
||||
print('RESULT: Sus and Res samples:\n', meta_data[drug].value_counts()
|
||||
, '\n===========================================================')
|
||||
|
||||
# clear variables
|
||||
del(in_filename,infile)
|
||||
del(in_filename, infile)
|
||||
#del(outdir)
|
||||
#%%
|
||||
# !!!IMPORTANT!!! sanity check:
|
||||
|
@ -208,11 +214,10 @@ na_count = meta_data[dr_muts_col].isna().sum()
|
|||
|
||||
if len(clean_df) == (total_samples - na_count):
|
||||
print('PASS: clean_df extracted: length is', len(clean_df)
|
||||
, '\nNo.of NAs =', na_count, '/', total_samples
|
||||
, '\nNo.of NAs in', dr_muts_col, '=', na_count, '/', total_samples
|
||||
, '\n==========================================================')
|
||||
else:
|
||||
print('FAIL: dropping NA failed'
|
||||
, '\n==========================================================')
|
||||
sys.exit('FAIL: Could not drop NA')
|
||||
|
||||
dr_gene_count = 0
|
||||
wt = 0
|
||||
|
@ -238,7 +243,7 @@ print('Total matches of', gene_match, 'in dr_muts_col:', dr_gene_count)
|
|||
print('Total samples with > 1', gene_match, 'muts in dr_muts_col:', len(id2_dr) )
|
||||
print('=================================================================')
|
||||
|
||||
del(i, id, wt, id2_dr, clean_df, na_count, count_gene_dr, count_wt)
|
||||
del(clean_df, na_count, i, id, wt, id2_dr, count_gene_dr, count_wt)
|
||||
|
||||
#========
|
||||
# Second: counting <gene_match> mutations in dr_muts_col column
|
||||
|
@ -258,6 +263,7 @@ if len(clean_df) == (total_samples - na_count):
|
|||
else:
|
||||
print('FAIL: dropping NA failed'
|
||||
, '\n=========================================================')
|
||||
sys.exit()
|
||||
|
||||
other_gene_count = 0
|
||||
wt_other = 0
|
||||
|
@ -268,7 +274,7 @@ for i, id in enumerate(clean_df.id):
|
|||
# print (i, id)
|
||||
# id_other.append(id)
|
||||
# count_gene_other = clean_df[other_muts_col].iloc[i].count('gene_match')
|
||||
count_gene_other = clean_df[other_muts_col].iloc[i].count(gene_match)
|
||||
count_gene_other = clean_df[other_muts_col].iloc[i].count(gene_match)
|
||||
if count_gene_other > 0:
|
||||
id_other.append(id)
|
||||
if count_gene_other > 1:
|
||||
|
@ -313,19 +319,25 @@ print('gene to extract:', gene_match )
|
|||
dr_based_cols = [drug, dr_muts_col]
|
||||
|
||||
cols_to_extract = core_cols + dr_based_cols
|
||||
print('Extracting', len(cols_to_extract), 'columns from meta data')
|
||||
|
||||
meta_data_dr = meta_data[cols_to_extract]
|
||||
|
||||
del(dr_based_cols, cols_to_extract)
|
||||
|
||||
print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) )
|
||||
print('actual dim:', meta_data_dr.shape
|
||||
, '\n===============================================================')
|
||||
if meta_data_dr.shape[0] == len(meta_data) and meta_data_dr.shape[1] == (len(meta_data.columns)-1):
|
||||
print('PASS: Dimensions match')
|
||||
else:
|
||||
print('FAIL: Dimensions mismatch:'
|
||||
, 'Expected dim should be:', len(meta_data), (len(meta_data.columns)-1)
|
||||
, '\nGot:', meta_data_dr.shape
|
||||
, '\n===============================================================')
|
||||
sys.exit()
|
||||
|
||||
# Extract within this the gene of interest using string match
|
||||
#meta_gene_dr = meta_data.loc[meta_data[dr_muts_col].str.contains('gene_match*', na = False)]
|
||||
meta_gene_dr = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(gene_match, na = False)]
|
||||
|
||||
meta_gene_dr = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(gene_match, na = False)]
|
||||
|
||||
dr_id = meta_gene_dr['id'].unique()
|
||||
print('RESULT: No. of samples with dr muts in pncA:', len(dr_id))
|
||||
print('checking RESULT:',
|
||||
|
@ -338,13 +350,14 @@ if len(id_dr) == len(meta_gene_dr):
|
|||
else:
|
||||
print('FAIL: length mismatch'
|
||||
, '\n===============================================================')
|
||||
sys.exit()
|
||||
|
||||
dr_id = pd.Series(dr_id)
|
||||
|
||||
#=================
|
||||
# other mutations: extract gene_match entries
|
||||
# other mutations: extract gene_match entries from other_muts_col
|
||||
#==================
|
||||
print('Extracting dr_muts from:', other_muts_col,'with other meta_data')
|
||||
print('Extracting other_muts from:', other_muts_col,'with other meta_data')
|
||||
# FIXME: replace drug with variable containing the drug name
|
||||
# !!! important !!!
|
||||
#meta_data_other = meta_data[['id'
|
||||
|
@ -356,17 +369,24 @@ print('Extracting dr_muts from:', other_muts_col,'with other meta_data')
|
|||
# , other_muts_col
|
||||
# ]]
|
||||
|
||||
dr_based_cols = [drug, other_muts_col]
|
||||
other_based_cols = [drug, other_muts_col]
|
||||
|
||||
cols_to_extract = core_cols + dr_based_cols
|
||||
cols_to_extract = core_cols + other_based_cols
|
||||
print('Extracting', len(cols_to_extract), 'columns from meta data')
|
||||
|
||||
meta_data_other = meta_data[cols_to_extract]
|
||||
|
||||
del(dr_based_cols, cols_to_extract)
|
||||
del(other_based_cols, cols_to_extract)
|
||||
|
||||
print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) )
|
||||
print('actual dim:', meta_data_other.shape
|
||||
, '\n===============================================================')
|
||||
|
||||
if meta_data_other.shape[0] == len(meta_data) and meta_data_other.shape[1] == (len(meta_data.columns)-1):
|
||||
print('PASS: Dimensions match')
|
||||
else:
|
||||
print('FAIL: Dimensions mismatch:'
|
||||
, 'Expected dim should be:', len(meta_data), (len(meta_data.columns)-1)
|
||||
, '\nGot:', meta_data_other.shape
|
||||
, '\n===============================================================')
|
||||
sys.exit()
|
||||
|
||||
# Extract within this the gene of interest using string match
|
||||
meta_gene_other = meta_data_other.loc[meta_data_other[other_muts_col].str.contains(gene_match, na = False)]
|
||||
|
@ -383,6 +403,7 @@ if len(id_other) == len(meta_gene_other):
|
|||
else:
|
||||
print('FAIL: length mismatch'
|
||||
, '\n===============================================================')
|
||||
sys.exit()
|
||||
|
||||
other_id = pd.Series(other_id)
|
||||
#%% Find common IDs
|
||||
|
@ -392,9 +413,12 @@ print('RESULT: No. of common ids:', common_mut_ids)
|
|||
|
||||
# sanity checks
|
||||
# check if True: should be since these are common
|
||||
dr_id.isin(other_id).sum() == other_id.isin(dr_id).sum()
|
||||
|
||||
# check if the 24 Ids that are common are indeed the same!
|
||||
if dr_id.isin(other_id).sum() == other_id.isin(dr_id).sum():
|
||||
print('PASS: Cross check on no. of common ids')
|
||||
else:
|
||||
sys.exit('FAIL: Cross check on no. of common ids failed')
|
||||
|
||||
# check if the common are indeed the same!
|
||||
# bit of a tautology, but better safe than sorry!
|
||||
common_ids = dr_id[dr_id.isin(other_id)]
|
||||
common_ids = common_ids.reset_index()
|
||||
|
@ -405,25 +429,29 @@ common_ids2 = common_ids2.reset_index()
|
|||
common_ids2.columns = ['index', 'id2']
|
||||
|
||||
# should be True
|
||||
print(common_ids['id'].equals(common_ids2['id2']))
|
||||
if common_ids['id'].equals(common_ids2['id2']):
|
||||
print('PASS: Further cross checks on common ids')
|
||||
else:
|
||||
sys.exit('FAIL: Further cross checks on common ids')
|
||||
|
||||
# good sanity check: use it later to check gene_sample_counts
|
||||
expected_gene_samples = ( len(meta_gene_dr) + len(meta_gene_other) - common_mut_ids )
|
||||
print('expected no. of gene samples:', expected_gene_samples)
|
||||
print('Expected no. of gene samples:', expected_gene_samples)
|
||||
print('=================================================================')
|
||||
#%% write file
|
||||
#print(outdir)
|
||||
out_filename0 = gene.lower() + '_common_ids.csv'
|
||||
outfile0 = outdir + '/' + out_filename0
|
||||
out_filename_cid = gene.lower() + '_common_ids.csv'
|
||||
outfile_cid = outdir + '/' + out_filename_cid
|
||||
|
||||
#FIXME: CHECK line len(common_ids)
|
||||
print('Writing file:'
|
||||
, '\nFile:', outfile0
|
||||
, '\nExpected no. of rows:', len(common_ids)
|
||||
, '\nFile:', outfile_cid
|
||||
, '\nNo. of rows:', len(common_ids)
|
||||
, '\n=============================================================')
|
||||
|
||||
common_ids.to_csv(outfile0, index = False)
|
||||
del(out_filename0)
|
||||
common_ids.to_csv(outfile_cid, index = False)
|
||||
|
||||
del(out_filename_cid)
|
||||
|
||||
# clear variables
|
||||
del(dr_id, other_id, meta_data_dr, meta_data_other, common_ids, common_mut_ids, common_ids2)
|
||||
|
@ -441,20 +469,19 @@ print('RESULT: actual no. of gene samples extracted:', extracted_gene_samples
|
|||
# sanity check: length of gene samples
|
||||
print('Performing sanity check:')
|
||||
if extracted_gene_samples == expected_gene_samples:
|
||||
print('No. of gene samples:', len(meta_gene_all)
|
||||
, '\nPASS: expected & actual no. of gene samples match'
|
||||
print('PASS: expected & actual no. of gene samples match'
|
||||
, '\nNo. of gene samples:', len(meta_gene_all)
|
||||
, '\n=========================================================')
|
||||
else:
|
||||
print('FAIL: Debug please!'
|
||||
, '\n===============================================================')
|
||||
sys.exit('FAIL: Length mismatch in gene samples!')
|
||||
|
||||
# count NA in drug column
|
||||
gene_na = meta_gene_all[drug].isna().sum()
|
||||
print('No. of gene samples without pza testing i.e NA in pza column:', gene_na)
|
||||
print('No. of gene samples without', drug, 'testing:', gene_na)
|
||||
|
||||
# use it later to check number of complete samples from LF data
|
||||
comp_gene_samples = len(meta_gene_all) - gene_na
|
||||
print('comp gene samples tested for pza:', comp_gene_samples)
|
||||
print('comp gene samples tested for', drug, ':', comp_gene_samples)
|
||||
print('=================================================================')
|
||||
|
||||
# Comment: This is still dirty data since these
|
||||
|
@ -464,19 +491,9 @@ print('This is still dirty data: samples have ', gene_match, 'muts but may have
|
|||
, '\nsince the format for mutations is mut1; mut2, etc.'
|
||||
, '\n=============================================================')
|
||||
|
||||
#%% tidy_split():Function to split mutations on specified delimiter: ';'
|
||||
#https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas
|
||||
|
||||
print('Performing tidy_split(): to separate the mutations into indivdual rows')
|
||||
|
||||
|
||||
|
||||
|
||||
#TIDY SPLIT HERE
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#=========
|
||||
# DF1: dr_muts_col
|
||||
#=========
|
||||
|
@ -495,7 +512,7 @@ dr_WF0[dr_muts_col] = dr_WF0[dr_muts_col].str.lstrip()
|
|||
# extract only the samples/rows with gene_match
|
||||
dr_gene_WF0 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(gene_match)]
|
||||
|
||||
print('lengths after tidy split and extracting', gene_match, 'muts:'
|
||||
print('Lengths after tidy split and extracting', gene_match, 'muts:'
|
||||
, '\nold length:' , len(meta_gene_dr)
|
||||
, '\nlen after split:', len(dr_WF0)
|
||||
, '\ndr_gene_WF0 length:', len(dr_gene_WF0)
|
||||
|
@ -507,6 +524,7 @@ if len(dr_gene_WF0) == dr_gene_count:
|
|||
else:
|
||||
print('FAIL: lengths mismatch'
|
||||
, '\n===============================================================')
|
||||
sys.exit()
|
||||
|
||||
# count the freq of 'dr_muts' samples
|
||||
dr_muts_df = dr_gene_WF0 [['id', dr_muts_col]]
|
||||
|
@ -515,29 +533,42 @@ print('dim of dr_muts_df:', dr_muts_df.shape)
|
|||
# add freq column
|
||||
dr_muts_df['dr_sample_freq'] = dr_muts_df.groupby('id')['id'].transform('count')
|
||||
#dr_muts_df['dr_sample_freq'] = dr_muts_df.loc[dr_muts_df.groupby('id')].transform('count')
|
||||
print('revised dim of dr_muts_df:', dr_muts_df.shape)
|
||||
print('Revised dim of dr_muts_df:', dr_muts_df.shape)
|
||||
|
||||
c1 = dr_muts_df.dr_sample_freq.value_counts()
|
||||
print('counting no. of sample frequency:\n', c1
|
||||
print('Counting no. of sample frequency:\n', c1
|
||||
, '\n===================================================================')
|
||||
|
||||
# sanity check: length of gene samples
|
||||
if len(dr_gene_WF0) == c1.sum():
|
||||
print('PASS: WF data has expected length'
|
||||
, '\nlength of dr_gene WFO:', c1.sum()
|
||||
, '\nLength of dr_gene WFO:', c1.sum()
|
||||
, '\n===============================================================')
|
||||
else:
|
||||
print('FAIL: Debug please!'
|
||||
, '\n===============================================================')
|
||||
sys.exit('FAIL: length mismatch!')
|
||||
|
||||
#!!! Important !!!
|
||||
# Assign 'column name' on which split was performed as an extra column
|
||||
# This is so you can identify if mutations are dr_type or other in the final df
|
||||
dr_df = dr_gene_WF0.assign(mutation_info = dr_muts_col)
|
||||
print('dim of dr_df:', dr_df.shape
|
||||
dr_df_all = dr_gene_WF0.assign(mutation_info = dr_muts_col)
|
||||
print('Dim of dr_df:', dr_df_all.shape
|
||||
, '\n=============================================================='
|
||||
, '\nEnd of tidy split() on dr_muts, and added an extra column relecting mut_category'
|
||||
, '\n===============================================================')
|
||||
|
||||
# Further clean up to remove stop mutations as these are similar in format to
|
||||
# gene match but dont have a 'mutant aa' (for e.g: abc10* vs nssnp: abc10cde)
|
||||
dr_stop = dr_df_all[dr_muts_col].str.count('\*').sum()
|
||||
print('Removing stop mutations denoted by an asterix'
|
||||
, '\nNo. of stop mutations:',dr_stop)
|
||||
|
||||
dr_df = dr_df_all[~dr_df_all[dr_muts_col].str.contains('\*')]
|
||||
|
||||
if len(dr_df) == len(dr_df_all) - dr_stop:
|
||||
print('PASS: Successfully removed stop mutatiosn from dr_df')
|
||||
else:
|
||||
sys.exit('FAIL: Could not remove stop mutations. Check regex or variable names?')
|
||||
|
||||
#%%
|
||||
#=========
|
||||
# DF2: other_mutations_pyrazinamdie
|
||||
|
@ -558,7 +589,7 @@ other_WF1[other_muts_col] = other_WF1[other_muts_col].str.strip()
|
|||
# extract only the samples/rows with gene_match
|
||||
other_gene_WF1 = other_WF1.loc[other_WF1[other_muts_col].str.contains(gene_match)]
|
||||
|
||||
print('lengths after tidy split and extracting', gene_match, 'muts:',
|
||||
print('Lengths after tidy split and extracting', gene_match, 'muts:',
|
||||
'\nold length:' , len(meta_gene_other),
|
||||
'\nlen after split:', len(other_WF1),
|
||||
'\nother_gene_WF1 length:', len(other_gene_WF1),
|
||||
|
@ -568,36 +599,50 @@ if len(other_gene_WF1) == other_gene_count:
|
|||
print('PASS: length matches with expected length'
|
||||
, '\n===============================================================')
|
||||
else:
|
||||
print('FAIL: lengths mismatch'
|
||||
, '\n===============================================================')
|
||||
sys.exit('FAIL: lengths mismatch')
|
||||
|
||||
# count the freq of 'other muts' samples
|
||||
other_muts_df = other_gene_WF1 [['id', other_muts_col]]
|
||||
print('dim of other_muts_df:', other_muts_df.shape)
|
||||
print('Dim of other_muts_df:', other_muts_df.shape)
|
||||
|
||||
# add freq column
|
||||
other_muts_df['other_sample_freq'] = other_muts_df.groupby('id')['id'].transform('count')
|
||||
print('revised dim of other_muts_df:', other_muts_df.shape)
|
||||
print('Revised dim of other_muts_df:', other_muts_df.shape)
|
||||
|
||||
c2 = other_muts_df.other_sample_freq.value_counts()
|
||||
print('counting no. of sample frequency:\n', c2)
|
||||
print('Counting no. of sample frequency:\n', c2)
|
||||
print('=================================================================')
|
||||
# sanity check: length of gene samples
|
||||
if len(other_gene_WF1) == c2.sum():
|
||||
print('PASS: WF data has expected length'
|
||||
, '\nlength of other_gene WFO:', c2.sum()
|
||||
, '\nLength of other_gene WFO:', c2.sum()
|
||||
, '\n===============================================================')
|
||||
else:
|
||||
print('FAIL: Debug please!'
|
||||
, '\n===============================================================')
|
||||
|
||||
sys.exit('FAIL: Length mismatch')
|
||||
|
||||
#!!! Important !!!
|
||||
# Assign 'column name' on which split was performed as an extra column
|
||||
# This is so you can identify if mutations are dr_type or other in the final df
|
||||
other_df = other_gene_WF1.assign(mutation_info = other_muts_col)
|
||||
print('dim of other_df:', other_df.shape
|
||||
other_df_all = other_gene_WF1.assign(mutation_info = other_muts_col)
|
||||
print('dim of other_df:', other_df_all.shape
|
||||
, '\n==============================================================='
|
||||
, '\nEnd of tidy split() on other_muts, and added an extra column relecting mut_category'
|
||||
, '\n===============================================================')
|
||||
|
||||
|
||||
# Further clean up to remove stop mutations as these are similar in format to
|
||||
# gene match but don't have a 'mutant aa' (for e.g: abc10* vs nssnp: abc10cde)
|
||||
other_stop = other_df_all[other_muts_col].str.count('\*').sum()
|
||||
print('Removing stop mutations denoted by an asterix'
|
||||
, '\nNo. of stop mutations:',other_stop)
|
||||
|
||||
other_df = other_df_all[~other_df_all[other_muts_col].str.contains('\*')]
|
||||
|
||||
if len(other_df) == len(other_df_all) - other_stop:
|
||||
print('PASS: Successfully removed stop mutatiosn from other_df')
|
||||
else:
|
||||
sys.exit('FAIL: Could not remove stop mutations. Check regex or variable names?')
|
||||
|
||||
#%%
|
||||
#==========
|
||||
# Concatentating the two dfs: equivalent of rbind in R
|
||||
|
@ -606,8 +651,8 @@ print('dim of other_df:', other_df.shape
|
|||
# change column names to allow concat:
|
||||
# dr_muts.. & other_muts : 'mutation'
|
||||
print('Now concatenating the two dfs by row'
|
||||
, '\nfirst assigning a common colname: "mutation" to the col containing muts'
|
||||
, '\nthis is done for both dfs'
|
||||
, '\nFirst assigning a common colname: "mutation" to the col containing muts'
|
||||
, '\nThis is done for both dfs'
|
||||
, '\n===================================================================')
|
||||
|
||||
dr_df.columns
|
||||
|
@ -618,39 +663,58 @@ other_df.columns
|
|||
other_df.rename(columns = {other_muts_col: 'mutation'}, inplace = True)
|
||||
other_df.columns
|
||||
|
||||
print('Now appending the two dfs:'
|
||||
, '\ndr_df dim:', dr_df.shape
|
||||
, '\nother_df dim:', other_df.shape
|
||||
, '\ndr_df length:', len(dr_df)
|
||||
, '\nother_df length:', len(other_df)
|
||||
, '\nexpected length:', len(dr_df) + len(other_df)
|
||||
, '\n=============================================================')
|
||||
if len(dr_df.columns) == len(other_df.columns):
|
||||
print('Checking dfs for concatening by rows:'
|
||||
, '\ndr_df dim:', dr_df.shape
|
||||
, '\nother_df dim:', other_df.shape
|
||||
, '\nexpected nrows:', len(dr_df) + len(other_df)
|
||||
, '\n=============================================================')
|
||||
else:
|
||||
sys.exit('FAIL: No. of cols mismatch for concatenating')
|
||||
|
||||
# checking colnames before concat
|
||||
print('checking colnames BEFORE concatenating the two dfs...')
|
||||
if (set(dr_df.columns) == set(other_df.columns)):
|
||||
print('PASS: column names match necessary for merging two dfs')
|
||||
else:
|
||||
print('FAIL: Debug please!')
|
||||
sys.exit('FAIL: Colnames mismatch for concatenating!')
|
||||
|
||||
# concatenate (axis = 0): Rbind
|
||||
gene_LF0 = pd.concat([dr_df, other_df], ignore_index = True, axis = 0)
|
||||
print('Now appending the two dfs: Rbind')
|
||||
gene_LF_comb = pd.concat([dr_df, other_df], ignore_index = True, axis = 0)
|
||||
|
||||
print('Finding stop mutations in concatenated df')
|
||||
stop_muts = gene_LF_comb['mutation'].str.contains('\*').sum()
|
||||
stop_muts_group = gene_LF_comb['mutation'].str.counts('\*').value_counts
|
||||
gene_LF_comb.groupby(['mutation_info'])['mutation'].apply(lambda x: x[x.str.contains('\*')].count())
|
||||
|
||||
|
||||
gene_LF0 = gene_LF_comb[gene_LF_comb['mutation'].str.contains(nssnp_match, case = False)]
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# checking colnames and length after concat
|
||||
print('checking colnames AFTER concatenating the two dfs...')
|
||||
if (set(dr_df.columns) == set(gene_LF0.columns)):
|
||||
print('PASS: column names match')
|
||||
print('PASS: column names match
|
||||
, '\n=============================================================')
|
||||
else:
|
||||
print('FAIL: Debug please!')
|
||||
sys.exit('FAIL: Colnames mismatch AFTER concatenating')
|
||||
|
||||
print('checking length AFTER concatenating the two dfs...')
|
||||
|
||||
print('checking concatened df')
|
||||
if len(gene_LF0) == len(dr_df) + len(other_df):
|
||||
print('PASS:length of df after concat match'
|
||||
, '\n===============================================================')
|
||||
else:
|
||||
print('FAIL: length mismatch'
|
||||
, '\n===============================================================')
|
||||
sys.exit('FAIL: length mismatch')
|
||||
|
||||
#%%
|
||||
###########
|
||||
# This is hopefully clean data, but just double check
|
||||
|
@ -658,13 +722,37 @@ else:
|
|||
# mutations corresponding to gene_match* (string match pattern)
|
||||
# this will be your list you run OR calcs
|
||||
###########
|
||||
#my_regex = 'pncA_p.[A-Z]{3}[0-9]+[A-Z]{3}'
|
||||
|
||||
|
||||
gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(nssnp_match, case = False)]
|
||||
len(gene_LF1)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
print('length of gene_LF0:', len(gene_LF0),
|
||||
'\nThis should be what you need. But just double check and extract', gene_match,
|
||||
'\nfrom LF0 (concatenated data) using string match:', gene_match)
|
||||
'\nfrom LF0 (concatenated data) using string match:', gene_match
|
||||
, '\nand double checking for stop mutations')
|
||||
|
||||
if gene_LF0['mutation'].str.count('\*').sum() > 0:
|
||||
sys.exit('FAIL: Stop mutations detected post concatenating dfs. Resolve at source!')
|
||||
|
||||
print('Double checking and creating df: gene_LF1')
|
||||
gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(gene_match)]
|
||||
|
||||
|
||||
|
||||
if len(gene_LF0) == len(gene_LF1):
|
||||
print('PASS: length of gene_LF0 and gene_LF1 match',
|
||||
'\nconfirming extraction and concatenating worked correctly')
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue