fixed white space prob with mcsm input with merge

This commit is contained in:
Tanushree Tunstall 2020-07-14 14:07:23 +01:00
parent 46b1505fdf
commit 7d36e0e36b
6 changed files with 108 additions and 98 deletions

View file

@ -13,7 +13,6 @@ import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
import numpy as np
#=======================================================================
#%% specify input and curr dir
homedir = os.path.expanduser('~')
@ -25,10 +24,6 @@ os.getcwd()
#%% variable assignment: input and output
drug = 'pyrazinamide'
gene = 'pncA'
#drug = args.drug
#gene = args.gene
gene_match = gene + '_p.'
#==========
# dirs
@ -41,7 +36,6 @@ outdir = datadir + '/' + drug + '/' + 'output'
# input:
#=======
# 1) result_urls (from outdir)
in_filename_mcsm_output = gene.lower() + '_mcsm_output.csv' #(outfile, from mcsm_results.py)
infile_mcsm_output = outdir + '/' + in_filename_mcsm_output
print('Input file:', infile_mcsm_output
@ -57,9 +51,11 @@ print('Output file:', out_filename_mcsm_norm
#=======================================================================
print('Reading input file')
mcsm_data = pd.read_csv(infile_mcsm_output, sep = ',')
mcsm_data_raw = pd.read_csv(infile_mcsm_output, sep = ',')
# strip white space from both ends in all columns
mcsm_data = mcsm_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
mcsm_data.columns
# PredAffLog = affinity_change_log
# "DUETStability_Kcalpermol = DUET_change_kcalpermol
dforig_shape = mcsm_data.shape
@ -72,7 +68,7 @@ print('dim of infile:', dforig_shape)
print('Assigning meaningful colnames i.e without spaces and hyphen and reflecting units'
, '\n===================================================================')
my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info from this col will be extracted and the column discarded
, 'Mutation information': 'mutation_information' # {wild_type}<position>{mutant_type}
, 'Mutation information': 'mutationinformation' # {wild_type}<position>{mutant_type}
, 'Wild-type': 'wild_type' # one letter amino acid code
, 'Position': 'position' # number
, 'Mutant-type': 'mutant_type' # one letter amino acid code
@ -83,17 +79,17 @@ my_colnames_dict = {'Predicted Affinity Change': 'PredAffLog' # relevant info fr
mcsm_data.rename(columns = my_colnames_dict, inplace = True)
#%%===========================================================================
# populate mutation_information column:mcsm style muts {WT}<POS>{MUT}
print('Populating column : mutation_information which is currently empty\n', mcsm_data['mutation_information'])
mcsm_data['mutation_information'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
print('checking after populating:\n', mcsm_data['mutation_information']
# populate mutationinformation column:mcsm style muts {WT}<POS>{MUT}
print('Populating column : mutationinformation which is currently empty\n', mcsm_data['mutationinformation'])
mcsm_data['mutationinformation'] = mcsm_data['wild_type'] + mcsm_data['position'].astype(str) + mcsm_data['mutant_type']
print('checking after populating:\n', mcsm_data['mutationinformation']
, '\n===================================================================')
# Remove spaces b/w pasted columns
print('removing white space within column: \mutation_information')
mcsm_data['mutation_information'] = mcsm_data['mutation_information'].str.replace(' ', '')
print('Correctly formatted column: mutation_information\n', mcsm_data['mutation_information']
, '\n===================================================================')
# Remove spaces b/w pasted columns: not needed as white space removed at the time of import
#print('removing white space within column: \mutationinformation')
#mcsm_data['mutationinformation'] = mcsm_data['mutationinformation'].str.replace(' ', '')
#print('Correctly formatted column: mutationinformation\n', mcsm_data['mutationinformation']
# , '\n===================================================================')
#%% Remove whitespace from column
#orig_dtypes = mcsm_data.dtypes
#https://stackoverflow.com/questions/33788913/pythonic-efficient-way-to-strip-whitespace-from-every-pandas-data-frame-cell-tha/33789292
@ -103,7 +99,7 @@ print('Correctly formatted column: mutation_information\n', mcsm_data['mutation_
# very important
print('Sanity check:'
, '\nChecking duplicate mutations')
if mcsm_data['mutation_information'].duplicated().sum() == 0:
if mcsm_data['mutationinformation'].duplicated().sum() == 0:
print('PASS: No duplicate mutations detected (as expected)'
, '\nDim of data:', mcsm_data.shape
, '\n===============================================================')
@ -111,7 +107,7 @@ else:
print('FAIL (but not fatal): Duplicate mutations detected'
, '\nDim of df with duplicates:', mcsm_data.shape
, 'Removing duplicate entries')
mcsm_data = mcsm_data.drop_duplicates(['mutation_information'])
mcsm_data = mcsm_data.drop_duplicates(['mutationinformation'])
print('Dim of data after removing duplicate muts:', mcsm_data.shape
, '\n===============================================================')
#%%===========================================================================
@ -248,7 +244,7 @@ print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'
#=============================================================================
#%% ensuring dtypes are string for the non-numeric cols
#) char cols
char_cols = ['PredAffLog', 'mutation_information', 'wild_type', 'mutant_type', 'chain'
char_cols = ['PredAffLog', 'mutationinformation', 'wild_type', 'mutant_type', 'chain'
, 'ligand_id', 'duet_outcome', 'ligand_outcome', 'wild_pos', 'wild_chain_pos']
#mcsm_data[char_cols] = mcsm_data[char_cols].astype(str)
@ -298,8 +294,8 @@ else:
, '\nis', expected_ncols_toadd, 'the no. of expected cols to add?'
, '\n===============================================================')
#%%============================================================================
# Remove white space everywhere before output: bit me when merging!?
mcsm_data_fs.columns = mcsm_data_fs.columns.str.replace(' ', '')
# Ensuring column names are lowercase before output
mcsm_data_fs.columns = mcsm_data_fs.columns.str.lower()
# writing file
print('Writing formatted df to csv')