fixed white space prob with mcsm input with merge

This commit is contained in:
Tanushree Tunstall 2020-07-14 14:07:23 +01:00
parent 5a2084ba11
commit 8dc2fa7326
6 changed files with 108 additions and 98 deletions

View file

@ -183,7 +183,11 @@ def format_mcsm_output(mcsm_outputcsv):
#############
# Read file
#############
mcsm_data = pd.read_csv(mcsm_outputcsv, sep = ',')
mcsm_data_raw = pd.read_csv(mcsm_outputcsv, sep = ',')
# strip white space from both ends in all columns
mcsm_data = mcsm_data_raw.apply(lambda x: x.str.strip() if x.dtype == 'object' else x)
dforig_shape = mcsm_data.shape
print('dimensions of input file:', dforig_shape)
@ -396,8 +400,7 @@ def format_mcsm_output(mcsm_outputcsv):
print('removing white space within created column: wild_chain_pos')
mcsm_data['wild_chain_pos'] = mcsm_data['wild_chain_pos'].str.replace(' ', '')
print('Correctly formatted column: wild_chain_pos\n', mcsm_data['wild_chain_pos'].head()
, '\n=========================================================')
, '\n=========================================================')
#%%=====================================================================
#############
# ensuring corrrect dtype in non-numeric cols
@ -426,8 +429,8 @@ def format_mcsm_output(mcsm_outputcsv):
mcsm_data_fs = mcsm_data_f.sort_values(by = ['position'])
print('sorted df:\n', mcsm_data_fs.head())
# Remove white space everywhere before output: bit me when merging!?
mcsm_data_fs.columns = mcsm_data_fs.columns.str.replace(' ', '')
# Ensuring column names are lowercase before output
mcsm_data_fs.columns = mcsm_data_fs.columns.str.lower()
#%%=====================================================================
#############
# sanity check before writing file