62 lines
1.9 KiB
Python
62 lines
1.9 KiB
Python
#!/usr/bin/env python3
|
|
# -*- coding: utf-8 -*-
|
|
'''
|
|
Created on Tue Aug 6 12:56:03 2019
|
|
|
|
@author: tanu
|
|
'''
|
|
#=======================================================================
|
|
#%% load libraries
|
|
import os, sys
|
|
import pandas as pd
|
|
#import numpy as np
|
|
#=======================================================================
|
|
#%% homdir and curr dir and local imports
|
|
#homedir = os.path.expanduser('~')
|
|
# set working dir
|
|
#os.getcwd()
|
|
#os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
|
#os.getcwd()
|
|
#%%=====================================================================
|
|
# tidy_split():Function to split mutations on specified delimiter: ';'
|
|
#https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas
|
|
|
|
def tidy_split(df, column, sep = '|', keep = False):
|
|
'''
|
|
Split the values of a column and expand so the new DataFrame has one split
|
|
value per row. Filters rows where the column is missing.
|
|
|
|
Params
|
|
------
|
|
df : pandas.DataFrame
|
|
dataframe with the column to split and expand
|
|
column : str
|
|
the column to split and expand
|
|
sep : str
|
|
the string used to split the column's values
|
|
keep : bool
|
|
whether to retain the presplit value as it's own row
|
|
|
|
Returns
|
|
-------
|
|
pandas.DataFrame
|
|
Returns a dataframe with the same columns as `df`.
|
|
'''
|
|
indexes = list()
|
|
new_values = list()
|
|
#df = df.dropna(subset=[column])#!!!!-----see this incase you need to uncomment based on use case
|
|
for i, presplit in enumerate(df[column].astype(str)):
|
|
values = presplit.split(sep)
|
|
if keep and len(values) > 1:
|
|
indexes.append(i)
|
|
new_values.append(presplit)
|
|
for value in values:
|
|
indexes.append(i)
|
|
new_values.append(value)
|
|
new_df = df.iloc[indexes, :].copy()
|
|
new_df[column] = new_values
|
|
return new_df
|
|
#%%=====================================================================
|
|
#end of tidy_split()
|
|
|
|
|