LSHTM_analysis/scripts/tidy_split.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
Created on Tue Aug  6 12:56:03 2019

@author: tanu
'''
#=======================================================================
#%% load libraries
import os, sys
import pandas as pd
#import numpy as np
#=======================================================================
#%% homdir and curr dir and local imports
#homedir = os.path.expanduser('~')
# set working dir
#os.getcwd()
#os.chdir(homedir + '/git/LSHTM_analysis/scripts')
#os.getcwd()
#%%=====================================================================
# tidy_split():Function to split mutations on specified delimiter: ';'
#https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas

def tidy_split(df, column, sep = '|', keep = False):
    '''
    Split the values of a column and expand so the new DataFrame has one split
    value per row. Filters rows where the column is missing.

    Params
    ------
    df : pandas.DataFrame
        dataframe with the column to split and expand
    column : str
        the column to split and expand
    sep : str
        the string used to split the column's values
    keep : bool
        whether to retain the presplit value as it's own row

    Returns
    -------
    pandas.DataFrame
        Returns a dataframe with the same columns as `df`.
    '''
    indexes = list()
    new_values = list()
    #df = df.dropna(subset=[column])#!!!!-----see this incase you need to uncomment based on use case
    for i, presplit in enumerate(df[column].astype(str)):
        values = presplit.split(sep)
        if keep and len(values) > 1:
            indexes.append(i)
            new_values.append(presplit)
        for value in values:
            indexes.append(i)
            new_values.append(value)
    new_df = df.iloc[indexes, :].copy()
    new_df[column] = new_values
    return new_df
#%%=====================================================================
#end of tidy_split()