#!/usr/bin/env python3 # -*- coding: utf-8 -*- ''' Created on Tue Aug 6 12:56:03 2019 @author: tanu ''' #======================================================================= #%% load libraries import os, sys import pandas as pd #import numpy as np #======================================================================= #%% homdir and curr dir and local imports #homedir = os.path.expanduser('~') # set working dir #os.getcwd() #os.chdir(homedir + '/git/LSHTM_analysis/scripts') #os.getcwd() #%%===================================================================== # define the split function def tidy_split(df, column, sep = '|', keep = False): ''' Split the values of a column and expand so the new DataFrame has one split value per row. Filters rows where the column is missing. Params ------ df : pandas.DataFrame dataframe with the column to split and expand column : str the column to split and expand sep : str the string used to split the column's values keep : bool whether to retain the presplit value as it's own row Returns ------- pandas.DataFrame Returns a dataframe with the same columns as `df`. ''' indexes = list() new_values = list() #df = df.dropna(subset=[column])#!!!!-----see this incase you need to uncomment based on use case for i, presplit in enumerate(df[column].astype(str)): values = presplit.split(sep) if keep and len(values) > 1: indexes.append(i) new_values.append(presplit) for value in values: indexes.append(i) new_values.append(value) new_df = df.iloc[indexes, :].copy() new_df[column] = new_values return new_df #%%===================================================================== #end of tidy_split()