adding clean files for rerrun 35k dataset
This commit is contained in:
parent
0973717287
commit
8f460347b4
32 changed files with 157 additions and 44550 deletions
60
scripts/tidy_split.py
Normal file
60
scripts/tidy_split.py
Normal file
|
@ -0,0 +1,60 @@
|
|||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
'''
|
||||
Created on Tue Aug 6 12:56:03 2019
|
||||
|
||||
@author: tanu
|
||||
'''
|
||||
#=======================================================================
|
||||
#%% load libraries
|
||||
import os, sys
|
||||
import pandas as pd
|
||||
#import numpy as np
|
||||
#=======================================================================
|
||||
#%% homdir and curr dir and local imports
|
||||
#homedir = os.path.expanduser('~')
|
||||
# set working dir
|
||||
#os.getcwd()
|
||||
#os.chdir(homedir + '/git/LSHTM_analysis/scripts')
|
||||
#os.getcwd()
|
||||
#%%=====================================================================
|
||||
# define the split function
|
||||
def tidy_split(df, column, sep = '|', keep = False):
|
||||
'''
|
||||
Split the values of a column and expand so the new DataFrame has one split
|
||||
value per row. Filters rows where the column is missing.
|
||||
|
||||
Params
|
||||
------
|
||||
df : pandas.DataFrame
|
||||
dataframe with the column to split and expand
|
||||
column : str
|
||||
the column to split and expand
|
||||
sep : str
|
||||
the string used to split the column's values
|
||||
keep : bool
|
||||
whether to retain the presplit value as it's own row
|
||||
|
||||
Returns
|
||||
-------
|
||||
pandas.DataFrame
|
||||
Returns a dataframe with the same columns as `df`.
|
||||
'''
|
||||
indexes = list()
|
||||
new_values = list()
|
||||
#df = df.dropna(subset=[column])#!!!!-----see this incase you need to uncomment based on use case
|
||||
for i, presplit in enumerate(df[column].astype(str)):
|
||||
values = presplit.split(sep)
|
||||
if keep and len(values) > 1:
|
||||
indexes.append(i)
|
||||
new_values.append(presplit)
|
||||
for value in values:
|
||||
indexes.append(i)
|
||||
new_values.append(value)
|
||||
new_df = df.iloc[indexes, :].copy()
|
||||
new_df[column] = new_values
|
||||
return new_df
|
||||
#%%=====================================================================
|
||||
#end of tidy_split()
|
||||
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue