adding clean files for rerrun 35k dataset

This commit is contained in:
Tanushree Tunstall 2020-07-07 18:28:55 +01:00
parent 943513a338
commit a7f21cfb14
32 changed files with 157 additions and 44550 deletions

60
scripts/tidy_split.py Normal file
View file

@ -0,0 +1,60 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
'''
Created on Tue Aug 6 12:56:03 2019
@author: tanu
'''
#=======================================================================
#%% load libraries
import os, sys
import pandas as pd
#import numpy as np
#=======================================================================
#%% homdir and curr dir and local imports
#homedir = os.path.expanduser('~')
# set working dir
#os.getcwd()
#os.chdir(homedir + '/git/LSHTM_analysis/scripts')
#os.getcwd()
#%%=====================================================================
# define the split function
def tidy_split(df, column, sep = '|', keep = False):
'''
Split the values of a column and expand so the new DataFrame has one split
value per row. Filters rows where the column is missing.
Params
------
df : pandas.DataFrame
dataframe with the column to split and expand
column : str
the column to split and expand
sep : str
the string used to split the column's values
keep : bool
whether to retain the presplit value as it's own row
Returns
-------
pandas.DataFrame
Returns a dataframe with the same columns as `df`.
'''
indexes = list()
new_values = list()
#df = df.dropna(subset=[column])#!!!!-----see this incase you need to uncomment based on use case
for i, presplit in enumerate(df[column].astype(str)):
values = presplit.split(sep)
if keep and len(values) > 1:
indexes.append(i)
new_values.append(presplit)
for value in values:
indexes.append(i)
new_values.append(value)
new_df = df.iloc[indexes, :].copy()
new_df[column] = new_values
return new_df
#%%=====================================================================
#end of tidy_split()