adding clean files for rerrun 35k dataset

2020-07-07 18:28:55 +01:00 · 2020-07-07 18:28:55 +01:00 · 8f460347b4
commit 8f460347b4
parent 0973717287
32 changed files with 157 additions and 44550 deletions
--- a/scripts/tidy_split.py
+++ b/scripts/tidy_split.py
@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+'''
+Created on Tue Aug  6 12:56:03 2019
+
+@author: tanu
+'''
+#=======================================================================
+#%% load libraries
+import os, sys
+import pandas as pd
+#import numpy as np
+#=======================================================================
+#%% homdir and curr dir and local imports
+#homedir = os.path.expanduser('~') 
+# set working dir
+#os.getcwd()
+#os.chdir(homedir + '/git/LSHTM_analysis/scripts')
+#os.getcwd()
+#%%=====================================================================
+# define the split function
+def tidy_split(df, column, sep = '|', keep = False):
+    '''
+    Split the values of a column and expand so the new DataFrame has one split
+    value per row. Filters rows where the column is missing.
+
+    Params
+    ------
+    df : pandas.DataFrame
+        dataframe with the column to split and expand
+    column : str
+        the column to split and expand
+    sep : str
+        the string used to split the column's values
+    keep : bool
+        whether to retain the presplit value as it's own row
+
+    Returns
+    -------
+    pandas.DataFrame
+        Returns a dataframe with the same columns as `df`.
+    '''
+    indexes = list()
+    new_values = list()
+    #df = df.dropna(subset=[column])#!!!!-----see this incase you need to uncomment based on use case
+    for i, presplit in enumerate(df[column].astype(str)):
+        values = presplit.split(sep)
+        if keep and len(values) > 1:
+            indexes.append(i)
+            new_values.append(presplit)
+        for value in values:
+            indexes.append(i)
+            new_values.append(value)
+    new_df = df.iloc[indexes, :].copy()
+    new_df[column] = new_values
+    return new_df 
+#%%=====================================================================
+#end of tidy_split()
+
+