adding clean files for rerrun 35k dataset

2020-07-07 18:28:55 +01:00 · 2020-07-07 18:28:55 +01:00 · a7f21cfb14
commit a7f21cfb14
parent 943513a338
32 changed files with 157 additions and 44550 deletions
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@ -21,6 +21,9 @@ Created on Tue Aug  6 12:56:03 2019
 # where each row is a separate mutation
 # sample ids AND mutations are NOT unique, but the COMBINATION (sample id + mutation) = unique

+# NOTE
+#drtype is renamed to 'resistance' in the 35k dataset
+
 # output files: all lower case
 # 0) <gene>_common_ids.csv
 # 1) <gene>_ambiguous_muts.csv
@ -60,6 +63,7 @@ os.getcwd()

 # import aa dict
 from reference_dict import my_aa_dict # CHECK DIR STRUC THERE!
+from tidy_split import tidy_split
 #=======================================================================
 #%% command line args
 arg_parser = argparse.ArgumentParser()
@ -96,8 +100,8 @@ datadir = homedir + '/' + 'git/Data'
 #=======
 # input
 #=======
-in_filename  = 'original_tanushree_data_v2.csv'
-#in_filename  = 'mtb_gwas_v3.csv'
+#in_filename  = 'original_tanushree_data_v2.csv' #19k
+in_filename  = 'mtb_gwas_meta_v3.csv' #33k
 infile = datadir + '/' + in_filename
 print('Input file: ', infile
      , '\n============================================================')
@ -121,17 +125,45 @@ master_data  = pd.read_csv(infile, sep = ',')
 #list(master_data.columns)

 # extract elevant columns to extract from meta data related to the drug
-meta_data = master_data[['id'
-       ,'country'
-       ,'lineage'
-       ,'sublineage'
-       ,'drtype'
-       , drug
-       , dr_muts_col
-       , other_muts_col
-        ]] 

-del(master_data)
+#meta_data_ch = master_data[['id'
+#, 'country'
+#, 'lineage'
+#, 'sublineage'
+##, 'drtype' #19k only
+#, 'resistance'
+#, drug
+#, dr_muts_col
+#, other_muts_col]] 
+
+
+core_cols = ['id'
+    , 'country'
+    , 'country2'
+    , 'geographic_source'
+    , 'region'
+    , 'date'
+    , 'strain'
+    , 'lineage'
+    , 'sublineage' #drtype renamed to resistance
+    , 'resistance'
+    , 'location'
+    , 'host_body_site'
+    , 'environment_material'
+    , 'host_status'
+    , 'hiv_status'
+    , 'HIV_status'
+    , 'isolation_source']
+
+variable_based_cols = [drug
+                       , dr_muts_col
+                       , other_muts_col]
+       
+cols_to_extract = core_cols + variable_based_cols
+
+meta_data = master_data[cols_to_extract]  
+   
+del(master_data, variable_based_cols, cols_to_extract)

 # checks and results
 total_samples = meta_data['id'].nunique() 
@ -269,14 +301,23 @@ print('gene to extract:', gene_match )
 #===============
 # FIXME: replace drug with variable containing the drug name
 # !!! important !!!
-meta_data_dr = meta_data[['id'
-       ,'country'
-       ,'lineage'
-       ,'sublineage'
-       ,'drtype'
-       , drug
-       , dr_muts_col
-        ]] 
+#meta_data_dr = meta_data[['id'
+#       ,'country'
+#       ,'lineage'
+#       ,'sublineage'
+#       ,'drtype'
+#       , drug
+#       , dr_muts_col
+#        ]] 
+
+dr_based_cols = [drug, dr_muts_col]
+       
+cols_to_extract = core_cols + dr_based_cols
+
+meta_data_dr = meta_data[cols_to_extract]
+
+del(dr_based_cols, cols_to_extract)
+
 print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) )
 print('actual dim:', meta_data_dr.shape
 	, '\n===============================================================')
@ -306,14 +347,22 @@ dr_id = pd.Series(dr_id)
 print('Extracting dr_muts from:', other_muts_col,'with other meta_data')
 # FIXME: replace drug with variable containing the drug name
 # !!! important !!!
-meta_data_other = meta_data[['id'
-       ,'country'
-       ,'lineage'
-       ,'sublineage'
-       ,'drtype'
-       , drug
-       , other_muts_col
-        ]] 
+#meta_data_other = meta_data[['id'
+#       ,'country'
+#       ,'lineage'
+#       ,'sublineage'
+##       ,'drtype'
+#       , drug
+#       , other_muts_col
+#        ]] 
+
+dr_based_cols = [drug, other_muts_col]
+       
+cols_to_extract = core_cols + dr_based_cols
+
+meta_data_other = meta_data[cols_to_extract]
+
+del(dr_based_cols, cols_to_extract)

 print('expected dim should be:', len(meta_data), (len(meta_data.columns)-1) )
 print('actual dim:', meta_data_other.shape
@ -373,7 +422,7 @@ print('Writing file:'
      , '\nExpected no. of rows:', len(common_ids)
      , '\n=============================================================')

-common_ids.to_csv(outfile0)
+common_ids.to_csv(outfile0, index = False)
 del(out_filename0)

 # clear variables
@ -419,44 +468,15 @@ print('This is still dirty data: samples have ', gene_match, 'muts but may have
 #https://stackoverflow.com/questions/41476150/removing-space-from-dataframe-columns-in-pandas
 print('Performing tidy_split(): to separate the mutations into indivdual rows')

-# define the split function
-def tidy_split(df, column, sep='|', keep=False):
-    '''
-    Split the values of a column and expand so the new DataFrame has one split
-    value per row. Filters rows where the column is missing.

-    Params
-    ------
-    df : pandas.DataFrame
-        dataframe with the column to split and expand
-    column : str
-        the column to split and expand
-    sep : str
-        the string used to split the column's values
-    keep : bool
-        whether to retain the presplit value as it's own row

-    Returns
-    -------
-    pandas.DataFrame
-        Returns a dataframe with the same columns as `df`.
-    '''
-    indexes = list()
-    new_values = list()
-    #df = df.dropna(subset=[column])#!!!!-----see this incase you need to uncomment based on use case
-    for i, presplit in enumerate(df[column].astype(str)):
-        values = presplit.split(sep)
-        if keep and len(values) > 1:
-            indexes.append(i)
-            new_values.append(presplit)
-        for value in values:
-            indexes.append(i)
-            new_values.append(value)
-    new_df = df.iloc[indexes, :].copy()
-    new_df[column] = new_values
-    return new_df
-    
-#%% end of tidy_split()
+
+#TIDY SPLIT HERE
+
+
+
+
+
 #=========
 # DF1: dr_muts_col
 #=========
@ -761,12 +781,11 @@ del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_m
 out_filename1 = gene.lower() + '_ambiguous_muts.csv'
 outfile1 = outdir + '/' + out_filename1
 print('Writing file: ambiguous muts'
-      , '\nFilename:', out_filename1
-      , '\nPath:',  outdir)
+      , '\nFilename:', outfile1)

 #common_muts = ['gene_matchVal180Phe','gene_matchGln10Pro'] # test
 inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
-inspect.to_csv(outfile1)
+inspect.to_csv(outfile1, index = False)

 print('Finished writing:', out_filename1
      , '\nNo. of rows:', len(inspect)
@ -1069,13 +1088,13 @@ else:
    print('FAIL: SNP has NA, Possible mapping issues from dict?'
          , '\nDebug please!'
          , '\n=========================================================')
+sys.exit()

 out_filename2 = gene.lower() + '_mcsm_snps.csv'
 outfile2 = outdir + '/' + out_filename2

 print('Writing file: mCSM style muts'
-      , '\nFilename:', out_filename2
-      , '\nPath:', outdir
+      , '\nFilename:', outfile2
      , '\nmutation format (SNP): {WT}<POS>{MUT}'
      , '\nNo. of distinct muts:', len(snps_only)
      , '\nNo. of distinct positions:', len(pos_only)
@ -1083,7 +1102,7 @@ print('Writing file: mCSM style muts'

 snps_only.to_csv(outfile2, header = False, index = False)

-print('Finished writing:', out_filename2
+print('Finished writing:', outfile2
      , '\nNo. of rows:', len(snps_only)
      , '\nNo. of cols:', len(snps_only.columns)
      , '\n=============================================================')
@ -1099,7 +1118,7 @@ print('Writing file: LF formatted data'
      , '\n============================================================')

 gene_LF1.to_csv(outfile3, header = True, index = False)
-print('Finished writing:', out_filename3
+print('Finished writing:', outfile3
      , '\nNo. of rows:', len(gene_LF1)
      , '\nNo. of cols:', len(gene_LF1.columns)
      , '\n=============================================================')
@ -1118,11 +1137,11 @@ all_muts_msa.columns.dtype
 all_muts_msa_sorted = all_muts_msa.sort_values(by = 'mutationinformation')

 # create an extra column with protein name
-all_muts_msa_sorted = all_muts_msa_sorted.assign(fasta_name = '3PL1') 
-all_muts_msa_sorted.head()
+#all_muts_msa_sorted = all_muts_msa_sorted.assign(fasta_name = '3PL1') 
+#all_muts_msa_sorted.head()

 # rearrange columns so the fasta name is the first column (required for mutate.script)
-all_muts_msa_sorted = all_muts_msa_sorted[['fasta_name', 'mutationinformation']]
+#all_muts_msa_sorted = all_muts_msa_sorted[['fasta_name', 'mutationinformation']]
 all_muts_msa_sorted.head()

 print('Checking NA in snps...')# should be 0
@ -1138,15 +1157,14 @@ out_filename4 = gene.lower() +'_all_muts_msa.csv'
 outfile4 = outdir + '/' + out_filename4

 print('Writing file: mCSM style muts for msa',
-      '\nFilename:', out_filename4,
-      '\nPath:', outdir,
+      '\nFilename:', outfile4,
      '\nmutation format (SNP): {WT}<POS>{MUT}',
      '\nNo.of lines of msa:', len(all_muts_msa),  
 )

 all_muts_msa_sorted.to_csv(outfile4, header = False, index = False)

-print('Finished writing:', out_filename4
+print('Finished writing:', outfile4
      , '\nNo. of rows:', len(all_muts_msa)
      , '\nNo. of cols:', len(all_muts_msa.columns)
      , '\n=============================================================')
@ -1177,7 +1195,7 @@ print('Writing file: mutational positions'

 pos_only_sorted.to_csv(outfile5, header = True, index = False)

-print('Finished writing:', out_filename5
+print('Finished writing:', outfile5
      , '\nNo. of rows:', len(pos_only_sorted)
      , '\nNo. of cols:', len(pos_only_sorted.columns)
      , '\n=============================================================')