From 436125745dfa10a17467c2a062a4c1a4f4f9f717 Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 8 Jul 2020 16:01:54 +0100 Subject: [PATCH] minor changes in data extraction --- scripts/data_extraction.py | 467 +++++++++++++++++-------------------- 1 file changed, 211 insertions(+), 256 deletions(-) diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index 1cb703c..d045fb9 100755 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -105,7 +105,7 @@ outdir = datadir + '/' + drug + '/' + 'output' #======= # input #======= -in_filename = 'original_tanushree_data_v2.csv' #19k +#in_filename = 'original_tanushree_data_v2.csv' #19k in_filename = 'mtb_gwas_meta_v3.csv' #33k infile = datadir + '/' + in_filename print('Input file: ', infile @@ -129,44 +129,46 @@ master_data = pd.read_csv(infile, sep = ',') # extract elevant columns to extract from meta data related to the drug -meta_data = master_data[['id' -, 'country' -, 'lineage' -, 'sublineage' -, 'drtype' #19k only -, drug -, dr_muts_col -, other_muts_col]] - - -core_cols = ['id' - , 'country' - , 'country2' - , 'geographic_source' - , 'region' - , 'date' - , 'strain' - , 'lineage' - , 'sublineage' #drtype renamed to resistance - , 'resistance' - , 'location' - , 'host_body_site' - , 'environment_material' - , 'host_status' - , 'hiv_status' - , 'HIV_status' - , 'isolation_source'] - -variable_based_cols = [drug - , dr_muts_col - , other_muts_col] +if in_filename == 'original_tanushree_data_v2.csv': + meta_data = master_data[['id' + , 'country' + , 'lineage' + , 'sublineage' + , 'drtype' #19k only + , drug + , dr_muts_col + , other_muts_col]] + +if in_filename == 'mtb_gwas_meta_v3.csv': + core_cols = ['id' + , 'country' + , 'country2' + , 'geographic_source' + , 'region' + , 'date' + , 'strain' + , 'lineage' + , 'sublineage' #drtype renamed to resistance + , 'resistance' + , 'location' + , 'host_body_site' + , 'environment_material' + , 'host_status' + , 'hiv_status' + , 'HIV_status' + , 'isolation_source'] + + variable_based_cols = [drug + , dr_muts_col + , other_muts_col] -cols_to_extract = core_cols + variable_based_cols -print('Extracting', len(cols_to_extract), 'columns from master data') + cols_to_extract = core_cols + variable_based_cols + print('Extracting', len(cols_to_extract), 'columns from master data') -meta_data = master_data[cols_to_extract] - -del(master_data, variable_based_cols, cols_to_extract) + meta_data = master_data[cols_to_extract] + del(master_data, variable_based_cols, cols_to_extract) + +print('Extracted meta data:', meta_data.shape) # checks and results total_samples = meta_data['id'].nunique() @@ -190,9 +192,6 @@ meta_data[drug].value_counts() print('RESULT: Sus and Res samples:\n', meta_data[drug].value_counts() , '\n===========================================================') -# clear variables -del(in_filename, infile) -#del(outdir) #%% # !!!IMPORTANT!!! sanity check: # This is to find out how many samples have 1 and more than 1 mutation,so you @@ -217,7 +216,7 @@ if len(clean_df) == (total_samples - na_count): , '\nNo.of NAs in', dr_muts_col, '=', na_count, '/', total_samples , '\n==========================================================') else: - sys.exit('FAIL: Could not drop NA') + sys.exit('FAIL: Could not drop NAs') dr_gene_count = 0 wt = 0 @@ -225,14 +224,14 @@ id_dr = [] id2_dr = [] for i, id in enumerate(clean_df.id): -# print (i, id) -# id_dr.append(id) + #print (i, id) + #id_dr.append(id) count_gene_dr = clean_df[dr_muts_col].iloc[i].count(gene_match) if count_gene_dr > 0: id_dr.append(id) if count_gene_dr > 1: id2_dr.append(id) -# print(id, count_gene_dr) + #print(id, count_gene_dr) dr_gene_count = dr_gene_count + count_gene_dr count_wt = clean_df[dr_muts_col].iloc[i].count('WT') wt = wt + count_wt @@ -261,28 +260,26 @@ if len(clean_df) == (total_samples - na_count): , '\nNo.of NAs =', na_count, '/', total_samples , '\n=========================================================') else: - print('FAIL: dropping NA failed' - , '\n=========================================================') -sys.exit() + sys.exit('FAIL: Could not drop NAs') other_gene_count = 0 wt_other = 0 id_other = [] id2_other = [] - + for i, id in enumerate(clean_df.id): -# print (i, id) -# id_other.append(id) -# count_gene_other = clean_df[other_muts_col].iloc[i].count('gene_match') - count_gene_other = clean_df[other_muts_col].iloc[i].count(gene_match) + #print (i, id) + #id_other.append(id) + count_gene_other = clean_df[other_muts_col].iloc[i].count(gene_match) if count_gene_other > 0: - id_other.append(id) + id_other.append(id) if count_gene_other > 1: id2_other.append(id) -# print(id, count_gene_other) - other_gene_count = other_gene_count + count_gene_other + #print(id, count_gene_other) + other_gene_count = other_gene_count + count_gene_other count_wt = clean_df[other_muts_col].iloc[i].count('WT') - wt_other = wt_other + count_wt + wt_other = wt_other + count_wt + print('RESULTS:') print('Total WT in other_muts_col:', wt_other) print('Total matches of', gene_match, 'in', other_muts_col, ':', other_gene_count) @@ -307,50 +304,53 @@ print('gene to extract:', gene_match ) #=============== # FIXME: replace drug with variable containing the drug name # !!! important !!! -#meta_data_dr = meta_data[['id' -# ,'country' -# ,'lineage' -# ,'sublineage' -# ,'drtype' -# , drug -# , dr_muts_col -# ]] - -dr_based_cols = [drug, dr_muts_col] - -cols_to_extract = core_cols + dr_based_cols -print('Extracting', len(cols_to_extract), 'columns from meta data') - -meta_data_dr = meta_data[cols_to_extract] - -del(dr_based_cols, cols_to_extract) +if in_filename == 'original_tanushree_data_v2.csv': + meta_data_dr = meta_data[['id' + ,'country' + ,'lineage' + ,'sublineage' + ,'drtype' + , drug + , dr_muts_col]] +if in_filename == 'mtb_gwas_meta_v3.csv': + dr_based_cols = [drug, dr_muts_col] + cols_to_extract = core_cols + dr_based_cols + print('Extracting', len(cols_to_extract), 'columns from meta data') + meta_data_dr = meta_data[cols_to_extract] + del(dr_based_cols, cols_to_extract) if meta_data_dr.shape[0] == len(meta_data) and meta_data_dr.shape[1] == (len(meta_data.columns)-1): - print('PASS: Dimensions match') + print('PASS: Dimensions match' + , '\n===============================================================') else: print('FAIL: Dimensions mismatch:' - , 'Expected dim should be:', len(meta_data), (len(meta_data.columns)-1) + , 'Expected dim:', len(meta_data), (len(meta_data.columns)-1) , '\nGot:', meta_data_dr.shape , '\n===============================================================') -sys.exit() + sys.exit() +# FIXME FIXME FIXME FIXME # Extract within this the gene of interest using string match #meta_gene_dr = meta_data.loc[meta_data[dr_muts_col].str.contains('gene_match*', na = False)] meta_gene_dr = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(gene_match, na = False)] - +print('gene_match in dr:', len(meta_gene_dr)) + +#!!!!! USE THIS ONCE VERIFIED!!!! +meta_gene_dr_snp = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)] +print('gene_snp_match in dr:', len(meta_gene_dr_snp)) +#!!!!! USE THIS ONCE VERIFIED!!!! + dr_id = meta_gene_dr['id'].unique() print('RESULT: No. of samples with dr muts in pncA:', len(dr_id)) -print('checking RESULT:', - '\nexpected len =', len(id_dr), - '\nactual len =', len(meta_gene_dr) ) if len(id_dr) == len(meta_gene_dr): print('PASS: lengths match' , '\n===============================================================') else: print('FAIL: length mismatch' - , '\n===============================================================') -sys.exit() + , '\nExpected len:', len(id_dr) + , '\nGot:', len(meta_gene_dr)) + sys.exit() dr_id = pd.Series(dr_id) @@ -358,52 +358,54 @@ dr_id = pd.Series(dr_id) # other mutations: extract gene_match entries from other_muts_col #================== print('Extracting other_muts from:', other_muts_col,'with other meta_data') -# FIXME: replace drug with variable containing the drug name -# !!! important !!! -#meta_data_other = meta_data[['id' -# ,'country' -# ,'lineage' -# ,'sublineage' -## ,'drtype' -# , drug -# , other_muts_col -# ]] - -other_based_cols = [drug, other_muts_col] - -cols_to_extract = core_cols + other_based_cols -print('Extracting', len(cols_to_extract), 'columns from meta data') - -meta_data_other = meta_data[cols_to_extract] - -del(other_based_cols, cols_to_extract) +if in_filename == 'original_tanushree_data_v2.csv': + meta_data_other = meta_data[['id' + , 'country' + , 'lineage' + , 'sublineage' + , 'drtype' + , drug + , other_muts_col]] + +if in_filename == 'mtb_gwas_meta_v3.csv': + other_based_cols = [drug, other_muts_col] + cols_to_extract = core_cols + other_based_cols + print('Extracting', len(cols_to_extract), 'columns from meta data') + meta_data_other = meta_data[cols_to_extract] + del(other_based_cols, cols_to_extract) if meta_data_other.shape[0] == len(meta_data) and meta_data_other.shape[1] == (len(meta_data.columns)-1): - print('PASS: Dimensions match') + print('PASS: Dimensions match' + , '\n===============================================================') else: print('FAIL: Dimensions mismatch:' - , 'Expected dim should be:', len(meta_data), (len(meta_data.columns)-1) + , 'Expected dim:', len(meta_data), (len(meta_data.columns)-1) , '\nGot:', meta_data_other.shape , '\n===============================================================') -sys.exit() + sys.exit() +# FIXME FIXME FIXME FIXME # Extract within this the gene of interest using string match meta_gene_other = meta_data_other.loc[meta_data_other[other_muts_col].str.contains(gene_match, na = False)] +print('gene_match in other:', len(meta_gene_other)) +#!!!!! USE THIS ONCE VERIFIED!!!! +meta_gene_other_snp = meta_data_other.loc[meta_data_other[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)] +print('gene_snp_match in other:', len(meta_gene_other_snp)) + +#!!!!! USE THIS ONCE VERIFIED!!!! other_id = meta_gene_other['id'].unique() print('RESULT: No. of samples with other muts:', len(other_id)) -print('checking RESULT:', - '\nexpected len =', len(id_other), - '\nactual len =', len(meta_gene_other) ) if len(id_other) == len(meta_gene_other): print('PASS: lengths match' , '\n==============================================================') else: print('FAIL: length mismatch' - , '\n===============================================================') -sys.exit() + , '\nExpected len:', len(id_other) + , '\nGot:', len(meta_gene_other)) + sys.exit() other_id = pd.Series(other_id) #%% Find common IDs @@ -435,9 +437,9 @@ else: sys.exit('FAIL: Further cross checks on common ids') # good sanity check: use it later to check gene_sample_counts -expected_gene_samples = ( len(meta_gene_dr) + len(meta_gene_other) - common_mut_ids ) -print('Expected no. of gene samples:', expected_gene_samples) -print('=================================================================') +expected_gene_samples = (len(meta_gene_dr) + len(meta_gene_other) - common_mut_ids) +print('Expected no. of gene samples:', expected_gene_samples + , '\n=================================================================') #%% write file #print(outdir) out_filename_cid = gene.lower() + '_common_ids.csv' @@ -455,6 +457,7 @@ del(out_filename_cid) # clear variables del(dr_id, other_id, meta_data_dr, meta_data_other, common_ids, common_mut_ids, common_ids2) + #%% Now extract 'all' pncA mutations: i.e 'gene_match*' print('extracting from string match:', gene_match, 'mutations from cols:\n' , dr_muts_col, 'and', other_muts_col, 'using string match:' @@ -491,7 +494,6 @@ print('This is still dirty data: samples have ', gene_match, 'muts but may have , '\nsince the format for mutations is mut1; mut2, etc.' , '\n=============================================================') - print('Performing tidy_split(): to separate the mutations into indivdual rows') #========= @@ -522,13 +524,11 @@ if len(dr_gene_WF0) == dr_gene_count: print('PASS: length of dr_gene_WF0 match with expected length' , '\n===============================================================') else: - print('FAIL: lengths mismatch' - , '\n===============================================================') -sys.exit() + sys.exit('FAIL: lengths mismatch') # count the freq of 'dr_muts' samples dr_muts_df = dr_gene_WF0 [['id', dr_muts_col]] -print('dim of dr_muts_df:', dr_muts_df.shape) +print('Dim of dr_muts_df:', dr_muts_df.shape) # add freq column dr_muts_df['dr_sample_freq'] = dr_muts_df.groupby('id')['id'].transform('count') @@ -550,25 +550,12 @@ else: #!!! Important !!! # Assign 'column name' on which split was performed as an extra column # This is so you can identify if mutations are dr_type or other in the final df -dr_df_all = dr_gene_WF0.assign(mutation_info = dr_muts_col) -print('Dim of dr_df:', dr_df_all.shape +dr_df = dr_gene_WF0.assign(mutation_info = dr_muts_col) +print('Dim of dr_df:', dr_df.shape , '\n==============================================================' , '\nEnd of tidy split() on dr_muts, and added an extra column relecting mut_category' , '\n===============================================================') -# Further clean up to remove stop mutations as these are similar in format to -# gene match but dont have a 'mutant aa' (for e.g: abc10* vs nssnp: abc10cde) -dr_stop = dr_df_all[dr_muts_col].str.count('\*').sum() -print('Removing stop mutations denoted by an asterix' - , '\nNo. of stop mutations:',dr_stop) - -dr_df = dr_df_all[~dr_df_all[dr_muts_col].str.contains('\*')] - -if len(dr_df) == len(dr_df_all) - dr_stop: - print('PASS: Successfully removed stop mutatiosn from dr_df') -else: - sys.exit('FAIL: Could not remove stop mutations. Check regex or variable names?') - #%% #========= # DF2: other_mutations_pyrazinamdie @@ -623,26 +610,12 @@ else: #!!! Important !!! # Assign 'column name' on which split was performed as an extra column # This is so you can identify if mutations are dr_type or other in the final df -other_df_all = other_gene_WF1.assign(mutation_info = other_muts_col) -print('dim of other_df:', other_df_all.shape +other_df = other_gene_WF1.assign(mutation_info = other_muts_col) +print('dim of other_df:', other_df.shape , '\n===============================================================' , '\nEnd of tidy split() on other_muts, and added an extra column relecting mut_category' , '\n===============================================================') - -# Further clean up to remove stop mutations as these are similar in format to -# gene match but don't have a 'mutant aa' (for e.g: abc10* vs nssnp: abc10cde) -other_stop = other_df_all[other_muts_col].str.count('\*').sum() -print('Removing stop mutations denoted by an asterix' - , '\nNo. of stop mutations:',other_stop) - -other_df = other_df_all[~other_df_all[other_muts_col].str.contains('\*')] - -if len(other_df) == len(other_df_all) - other_stop: - print('PASS: Successfully removed stop mutatiosn from other_df') -else: - sys.exit('FAIL: Could not remove stop mutations. Check regex or variable names?') - #%% #========== # Concatentating the two dfs: equivalent of rbind in R @@ -673,7 +646,7 @@ else: sys.exit('FAIL: No. of cols mismatch for concatenating') # checking colnames before concat -print('checking colnames BEFORE concatenating the two dfs...') +print('Checking colnames BEFORE concatenating the two dfs...') if (set(dr_df.columns) == set(other_df.columns)): print('PASS: column names match necessary for merging two dfs') else: @@ -683,33 +656,38 @@ else: print('Now appending the two dfs: Rbind') gene_LF_comb = pd.concat([dr_df, other_df], ignore_index = True, axis = 0) -print('Finding stop mutations in concatenated df') stop_muts = gene_LF_comb['mutation'].str.contains('\*').sum() -stop_muts_group = gene_LF_comb['mutation'].str.counts('\*').value_counts -gene_LF_comb.groupby(['mutation_info'])['mutation'].apply(lambda x: x[x.str.contains('\*')].count()) - +print('Finding stop mutations in concatenated df') +if stop_muts > 0: + print('stop mutations detected' + , '\nNo. of stop muts:', stop_muts, '\n' + , gene_LF_comb.groupby(['mutation_info'])['mutation'].apply(lambda x: x[x.str.contains('\*')].count()) + , '\nNow removing them') + +gene_LF0_nssnp = gene_LF_comb[~gene_LF_comb['mutation'].str.contains('\*')] +print('snps only: by subtracting stop muts:', len(gene_LF0_nssnp)) gene_LF0 = gene_LF_comb[gene_LF_comb['mutation'].str.contains(nssnp_match, case = False)] +print('snps only by direct regex:', len(gene_LF0)) - - - - - - - - +if gene_LF0_nssnp.equals(gene_LF0): + print('PASS: regex for extracting nssnp worked correctly & stop mutations succeessfully removed' + , '\nUsing the regex extracted df') +else: + sys.exit('FAIL: posssibly regex or no of stop mutations' + , 'Regex being used:', nssnp_match) + #sys.exit() # checking colnames and length after concat -print('checking colnames AFTER concatenating the two dfs...') +print('Checking colnames AFTER concatenating the two dfs...') if (set(dr_df.columns) == set(gene_LF0.columns)): - print('PASS: column names match + print('PASS: column names match' , '\n=============================================================') else: sys.exit('FAIL: Colnames mismatch AFTER concatenating') -print('checking concatened df') -if len(gene_LF0) == len(dr_df) + len(other_df): +print('Checking concatenated df') +if len(gene_LF0) == (len(dr_df) + len(other_df))- stop_muts : print('PASS:length of df after concat match' , '\n===============================================================') else: @@ -722,80 +700,63 @@ else: # mutations corresponding to gene_match* (string match pattern) # this will be your list you run OR calcs ########### -#my_regex = 'pncA_p.[A-Z]{3}[0-9]+[A-Z]{3}' - - -gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(nssnp_match, case = False)] -len(gene_LF1) - - - - - - - - - - - - - -print('length of gene_LF0:', len(gene_LF0), - '\nThis should be what you need. But just double check and extract', gene_match, - '\nfrom LF0 (concatenated data) using string match:', gene_match - , '\nand double checking for stop mutations') - -if gene_LF0['mutation'].str.count('\*').sum() > 0: - sys.exit('FAIL: Stop mutations detected post concatenating dfs. Resolve at source!') - -print('Double checking and creating df: gene_LF1') -gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(gene_match)] - +print('Length of gene_LF0:', len(gene_LF0) + , '\nThis should be what we need. But just double checking and extracting nssnp for', gene_match + , '\nfrom LF0 (concatenated data) using case insensitive regex match:', nssnp_match) +gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(nssnp_match, regex = True, case = False)] if len(gene_LF0) == len(gene_LF1): print('PASS: length of gene_LF0 and gene_LF1 match', - '\nconfirming extraction and concatenating worked correctly') + '\nConfirming extraction and concatenating worked correctly' + , '\n==========================================================') else: print('FAIL: BUT NOT FATAL!' , '\ngene_LF0 and gene_LF1 lengths differ' , '\nsuggesting error in extraction process' , ' use gene_LF1 for downstreama analysis' - , '\n=========================================================') -print('length of dfs pre and post processing...' + , '\n=========================================================') + +print('Length of dfs pre and post processing...' , '\ngene WF data (unique samples in each row):', extracted_gene_samples , '\ngene LF data (unique mutation in each row):', len(gene_LF1) , '\n=============================================================') +if extracted_gene_samples > len(gene_LF1): + print('Stop mutations removed after concatentaing') + #%% sanity check for extraction +# FIXME: This ought to pass if nsnsp_match happens right at the beginning of creating +#expected_rows print('Verifying whether extraction process worked correctly...') if len(gene_LF1) == expected_rows: print('PASS: extraction process performed correctly' - , '\nexpected length:', expected_rows - , '\ngot:', len(gene_LF1) + , '\nExpected length:', expected_rows + , '\nGot:', len(gene_LF1) , '\nRESULT: Total no. of mutant sequences for logo plot:', len(gene_LF1) , '\n=========================================================') else: print('FAIL: extraction process has bugs' - , '\nexpected length:', expected_rows - , '\ngot:', len(gene_LF1) - , ', \Debug please' + , '\nExpected length:', expected_rows + , '\nGot:', len(gene_LF1) + , '\nDebug please' , '\n=========================================================') -#%% +#%% FIXME FIXME FIXME FIXME +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! print('Performing some more sanity checks...') # From LF1: -# no. of unique muts +# no. of unique muts: useful for OR counts distinct_muts = gene_LF1.mutation.value_counts() print('Distinct genomic mutations:', len(distinct_muts)) -# no. of samples contributing the unique muta +# no. of samples contributing the unique muts gene_LF1.id.nunique() -print('No.of samples contributing to distinct genomic muts:', gene_LF1.id.nunique() ) +print('No.of samples contributing to distinct genomic muts:', gene_LF1.id.nunique()) # no. of dr and other muts mut_grouped = gene_LF1.groupby('mutation_info').mutation.nunique() -print('No.of unique dr and other muts:\n', gene_LF1.groupby('mutation_info').mutation.nunique() ) +print('No.of unique dr and other muts:\n', gene_LF1.groupby('mutation_info').mutation.nunique()) # sanity check if len(distinct_muts) == mut_grouped.sum() : @@ -803,17 +764,20 @@ if len(distinct_muts) == mut_grouped.sum() : , '\n===============================================================') else: print('FAIL: Mistmatch in count of muts' - , '\nexpected count:', len(distinct_muts) - , '\nactual count:', mut_grouped.sum() - , '\nmuts should be distinct within dr* and other* type' - , '\ninspecting ...' + , '\nExpected count:', len(distinct_muts) + , '\nActual count:', mut_grouped.sum() + , '\nMuts should be distinct within dr* and other* type' + , '\nInspecting...' , '\n=========================================================') + muts_split = list(gene_LF1.groupby('mutation_info')) dr_muts = muts_split[0][1].mutation other_muts = muts_split[1][1].mutation print('splitting muts by mut_info:', muts_split) print('no.of dr_muts samples:', len(dr_muts)) -print('no. of other_muts samples', len(other_muts)) +print('no. of other_muts samples', len(other_muts)) +#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# FIXME FIXME FIXME #%% # !!! IMPORTANT !!!! # sanity check: There should not be any common muts @@ -831,14 +795,13 @@ if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: print('Finding ambiguous muts...' , '\n=========================================================' , '\nTotal no. of samples in dr_muts present in other_muts:', dr_muts.isin(other_muts).sum() - , '\nThese are:\n', dr_muts[dr_muts.isin(other_muts)] + , '\nThese are:', dr_muts[dr_muts.isin(other_muts)] , '\n=========================================================' , '\nTotal no. of samples in other_muts present in dr_muts:', other_muts.isin(dr_muts).sum() , '\nThese are:\n', other_muts[other_muts.isin(dr_muts)] , '\n=========================================================') else: - print('Error: ambiguous muts present, but extraction failed. Debug!' - , '\n===============================================================') + sys.exit('Error: ambiguous muts present, but extraction failed. Debug!') print('Counting no. of ambiguous muts...') @@ -866,23 +829,23 @@ del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_m #dr_muts.to_csv('dr_muts.csv', header = True) #other_muts.to_csv('other_muts.csv', header = True) -out_filename1 = gene.lower() + '_ambiguous_muts.csv' -outfile1 = outdir + '/' + out_filename1 +out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv' +outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts print('Writing file: ambiguous muts' - , '\nFilename:', outfile1) + , '\nFilename:', outfile_ambig_muts) #common_muts = ['gene_matchVal180Phe','gene_matchGln10Pro'] # test inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)] -inspect.to_csv(outfile1, index = False) +inspect.to_csv(outfile_ambig_muts, index = False) -print('Finished writing:', out_filename1 +print('Finished writing:', out_filename_ambig_muts , '\nNo. of rows:', len(inspect) , '\nNo. of cols:', len(inspect.columns) , '\nNo. of rows = no. of samples with the ambiguous muts present:' , dr_muts.isin(other_muts).sum() + other_muts.isin(dr_muts).sum() , '\n=============================================================') -del(out_filename1) +del(out_filename_ambig_muts) #%% end of data extraction and some files writing. Below are some more files writing. #============================================================================= #%% Formatting df: read aa dict and pull relevant info @@ -1173,44 +1136,40 @@ if snps_only.mutationinformation.isna().sum() == 0: print ('PASS: NO NAs/missing entries for SNPs' , '\n===============================================================') else: - print('FAIL: SNP has NA, Possible mapping issues from dict?' - , '\nDebug please!' - , '\n=========================================================') -sys.exit() + sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?') -out_filename2 = gene.lower() + '_mcsm_snps.csv' -outfile2 = outdir + '/' + out_filename2 +out_filename_mcsmsnps = gene.lower() + '_mcsm_snps.csv' +outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps print('Writing file: mCSM style muts' - , '\nFilename:', outfile2 + , '\nFilename:', outfile_mcsmsnps , '\nmutation format (SNP): {WT}{MUT}' , '\nNo. of distinct muts:', len(snps_only) , '\nNo. of distinct positions:', len(pos_only) , '\n=============================================================') -snps_only.to_csv(outfile2, header = False, index = False) +snps_only.to_csv(outfile_mcsmsnps, header = False, index = False) -print('Finished writing:', outfile2 +print('Finished writing:', outfile_mcsmsnps , '\nNo. of rows:', len(snps_only) , '\nNo. of cols:', len(snps_only.columns) , '\n=============================================================') -del(out_filename2) +del(out_filename_mcsmsnps) #%% Write file: gene_metadata (i.e gene_LF1) # where each row has UNIQUE mutations NOT unique sample ids -out_filename3 = gene.lower() + '_metadata.csv' -outfile3 = outdir + '/' + out_filename3 +out_filename_metadata = gene.lower() + '_metadata.csv' +outfile_metadata = outdir + '/' + out_filename_metadata print('Writing file: LF formatted data' - , '\nFilename:', out_filename3 - , '\nPath:', outdir + , '\nFilename:', outfile_metadata , '\n============================================================') -gene_LF1.to_csv(outfile3, header = True, index = False) -print('Finished writing:', outfile3 +gene_LF1.to_csv(outfile_metadata, header = True, index = False) +print('Finished writing:', outfile_metadata , '\nNo. of rows:', len(gene_LF1) , '\nNo. of cols:', len(gene_LF1.columns) , '\n=============================================================') -del(out_filename3) +del(out_filename_metadata) #%% write file: mCSM style but with repitions for MSA and logo plots all_muts_msa = pd.DataFrame(gene_LF1['mutationinformation']) @@ -1237,27 +1196,24 @@ if all_muts_msa.mutationinformation.isna().sum() == 0: print ('PASS: NO NAs/missing entries for SNPs' , '\n===============================================================') else: - print('FAIL: SNP has NA, Possible mapping issues from dict?' - , '\nDebug please!' - , '\n=========================================================') + sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?') -out_filename4 = gene.lower() +'_all_muts_msa.csv' -outfile4 = outdir + '/' + out_filename4 +out_filename_msa = gene.lower() +'_all_muts_msa.csv' +outfile_msa = outdir + '/' + out_filename_msa print('Writing file: mCSM style muts for msa', - '\nFilename:', outfile4, + '\nFilename:', outfile_msa, '\nmutation format (SNP): {WT}{MUT}', - '\nNo.of lines of msa:', len(all_muts_msa), -) + '\nNo.of lines of msa:', len(all_muts_msa)) -all_muts_msa_sorted.to_csv(outfile4, header = False, index = False) +all_muts_msa_sorted.to_csv(outfile_msa, header = False, index = False) -print('Finished writing:', outfile4 +print('Finished writing:', outfile_msa , '\nNo. of rows:', len(all_muts_msa) , '\nNo. of cols:', len(all_muts_msa.columns) , '\n=============================================================') -del(out_filename4) +del(out_filename_msa) #%% write file for mutational positions # count how many positions this corresponds to @@ -1272,23 +1228,22 @@ pos_only.position.dtype # sort by position value pos_only_sorted = pos_only.sort_values(by = 'position', ascending = True) -out_filename5 = gene.lower() + '_mutational_positons.csv' -outfile5 = outdir + '/' + out_filename5 +out_filename_pos = gene.lower() + '_mutational_positons.csv' +outfile_pos = outdir + '/' + out_filename_pos print('Writing file: mutational positions' , '\nNo. of distinct positions:', len(pos_only_sorted) - , '\nFilename:', out_filename5 - , '\nPath:', outdir + , '\nFile:', outfile_pos , '\n=============================================================') -pos_only_sorted.to_csv(outfile5, header = True, index = False) +pos_only_sorted.to_csv(outfile_pos, header = True, index = False) -print('Finished writing:', outfile5 +print('Finished writing:', outfile_pos , '\nNo. of rows:', len(pos_only_sorted) , '\nNo. of cols:', len(pos_only_sorted.columns) , '\n=============================================================') -del(out_filename5) +del(out_filename_pos) #======================================================================= print(u'\u2698' * 50, '\nEnd of script: Data extraction and writing files'