From 0e71b2375995f2e5275e9ce49b7f2b658014b84f Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Wed, 8 Jul 2020 18:47:22 +0100 Subject: [PATCH] modified extraction to be explicit for extracting nsSNP for specified gene --- scripts/data_extraction.py | 290 ++++++++++++++++--------------------- 1 file changed, 122 insertions(+), 168 deletions(-) diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index d045fb9..b82da65 100755 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -80,7 +80,19 @@ gene = args.gene #gene = 'pncA' gene_match = gene + '_p.' +print('mut pattern for gene', gene, ':', gene_match) + nssnp_match = gene_match+'[A-Z]{3}[0-9]+[A-Z]{3}' +print('nsSNP for gene', gene, ':', nssnp_match) + +wt_regex = gene_match.lower()+'(\w{3})' +print('wt regex:', wt_regex) + +mut_regex = r'\d+(\w{3})$' +print('mt regex:', mut_regex) + +pos_regex = r'(\d+)' +print('position regex:', pos_regex) # building cols to extract dr_muts_col = 'dr_mutations_' + drug @@ -105,10 +117,10 @@ outdir = datadir + '/' + drug + '/' + 'output' #======= # input #======= -#in_filename = 'original_tanushree_data_v2.csv' #19k -in_filename = 'mtb_gwas_meta_v3.csv' #33k -infile = datadir + '/' + in_filename -print('Input file: ', infile +#in_filename_master_master = 'original_tanushree_data_v2.csv' #19k +in_filename_master = 'mtb_gwas_meta_v3.csv' #33k +infile_master = datadir + '/' + in_filename_master +print('Input file: ', infile_master , '\n============================================================') #======= @@ -122,14 +134,13 @@ print('Output filename: in the respective sections' #%%end of variable assignment for input and output files #======================================================================= #%% Read input file -master_data = pd.read_csv(infile, sep = ',') +master_data = pd.read_csv(infile_master, sep = ',') # column names #list(master_data.columns) # extract elevant columns to extract from meta data related to the drug - -if in_filename == 'original_tanushree_data_v2.csv': +if in_filename_master == 'original_tanushree_data_v2.csv': meta_data = master_data[['id' , 'country' , 'lineage' @@ -139,7 +150,7 @@ if in_filename == 'original_tanushree_data_v2.csv': , dr_muts_col , other_muts_col]] -if in_filename == 'mtb_gwas_meta_v3.csv': +if in_filename_master == 'mtb_gwas_meta_v3.csv': core_cols = ['id' , 'country' , 'country2' @@ -168,7 +179,8 @@ if in_filename == 'mtb_gwas_meta_v3.csv': meta_data = master_data[cols_to_extract] del(master_data, variable_based_cols, cols_to_extract) -print('Extracted meta data:', meta_data.shape) +print('Extracted meta data from filename:', in_filename_master + , '\nDim:', meta_data.shape) # checks and results total_samples = meta_data['id'].nunique() @@ -193,7 +205,7 @@ print('RESULT: Sus and Res samples:\n', meta_data[drug].value_counts() , '\n===========================================================') #%% -# !!!IMPORTANT!!! sanity check: +# IMPORTANT sanity check: # This is to find out how many samples have 1 and more than 1 mutation,so you # can use it to check if your data extraction process for dr_muts # and other_muts has worked correctly AND also to check the dim of the @@ -203,8 +215,7 @@ print('RESULT: Sus and Res samples:\n', meta_data[drug].value_counts() #======== # First: counting mutations in dr_muts_col column #======== -print('Now counting WT &', gene_match, 'muts within the column:', dr_muts_col) - +print('Now counting WT &', nssnp_match, 'muts within the column:', dr_muts_col) # drop na and extract a clean df clean_df = meta_data.dropna(subset=[dr_muts_col]) @@ -222,11 +233,14 @@ dr_gene_count = 0 wt = 0 id_dr = [] id2_dr = [] +#nssnp_match_regex = re.compile(nssnp_match) for i, id in enumerate(clean_df.id): #print (i, id) #id_dr.append(id) - count_gene_dr = clean_df[dr_muts_col].iloc[i].count(gene_match) + #count_gene_dr = clean_df[dr_muts_col].iloc[i].count(gene_match) # can include stop muts + count_gene_dr = len(re.findall(nssnp_match, clean_df[dr_muts_col].iloc[i], re.IGNORECASE)) + #print(count_gene_dr) if count_gene_dr > 0: id_dr.append(id) if count_gene_dr > 1: @@ -238,8 +252,8 @@ for i, id in enumerate(clean_df.id): print('RESULTS:') print('Total WT in dr_muts_col:', wt) -print('Total matches of', gene_match, 'in dr_muts_col:', dr_gene_count) -print('Total samples with > 1', gene_match, 'muts in dr_muts_col:', len(id2_dr) ) +print('Total matches of', gene, 'SNP matches in', dr_muts_col, ':', dr_gene_count) +print('Total samples with > 1', gene, 'muts in dr_muts_col:', len(id2_dr) ) print('=================================================================') del(clean_df, na_count, i, id, wt, id2_dr, count_gene_dr, count_wt) @@ -247,8 +261,7 @@ del(clean_df, na_count, i, id, wt, id2_dr, count_gene_dr, count_wt) #======== # Second: counting mutations in dr_muts_col column #======== -print('Now counting WT &', gene_match, 'muts within the column:', other_muts_col) - +print('Now counting WT &', nssnp_match, 'muts within the column:', other_muts_col) # drop na and extract a clean df clean_df = meta_data.dropna(subset=[other_muts_col]) @@ -270,7 +283,8 @@ id2_other = [] for i, id in enumerate(clean_df.id): #print (i, id) #id_other.append(id) - count_gene_other = clean_df[other_muts_col].iloc[i].count(gene_match) + #count_gene_other = clean_df[other_muts_col].iloc[i].count(gene_match) # can include stop muts + count_gene_other = len(re.findall(nssnp_match, clean_df[other_muts_col].iloc[i], re.IGNORECASE)) if count_gene_other > 0: id_other.append(id) if count_gene_other > 1: @@ -282,8 +296,8 @@ for i, id in enumerate(clean_df.id): print('RESULTS:') print('Total WT in other_muts_col:', wt_other) -print('Total matches of', gene_match, 'in', other_muts_col, ':', other_gene_count) -print('Total samples with > 1', gene_match, 'muts in other_muts_col:', len(id2_other) ) +print('Total matches of', gene, 'SNP matches in', other_muts_col, ':', other_gene_count) +print('Total samples with > 1', gene, 'muts in other_muts_col:', len(id2_other) ) print('=================================================================') print('Predicting total no. of rows in the curated df:', dr_gene_count + other_gene_count @@ -297,14 +311,12 @@ del(i, id, wt_other, clean_df, na_count, id2_other, count_gene_other, count_wt) # extracting dr and other muts separately along with the common cols ############# print('Extracting dr_muts from col:', dr_muts_col, 'with other meta_data') -print('gene to extract:', gene_match ) +print('muts to extract:', nssnp_match ) #=============== # dr mutations: extract gene_match entries with meta data and ONLY dr_muts col #=============== -# FIXME: replace drug with variable containing the drug name -# !!! important !!! -if in_filename == 'original_tanushree_data_v2.csv': +if in_filename_master == 'original_tanushree_data_v2.csv': meta_data_dr = meta_data[['id' ,'country' ,'lineage' @@ -312,7 +324,7 @@ if in_filename == 'original_tanushree_data_v2.csv': ,'drtype' , drug , dr_muts_col]] -if in_filename == 'mtb_gwas_meta_v3.csv': +if in_filename_master == 'mtb_gwas_meta_v3.csv': dr_based_cols = [drug, dr_muts_col] cols_to_extract = core_cols + dr_based_cols print('Extracting', len(cols_to_extract), 'columns from meta data') @@ -329,16 +341,10 @@ else: , '\n===============================================================') sys.exit() -# FIXME FIXME FIXME FIXME -# Extract within this the gene of interest using string match -#meta_gene_dr = meta_data.loc[meta_data[dr_muts_col].str.contains('gene_match*', na = False)] -meta_gene_dr = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(gene_match, na = False)] -print('gene_match in dr:', len(meta_gene_dr)) - -#!!!!! USE THIS ONCE VERIFIED!!!! -meta_gene_dr_snp = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)] -print('gene_snp_match in dr:', len(meta_gene_dr_snp)) -#!!!!! USE THIS ONCE VERIFIED!!!! +# Extract within this the nsSNPs for gene of interest using string match +#meta_gene_dr = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(gene_match, na = False)] +meta_gene_dr = meta_data_dr.loc[meta_data_dr[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)] +print('gene_snp_match in dr:', len(meta_gene_dr)) dr_id = meta_gene_dr['id'].unique() print('RESULT: No. of samples with dr muts in pncA:', len(dr_id)) @@ -355,11 +361,12 @@ else: dr_id = pd.Series(dr_id) #================= -# other mutations: extract gene_match entries from other_muts_col +# other mutations: extract nssnp_match entries from other_muts_col #================== print('Extracting other_muts from:', other_muts_col,'with other meta_data') +print('muts to extract:', nssnp_match) -if in_filename == 'original_tanushree_data_v2.csv': +if in_filename_master == 'original_tanushree_data_v2.csv': meta_data_other = meta_data[['id' , 'country' , 'lineage' @@ -368,7 +375,7 @@ if in_filename == 'original_tanushree_data_v2.csv': , drug , other_muts_col]] -if in_filename == 'mtb_gwas_meta_v3.csv': +if in_filename_master == 'mtb_gwas_meta_v3.csv': other_based_cols = [drug, other_muts_col] cols_to_extract = core_cols + other_based_cols print('Extracting', len(cols_to_extract), 'columns from meta data') @@ -385,16 +392,11 @@ else: , '\n===============================================================') sys.exit() -# FIXME FIXME FIXME FIXME -# Extract within this the gene of interest using string match -meta_gene_other = meta_data_other.loc[meta_data_other[other_muts_col].str.contains(gene_match, na = False)] -print('gene_match in other:', len(meta_gene_other)) +# Extract within this nSSNP for gene of interest using string match +#meta_gene_other = meta_data_other.loc[meta_data_other[other_muts_col].str.contains(gene_match, na = False)] +meta_gene_other = meta_data_other.loc[meta_data_other[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)] +print('gene_snp_match in other:', len(meta_gene_other)) -#!!!!! USE THIS ONCE VERIFIED!!!! -meta_gene_other_snp = meta_data_other.loc[meta_data_other[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False)] -print('gene_snp_match in other:', len(meta_gene_other_snp)) - -#!!!!! USE THIS ONCE VERIFIED!!!! other_id = meta_gene_other['id'].unique() print('RESULT: No. of samples with other muts:', len(other_id)) @@ -445,7 +447,6 @@ print('Expected no. of gene samples:', expected_gene_samples out_filename_cid = gene.lower() + '_common_ids.csv' outfile_cid = outdir + '/' + out_filename_cid -#FIXME: CHECK line len(common_ids) print('Writing file:' , '\nFile:', outfile_cid , '\nNo. of rows:', len(common_ids) @@ -458,12 +459,12 @@ del(out_filename_cid) # clear variables del(dr_id, other_id, meta_data_dr, meta_data_other, common_ids, common_mut_ids, common_ids2) -#%% Now extract 'all' pncA mutations: i.e 'gene_match*' -print('extracting from string match:', gene_match, 'mutations from cols:\n' +#%% Now extract 'all' gene specific nsSNP mutations: i.e 'nssnp_match' +print('Extracting nsSNP match:', gene, 'mutations from cols:\n' , dr_muts_col, 'and', other_muts_col, 'using string match:' , '\n===================================================================') -#meta_gene_all = meta_data.loc[meta_data[dr_muts_col].str.contains(gene_match) | meta_data[other_muts_col].str.contains(gene_match) ] -meta_gene_all = meta_data.loc[meta_data[dr_muts_col].str.contains(gene_match, na = False) | meta_data[other_muts_col].str.contains(gene_match, na = False) ] +#meta_gene_all = meta_data.loc[meta_data[dr_muts_col].str.contains(gene_match, na = False) | meta_data[other_muts_col].str.contains(gene_match, na = False) ] +meta_gene_all = meta_data.loc[meta_data[dr_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) | meta_data[other_muts_col].str.contains(nssnp_match, na = False, regex = True, case = False) ] extracted_gene_samples = meta_gene_all['id'].nunique() print('RESULT: actual no. of gene samples extracted:', extracted_gene_samples @@ -472,7 +473,7 @@ print('RESULT: actual no. of gene samples extracted:', extracted_gene_samples # sanity check: length of gene samples print('Performing sanity check:') if extracted_gene_samples == expected_gene_samples: - print('PASS: expected & actual no. of gene samples match' + print('PASS: expected & actual no. of nssnp gene samples match' , '\nNo. of gene samples:', len(meta_gene_all) , '\n=========================================================') else: @@ -484,13 +485,13 @@ print('No. of gene samples without', drug, 'testing:', gene_na) # use it later to check number of complete samples from LF data comp_gene_samples = len(meta_gene_all) - gene_na -print('comp gene samples tested for', drug, ':', comp_gene_samples) +print('Complete gene samples tested for', drug, ':', comp_gene_samples) print('=================================================================') # Comment: This is still dirty data since these -# are samples that have gene_match muts, but can have others as well +# are samples that have nsSNP muts, but can have others as well # since the format for mutations is mut1; mut2, etc. -print('This is still dirty data: samples have ', gene_match, 'muts but may have others as well' +print('This is still dirty data: samples have ', nssnp_match, 'muts but may have others as well' , '\nsince the format for mutations is mut1; mut2, etc.' , '\n=============================================================') @@ -511,17 +512,18 @@ dr_WF0 = tidy_split(meta_gene_dr, col_to_split1, sep = ';') # remove leading white space else these are counted as distinct mutations as well dr_WF0[dr_muts_col] = dr_WF0[dr_muts_col].str.lstrip() -# extract only the samples/rows with gene_match -dr_gene_WF0 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(gene_match)] - -print('Lengths after tidy split and extracting', gene_match, 'muts:' - , '\nold length:' , len(meta_gene_dr) - , '\nlen after split:', len(dr_WF0) - , '\ndr_gene_WF0 length:', len(dr_gene_WF0) - , '\nexpected len:', dr_gene_count) +# extract only the samples/rows with nssnp_match +#dr_gene_WF0 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(gene_match)] +dr_gene_WF0 = dr_WF0.loc[dr_WF0[dr_muts_col].str.contains(nssnp_match, regex = True, case = False)] +print('Lengths after tidy split and extracting', nssnp_match, 'muts:' + , '\nOld length:' , len(meta_gene_dr) + , '\nLength after split:', len(dr_WF0) + , '\nLength of nssnp df:', len(dr_gene_WF0) + , '\nExpected len:', dr_gene_count + , '\n=============================================================') if len(dr_gene_WF0) == dr_gene_count: - print('PASS: length of dr_gene_WF0 match with expected length' + print('PASS: length matches expected length' , '\n===============================================================') else: sys.exit('FAIL: lengths mismatch') @@ -547,8 +549,7 @@ if len(dr_gene_WF0) == c1.sum(): else: sys.exit('FAIL: length mismatch!') -#!!! Important !!! -# Assign 'column name' on which split was performed as an extra column +# Important: Assign 'column name' on which split was performed as an extra column # This is so you can identify if mutations are dr_type or other in the final df dr_df = dr_gene_WF0.assign(mutation_info = dr_muts_col) print('Dim of dr_df:', dr_df.shape @@ -573,17 +574,18 @@ other_WF1 = tidy_split(meta_gene_other, col_to_split2, sep = ';') # remove the leading white spaces in the column other_WF1[other_muts_col] = other_WF1[other_muts_col].str.strip() -# extract only the samples/rows with gene_match -other_gene_WF1 = other_WF1.loc[other_WF1[other_muts_col].str.contains(gene_match)] +# extract only the samples/rows with nssnp_match +#other_gene_WF1 = other_WF1.loc[other_WF1[other_muts_col].str.contains(gene_match)] +other_gene_WF1 = other_WF1.loc[other_WF1[other_muts_col].str.contains(nssnp_match, regex = True, case = False)] print('Lengths after tidy split and extracting', gene_match, 'muts:', - '\nold length:' , len(meta_gene_other), - '\nlen after split:', len(other_WF1), - '\nother_gene_WF1 length:', len(other_gene_WF1), - '\nexpected len:', other_gene_count) - + '\nOld length:' , len(meta_gene_other), + '\nLength after split:', len(other_WF1), + '\nLength of nssnp df:', len(other_gene_WF1), + '\nExpected len:', other_gene_count + , '\n=============================================================') if len(other_gene_WF1) == other_gene_count: - print('PASS: length matches with expected length' + print('PASS: length matches expected length' , '\n===============================================================') else: sys.exit('FAIL: lengths mismatch') @@ -607,8 +609,7 @@ if len(other_gene_WF1) == c2.sum(): else: sys.exit('FAIL: Length mismatch') -#!!! Important !!! -# Assign 'column name' on which split was performed as an extra column +# Important: Assign 'column name' on which split was performed as an extra column # This is so you can identify if mutations are dr_type or other in the final df other_df = other_gene_WF1.assign(mutation_info = other_muts_col) print('dim of other_df:', other_df.shape @@ -620,8 +621,7 @@ print('dim of other_df:', other_df.shape #========== # Concatentating the two dfs: equivalent of rbind in R #========== -#!!! important !!! -# change column names to allow concat: +# Important: Change column names to allow concat: # dr_muts.. & other_muts : 'mutation' print('Now concatenating the two dfs by row' , '\nFirst assigning a common colname: "mutation" to the col containing muts' @@ -638,9 +638,9 @@ other_df.columns if len(dr_df.columns) == len(other_df.columns): print('Checking dfs for concatening by rows:' - , '\ndr_df dim:', dr_df.shape - , '\nother_df dim:', other_df.shape - , '\nexpected nrows:', len(dr_df) + len(other_df) + , '\nDim of dr_df:', dr_df.shape + , '\nDim of other_df:', other_df.shape + , '\nExpected nrows:', len(dr_df) + len(other_df) , '\n=============================================================') else: sys.exit('FAIL: No. of cols mismatch for concatenating') @@ -656,13 +656,15 @@ else: print('Now appending the two dfs: Rbind') gene_LF_comb = pd.concat([dr_df, other_df], ignore_index = True, axis = 0) -stop_muts = gene_LF_comb['mutation'].str.contains('\*').sum() print('Finding stop mutations in concatenated df') -if stop_muts > 0: +stop_muts = gene_LF_comb['mutation'].str.contains('\*').sum() +if stop_muts == 0: + print('PASS: No stop mutations detected') +else: print('stop mutations detected' , '\nNo. of stop muts:', stop_muts, '\n' , gene_LF_comb.groupby(['mutation_info'])['mutation'].apply(lambda x: x[x.str.contains('\*')].count()) - , '\nNow removing them') + , '\nNow removing them') gene_LF0_nssnp = gene_LF_comb[~gene_LF_comb['mutation'].str.contains('\*')] print('snps only: by subtracting stop muts:', len(gene_LF0_nssnp)) @@ -671,7 +673,7 @@ gene_LF0 = gene_LF_comb[gene_LF_comb['mutation'].str.contains(nssnp_match, case print('snps only by direct regex:', len(gene_LF0)) if gene_LF0_nssnp.equals(gene_LF0): - print('PASS: regex for extracting nssnp worked correctly & stop mutations succeessfully removed' + print('PASS: regex for extracting nssnp worked correctly & stop mutations successfully removed' , '\nUsing the regex extracted df') else: sys.exit('FAIL: posssibly regex or no of stop mutations' @@ -697,11 +699,11 @@ else: ########### # This is hopefully clean data, but just double check # Filter LF data so that you only have -# mutations corresponding to gene_match* (string match pattern) +# mutations corresponding to nssnp_match (case insensitive) # this will be your list you run OR calcs ########### print('Length of gene_LF0:', len(gene_LF0) - , '\nThis should be what we need. But just double checking and extracting nssnp for', gene_match + , '\nThis should be what we need. But just double checking and extracting nsSNP for', gene , '\nfrom LF0 (concatenated data) using case insensitive regex match:', nssnp_match) gene_LF1 = gene_LF0[gene_LF0['mutation'].str.contains(nssnp_match, regex = True, case = False)] @@ -722,12 +724,9 @@ print('Length of dfs pre and post processing...' , '\ngene LF data (unique mutation in each row):', len(gene_LF1) , '\n=============================================================') -if extracted_gene_samples > len(gene_LF1): - print('Stop mutations removed after concatentaing') #%% sanity check for extraction -# FIXME: This ought to pass if nsnsp_match happens right at the beginning of creating -#expected_rows +# This ought to pass if nsnsp_match happens right at the beginning of creating 'expected_rows' print('Verifying whether extraction process worked correctly...') if len(gene_LF1) == expected_rows: print('PASS: extraction process performed correctly' @@ -741,12 +740,11 @@ else: , '\nGot:', len(gene_LF1) , '\nDebug please' , '\n=========================================================') -#%% FIXME FIXME FIXME FIXME -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +#%% print('Performing some more sanity checks...') -# From LF1: -# no. of unique muts: useful for OR counts +# From LF1: useful for OR counts +# no. of unique muts distinct_muts = gene_LF1.mutation.value_counts() print('Distinct genomic mutations:', len(distinct_muts)) @@ -767,7 +765,8 @@ else: , '\nExpected count:', len(distinct_muts) , '\nActual count:', mut_grouped.sum() , '\nMuts should be distinct within dr* and other* type' - , '\nInspecting...' + , '\nInspecting...possibly ambiguous muts' + , '\nNo. of possible ambiguous muts:', mut_grouped.sum() - len(distinct_muts) , '\n=========================================================') muts_split = list(gene_LF1.groupby('mutation_info')) @@ -776,15 +775,12 @@ other_muts = muts_split[1][1].mutation print('splitting muts by mut_info:', muts_split) print('no.of dr_muts samples:', len(dr_muts)) print('no. of other_muts samples', len(other_muts)) -#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -# FIXME FIXME FIXME + #%% -# !!! IMPORTANT !!!! -# sanity check: There should not be any common muts -# i.e the same mutation cannot be classed as a drug AND 'others' +# IMPORTANT: The same mutation cannot be classed as a drug AND 'others' if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: print('WARNING: Ambiguous muts detected in dr_ and other_ mutation category' - , '\n===============================================================') + , '\n===============================================================') else: print('PASS: NO ambiguous muts detected' , '\nMuts are unique to dr_ and other_ mutation class' @@ -834,7 +830,6 @@ outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts print('Writing file: ambiguous muts' , '\nFilename:', outfile_ambig_muts) -#common_muts = ['gene_matchVal180Phe','gene_matchGln10Pro'] # test inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)] inspect.to_csv(outfile_ambig_muts, index = False) @@ -869,8 +864,10 @@ ncol_aa_add = 2 # 2 aa prop add (wt & mut) in each mapping # After importing, convert to mutation to lowercase for compatibility with dict #=========== gene_LF1['mutation'] = gene_LF1.loc[:, 'mutation'].str.lower() -gene_regex = gene_match.lower()+'(\w{3})' -print('gene regex being used:', gene_regex) + +print('wt regex being used:', wt_regex + , '\nmut regex being used:', mut_regex + , '\nposition regex being used:', pos_regex) mylen0 = len(gene_LF1.columns) #======= @@ -888,14 +885,16 @@ print('Adding', ncol_mutf_add, 'more cols:\n') lookup_dict = dict() for k, v in my_aa_dict.items(): lookup_dict[k] = v['one_letter_code'] -# wt = gene_LF1['mutation'].str.extract('gene_p.(\w{3})').squeeze() # converts to a series that map works on - wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze() + #wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze()converts to a series that map works on + wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze() gene_LF1['wild_type'] = wt.map(lookup_dict) - mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze() + #mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze() + mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze() gene_LF1['mutant_type'] = mut.map(lookup_dict) # extract position info from mutation column separetly using string match -gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)') +#gene_LF1['position'] = gene_LF1['mutation'].str.extract(r'(\d+)') +gene_LF1['position'] = gene_LF1['mutation'].str.extract(pos_regex) mylen1 = len(gene_LF1.columns) @@ -939,10 +938,10 @@ lookup_dict = dict() for k, v in my_aa_dict.items(): lookup_dict[k] = v['aa_prop_water'] #print(lookup_dict) -# wt = gene_LF1['mutation'].str.extract('gene_p.(\w{3})').squeeze() # converts to a series that map works on - wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze() + wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze() gene_LF1['wt_prop_water'] = wt.map(lookup_dict) - mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze() + #mut = gene_LF1['mutation'].str.extract('\d+(\w{3})$').squeeze() + mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze() gene_LF1['mut_prop_water'] = mut.map(lookup_dict) mylen2 = len(gene_LF1.columns) @@ -987,10 +986,10 @@ lookup_dict = dict() for k, v in my_aa_dict.items(): lookup_dict[k] = v['aa_prop_polarity'] #print(lookup_dict) -# wt = gene_LF1['mutation'].str.extract('gene_p.(\w{3})').squeeze() # converts to a series that map works on - wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze() + wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze() gene_LF1['wt_prop_polarity'] = wt.map(lookup_dict) - mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze() + #mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze() + mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze() gene_LF1['mut_prop_polarity'] = mut.map(lookup_dict) mylen3 = len(gene_LF1.columns) @@ -1033,10 +1032,9 @@ lookup_dict = dict() for k, v in my_aa_dict.items(): lookup_dict[k] = v['aa_calcprop'] #print(lookup_dict) -# wt = gene_LF1['mutation'].str.extract('gene_p.(\w{3})').squeeze() # converts to a series that map works on - wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze() + wt = gene_LF1['mutation'].str.extract(wt_regex).squeeze() gene_LF1['wt_calcprop'] = wt.map(lookup_dict) - mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze() + mut = gene_LF1['mutation'].str.extract(mut_regex).squeeze() gene_LF1['mut_calcprop'] = mut.map(lookup_dict) mylen4 = len(gene_LF1.columns) @@ -1068,50 +1066,6 @@ else: # clear variables del(k, v, wt, mut, lookup_dict) -#======== -# iterate through the dict, create a lookup dict that i.e -# lookup_dict = {three_letter_code: aa_taylor} -# Do this for both wild_type and mutant as above. -# caution: taylor mapping will create a list within a column -#========= -#print('Adding', ncol_aa_add, 'more cols:\n') -#lookup_dict = dict() -#for k, v in my_aa_dict.items(): -# lookup_dict[k] = v['aa_taylor'] - #print(lookup_dict) -# wt = gene_LF1['mutation'].str.extract(gene_regex).squeeze() -# gene_LF1['wt_taylor'] = wt.map(lookup_dict) -# mut = gene_LF1['mutation'].str.extract(r'\d+(\w{3})$').squeeze() -# gene_LF1['mut_taylor'] = mut.map(lookup_dict) - -#mylen5 = len(gene_LF1.columns) - -# sanity checks -#print('checking if 3-letter wt&mut residue extraction worked correctly') -#if wt.isna().sum() & mut.isna().sum() == 0: -# print('PASS: 3-letter wt&mut residue extraction worked correctly:' -# , '\nNo NAs detected:' -# , '\nwild-type\n', wt -# , '\nmutant-type\n', mut -# , '\ndim of df:', gene_LF1.shape) -#else: -# print('FAIL: 3-letter wt&mut residue extraction failed' -# , '\nNo NAs detected:' -# , '\nwild-type\n', wt -# , '\nmutant-type\n', mut -# , '\ndim of df:', gene_LF1.shape) - -#if mylen5 == mylen4 + ncol_aa_add: -# print('PASS: successfully added', ncol_aa_add, 'cols' -# , '\nold length:', mylen4 -# , '\nnew len:', mylen5) -#else: -# print('FAIL: failed to add cols:' -# , '\nold length:', mylen4 -# , '\nnew len:', mylen5) -# clear variables -#del(k, v, wt, mut, lookup_dict) - ######## # combine the wild_type+poistion+mutant_type columns to generate # mutationinformation (matches mCSM output field) @@ -1119,7 +1073,7 @@ del(k, v, wt, mut, lookup_dict) ######### gene_LF1['mutationinformation'] = gene_LF1['wild_type'] + gene_LF1.position.map(str) + gene_LF1['mutant_type'] print('Created column: mutationinformation' - , '\n=====================================================================' + , '\n=====================================================================\n' , gene_LF1.mutationinformation.head(10)) #%% Write file: mCSM muts @@ -1142,7 +1096,7 @@ out_filename_mcsmsnps = gene.lower() + '_mcsm_snps.csv' outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps print('Writing file: mCSM style muts' - , '\nFilename:', outfile_mcsmsnps + , '\nFile:', outfile_mcsmsnps , '\nmutation format (SNP): {WT}{MUT}' , '\nNo. of distinct muts:', len(snps_only) , '\nNo. of distinct positions:', len(pos_only) @@ -1161,7 +1115,7 @@ del(out_filename_mcsmsnps) out_filename_metadata = gene.lower() + '_metadata.csv' outfile_metadata = outdir + '/' + out_filename_metadata print('Writing file: LF formatted data' - , '\nFilename:', outfile_metadata + , '\nFile:', outfile_metadata , '\n============================================================') gene_LF1.to_csv(outfile_metadata, header = True, index = False) @@ -1202,7 +1156,7 @@ out_filename_msa = gene.lower() +'_all_muts_msa.csv' outfile_msa = outdir + '/' + out_filename_msa print('Writing file: mCSM style muts for msa', - '\nFilename:', outfile_msa, + '\nFile:', outfile_msa, '\nmutation format (SNP): {WT}{MUT}', '\nNo.of lines of msa:', len(all_muts_msa)) @@ -1232,8 +1186,8 @@ out_filename_pos = gene.lower() + '_mutational_positons.csv' outfile_pos = outdir + '/' + out_filename_pos print('Writing file: mutational positions' - , '\nNo. of distinct positions:', len(pos_only_sorted) , '\nFile:', outfile_pos + , '\nNo. of distinct positions:', len(pos_only_sorted) , '\n=============================================================') pos_only_sorted.to_csv(outfile_pos, header = True, index = False)