From 6b6921d45f7c987c42a62157c2db05ea4072070a Mon Sep 17 00:00:00 2001 From: Tanushree Tunstall Date: Tue, 9 Feb 2021 16:03:02 +0000 Subject: [PATCH] work from thinkpad --- scripts/data_extraction.py | 117 ++++++++++++++++++++----------------- scripts/running_scripts | 34 +++++++---- 2 files changed, 86 insertions(+), 65 deletions(-) diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py index 98a4415..6d46a7d 100755 --- a/scripts/data_extraction.py +++ b/scripts/data_extraction.py @@ -81,8 +81,8 @@ indir = args.input_dir outdir = args.output_dir make_dirs = args.make_dirs -#drug = 'ethambutol' -#gene = 'embB' +#drug = 'streptomycin' +#gene = 'gid' #%% input and output dirs and files #======= @@ -122,15 +122,15 @@ if make_dirs: # handle missing dirs here if not os.path.isdir(datadir): print('ERROR: Data directory does not exist:', datadir - , '\nPlease create and ensure gwas data is present and then rerun\nelse specify cmd option ---make_dirs') + , '\nPlease create and ensure gwas data is present and then rerun\nelse specify cmd option --make_dirs') sys.exit() if not os.path.isdir(indir): print('ERROR: Input directory does not exist:', indir - , '\nPlease either create or specify indir and rerun\nelse specify cmd option ---make_dirs') + , '\nPlease either create or specify indir and rerun\nelse specify cmd option --make_dirs') sys.exit() if not os.path.isdir(outdir): print('ERROR: Output directory does not exist:', outdir - , '\nPlease create or specify outdir and rerun\nelse specify cmd option ---make_dirs') + , '\nPlease create or specify outdir and rerun\nelse specify cmd option --make_dirs') sys.exit() # Requires @@ -317,7 +317,7 @@ for i, id in enumerate(clean_df.id): print('RESULTS:') print('Total WT in dr_muts_col:', wt) print('Total matches of', gene, 'SNP matches in', dr_muts_col, ':', dr_gene_count) -print('Total samples with > 1', gene, 'muts in dr_muts_col:', len(id2_dr) ) +print('Total samples with > 1', gene, 'nsSNPs in dr_muts_col:', len(id2_dr) ) print('=================================================================') del(clean_df, na_count, i, id, wt, id2_dr, count_gene_dr, count_wt) @@ -361,7 +361,7 @@ for i, id in enumerate(clean_df.id): print('RESULTS:') print('Total WT in other_muts_col:', wt_other) print('Total matches of', gene, 'SNP matches in', other_muts_col, ':', other_gene_count) -print('Total samples with > 1', gene, 'muts in other_muts_col:', len(id2_other) ) +print('Total samples with > 1', gene, 'nsSNPs in other_muts_col:', len(id2_other) ) print('=================================================================') print('Predicting total no. of rows in the curated df:', dr_gene_count + other_gene_count @@ -851,7 +851,7 @@ else: , '\nMuts are unique to dr_ and other_ mutation class' , '\n=========================================================') -# inspect dr_muts and other muts +# inspect dr_muts and other muts: Fixed in case no ambiguous muts detected! if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: print('Finding ambiguous muts...' , '\n=========================================================' @@ -860,24 +860,37 @@ if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: , '\n=========================================================' , '\nTotal no. of samples in other_muts present in dr_muts:', other_muts.isin(dr_muts).sum() , '\nThese are:\n', other_muts[other_muts.isin(dr_muts)] - , '\n=========================================================') + , '\n=========================================================') + + print('Counting no. of ambiguous muts...' + , '\nNo. of ambiguous muts in dr:' + , len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist()) + , '\nNo. of ambiguous muts in other:' + , len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist()) + , '\n=========================================================') + + if dr_muts[dr_muts.isin(other_muts)].nunique() == other_muts[other_muts.isin(dr_muts)].nunique(): + common_muts = dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist() + print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts)) + , '\nlist of ambiguous mutations (see below):', *common_muts, sep = '\n') + print('\n===========================================================') else: - sys.exit('Error: ambiguous muts present, but extraction failed. Debug!') - -print('Counting no. of ambiguous muts...') - -if dr_muts[dr_muts.isin(other_muts)].nunique() == other_muts[other_muts.isin(dr_muts)].nunique(): - common_muts = dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist() - print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts)) - , '\nlist of ambiguous mutations (see below):', *common_muts, sep = '\n') - print('\n===========================================================') -else: - print('Error: ambiguous muts detected, but extraction failed. Debug!' - , '\nNo. of ambiguous muts in dr:' - , len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist()) - , '\nNo. of ambiguous muts in other:' - , len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist()) - , '\n=========================================================') + #sys.exit('Error: ambiguous muts present, but extraction failed. Debug!') + print('No: ambiguous muts present') + +#print('Counting no. of ambiguous muts...') +#if dr_muts[dr_muts.isin(other_muts)].nunique() == other_muts[other_muts.isin(dr_muts)].nunique(): +# common_muts = dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist() +# print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts)) +# , '\nlist of ambiguous mutations (see below):', *common_muts, sep = '\n') +# print('\n===========================================================') +#else: +# print('Error: ambiguous muts detected, but extraction failed. Debug!' +# , '\nNo. of ambiguous muts in dr:' +# , len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist()) +# , '\nNo. of ambiguous muts in other:' +# , len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist()) +# , '\n=========================================================') #%% clear variables del(id_dr, id_other @@ -893,25 +906,24 @@ del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_m #print(outdir) #dr_muts.to_csv('dr_muts.csv', header = True) #other_muts.to_csv('other_muts.csv', header = True) +if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: + out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv' + outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts + print('\n----------------------------------' + , '\nWriting file: ambiguous muts' + , '\n----------------------------------' + , '\nFilename:', outfile_ambig_muts) + inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)] + inspect.to_csv(outfile_ambig_muts, index = False) -out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv' -outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts -print('\n----------------------------------' - , '\nWriting file: ambiguous muts' - , '\n----------------------------------' - , '\nFilename:', outfile_ambig_muts) + print('Finished writing:', out_filename_ambig_muts + , '\nNo. of rows:', len(inspect) + , '\nNo. of cols:', len(inspect.columns) + , '\nNo. of rows = no. of samples with the ambiguous muts present:' + , dr_muts.isin(other_muts).sum() + other_muts.isin(dr_muts).sum() + , '\n=============================================================') + del(out_filename_ambig_muts) -inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)] -inspect.to_csv(outfile_ambig_muts, index = False) - -print('Finished writing:', out_filename_ambig_muts - , '\nNo. of rows:', len(inspect) - , '\nNo. of cols:', len(inspect.columns) - , '\nNo. of rows = no. of samples with the ambiguous muts present:' - , dr_muts.isin(other_muts).sum() + other_muts.isin(dr_muts).sum() - , '\n=============================================================') - -del(out_filename_ambig_muts) #%% end of data extraction and some files writing. Below are some more files writing. #============================================================================= #%% Formatting df: read aa dict and pull relevant info @@ -1181,7 +1193,7 @@ if snps_only.mutationinformation.isna().sum() == 0: else: sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?') -out_filename_mcsmsnps = gene.lower() + '_mcsm_snps.csv' +out_filename_mcsmsnps = gene.lower() + '_mcsm_style_snps.csv' outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps print('\n----------------------------------' @@ -1215,7 +1227,7 @@ metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = T out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv' outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts print('\n----------------------------------' - , 'Writing file: Metadata poscounts' + , '\nWriting file: Metadata poscounts' , '\n----------------------------------' , '\nFile:', outfile_metadata_poscounts , '\n============================================================') @@ -1309,7 +1321,7 @@ out_filename_pos = gene.lower() + '_mutational_positons.csv' outfile_pos = outdir + '/' + out_filename_pos print('\n----------------------------------' - , 'Writing file: mutational positions' + , '\nWriting file: mutational positions' , '\n----------------------------------' , '\nFile:', outfile_pos , '\nNo. of distinct positions:', len(pos_only_sorted) @@ -1349,15 +1361,14 @@ print('============================================' , '\nTotal no. of samples with missense muts:', len(gene_LF1) , '\nTotal no. of unique samples with missense muts:', gene_LF1['id'].nunique() , '\n' - , '\nTotal no.of samples with common_ids:', nu_common_ids['id'] - , '\nTotal no.of samples with ambiguous muts:', len(inspect) - #, '\nTotal no.of unique ambiguous muts:', len(common_muts) - , '\nTotal no.of unique ambiguous muts:', inspect['mutation'].nunique() - , '\n=============================================================' - , '\n\n\n') - - + , '\nTotal no.of samples with common_ids:', nu_common_ids['id']) +if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0: + print('\nTotal no.of samples with ambiguous muts:', len(inspect) + #, '\nTotal no.of unique ambiguous muts:', len(common_muts) + , '\nTotal no.of unique ambiguous muts:', inspect['mutation'].nunique() + , '\n=============================================================' + , '\n\n\n') #======================================================================= print(u'\u2698' * 50, '\nEnd of script: Data extraction and writing files' diff --git a/scripts/running_scripts b/scripts/running_scripts index 646c654..ee109db 100644 --- a/scripts/running_scripts +++ b/scripts/running_scripts @@ -1,32 +1,42 @@ #======== # data extraction: Must be run first to extract mutations for each drug-gene combination #======== -./data_extraction.py -d pyrazinamide -g pncA +./data_extraction.py -d -g --make_dirs + +#======== +# add chains to a PDB file: for modeller models lack chain ID, this script is used +#======== +add_chains_pdb.py MY_PDB.pdb + +#======== +# pdb data extraction: To find out discontinuity of chain and removing invalid muts to allow foldx and mcsm to run properly! +#======== +In progress... #======== # foldx: specify chain if default is NOT 'A' #======== -./runFoldx.py -d pyrazinamide -g pncA +./runFoldx.py -d -g -c1 A -p /media/tanu/eb1d072a-3f73-427f-aeb8-f6852b5c5216/Data/processing #======== # mcsm: specify chain if default is NOT 'A' #======== -./run_mcsm.py -d pyrazinamide -g pncA -s submit -l PZA --debug -./run_mcsm.py -d pyrazinamide -g pncA -s get -./run_mcsm.py -d pyrazinamide -g pncA -s format +./run_mcsm.py -d -g -s submit -l PZA --debug +./run_mcsm.py -d -g pncA -s get +./run_mcsm.py -d -g pncA -s format #==================== # other struct params #==================== -./dssp_df.py -d pyrazinamide -g pncA +./dssp_df.py -d -g # Errors on matplot.lib warn=, so just comment it out for the timebeing!: MONKEY PATCH -./kd_df.py -d pyrazinamide -g pncA -fasta # fixme: NO of cols says 2, but is actually 3 -./rd_df.py -d pyrazinamide -g pncA # fixme: input tsv file is sourced manually from website! +./kd_df.py -d -g -fasta # fixme: NO of cols says 2, but is actually 3 +./rd_df.py -d -g # fixme: input tsv file is sourced manually from website! #============================== # af_or calcs: different types #============================== -./af_or_calcs.R --d pyrazinamide --gene pncA # fixme: No conditional dir structure +./af_or_calcs.R -d -g # fixme: No conditional dir structure #============================== # af_or calcs: kinship @@ -40,18 +50,18 @@ USE THE BELOW from within the or_kinship_link.py script or something?! as part o # for now use the file already created using some manual wrestling to link # the OR for kinship with mutations -./or_kinship_link.py -d pyrazinamide -g pncA -sc 2288681 -ec 2289241 +./or_kinship_link.py -d -g -sc -ec #============================== # formatting: ns_snp_info.txt #============================== # This adds mcsm style muts -./snpinfo_format.py -d pyrazinamide -g pncA +./snpinfo_format.py -d -g #============================== # combining dfs: combining_dfs.py #============================== # FIXME: combining_FIXME.py -./combining_dfs.py -d pyrazinamide -g pncA +./combining_dfs.py --d -g