From 6b6921d45f7c987c42a62157c2db05ea4072070a Mon Sep 17 00:00:00 2001
From: Tanushree Tunstall <tanu@tunstall.in>
Date: Tue, 9 Feb 2021 16:03:02 +0000
Subject: [PATCH] work from thinkpad

---
 scripts/data_extraction.py | 117 ++++++++++++++++++++-----------------
 scripts/running_scripts    |  34 +++++++----
 2 files changed, 86 insertions(+), 65 deletions(-)

diff --git a/scripts/data_extraction.py b/scripts/data_extraction.py
index 98a4415..6d46a7d 100755
--- a/scripts/data_extraction.py
+++ b/scripts/data_extraction.py
@@ -81,8 +81,8 @@ indir   = args.input_dir
 outdir  = args.output_dir
 make_dirs  = args.make_dirs
 
-#drug = 'ethambutol'
-#gene = 'embB'
+#drug = 'streptomycin'
+#gene = 'gid'
 
 #%% input and output dirs and files
 #=======
@@ -122,15 +122,15 @@ if make_dirs:
 # handle missing dirs here
 if not os.path.isdir(datadir):
     print('ERROR: Data directory does not exist:', datadir
-    , '\nPlease create and ensure gwas data is present and then rerun\nelse specify cmd option ---make_dirs')
+    , '\nPlease create and ensure gwas data is present and then rerun\nelse specify cmd option --make_dirs')
     sys.exit()
 if not os.path.isdir(indir):
     print('ERROR: Input directory does not exist:', indir
-    , '\nPlease either create or specify indir and rerun\nelse specify cmd option ---make_dirs')
+    , '\nPlease either create or specify indir and rerun\nelse specify cmd option --make_dirs')
     sys.exit()
 if not os.path.isdir(outdir):
     print('ERROR: Output directory does not exist:', outdir
-    , '\nPlease create or specify outdir and rerun\nelse specify cmd option ---make_dirs')
+    , '\nPlease create or specify outdir and rerun\nelse specify cmd option --make_dirs')
     sys.exit()
     
 # Requires
@@ -317,7 +317,7 @@ for i, id in enumerate(clean_df.id):
 print('RESULTS:')        
 print('Total WT in dr_muts_col:', wt)
 print('Total matches of', gene, 'SNP matches in', dr_muts_col, ':', dr_gene_count)
-print('Total samples with > 1', gene, 'muts in dr_muts_col:', len(id2_dr) )
+print('Total samples with > 1', gene, 'nsSNPs in dr_muts_col:', len(id2_dr) )
 print('=================================================================')
 
 del(clean_df, na_count, i, id, wt, id2_dr, count_gene_dr, count_wt)
@@ -361,7 +361,7 @@ for i, id in enumerate(clean_df.id):
 print('RESULTS:')
 print('Total WT in other_muts_col:', wt_other)
 print('Total matches of', gene, 'SNP matches in', other_muts_col, ':', other_gene_count)
-print('Total samples with > 1', gene, 'muts in other_muts_col:', len(id2_other) )
+print('Total samples with > 1', gene, 'nsSNPs in other_muts_col:', len(id2_other) )
 print('=================================================================')
 
 print('Predicting total no. of rows in the curated df:', dr_gene_count + other_gene_count
@@ -851,7 +851,7 @@ else:
           , '\nMuts are unique to dr_ and other_ mutation class'
           , '\n=========================================================')
 
-# inspect dr_muts and other muts    
+# inspect dr_muts and other muts: Fixed in case no ambiguous muts detected!
 if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
     print('Finding ambiguous muts...'
           , '\n========================================================='
@@ -860,24 +860,37 @@ if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
           , '\n========================================================='
           , '\nTotal no. of samples in other_muts present in dr_muts:', other_muts.isin(dr_muts).sum()
           , '\nThese are:\n', other_muts[other_muts.isin(dr_muts)]
-          , '\n=========================================================')    
+          , '\n=========================================================')
+    
+    print('Counting no. of ambiguous muts...'
+          , '\nNo. of ambiguous muts in dr:'
+          , len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist())
+          , '\nNo. of ambiguous muts in other:'
+          , len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist())
+          , '\n=========================================================')
+    
+    if dr_muts[dr_muts.isin(other_muts)].nunique() == other_muts[other_muts.isin(dr_muts)].nunique(): 
+        common_muts = dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist()
+        print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts))
+              , '\nlist of ambiguous mutations (see below):', *common_muts, sep = '\n')
+        print('\n===========================================================') 
 else:
-    sys.exit('Error: ambiguous muts present, but extraction failed. Debug!')
-
-print('Counting no. of ambiguous muts...') 
-
-if dr_muts[dr_muts.isin(other_muts)].nunique() == other_muts[other_muts.isin(dr_muts)].nunique(): 
-    common_muts = dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist()
-    print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts))
-          , '\nlist of ambiguous mutations (see below):', *common_muts, sep = '\n')
-    print('\n===========================================================')
-else:
-   print('Error: ambiguous muts detected, but extraction failed. Debug!'
-         , '\nNo. of ambiguous muts in dr:'
-         , len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist())
-         , '\nNo. of ambiguous muts in other:'
-         , len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist())
-         , '\n=========================================================')       
+    #sys.exit('Error: ambiguous muts present, but extraction failed. Debug!')
+    print('No: ambiguous muts present')
+    
+#print('Counting no. of ambiguous muts...')
+#if dr_muts[dr_muts.isin(other_muts)].nunique() == other_muts[other_muts.isin(dr_muts)].nunique(): 
+#    common_muts = dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist()
+#    print('Distinct no. of ambigiuous muts detected:'+ str(len(common_muts))
+#          , '\nlist of ambiguous mutations (see below):', *common_muts, sep = '\n')
+#    print('\n===========================================================')
+#else:
+#   print('Error: ambiguous muts detected, but extraction failed. Debug!'
+#         , '\nNo. of ambiguous muts in dr:'
+#         , len(dr_muts[dr_muts.isin(other_muts)].value_counts().keys().tolist())
+#         , '\nNo. of ambiguous muts in other:'
+#         , len(other_muts[other_muts.isin(dr_muts)].value_counts().keys().tolist())
+#         , '\n=========================================================')       
          
 #%% clear variables
 del(id_dr, id_other
@@ -893,25 +906,24 @@ del(c1, c2, col_to_split1, col_to_split2, comp_gene_samples, dr_WF0, dr_df, dr_m
 #print(outdir)
 #dr_muts.to_csv('dr_muts.csv', header = True)
 #other_muts.to_csv('other_muts.csv', header = True)
+if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
+    out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv'
+    outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts
+    print('\n----------------------------------'
+          , '\nWriting file: ambiguous muts'
+          , '\n----------------------------------'
+          , '\nFilename:', outfile_ambig_muts)
+    inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
+    inspect.to_csv(outfile_ambig_muts, index = False)
 
-out_filename_ambig_muts = gene.lower() + '_ambiguous_muts.csv'
-outfile_ambig_muts = outdir + '/' + out_filename_ambig_muts
-print('\n----------------------------------'
-      , '\nWriting file: ambiguous muts'
-      , '\n----------------------------------'
-      , '\nFilename:', outfile_ambig_muts)
+    print('Finished writing:', out_filename_ambig_muts
+          , '\nNo. of rows:', len(inspect)
+          , '\nNo. of cols:', len(inspect.columns)
+          , '\nNo. of rows = no. of samples with the ambiguous muts present:'
+          , dr_muts.isin(other_muts).sum() + other_muts.isin(dr_muts).sum()
+          , '\n=============================================================')
+    del(out_filename_ambig_muts)
 
-inspect = gene_LF1[gene_LF1['mutation'].isin(common_muts)]
-inspect.to_csv(outfile_ambig_muts, index = False)
-
-print('Finished writing:', out_filename_ambig_muts
-      , '\nNo. of rows:', len(inspect)
-      , '\nNo. of cols:', len(inspect.columns)
-      , '\nNo. of rows = no. of samples with the ambiguous muts present:'
-      , dr_muts.isin(other_muts).sum() + other_muts.isin(dr_muts).sum()
-      , '\n=============================================================')
-      
-del(out_filename_ambig_muts)
 #%% end of data extraction and some files writing. Below are some more files writing.
 #=============================================================================
 #%% Formatting df: read aa dict and pull relevant info
@@ -1181,7 +1193,7 @@ if snps_only.mutationinformation.isna().sum() == 0:
 else:
     sys.exit('FAIL: SNP has NA, Possible mapping issues from dict?')
 
-out_filename_mcsmsnps = gene.lower() + '_mcsm_snps.csv'
+out_filename_mcsmsnps = gene.lower() + '_mcsm_style_snps.csv'
 outfile_mcsmsnps = outdir + '/' + out_filename_mcsmsnps
 
 print('\n----------------------------------'
@@ -1215,7 +1227,7 @@ metadata_pos.sort_values(by = ['meta_pos_count'], ascending = False, inplace = T
 out_filename_metadata_poscounts = gene.lower() + '_metadata_poscounts.csv'
 outfile_metadata_poscounts = outdir + '/' + out_filename_metadata_poscounts
 print('\n----------------------------------'
-      , 'Writing file: Metadata poscounts'
+      , '\nWriting file: Metadata poscounts'
       , '\n----------------------------------'
       , '\nFile:', outfile_metadata_poscounts
       , '\n============================================================')
@@ -1309,7 +1321,7 @@ out_filename_pos = gene.lower() + '_mutational_positons.csv'
 outfile_pos = outdir + '/' + out_filename_pos
 
 print('\n----------------------------------'
-      , 'Writing file: mutational positions'
+      , '\nWriting file: mutational positions'
       , '\n----------------------------------'
       , '\nFile:', outfile_pos
       , '\nNo. of distinct positions:', len(pos_only_sorted)
@@ -1349,15 +1361,14 @@ print('============================================'
       , '\nTotal no. of samples with missense muts:', len(gene_LF1)
       , '\nTotal no. of unique samples with missense muts:', gene_LF1['id'].nunique()  
       , '\n'
-      , '\nTotal no.of samples with common_ids:',  nu_common_ids['id']
-      , '\nTotal no.of samples with ambiguous muts:', len(inspect)
-      #, '\nTotal no.of unique ambiguous muts:', len(common_muts)
-      , '\nTotal no.of unique ambiguous muts:', inspect['mutation'].nunique()
-      , '\n============================================================='
-      , '\n\n\n')
-
-
+      , '\nTotal no.of samples with common_ids:',  nu_common_ids['id'])
 
+if dr_muts.isin(other_muts).sum() & other_muts.isin(dr_muts).sum() > 0:
+    print('\nTotal no.of samples with ambiguous muts:', len(inspect)
+          #, '\nTotal no.of unique ambiguous muts:', len(common_muts)
+          , '\nTotal no.of unique ambiguous muts:', inspect['mutation'].nunique()
+          , '\n============================================================='
+          , '\n\n\n')
 #=======================================================================
 print(u'\u2698' * 50,
       '\nEnd of script: Data extraction and writing files'
diff --git a/scripts/running_scripts b/scripts/running_scripts
index 646c654..ee109db 100644
--- a/scripts/running_scripts
+++ b/scripts/running_scripts
@@ -1,32 +1,42 @@
 #========
 # data extraction: Must be run first to extract mutations for each drug-gene combination
 #========
-./data_extraction.py -d pyrazinamide -g pncA 
+./data_extraction.py -d <drug> -g <gene> --make_dirs
+
+#========
+# add chains to a PDB file: for modeller models lack chain ID, this script is used
+#========
+add_chains_pdb.py <N> MY_PDB.pdb 
+
+#========
+# pdb data extraction: To find out discontinuity of chain and removing invalid muts to allow foldx and mcsm to run properly!
+#========
+In progress...
 
 #========
 # foldx: specify chain if default is NOT 'A'
 #========
-./runFoldx.py -d pyrazinamide -g pncA
+./runFoldx.py -d <drug> -g <gene> -c1 A -p /media/tanu/eb1d072a-3f73-427f-aeb8-f6852b5c5216/Data/processing
 
 #========
 # mcsm: specify chain if default is NOT 'A'
 #========
-./run_mcsm.py -d pyrazinamide -g pncA -s submit -l PZA --debug
-./run_mcsm.py -d pyrazinamide -g pncA -s get
-./run_mcsm.py -d pyrazinamide -g pncA -s format
+./run_mcsm.py -d <drug> -g <gene> -s submit -l PZA --debug
+./run_mcsm.py -d <drug> -g <gene> pncA -s get
+./run_mcsm.py -d <drug> -g <gene> pncA -s format
 
 #====================
 # other struct params
 #====================
-./dssp_df.py -d pyrazinamide -g pncA
+./dssp_df.py -d <drug> -g <gene>
 # Errors on matplot.lib warn=, so just comment it out for the timebeing!: MONKEY PATCH
-./kd_df.py -d pyrazinamide -g pncA  -fasta # fixme: NO of cols says 2, but is actually 3
-./rd_df.py -d pyrazinamide -g pncA # fixme: input tsv file is sourced manually from website!
+./kd_df.py -d <drug> -g <gene>  -fasta # fixme: NO of cols says 2, but is actually 3
+./rd_df.py -d <drug> -g <gene> # fixme: input tsv file is sourced manually from website!
 
 #==============================
 # af_or calcs: different types
 #==============================
-./af_or_calcs.R --d pyrazinamide --gene pncA # fixme: No conditional dir structure
+./af_or_calcs.R -d <drug> -g <gene># fixme: No conditional dir structure
 
 #==============================
 # af_or calcs: kinship
@@ -40,18 +50,18 @@ USE THE BELOW from within the or_kinship_link.py script or something?! as part o
 # for now use the file already created using some manual wrestling to link
 # the OR for kinship with mutations
 
-./or_kinship_link.py  -d pyrazinamide -g pncA -sc 2288681 -ec 2289241
+./or_kinship_link.py -d <drug> -g <gene> -sc <start_coord> -ec <end_coord>
 
 #==============================
 # formatting: ns<gene>_snp_info.txt
 #==============================
 # This adds mcsm style muts
-./snpinfo_format.py -d pyrazinamide -g pncA
+./snpinfo_format.py -d <drug> -g <gene>
 
 #==============================
 # combining dfs: combining_dfs.py
 #==============================
 # FIXME: combining_FIXME.py
-./combining_dfs.py -d pyrazinamide -g pncA
+./combining_dfs.py --d <drug> -g <gene>