diff --git a/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py b/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py index 04a1cc0..5cc5f09 100755 --- a/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py +++ b/mcsm_analysis/pyrazinamide/scripts/generate_mut_sequences.py @@ -9,6 +9,7 @@ Created on Tue Jun 25 08:46:36 2019 # load libraries import os import pandas as pd +import numpy as np from Bio import SeqIO ############################################ #******************************************************************** @@ -47,7 +48,7 @@ print("Input file is:", infile_meta_data) #======= # output #======= -outdir = 'git/Data/pyrazinamide/output' +outdir = 'git/Data/pyrazinamide/output' # filenames in respective sections ################## end of variable assignment for input and output files @@ -86,7 +87,7 @@ print("new length:", len(my_fasta)) ############# # read mutant_info file and extract cols with positions and mutant_info # This should be all samples with pncA muts -#my_data = pd.read_csv('mcsm_complex1_normalised.csv') #335, 15 +#my_data = pd.read_csv('mcsm_complex1_normalised.csv') my_data = pd.read_csv(infile_meta_data) list(my_data.columns) #my_data['OR'].value_counts() @@ -95,12 +96,34 @@ list(my_data.columns) #FIXME: You need a better way to identify this # ideally this file should not contain any non_struc pos # remove positions not in the structure -my_data = my_data[my_data.position != ns_pos_o] #3092, 22 +my_data = my_data[my_data.position != ns_pos_o] # if multiple positions, then try the example below; # https://stackoverflow.com/questions/29017525/deleting-rows-based-on-multiple-conditions-python-pandas #df = df[(df.one > 0) | (df.two > 0) | (df.three > 0) & (df.four < 1)] +# count mutations per sample +mut_info = my_data[['id', 'Mutationinformation', 'wild_type', 'position', 'mutant_type']] + +# test +foo = mut_info[mut_info.Mutationinformation.str.contains('C72Y')] + +foo = mut_info.pivot_table(values = ['Mutationinformation'] + , index = ['Mutationinformation', 'id'] +# , columns = + , aggfunc = 'count') + +# table +foo_tab = mut_info.pivot_table(values = ['Mutationinformation'] +# , index = ['Mutationinformation'] + , columns = ['id', 'Mutationinformation'] + , aggfunc = 'count' +# , margins = True) + ) +foo_tab.stack('id') + +mut_info.to_csv('mutinfo.csv') + mut_info1 = my_data[['position', 'mutant_type']] #%% ################ diff --git a/meta_data_analysis/pnca_data_extraction.py b/meta_data_analysis/pnca_data_extraction.py index 454fb5f..25a596e 100755 --- a/meta_data_analysis/pnca_data_extraction.py +++ b/meta_data_analysis/pnca_data_extraction.py @@ -118,14 +118,16 @@ clear variables meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] #2163 {RESULT: samples with dr_muts} dr_id = meta_pza['id'].unique() +dr_id = pd.Series(dr_id) meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] #526 (RESULT: samples with other_muts) other_id = meta_pza['id'].unique() +other_id = pd.Series(other_id) # FIXME: See if the sample ids are unique in each # find any common IDs -dr_id.isin(other_id[1,1]) +dr_id.isin(other_id).sum() del(meta_pza) @@ -159,8 +161,7 @@ del(meta_pnca_other) # Now extract "all" mutations meta_pnca_all = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*') | meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*') ] - -meta_pnca_all['id'].nunique() {#RESULT: pnca mutations in ALL samples} +meta_pnca_all['id'].nunique() #RESULT: pnca mutations in ALL samples} pnca_samples = len(meta_pnca_all) pnca_na = meta_pnca_all['pyrazinamide'].isna().sum() comp_pnca_samples = pnca_samples - pnca_na @@ -170,8 +171,8 @@ comp_pnca_samples = pnca_samples - pnca_na #=#=#=#=#=#=# # sanity checks -meta_pnca_all.dr_mutations_pyrazinamide.value_counts() -meta_pnca_all.other_mutations_pyrazinamide.value_counts() +foo1 = meta_pnca_all.dr_mutations_pyrazinamide.value_counts() +foo2 = meta_pnca_all.other_mutations_pyrazinamide.value_counts() # more sanity checks # !CAUTION!: muts will change depending on your gene @@ -182,7 +183,7 @@ meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.P meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Val139Leu')] meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists # rows -m = meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists # rows +meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists # rows # other_muts meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro*')] # empty