saving from work

This commit is contained in:
Tanushree Tunstall 2020-02-27 15:16:20 +00:00
parent 61f8dc57c9
commit a5356cf88b
2 changed files with 33 additions and 9 deletions

View file

@ -9,6 +9,7 @@ Created on Tue Jun 25 08:46:36 2019
# load libraries # load libraries
import os import os
import pandas as pd import pandas as pd
import numpy as np
from Bio import SeqIO from Bio import SeqIO
############################################ ############################################
#******************************************************************** #********************************************************************
@ -86,7 +87,7 @@ print("new length:", len(my_fasta))
############# #############
# read mutant_info file and extract cols with positions and mutant_info # read mutant_info file and extract cols with positions and mutant_info
# This should be all samples with pncA muts # This should be all samples with pncA muts
#my_data = pd.read_csv('mcsm_complex1_normalised.csv') #335, 15 #my_data = pd.read_csv('mcsm_complex1_normalised.csv')
my_data = pd.read_csv(infile_meta_data) my_data = pd.read_csv(infile_meta_data)
list(my_data.columns) list(my_data.columns)
#my_data['OR'].value_counts() #my_data['OR'].value_counts()
@ -95,12 +96,34 @@ list(my_data.columns)
#FIXME: You need a better way to identify this #FIXME: You need a better way to identify this
# ideally this file should not contain any non_struc pos # ideally this file should not contain any non_struc pos
# remove positions not in the structure # remove positions not in the structure
my_data = my_data[my_data.position != ns_pos_o] #3092, 22 my_data = my_data[my_data.position != ns_pos_o]
# if multiple positions, then try the example below; # if multiple positions, then try the example below;
# https://stackoverflow.com/questions/29017525/deleting-rows-based-on-multiple-conditions-python-pandas # https://stackoverflow.com/questions/29017525/deleting-rows-based-on-multiple-conditions-python-pandas
#df = df[(df.one > 0) | (df.two > 0) | (df.three > 0) & (df.four < 1)] #df = df[(df.one > 0) | (df.two > 0) | (df.three > 0) & (df.four < 1)]
# count mutations per sample
mut_info = my_data[['id', 'Mutationinformation', 'wild_type', 'position', 'mutant_type']]
# test
foo = mut_info[mut_info.Mutationinformation.str.contains('C72Y')]
foo = mut_info.pivot_table(values = ['Mutationinformation']
, index = ['Mutationinformation', 'id']
# , columns =
, aggfunc = 'count')
# table
foo_tab = mut_info.pivot_table(values = ['Mutationinformation']
# , index = ['Mutationinformation']
, columns = ['id', 'Mutationinformation']
, aggfunc = 'count'
# , margins = True)
)
foo_tab.stack('id')
mut_info.to_csv('mutinfo.csv')
mut_info1 = my_data[['position', 'mutant_type']] mut_info1 = my_data[['position', 'mutant_type']]
#%% #%%
################ ################

View file

@ -118,14 +118,16 @@ clear variables
meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] meta_pza = meta_data.loc[meta_data.dr_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
#2163 {RESULT: samples with dr_muts} #2163 {RESULT: samples with dr_muts}
dr_id = meta_pza['id'].unique() dr_id = meta_pza['id'].unique()
dr_id = pd.Series(dr_id)
meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)] meta_pza = meta_data.loc[meta_data.other_mutations_pyrazinamide.str.contains('pncA_p.*', na = False)]
#526 (RESULT: samples with other_muts) #526 (RESULT: samples with other_muts)
other_id = meta_pza['id'].unique() other_id = meta_pza['id'].unique()
other_id = pd.Series(other_id)
# FIXME: See if the sample ids are unique in each # FIXME: See if the sample ids are unique in each
# find any common IDs # find any common IDs
dr_id.isin(other_id[1,1]) dr_id.isin(other_id).sum()
del(meta_pza) del(meta_pza)
@ -159,8 +161,7 @@ del(meta_pnca_other)
# Now extract "all" mutations # Now extract "all" mutations
meta_pnca_all = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*') | meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*') ] meta_pnca_all = meta_data_pnca.loc[meta_data_pnca.dr_mutations_pyrazinamide.str.contains('pncA_p.*') | meta_data_pnca.other_mutations_pyrazinamide.str.contains('pncA_p.*') ]
meta_pnca_all['id'].nunique() #RESULT: pnca mutations in ALL samples}
meta_pnca_all['id'].nunique() {#RESULT: pnca mutations in ALL samples}
pnca_samples = len(meta_pnca_all) pnca_samples = len(meta_pnca_all)
pnca_na = meta_pnca_all['pyrazinamide'].isna().sum() pnca_na = meta_pnca_all['pyrazinamide'].isna().sum()
comp_pnca_samples = pnca_samples - pnca_na comp_pnca_samples = pnca_samples - pnca_na
@ -170,8 +171,8 @@ comp_pnca_samples = pnca_samples - pnca_na
#=#=#=#=#=#=# #=#=#=#=#=#=#
# sanity checks # sanity checks
meta_pnca_all.dr_mutations_pyrazinamide.value_counts() foo1 = meta_pnca_all.dr_mutations_pyrazinamide.value_counts()
meta_pnca_all.other_mutations_pyrazinamide.value_counts() foo2 = meta_pnca_all.other_mutations_pyrazinamide.value_counts()
# more sanity checks # more sanity checks
# !CAUTION!: muts will change depending on your gene # !CAUTION!: muts will change depending on your gene
@ -182,7 +183,7 @@ meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.P
meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Val139Leu')] meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.Val139Leu')]
meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists # rows meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists # rows
m = meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists # rows meta_pnca_all.loc[meta_pnca_all.dr_mutations_pyrazinamide.str.contains('pncA_p.')] # exists # rows
# other_muts # other_muts
meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro*')] # empty meta_pnca_all.loc[meta_pnca_all.other_mutations_pyrazinamide.str.contains('pncA_p.Gln10Pro*')] # empty