renamed files & added or kinship link file

2020-06-19 10:33:26 +01:00 · 2020-06-19 10:33:26 +01:00 · 07258120de
commit 07258120de
parent c36197d75e
3 changed files with 646 additions and 0 deletions
--- a/scripts/AF_and_OR_calcs.R
+++ b/scripts/AF_and_OR_calcs.R
--- a/scripts/af_or_calcs_scratch.R
+++ b/scripts/af_or_calcs_scratch.R
@ -0,0 +1,385 @@
+#########################################################
+# TASK: To compare OR from master data
+# chisq, fisher test and logistic and adjusted logistic
+#########################################################
+getwd()
+setwd('~/git/LSHTM_analysis/scripts')
+getwd()
+
+#install.packages("logistf")
+library(logistf)
+#########################################################
+#%% variable assignment: input and output paths & filenames
+drug = 'pyrazinamide'
+gene = 'pncA'
+gene_match = paste0(gene,'_p.')
+cat(gene_match)
+
+#===========
+# input and output dirs
+#===========
+datadir = paste0('~/git/Data')
+indir = paste0(datadir, '/', drug, '/', 'input')
+outdir = paste0(datadir, '/', drug, '/', 'output')
+
+#===========
+# input and output files
+#===========
+in_filename  = 'original_tanushree_data_v2.csv'
+#in_filename  = 'mtb_gwas_v3.csv'
+infile = paste0(datadir, '/', in_filename)
+cat(paste0('Reading infile1: raw data', ' ', infile) )
+
+# infile2: _gene associated meta data file to extract valid snps and add calcs to.
+# This is outfile3 from data_extraction.py
+in_filename_metadata = paste0(tolower(gene), '_metadata.csv')
+infile_metadata = paste0(outdir, '/', in_filename_metadata)
+cat(paste0('Reading infile2: gene associated metadata:', infile_metadata))
+
+#===========
+# output
+#===========
+out_filename = paste0(tolower(gene),'_', 'meta_data_with_AF_OR.csv')
+outfile = paste0(outdir, '/', out_filename)
+cat(paste0('Output file with full path:', outfile))
+#%% end of variable assignment for input and output files
+
+#########################################################
+# 1: Read master/raw data stored in Data/
+#####################################################
+
+#===============
+# Step 1: read raw data (all remove entries with NA in pza column)
+#===============
+raw_data_all = read.csv(infile, stringsAsFactors = F)
+
+# building cols to extract
+dr_muts_col = paste0('dr_mutations_', drug)
+other_muts_col = paste0('other_mutations_', drug)
+
+cat('Extracting columns based on variables:\n'
+    , drug
+    , '\n'
+    , dr_muts_col
+    , '\n'
+    , other_muts_col
+    , '\n===============================================================')
+
+raw_data = raw_data_all[,c("id"
+                           , drug
+                           , dr_muts_col
+                           , other_muts_col)]
+rm(raw_data_all)
+
+rm(indir, in_filename, infile)
+
+#===========
+# 1a: exclude na
+#===========
+raw_data = raw_data[!is.na(raw_data[[drug]]),]
+
+total_samples = length(unique(raw_data$id))
+cat(paste0('Total samples without NA in', ' ', drug, 'is:', total_samples))
+
+# sanity check: should  be true
+is.numeric(total_samples) 
+
+#===========
+# 1b: combine the two mutation columns
+#===========
+#raw_data$all_mutations_pyrazinamide = paste(raw_data$dr_mutations_pyrazinamide, raw_data$other_mutations_pyrazinamide)
+
+all_muts_colname = paste0('all_mutations_', drug)
+raw_data[[all_muts_colname]] = paste(raw_data[[dr_muts_col]], raw_data[[other_muts_col]])
+head(raw_data[[all_muts_colname]])
+
+#===========
+# 1c: create yet another column that contains all the mutations but in lower case
+#===========
+head(raw_data[[all_muts_colname]])
+raw_data$all_muts_gene = tolower(raw_data[[all_muts_colname]]) 
+head(raw_data$all_muts_gene)
+
+# sanity checks
+#table(grepl("gene_p",raw_data$all_muts_gene))
+cat(paste0('converting gene match:', gene_match, ' ', 'to lowercase'))
+gene_match = tolower(gene_match)
+
+table(grepl(gene_match,raw_data$all_muts_gene))
+
+# sanity check
+if(sum(table(grepl(gene_match, raw_data$all_muts_gene))) == total_samples){
+  cat('PASS: Total no. of samples match')
+} else{
+  cat('FAIL: No. of samples mismatch')
+}
+
+#########################################################
+# 2: Read valid snps for which OR 
+# can be calculated
+#########################################################
+cat(paste0('Reading metadata infile:', infile_metadata))
+
+gene_metadata = read.csv(infile_metadata
+                         #, file.choose()
+                         , stringsAsFactors = F
+                         , header = T)
+
+
+# clear variables
+rm(in_filename_metadata, infile_metadata)
+
+# count na in pyrazinamide column
+tot_pza_na = sum(is.na(gene_metadata$pyrazinamide))
+expected_rows = nrow(gene_metadata) - tot_pza_na
+
+# drop na from the pyrazinamide colum
+gene_snps_or = gene_metadata[!is.na(gene_metadata[[drug]]),]
+
+# sanity check
+if(nrow(gene_snps_or) == expected_rows){
+  cat('PASS: no. of rows match with expected_rows')
+} else{
+  cat('FAIL: nrows mismatch.')
+}
+
+# extract unique snps to iterate over for AF and OR calcs
+gene_snps_unique = unique(gene_snps_or$mutation) 
+
+cat(paste0('Total no. of distinct comp snps to perform OR calcs: ', length(gene_snps_unique)))
+
+#=====================================
+#OR calcs using the following 4
+#1) chisq.test
+#2) fisher
+#3) modified chisq.test
+#4) logistic
+#5) adjusted logistic?
+#6) kinship (separate script)
+
+#======================================
+
+################# modified chisq OR
+# Define OR function
+#x = as.numeric(mut)
+#y = dst
+my_chisq_or = function(x,y){
+  tab = as.matrix(table(x,y))
+  a = tab[2,2]
+  if (a==0){ a<-0.5}
+  b = tab[2,1]
+  if (b==0){ b<-0.5}
+  c = tab[1,2]
+  if (c==0){ c<-0.5}
+  d = tab[1,1]
+  if (d==0){ d<-0.5}
+  (a/b)/(c/d)
+  
+}
+
+#========================
+# TEST WITH ONE
+
+i = "pnca_p.trp68gly"
+i = "pnca_p.gln10pro"
+i = "pnca_p.leu159arg"
+
+# IV
+table(grepl(i,raw_data$all_muts_gene))
+mut = grepl(i,raw_data$all_muts_gene)
+
+# DV
+#dst = raw_data$pyrazinamide
+dst = raw_data[[drug]] # or raw_data[,drug]
+
+# 2X2 table
+table(mut, dst)
+
+# CV
+#c = raw_data$id[mut]
+c = raw_data$id[grepl(i,raw_data$all_muts_gene)] 
+#sid = grepl(raw_data$id[mut], raw_data$id) # warning
+#argument 'pattern' has length > 1 and only the first element will be used
+#grepl(raw_data$id=="ERR2512440", raw_data$id)
+
+sid = grepl(paste(c,collapse="|"), raw_data$id)
+table(sid)   
+
+# 3X2 table
+table(mut, dst, sid)
+
+#============================
+# compare OR
+chisq.test(table(mut,dst))
+fisher.test(table(mut, dst))
+fisher.test(table(mut, dst))$p.value
+fisher.test(table(mut, dst))$estimate
+my_chisq_or(mut,dst)
+
+# logistic or
+summary(model<-glm(dst ~ mut, family = binomial))
+or_logistic = exp(summary(model)$coefficients[2,1]); print(or_logistic)
+pval_logistic = summary(model)$coefficients[2,4]; print(pval_logistic)
+
+# adjusted logistic or
+summary(model2<-glm(dst ~ mut + sid, family = binomial))
+or_logistic2 = exp(summary(model2)$coefficients[2,1]); print(or_logistic2)
+pval_logistic2 = summary(model2)$coefficients[2,4]; print(pval_logistic2)
+
+#=========================
+
+ors = sapply(gene_snps_unique,function(m){
+  mut = grepl(m,raw_data$all_muts_gene)
+  my_chisq_or(mut,dst)
+})
+
+ors
+
+pvals = sapply(gene_snps_unique,function(m){
+  mut = grepl(m,raw_data$all_muts_gene)
+  fisher.test(mut,dst)$p.value
+})
+
+pvals
+
+afs = sapply(gene_snps_unique,function(m){
+  mut = grepl(m,raw_data$all_muts_gene)
+  mean(mut)
+})
+
+afs
+
+# logistic
+logistic_ors = sapply(gene_snps_unique,function(m){
+  mut = grepl(m,raw_data$all_muts_gene)
+  model<-glm(dst ~ mut, family = binomial)
+  or_logistic = exp(summary(model)$coefficients[2,1])
+  #pval_logistic = summary(model)$coefficients[2,4]
+})
+logistic_ors
+
+
+# logistic adj # Doesn't seem to make a difference
+logistic_ors2 = sapply(gene_snps_unique,function(m){
+  mut = grepl(m,raw_data$all_muts_gene)
+  c = raw_data$id[mut]
+  sid = grepl(paste(c,collapse="|"), raw_data$id)
+  model2<-glm(dst ~ mut + sid, family = binomial)
+  or_logistic2 = exp(summary(model2)$coefficients[2,1])
+  #pval_logistic2 = summary(model2)$coefficients[2,4]
+})
+
+logistic_ors2
+
+or_logistic2; pval_logistic2
+
+
+head(logistic_ors)
+#====================================
+
+# logistic
+summary(model<-glm(dst ~ mut
+                   , family = binomial
+                   #, control = glm.control(maxit = 1)
+                   #, options(warn = 1)
+                   ))
+or_logistic_maxit = exp(summary(model)$coefficients[2,1]); print(or_logistic_maxit)
+pval_logistic_maxit = summary(model)$coefficients[2,4]; print(pval_logistic_maxit)
+
+
+#####################
+# iterate: subset
+#####################
+
+snps_test = c("pnca_p.trp68gly", "pnca_p.leu4ser", "pnca_p.leu159arg","pnca_p.his57arg" )
+
+data = snps_test[1:2]
+
+data
+
+
+
+
+################# start loop
+for (i in data){
+  
+  print(i)
+  
+  # IV
+  #mut<-as.numeric(grepl(i,raw_data$all_muts_gene))
+  mut = grepl(i,raw_data$all_muts_gene)
+  table(mut)
+  
+  # DV
+  #dst<-as.numeric(raw_data[[drug]])
+  dst = raw_data[[drug]]
+  
+  # table
+  print(table(dst, mut))
+  
+
+  #=====================
+  # logistic regression, glm.control(maxit = n)
+  #https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression
+  #=====================
+  #n = 1
+  summary(model<-glm(dst ~ mut
+                     , family = binomial
+                     #, control = glm.control(maxit = 1)
+                     #, options(warn = 1)
+                     ))
+  or_logistic_maxit = exp(summary(model)$coefficients[2,1]); print(or_logistic_maxit)
+  pval_logistic_maxit = summary(model)$coefficients[2,4]; print(pval_logistic_maxit)
+  
+  #=====================
+  # fishers test
+  #=====================
+  #attributes(fisher.test(table(dst, mut)))
+  or_fisher = fisher.test(table(dst, mut))$estimate
+  or_fisher = or_fisher[[1]]; or_fisher
+  
+  pval_fisher = fisher.test(table(dst, mut))$p.value ; pval_fisher
+  
+  #=====================
+  # chi square
+  #=====================
+  #chisq.test(y = dst, x = mut)
+  #attributes(chisq.test(table(dst, mut)))
+  est_chisq = chisq.test(table(dst, mut))$statistic 
+  est_chisq = est_chisq[[1]]; est_chisq
+  
+  pval_chisq = chisq.test(table(dst, mut))$p.value; pval_chisq
+  
+  # all output 
+  writeLines(c(paste0("mutation:", i)
+               , paste0("=========================")
+               , paste0("OR_logistic_maxit:", or_logistic_maxit,"--->", "P-val_logistic_maxit:", pval_logistic_maxit )
+               , paste0("OR_fisher:", or_fisher, "--->","P-val_fisher:", pval_fisher )
+               , paste0("Chi_sq_estimate:", est_chisq, "--->","P-val_chisq:", pval_chisq)))
+    
+
+}
+
+
+i  = "gene_p.leu159arg"
+
+mut<-as.numeric(grepl(i,raw_data$all_muts_pza))
+# DV
+dst<-as.numeric(raw_data$pyrazinamide)
+# tablehttps://mail.google.com/mail/?tab=rm&ogbl
+table(dst, mut)
+
+ #=====================
+  # fishers test
+  #=====================
+  #attributes(fisher.test(table(dst, mut)))
+  or_fisher = fisher.test(table(dst, mut))$estimate
+  or_fisher = or_fisher[[1]]; or_fisher
+  
+  pval_fisher = fisher.test(table(dst, mut))$p.value ; pval_fisher
+  
+  
+ # https://stats.stackexchange.com/questions/259635/what-is-the-difference-using-a-fishers-exact-test-vs-a-logistic-regression-for
+  exact2x2(table(dst, mut),tsmethod="central")
+  
+  
--- a/scripts/or_kinship_link.py
+++ b/scripts/or_kinship_link.py
@ -0,0 +1,261 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Jun 10 11:13:49 2020
+
+@author: tanu
+"""
+#%% useful links
+
+#https://chrisalbon.com/python/data_wrangling/pandas_join_merge_dataframe/
+#https://kanoki.org/2019/11/12/how-to-use-regex-in-pandas/
+#https://stackoverflow.com/questions/40348541/pandas-diff-with-string
+#%%
+import os, sys
+import pandas as pd
+#import numpy as np
+import re
+from find_missense import find_missense
+import argparse
+
+#%%
+# homedir
+homedir = os.path.expanduser('~')
+#os.chdir(homedir + '/git/Misc/jody_pza')
+#=======================================================================
+#%% command line args
+arg_parser = argparse.ArgumentParser()
+#arg_parser.add_argument('-d', '--drug', help='drug name', default = 'pyrazinamide')
+#arg_parser.add_argument('-g', '--gene', help='gene name', default = 'pncA') # case sensitive
+arg_parser.add_argument('-d', '--drug', help = 'drug name', default = None)
+arg_parser.add_argument('-g', '--gene', help = 'gene name (case sensitive)', default = None) # case sensitive
+
+#arg_parser.add_argument('-p', '--outpath', help = 'output path', default = outpath)
+#arg_parser.add_argument('-o', '--outfile', help = 'output filename', default = outfile_or_kin) 
+
+arg_parser.add_argument('-s', '--start_coord', help = 'start of coding region (cds) of gene', default = 2288681) # pnca cds
+arg_parser.add_argument('-e', '--end_coord', help = 'end of coding region (cds) of gene', default = 2289241) # pnca cds
+
+args = arg_parser.parse_args()
+#=======================================================================
+#%% variables
+#or_file
+#info_file
+#short_info_file
+#gene = 'pncA'
+#drug = 'pyrazinamide'
+start_cds = args.start_coord
+end_cds = args.end_coord
+
+gene = args.gene
+drug = args.drug
+#=======================================================================
+#%% input and output dirs and files
+#=======
+# data dir
+#=======
+datadir = homedir + '/' + 'git/Data'
+indir = datadir + '/' + drug + '/input'
+outdir = datadir + '/' + drug + '/output'
+
+#=======
+# input
+#=======
+
+info_filename  = 'snp_info.txt'
+snp_info = datadir + '/' + info_filename
+print('Info file: ', snp_info
+      , '\n============================================================')
+
+
+gene_info_filename  = 'ns'+ gene.lower()+ '_snp_info.txt'
+gene_info = indir + '/' + gene_info_filename
+print('gene info file: ', gene_info
+      , '\n============================================================')
+
+
+in_filename_or  = 'ns'+ gene.lower()+ '_assoc.txt'
+gene_or = indir + '/' + in_filename_or
+print('gene OR file: ', gene_or
+      , '\n============================================================')
+
+#=======
+# output
+#=======
+gene_or_filename = gene.lower() + '_AF_OR_kinship.csv' # other one is called AFandOR
+outfile_or_kin =  outdir + '/' + gene_or_filename
+print('Output file: ', outfile_or_kin
+      , '\n============================================================')
+
+#%% read files: preformatted using bash
+# or file: '...assoc.txt'
+or_df = pd.read_csv(gene_or, sep = '\t', header = 0, index_col = False) # 182, 12 (without filtering for missense muts, it was 212 i.e we 30 muts weren't missense)
+or_df.head()
+or_df.columns
+#%% snp_info file: master and gene specific ones
+
+# gene info
+#info_df2  = pd.read_csv('nssnp_info_pnca.txt', sep = '\t', header = 0) #303, 10 
+info_df2  = pd.read_csv(gene_info, sep = '\t', header = 0) #303, 10   
+mis_mut_cover = (info_df2['chromosome_number'].nunique()/info_df2['chromosome_number'].count()) * 100
+print('*****RESULT*****'
+      , '\nPercentage of missense mut in pncA:', mis_mut_cover
+      , '\n*****RESULT*****') #65.7%
+
+# large file
+#info_df  = pd.read_csv('snp_info.txt', sep = '\t', header = None) #12010
+info_df  = pd.read_csv(snp_info, sep = '\t') #12010
+#info_df.columns = ['chromosome_number', 'ref_allele', 'alt_allele', 'snp_info'] #12009, 4
+
+info_df['chromosome_number'].nunique() #10257
+mut_cover = (info_df['chromosome_number'].nunique()/info_df['chromosome_number'].count()) * 100
+print('*****RESULT*****'
+      ,'\nPercentage of mutations in pncA:', mut_cover
+      , '\n*****RESULT*****') #85.4%
+
+# extract unique chr position numbers
+genomic_pos = info_df['chromosome_number'].unique()
+genomic_pos_df = pd.DataFrame(genomic_pos, columns = ['chr_pos'])
+genomic_pos_df.dtypes
+
+genomic_pos_min = info_df['chromosome_number'].min()
+genomic_pos_max = info_df['chromosome_number'].max()
+
+# genomic coord for pnca coding region
+#start_cds = 2288681
+#end_cds = 2289241
+cds_len = (end_cds-start_cds) + 1
+pred_prot_len = (cds_len/3) - 1
+
+# mindblowing: difference b/w bitwise (&) and 'and'
+# DO NOT want &: is this bit set to '1' in both variables? Is this what you want?
+#if (genomic_pos_min <= start_cds) & (genomic_pos_max >= end_cds):
+print('*****RESULT*****'
+      , '\nlength of coding region:', cds_len, 'bp'
+      , '\npredicted protein length:', pred_prot_len, 'aa'
+      , '\n*****RESULT*****') 
+
+if genomic_pos_min <= start_cds and genomic_pos_max >= end_cds:    
+    print ('PASS: coding region for gene included in snp_info.txt')
+else:
+    print('FAIL: coding region for gene not included in info file snp_info.txt')
+    
+#%%  Extracting ref allele and alt allele as single letters
+# info_df has some of these params as more than a single letter, which means that 
+# when you try to merge ONLY using chromosome_number, then it messes up... and is WRONG.
+# Hence the  merge needs to be performed on a unique set of attributes which in our case
+# would be chromosome_number, ref_allele and alt_allele
+
+#FIXME: Turn to a function
+orig_len = len(or_df.columns)
+
+#find_missense(or_df, 'ref_allele1', 'alt_allele0')
+find_missense(or_df, ref_allele_column = 'ref_allele1', alt_allele_column = 'alt_allele0')
+
+ncols_add = 4
+if len(or_df.columns) == orig_len + ncols_add:
+    print('PASS: Succesfuly extracted ref and alt alleles for missense muts')
+else:
+    print('FAIL: No. of cols mismatch'
+          ,'\noriginal length:', orig_len
+          , '\nExpected no. of cols:', orig_len + ncols_add
+          , '\nGot no. of cols:', len(or_df.columns))
+    sys.exit()
+del(orig_len, ncols_add)
+
+#%% TRY MERGE
+# check dtypes
+or_df.dtypes
+info_df.dtypes
+or_df.info()
+
+# pandas documentation where it mentions: "Pandas uses the object dtype for storing strings" 
+# check how many unique chr_num in info_df are in or_df
+genomic_pos_df['chr_pos'].isin(or_df['chromosome_number']).sum() #144
+
+# check how many chr_num in or_df are in info_df: should be ALL of them
+or_df['chromosome_number'].isin(genomic_pos_df['chr_pos']).sum() #182
+
+# sanity check 2
+if or_df['chromosome_number'].isin(genomic_pos_df['chr_pos']).sum() == len(or_df):
+    print('PASS: all genomic locs in or_df have meta datain info.txt')
+else:
+    print('FAIL: some genomic locs or_df chr number DO NOT have meta data in snp_info.txt')
+
+#%% Perform merge
+
+#join_type = 'inner'
+#join_type = 'outer'
+join_type = 'left'
+#join_type = 'right'
+
+#dfm1 = pd.merge(or_df, info_df, on ='chromosome_number', how = join_type, indicator = True) # not unique!
+dfm1 = pd.merge(or_df, info_df, on = ['chromosome_number', 'ref_allele', 'alt_allele'], how = join_type, indicator = True)
+dfm1['_merge'].value_counts()
+
+# count no. of missense mutations ONLY
+dfm1.snp_info.str.count(r'(missense.*)').sum()
+
+dfm2 = pd.merge(or_df, info_df2, on = ['chromosome_number', 'ref_allele', 'alt_allele'], how = join_type, indicator = True) 
+dfm2['_merge'].value_counts()
+
+# count no. of nan
+dfm2['mut_type'].isna().sum()
+
+# drop nan
+dfm2_mis = dfm2[dfm2['mut_type'].notnull()]
+
+#%% sanity check
+# count no. of missense muts
+if len(dfm1) - dfm1.snp_info.str.count(r'(missense.*)').sum() == dfm2['mut_type'].isna().sum():
+    print('PASSED: numbers cross checked'
+          , '\nTotal no. of missense mutations:', dfm1.snp_info.str.count(r'(missense.*)').sum()   
+          , '\nNo. of mutations falsely assumed to be missense:', len(dfm1) - dfm1.snp_info.str.count(r'(missense.*)').sum())
+
+# two ways to filter to get only  missense muts
+test = dfm1[dfm1['snp_info'].str.count('missense.*')>0]
+dfm1_mis = dfm1[dfm1['snp_info'].str.match('(missense.*)') == True]
+test.equals(dfm1_mis)
+
+# drop nan
+dfm2_mis = dfm2[dfm2['mut_type'].notnull()]
+
+if dfm1_mis[['chromosome_number', 'ref_allele', 'alt_allele']].equals(dfm2_mis[['chromosome_number', 'ref_allele', 'alt_allele']]):
+    print('PASS: Further cross checks successful')
+else:
+    print('FAIL: Second cross check unsuccessfull. Debug please!')
+    sys.exit()
+
+orig_len = len(dfm2_mis.columns)
+
+#%% extract mut info into three cols
+dfm2_mis['wild_type'] = dfm2_mis['mut_info'].str.extract('(\w{1})>')
+dfm2_mis['position'] = dfm2_mis['mut_info'].str.extract('(\d+)') 
+dfm2_mis['mutant_type'] = dfm2_mis['mut_info'].str.extract('>\d+(\w{1})')   
+
+dfm2_mis['mutationinformation'] = dfm2_mis['wild_type'] + dfm2_mis['position'] + dfm2_mis['mutant_type']    
+
+# sanity check
+ncols_add = 4
+if len(dfm2_mis.columns) == orig_len + ncols_add:
+    print('PASS: Succesfully extracted and added mutationinformation(mcsm style)')
+else:
+    print('FAIL: No. of cols mismatch'
+          ,'\noriginal length:', orig_len
+          , '\nExpected no. of cols:', orig_len + ncols_add
+          , '\nGot no. of cols:', len(dfm2_mis.columns))
+    sys.exit()
+
+
+#%% write file
+print('Writing output file:\n', outfile_or_kin
+      , '\nNo.of rows:', len(dfm2_mis)
+      , '\nNo. of cols:', len(dfm2_mis.columns))
+dfm2_mis.to_csv(outfile_or_kin, index = False)
+
+#%% diff b/w allele0 and 1: or_df
+#https://stackoverflow.com/questions/40348541/pandas-diff-with-string
+#df = or_df.iloc[[5, 15, 17, 19, 34]]
+#df[['alt_allele0','ref_allele1']].ne(df[['alt_allele0','ref_allele1']].shift()).any(axis=1).astype(int) 
+#df[['alt_allele0','ref_allele1']].ne(df[['alt_allele0','ref_allele1']].shift()).any(axis=1).astype(int)
+