#-----import packages-----#
#common python packages
import numpy as np
import string
import random
import os
import pickle
import argparse
import wget
import math
import gc
import sys
import multiprocessing as mp
#biological packages
import pybedtools
from pybedtools import featurefuncs
import pyBigWig
#machine learning packages
import sklearn
from sklearn.utils import shuffle
import matplotlib
from matplotlib import pyplot as plt
import pandas as pd
from scipy.stats import f_oneway
from scipy.stats import ttest_ind
%matplotlib inline
#parsing command line arguments
# -----parsing command line arguments-----#
parser = argparse.ArgumentParser(description='Training CNN model to predict STARR-seq enhancers based on chromatin accessbility and histone marks')
parser.add_argument('-w', '--cell_types', type=str, help='comma separated string of cell_types')
parser.add_argument('-x', '--in_dir', type=str, help='input_directory')
parser.add_argument('-y', '--cell_name', type=str, help='name of the cell')
parser.add_argument('-z', '--out_dir', type=str, help='output_directory')
parser.add_argument('-a', '--track1_peaks', type=str, help='chromatin accessibility peak')
parser.add_argument('-b', '--track2_peaks', type=str, help='ChIP-seq H3K27ac peak')
parser.add_argument('-c', '--track3_peaks', type=str, help='ChIP-seq H3K4me3 peak')
parser.add_argument('-d', '--track4_peaks', type=str, help='ChIP-seq H3K9ac peak')
parser.add_argument('-e', '--track5_peaks', type=str, help='ChIP-seq H3K4me1 peak')
parser.add_argument('-f', '--track1_bw', type=str, help='chromatin accessibility bigWig')
parser.add_argument('-g', '--track2_bw', type=str, help='ChIP-seq H3K27ac bigWig')
parser.add_argument('-i', '--track3_bw', type=str, help='ChIP-seq H3K4me3 bigWig')
parser.add_argument('-j', '--track4_bw', type=str, help='ChIP-seq H3K9ac bigWig')
parser.add_argument('-k', '--track5_bw', type=str, help='ChIP-seq H3K4me1 bigWig')
cell_type = "NPC"
#simulate command line input
seqdir = "./encode/datasets/" + cell_type + "/"
cmdline_str='-w ' + " HepG2,K562,A549,HCT116,MCF-7 " + \
' -x ' + "./encode/dev/encoded_2overlap/DNase/" + \
' -y ' + "NPC" + \
' -z ' + "./encode/dev/output/" + \
' -a ' + seqdir+cell_type+".DNase-seq.narrowPeak" + \
' -b ' + seqdir+cell_type+".ChIP-seq.H3K27ac.narrowPeak" + \
' -c ' + seqdir+cell_type+".ChIP-seq.H3K4me3.narrowPeak" + \
' -d ' + seqdir+cell_type+".ChIP-seq.H3K9ac.narrowPeak" + \
' -e ' + seqdir+cell_type+".ChIP-seq.H3K4me1.narrowPeak" + \
' -f ' + seqdir+cell_type+".DNase-seq.bigWig" + \
' -g ' + seqdir+cell_type+".ChIP-seq.H3K27ac.bigWig" + \
' -i ' + seqdir+cell_type+".ChIP-seq.H3K4me3.bigWig" + \
' -j ' + seqdir+cell_type+".ChIP-seq.H3K9ac.bigWig" + \
' -k ' + seqdir+cell_type+".ChIP-seq.H3K4me1.bigWig"
seq_names = ["DNase", "H3K27ac", "H3K4me3", "H3K9ac", "H3K4me1"]
#check if the files are there
args = parser.parse_args(cmdline_str.split())
args.cell_types = args.cell_types.split(",")
for cell in args.cell_types:
for seq in seq_names:
pos_file = args.in_dir + cell + "." + seq + ".pos.tsv"
if not os.path.exists(pos_file):
print(pos_file + " file does not exist")
exit(1)
neg_file = args.in_dir + cell + "." + seq + ".neg.tsv"
if not os.path.exists(neg_file):
print(neg_file + " file does not exist")
exit(1)
for key, value in vars(args).items():
if key == "cell_types" or key == "in_dir" or key == "out_dir" or key == "cell_name":
continue
else:
if not os.path.exists(value):
print(key + " argument file does not exist")
exit(1)
print("all files found!")
#construct a set of autosome + X chromosome names
chromosomes = []
for i in range(1,23):
chromosomes.append("chr"+str(i))
chromosomes.append("chrX")
print(chromosomes)
print("all files found!")
#load in original bed from prediction
original_bed = pybedtools.BedTool(args.out_dir + args.cell_name + ".all.prediction_pos_regions.50.merged.bed")
#load in bed refined by grad cam
refined_bed = pybedtools.BedTool(args.out_dir + args.cell_name + ".all.prediction_pos_regions.50.filtered.bed")
print("original prediction has a total coverage of", original_bed.total_coverage(), "bp")
print("refined prediction has a total coverage of", refined_bed.total_coverage(), "bp")
matplotlib.rcParams.update({'font.size': 16})
fig = plt.figure(figsize=(4, 5))
plt.bar(np.array([0, 0.3]),
np.array([original_bed.total_coverage(), refined_bed.total_coverage()]),
width = 0.25,
align='center',
color = ["#A6CEE3", "#1F78B4"])
plt.xticks(np.array([0, 0.3]), np.array(["Original", "Refined"]))
fig.suptitle('Total Coverage', fontsize=20)
plt.ylabel('Ten Million Base Pairs', fontsize=16)
#compare tss overlap
tss_snps = pybedtools.BedTool("refTSS_v3.1_human_coordinate.hg38.bed")
print("a total of", tss_snps.count(), "tss snps")
print("original prediction has", original_bed.intersect(tss_snps).count(), "overlap with tss snps")
print("refined prediction has", refined_bed.intersect(tss_snps).count(), "overlap with tss snps")
fig = plt.figure(figsize=(4, 5))
plt.bar(np.array([0, 0.3]),
np.array([original_bed.intersect(tss_snps).count(),
refined_bed.intersect(tss_snps).count()]),
width = 0.25,
color = ["#A6CEE3", "#1F78B4"])
plt.xticks(np.array([0, 0.3]), np.array(["Original", "Refined"]))
fig.suptitle('TSS overlap', fontsize=20)
plt.ylabel('Count', fontsize=16)
phastCons = pyBigWig.open("./encode/dev/hg38.phastCons100way.bw")
original_phastCons = [float(phastCons.stats(x.chrom, x.start, x.stop)[0] or 0) for x in original_bed]
refined_pahstCons = [float(phastCons.stats(x.chrom, x.start, x.stop)[0] or 0) for x in refined_bed]
original_phastCons = np.nan_to_num(np.array(original_phastCons))
refined_pahstCons = np.nan_to_num(np.array(refined_pahstCons))
print("original prediction phastCons score is", np.median(original_phastCons))
print("refined prediction phastCons score is", np.median(refined_pahstCons))
t, p = f_oneway(original_phastCons, refined_pahstCons)
print('{:.2e}'.format(p))
t, p = ttest_ind(original_phastCons, refined_pahstCons)
print('{:.2e}'.format(p))
# fig = plt.figure(figsize=(4, 5))
# box = plt.boxplot(np.array([np.array(original_phastCons),
# np.array(refined_pahstCons)]),
# labels=np.array(["Original", "Refined"]),
# boxprops=dict(color="#A6CEE3"),
# patch_artist=True, notch=True)
# fig.suptitle('PhastCons Score (p-value=' + str(p) + ")", fontsize=20)
# plt.ylabel('Signal Distribution', fontsize=16)
# colors = ["#A6CEE3", "#1F78B4"]
# plt.show();
fig = plt.figure(figsize=(4, 5))
c = "#A6CEE3"
box1 = plt.boxplot(np.array(original_phastCons), positions=[1], patch_artist=True,
boxprops=dict(facecolor=c, color=c),
capprops=dict(color=c),
whiskerprops=dict(color=c),
flierprops=dict(color=c, markeredgecolor=c),
medianprops=dict(color=c))
plt.setp(box1["medians"], color="black")
c2 = "#1F78B4"
box1 = plt.boxplot(np.array(refined_pahstCons),
positions=[2],
patch_artist=True)
for item in ['boxes', 'whiskers', 'fliers', 'medians', 'caps']:
plt.setp(box1[item], color=c2)
plt.setp(box1["boxes"], facecolor=c2)
plt.setp(box1["medians"], color="black")
plt.setp(box1["fliers"], markeredgecolor=c2)
plt.xticks([1,2], [1,2,3])
plt.ylabel('Signal Distribution', fontsize=16)
fig.suptitle('PhastCons Score (p-value<0.001)', fontsize=20)
plt.xticks([1, 2], ['Original', 'Refined'])
plt.show();
# fig = plt.figure(figsize=(4, 5))
# plt.bar(np.array(["Original", "Refined"]), np.array([np.sum(original_phastCons), np.sum(refined_pahstCons)]))
# fig.suptitle('PhastCons Score', fontsize=20)
# plt.ylabel('Total Signal', fontsize=16)
#hg19 genome lift over
liftOver = "~/liftOver "
hg38_original = args.out_dir + args.cell_name + ".all.prediction_pos_regions.50.merged.bed "
hg38_2_hg19_chain = "~/hg38ToHg19.over.chain "
hg19_lifted = args.out_dir + args.cell_name + ".all.prediction_pos_regions.50.merged.hg19.lifted.bed "
hg19_unlift = args.out_dir + args.cell_name + ".all.prediction_pos_regions.50.merged.hg19.unlift.bed "
os.system(liftOver + hg38_original + hg38_2_hg19_chain + hg19_lifted + hg19_unlift)
liftOver = "~/liftOver "
hg38_original = args.out_dir + args.cell_name + ".all.prediction_pos_regions.50.filtered.bed "
hg38_2_hg19_chain = "~/hg38ToHg19.over.chain "
hg19_lifted = args.out_dir + args.cell_name + ".all.prediction_pos_regions.50.filtered.hg19.lifted.bed "
hg19_unlift = args.out_dir + args.cell_name + ".all.prediction_pos_regions.50.filtered.hg19.unlift.bed "
os.system(liftOver + hg38_original + hg38_2_hg19_chain + hg19_lifted + hg19_unlift)
#load in original bed from prediction
original_bed = pybedtools.BedTool(args.out_dir + args.cell_name + ".all.prediction_pos_regions.50.merged.hg19.lifted.bed")
#load in bed refined by grad cam
refined_bed = pybedtools.BedTool(args.out_dir + args.cell_name + ".all.prediction_pos_regions.50.filtered.hg19.lifted.bed")
#ASD summary statistics
ASD_snps = pybedtools.BedTool("./GWAS_catalog/linked_snps/Autism_spectrum_disorder")
print("a total of", ASD_snps.count(), "ASD snps")
print("original prediction has", original_bed.intersect(ASD_snps).count(), "overlap with ASD snps")
print("refined prediction has", refined_bed.intersect(ASD_snps).count(), "overlap with ASD snps")
#Bipolar_disorder summary statistics
Bipolar_disorder_snps = pybedtools.BedTool("./GWAS_catalog/linked_snps/Bipolar_disorder")
print("a total of", Bipolar_disorder_snps.count(), "Bipolar Disorder snps")
print("original prediction has", original_bed.intersect(Bipolar_disorder_snps).count(), "overlap with Bipolar Disorder snps")
print("refined prediction has", refined_bed.intersect(Bipolar_disorder_snps).count(), "overlap with Bipolar Disorder snps")
#Schizophrenia summary statistics
Schizophrenia_snps = pybedtools.BedTool("./GWAS_catalog/linked_snps/Schizophrenia")
print("a total of", Schizophrenia_snps.count(), "Schizophrenia snps")
print("original prediction has", original_bed.intersect(Schizophrenia_snps).count(), "overlap with Schizophrenia snps")
print("refined prediction has", refined_bed.intersect(Schizophrenia_snps).count(), "overlap with Schizophrenia snps")