/
07_linkage.R
100 lines (87 loc) · 3.69 KB
/
07_linkage.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# LD analysis ----
library(LDlinkR)
library(data.table)
library(coloc)
# also requires bash software wget, bgzip and awk
# Set working directory
wd <- "/Users/fredriklandfors/projekt/lipase_genmimicry"
# Access token from https://ldlink.nci.nih.gov/?tab=apiaccess are required. ----
# access_token = # Get access token via instructions from https://ldlink.nci.nih.gov
pop = "EUR"
# Get LD R2 for variant using LDlinkR -----
## ANGPTL8: rs2278426 -----
# angptl8 <- LDlinkR::LDproxy(
# snp = "rs2278426",
# pop = pop,
# r2d = "r2",
# token = access_token,
# file = read.delim(paste0(wd, "/data/coloc/angptl8_ld/angptl8.txt"))
# )
# Read saved file
angptl8 <- data.table::fread(file = paste0(wd, "/data/coloc/angptl8_ld/angptl8.txt"))
angptl8$chr <- 19
angptl8$pos <- sapply(angptl8$Coord, function(x) {
strsplit(x, ":")[[1]][2]
})
angptl8$Alleles2 <- sapply(angptl8$Alleles, function(x) {
y <- gsub("\\(", "", x)
z <- gsub("\\)", "", y)
return(z)
})
angptl8$ref <- sapply(angptl8$Alleles2, function(x) {
strsplit(x, "/")[[1]][1]
})
angptl8$alt <- sapply(angptl8$Alleles2, function(x) {
strsplit(x, "/")[[1]][2]
})
angptl8$variant_id <- paste0(angptl8$chr, "_", angptl8$pos, "_",
angptl8$ref, "_", angptl8$alt, "_b37")
# Import GTEx v7 chromosome 19 EUR expression data ----
## Get data and preprocess in bash ----
# requires wget, bgzip and awk
# rs2278426 is located at 19:11350488 in GRCh37/hg19
# Cut a 1 Mb window liver expression data
# 11350488 - 1000000
# 11350488 + 1000000
# system("gzcat Liver.allpairs.txt.gz | awk 'NR>1 {print $2}' | awk -F'[_/]' 'BEGIN{OFS="\t"; print "#CHROM","POS","REF","ALT", "VER"}{$1=$1}1' > addvar_Liver.allpairs.txt; bgzip -i addvar_Liver.allpairs.txt;")
# system("paste <(gzcat Liver.allpairs.txt.gz) <(gzcat addvar_Liver.allpairs.txt.gz) | awk '$10 == 19 && $11 >= 10350488 && $11 <= 12350488' > angptl8_ld/fin_Liver.allpairs.txt; bgzip -i angptl8_ld/fin_Liver.allpairs.txt;")
#
# Subset annotation frame using awk
# system("gzcat GTEx_Analysis_2016-01-15_v7_WholeGenomeSeq_635Ind_PASS_AB02_GQ20_HETX_MISS15_PLINKQC.lookup_table.txt.gz | awk '$1 == 19 && $2 >= 10350488 && $2 <= 12350488' > angptl8_ld/fin_GTEx_Analysis_2016-01-15_v7_WholeGenomeSeq_635Ind_PASS_AB02_GQ20_HETX_MISS15_PLINKQC.lookup_table.txt; bgzip -i angptl8_ld/fin_GTEx_Analysis_2016-01-15_v7_WholeGenomeSeq_635Ind_PASS_AB02_GQ20_HETX_MISS15_PLINKQC.lookup_table.txt;")
## Import to R ----
chr19_liver_anno <- data.table::fread(
file = paste0(wd, "/data/coloc/angptl8_ld/fin_GTEx_Analysis_2016-01-15_v7_WholeGenomeSeq_635Ind_PASS_AB02_GQ20_HETX_MISS15_PLINKQC.lookup_table.txt.gz")
)
data.table::setnames(
chr19_liver_anno,
c("chr", "variant_pos", "variant_id", "ref", "alt", "num_alt_per_site",
"rs_id_dbSNP147_GRCh37p13")
)
chr19_liver_expression <- data.table::fread(
file = paste0(wd, "/data/coloc/angptl8_ld/fin_Liver.allpairs.txt.gz")
)
data.table::setnames(
chr19_liver_expression,
c("gene_id", "variant_id", "tss_distance", "ma_samples", "ma_count", "maf",
"pval_nominal", "slope", "slope_se", "chr", "pos", "ref", "alt", "hg")
)
merge1 <- data.table::merge.data.table(
x = chr19_liver_anno,
y = chr19_liver_expression,
by = "variant_id",
suffixes = c(".anno", ".expr")
)
# Subset LDLR (ENSG00000130164)
merge2 <- merge1[gene_id %like% "ENSG00000130164"]
# Merge linkage data with gene expression data
merge3 <- data.table::merge.data.table(
x = angptl8,
y = merge2,
by = "variant_id",
suffixes = c(".ldlink", ".expr")
)
# Subset R2 > 0.1
merge4 <- merge3[R2 >= 0.1]
merge4$pval_adjusted <- p.adjust(merge4$pval_nominal, method = "bonferroni")
# Save to file
data.table::fwrite(merge4, file = paste0(wd, "/data/coloc/angptl8_ld/data_out.txt"))