-
Notifications
You must be signed in to change notification settings - Fork 0
/
geneInfo_using_biomart_while.r
41 lines (41 loc) · 2.07 KB
/
geneInfo_using_biomart_while.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
library("biomaRt")
ensembl37 <- useMart(host='grch37.ensembl.org', biomart='ENSEMBL_MART_ENSEMBL') #select the ensembl version that contains hg19 (GRch37)
ensembl <- useDataset("hsapiens_gene_ensembl",mart= ensembl37) #select human genes
attributes <- c("ensembl_gene_id", "ensembl_transcript_id","chromosome_name","percentage_gc_content","entrezgene","hgnc_symbol","hgnc_transcript_name","strand" ,"band","transcript_length") #select the attributes
filter <- "hgnc_symbol" #select the filter
gene_id <- read.table("gene_id.txt") #upload the values of our filter
gene_id <- as.matrix(gene_id)
geneInfo = data.frame() #creat an empty dataframe
#the followinf while loop to handel the gene_id arry 500 by 500 (500 based on the advise on ensmble website beacuse whene we call the getBM function by the whole gene_id array we will miss some values)
finished = FALSE
limit = 500
k=0
while(!finished)
{
if((length(gene_id)-(k*limit))>limit)
{
gene_id_tmp = gene_id[c((limit*k+1):(limit*(k+1)))]
k=k+1
}
else
{
gene_id_tmp = gene_id[c((limit*k+1):length(gene_id))]
finished = TRUE
}
geneInfo <- rbind(geneInfo,getBM(attributes = attributes, filters = filter, values = gene_id_tmp, mart = ensembl))
}
row.na <- apply(geneInfo, 1, function(x){is.na(x[4])}) #select the rows that contains NA in gc content column(also trancript_length)
geneInfo.f <- geneInfo[!row.na,]
gene_list <- split(geneInfo.f, with(geneInfo.f, interaction(hgnc_symbol)), drop = TRUE) #split the data according the gene hgnc_symbolID
max_index<-lapply(gene_list,function(x){which.max(x[,10])}) #find the index of bigger transcript
#build a new data frame contains the required transcripts(the transcript with bigger length )
#I think we can enhance the following segmment of code
geneInfo.unique = data.frame()
for (i in 1:length(gene_list))
{
new_row = gene_list[[i]][max_index[[i]],]
geneInfo.unique <- rbind(geneInfo.unique, new_row)
}
row.names(geneInfo.unique)= geneInfo.unique[,6]
geneInfo.unique[,"percentage_gc_content"]=geneInfo.unique[,"percentage_gc_content"]/100.0
write.csv(geneInfo.unique, file = "geneInfo_bimomart.csv")