-
Notifications
You must be signed in to change notification settings - Fork 2
/
Get_PMID_PMCID.R
129 lines (91 loc) · 3.87 KB
/
Get_PMID_PMCID.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# 1. Load R packages (rentrez and plyr)
library("rentrez")
library("plyr")
# 2. Set your working directory
# 3. Load csv file and create a data frame
PMID_PMCID <- read.csv("FileName.csv", header=TRUE)
# 4. Create two colums with empty value: One for PMID and the other for PMCID
PMID_PMCID$pmid <- " "
PMID_PMCID$pmcid <- " "
# 5. Function to get Pubmed ID
myPubmedID <- function(x) {
Pubmed <- entrez_search(db="pubmed", term=x)
Pubmed_ID <- Pubmed$ids
if(length(Pubmed_ID) == 1 ) {
PMID <- Pubmed_ID
} else {
PMID <- "None"
}
return(PMID)
}
# 6. Get Pubmed ID
PMID_list <- lapply(PMID_PMCID$doi, myPubmedID)
PMID_list <- unlist(PMID_list)
PMID_PMCID$pmid <- PMID_list
## 7: Issue & Solution from Step 6.
### Issue: The function of myPubmedID returns one Pubmed ID, but associated one.
### Solution: Identify duplicated ID values and remove None value
duplicated_IDs <- data.frame(PMID_PMCID$pmid[duplicated(PMID_PMCID$pmid)])
# This returns n-1
colnames(duplicated_IDs) <- c("value")
# Change the column name
duplicated_IDs <- subset(duplicated_IDs, value != 'None')
# Remove None value
duplicated_IDs <- data.frame(count(duplicated_IDs))
## 7-1: If the freq is more than 1,
PMID_PMCID <- within(PMID_PMCID, pmid[pmid==23104645] <- c("None"))
# Repeat this process
## 7-2: If the freq is equal to 1, check manually. Sometimes, they are correct PMID.
# If not, do the process of 7-1.
# 8: Function to get Pubmed Central ID
myPMCID <- function(x) {
if (x == "None") {
PMCID <- "None"
} else {
taxize_summ <- entrez_summary(db="pubmed", id=x)
PMC <- data.frame(taxize_summ$articleids)
PMC_ID <- grep("pmc-id:", PMC$value, value=TRUE)
if (identical(PMC_ID, character(0))) {
PMCID <- "None"
} else {
PMCID <- PMC_ID
}
}
return(PMCID)
}
# 9: Get Pubmed Central ID
PMCID_list <- lapply(PMID_PMCID$pmid, myPMCID)
PMCID_list<- unlist(PMCID_list)
PMID_PMCID$pmcid <- PMCID_list
# 10: Delete all rows with PMID value None
PMID_PMCID <- PMID_PMCID[ which(PMID_PMCID$pmid != "None"), ]
# 11: Delete all rows with PMCID value not None
PMID_PMCID <- PMID_PMCID[ which(PMID_PMCID$pmcid=="None"), ]
# 12: Load csv file of items under embargo and create a data frame
Embargo <- read.csv("FileName.csv", header=TRUE)
# 13: Identify items under Embargo and change column name
items_under_embargo <- data.frame(intersect(PMID_PMCID$id, Embargo$ItemID))
colnames(items_under_embargo) <- c("ID")
# 14: Remove items under Embargo from PMID_PMCID data table
PMID_PMCID <- subset(PMID_PMCID, !(id %in% items_under_embargo$ID), select=id:pmcid)
# 15: Final Report
PMID_PMCID$ProviderID <- "YourID"
PMID_PMCID$Database <- "PubMed"
PMID_PMCID$IconURL <- "IconURL"
PMID_PMCID$UrlName <- "Full Text from"
PMID_PMCID$SubjectType <- " "
PMID_PMCID$Attribute <- "Full-text PDF"
final <- subset(PMID_PMCID, select=c(ProviderID, Database, pmid, handle, IconURL, UrlName, SubjectType, Attribute))
colnames(final) <- c("ProviderID", "Database", "UID", "URL", "IconURL", "UrlName", "SubjectType", "Attribute")
# 16: Save the file in the csv format
write.table(final, file="PMID_PMCID.csv", sep=",", row.names=F)
# 17: Identify items under embargo but released soon
items_under_embargo_from_PMID_PMCID <- subset(PMID_PMCID, (id %in% items_under_embargo$ID), select=id:pmcid)
items_under_embargo_from_Embargo <- subset(Embargo, (ItemID %in% items_under_embargo$ID), select=c(ItemID,Embargo))
colnames(items_under_embargo_from_Embargo) <- c("id", "Embargo")
# 18: Combine two tables by id
All_Embargo <- merge(items_under_embargo_from_PMID_PMCID, items_under_embargo_from_Embargo, by=c("id"))
# 19: Remove items under indefinite embargo
Removed_under_indefinite_embargo <- All_Embargo[ which(All_Embargo$Embargo!="9999-01-01"), ]
# 20: Save the file of items under definite embargo
write.table(Removed_under_indefinite_embargo, file="ItemsUnderEmbargo.csv", sep=",", row.names=F)