/
opennlp.Rmd
113 lines (83 loc) · 3.17 KB
/
opennlp.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
---
title: "CWB corpora and openNLP"
author: "Andreas Blätte (andreas.blaette@uni-due.de)"
date: "`r Sys.Date()`"
output: rmarkdown::html_vignette
vignette: >
%\VignetteIndexEntry{CWB corpora and openNLP}
%\VignetteEncoding{UTF-8}
%\VignetteEngine{knitr::rmarkdown}
editor_options:
chunk_output_type: console
---
## Required packages
```{r packages, eval = FALSE}
library(cwbtools)
library(RcppCWB)
library(NLP)
library(openNLP)
library(data.table)
```
## Interfacing to openNLP
### Decode p-attribute 'word'
```{r decode_word, eval = FALSE}
corpus_size <- cl_attribute_size("UNGA", attribute = "word", attribute_type ="p")
dt <- data.table(cpos = 0L:(corpus_size - 1L))
dt <- dt[, "id" := cl_cpos2id("UNGA", p_attribute = "word", cpos = dt[["cpos"]])]
dt[, "word" := cl_id2str("UNGA", p_attribute = "word", id = dt[["id"]])]
```
### Reconstruct String
```{r get_txt, eval = FALSE}
dt[, "whitespace" := c(ifelse(word %in% c(".", ",", ":", "!", "?", ";"), FALSE, TRUE)[2L:length(word)], FALSE)]
txt <- String(paste(paste(dt[["word"]], ifelse(dt[["whitespace"]], " ", ""), sep = ""), collapse = ""))
```
### Match token annotation
```{r word_offset_annotation, eval = FALSE}
dt[, "nchar" := sapply(dt[["word"]], nchar)]
dt[, "start" := c(1L, (cumsum(dt[["nchar"]] + dt[["whitespace"]]) + 1L)[1L:(nrow(dt) - 1L)])]
dt[, "end" := (dt[["start"]] + dt[["nchar"]] - 1L)]
w <- NLP::Annotation(
id = dt[["cpos"]],
rep.int("word", nrow(dt)),
start = dt[["start"]],
end = dt[["end"]]
)
```
## Create Annotations
### Sentences
```{r sentence_annotation, eval = FALSE}
sentence_annotator <- Maxent_Sent_Token_Annotator()
s <- annotate(s = txt, f = sentence_annotator)
spans <- as.data.table((as.Span(s)))[, "sentence" := 1L:length(s)]
spans[, "cpos_left" := dt[, c("cpos", "start", "end")][spans, on = "start"][["cpos"]]]
spans[, "cpos_right" := dt[, c("cpos", "start", "end")][spans, on = "end"][["cpos"]]]
regions <- spans[, c("cpos_left", "cpos_right", "sentence")]
```
### Part-of-speech
```{r pos_annotation, eval = FALSE}
pos_annotator <- Maxent_POS_Tag_Annotator(language = "en", probs = FALSE, model = NULL)
p <- annotate(s = txt, f = pos_annotator, a = c(w, s))
pos <- unlist(lapply(as.data.frame(p)[["features"]], `[[`, "POS"))
```
### Named entities
```{r, eval = FALSE}
install.packages(pkgs = "openNLPmodels.en", repos = "https://datacube.wu.ac.at")
ne_annotator <- Maxent_Entity_Annotator(
language = "en",
kind = "person",
probs = FALSE,
model = NULL
)
ne <- annotate(s = txt, f = ne_annotator, a = c(w, s))
ne_min <- subset(ne, type == "entity")
spans <- as.data.table((as.Span(ne_min)))
spans[, "cpos_left" := dt[, c("cpos", "start", "end")][spans, on = "start"][["cpos"]]]
spans[, "cpos_right" := dt[, c("cpos", "start", "end")][spans, on = "end"][["cpos"]]]
spans[, "ne" := sapply(as.data.frame(ne_min)[["features"]], `[[`, "kind")]
regions <- spans[, c("cpos_left", "cpos_right", "ne")]
txt <- regions[, {paste(dt[cpos %in% .SD[["cpos_left"]]:.SD[["cpos_right"]]][["word"]], collapse = " ")}, by = "cpos_left", .SDcols = c("cpos_left", "cpos_right")][["V1"]]
```
## Add annotation to corpus
```{r, eval = FALSE}
pos <- unlist(lapply(as.data.frame(p)[["features"]], `[[`, "POS"))
```