You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
There is some functionality in the package to work with annotations, but it is not developed very well. It is removed from the package temporarily and stored here, to be reintegrated later on.
#' Inspect Annotations Present in a Partition#' #' @param x a partition object#' @param annotation number of an annotation that will be looked up in values for s-attribute#' @examples#' \dontrun{#' P <- partition(#' "GERMAPARL",#' lp = 17, session = 175, speaker = "Fritz Rudolf K\u00F6rper",#' xml = "nested"#' )#' annotations_inspect(P, annotation = "16")#' }#' @export annotations_inspect#' @importFrom data.table rbindlistannotations_inspect<-function(x, annotation=NULL){
annos<- unique(unlist(strsplit(sAttributes(x, "cap"), split="\\|")))
annos<-annos[-which(annos%in%"")]
if (!is.null(annotation)){
if (!annotation%in%annos) stop("annotation is not present in partition")
annos<-annotation
}
.show<-function(x, anno){
xsub<- partition(x, cap= sprintf("^.*\\|%s\\|.*$", anno), xml="nested", regex=TRUE)
cpos<- unique(unlist(apply(xsub@cpos, 1, function(cpos) cpos[1]:cpos[2])))
H<- html(x)
H<- highlight(H, list(yellow=cpos))
show(H)
}
if (length(annos) >1){
for (annoinannos){
print(anno)
.show(x, anno)
if (readline(prompt="print 'q' to exit") =="q") break
}
} else {
.show(x, annos)
}
invisible(NULL)
}
#' Get Features of Annotations#' #' @param x XXX#' @param pAttribute XXX#' @param annotation XXX#' @export annotation_features#' @importFrom methods asannotation_features<-function(x, pAttribute= c("word", "pos"), annotation){
P<- partition(x, cap= sprintf("^.*\\|%s\\|.*$", annotation), regex=TRUE)
P2<- enrich(P, pAttribute=pAttribute)
annoCount<- as(P2, "count")
gparl<-Corpus$new("GERMAPARL", pAttribute=pAttribute)$as.partition()
gparlCount<- as(gparl, "count")
F<- features(annoCount, gparlCount, included=TRUE)
F<- subset(F, F[["chisquare"]] >10.83)
F<- subset(F, F[["pos"]] %in% c("NN", "ADJA"))
F
}
#' Write CAP Annotations to GermaParl#' #' @param dir directory with CAP annotations#' @importFrom polmineR sAttributes#' @importFrom pbapply pblapply#' @importFrom data.table := rbindlist#' @export germaparl_encode_cap_annotationsgermaparl_encode_cap_annotations<-function(dir="/Lab/gitlab/plprbttxt_annotations/2016_09_21"){
if (!requireNamespace(package="polmineR.anno", quietly=TRUE)){
stop(
"Package 'polmineR.anno' needs to be installed, but is not available.",
"It can be installed from the drat repository of the PolMine Project."
)
}
conll_files<- list.files(dir)
Encoding(conll_files) <-"UTF-8"# Extract metadata information from filename and match it to corpus metadataspeakerNames<- sAttributes("GERMAPARL", "speaker")
.SD<-NULL# just for the sake of R CMD checkdtList<-pbapply::pblapply(
conll_files,
function(filename){
# get legislative period and sessionlp<- gsub("^(\\d+)_.*?$", "\\1", filename)
sessionNo<- gsub("^\\d+_(\\d+)_.*?$", "\\1", filename)
# get party from filenameparty<- gsub("^\\d+_\\d+_([0-9A-Z\u00dc_]+)_.*?$", "\\1", filename)
if (grepl("^.*NDNIS_90_DIE_.*$", filename)) party<-"B90_DIE_GRUENEN"if (grepl("fraktionslos", filename)) party<-"fraktionslos"if (party=="F_D_P_") party<-"FDP"# get speaker from filenamespeakerName<- gsub("^(.*)_\\d+\\.tsv$", "\\1", filename)
speakerName<- gsub("^\\d+_\\d+_[0-9A-Z\u00dc_]+_(.*?)?$", "\\1", speakerName)
if (grepl("NEN_", speakerName)) speakerName<- gsub("^.*NEN_", "", speakerName)
speakerName<- gsub("_", "", speakerName)
speakerName<- gsub("^Dr\\s", "", speakerName)
speakerName<- gsub("\\s+[A-Z]\\s+", "", speakerName)
speakerName<- gsub("^-Ing\\s", "", speakerName)
speakerName<- gsub("^\\s*(.*?)\\s*$", "\\1", speakerName)
# match filename-speaker if (speakerName%in%speakerNames){
isPresent<-speakerName
} else {
matches<- agrep(speakerName, speakerNames)
if (length(matches) >0){
isPresent<-speakerNames[matches[1]]
} else {
tmp<- iconv(strsplit(speakerName, split="")[[1]], from="UTF-8", to="ISO-8859-1")
tmp<-tmp[!is.na(tmp)]
speakerNameNew<- iconv(paste(tmp, collapse=""), from="ISO-8859-1", to="UTF-8")
matches<- agrep(speakerNameNew, speakerNames)
if (length(matches) >0){
isPresent<-speakerNames[matches[1]]
} else {
splitted<- strsplit(speakerName, "")[[1]]
lastName<-splitted[length(splitted)]
lastTry<- grep(paste(lastName, "$", sep=""), speakerNames, value=T)
if (length(lastTry) >0){
isPresent<-lastTry[1]
} else {
isPresent<-"XXX"
}
}
}
}
sAttributes<- data.table(
filename=filename,
lp=lp,
session=sessionNo,
parliamentary_group=party,
speaker=isPresent
)
}
)
dt<- rbindlist(dtList)
dt[["speaker"]] <- enc2utf8(dt[["speaker"]]) # encodings are mixed # manual correction of known problemsdt[dt[["speaker"]] =="Peter Glotz", "speaker":="Peter G\u00F6tz"]
dt[dt[["speaker"]] =="Heinrich L. Kolb", "speaker":="Heinrich Leonhard Kolb"]
dt[dt[["speaker"]] =="Kersten Naumann", "speaker":="Kersten Steinke"]
dt[dt[["speaker"]] =="Sevim Dagdelen", "speaker":="Sevim Dadelen"]
# Get corpus positionsGPARL<-Corpus$new("GERMAPARL", sAttribute= c("session", "speaker", "lp"))
CoNLL_objects<-pbapply::pblapply(
setNames(1L:nrow(dt), dt[["filename"]]),
function(i){
P<- partition(
GPARL,
def= as.list(unlist(dt[i, c("session", "speaker", "lp")])),
verbose=FALSE
)
if (is.null(P)) return( NULL )
C<-polmineR.anno::CoNLL$new(filename= file.path(dir, conll_files[i]), partition=P)
C$getCorpusPositions()
C
}
)
# not used - inspect failed matchesif (FALSE) View(dt[which(sapply(CoNLL_objects, is.null) ==TRUE)])
regions<- rbindlist(lapply(CoNLL_objects, function(x) x[["cpos"]]))
regions<-regions[is.na(regions[["cpos_left"]]) ==FALSE] # check why this is necessary!!
setnames(regions, old="id", new="cap")
regions[, "quote":=NULL]
.aggr<-function(.SD){
data.table(
cpos_left=.SD[["cpos_left"]]:.SD[["cpos_right"]],
cpos_right=.SD[["cpos_left"]]:.SD[["cpos_right"]],
cap=.SD[["cap"]]
)
}
regionsToken<-regions[, .aggr(.SD), by= seq_len(nrow(regions))]
regionsToken[, "seq_len":=NULL]
# remove B- and I-regionsToken[, "cap2":= ifelse(
nchar(gsub("^(B|I)-(\\d+)_(\\d+)$", "\\3", regionsToken[["cap"]])) >=3,
gsub("^(B|I)-(\\d+)_(\\d+)$", "\\3", regionsToken[["cap"]]),
gsub("^(B|I)-(\\d+)_(\\d+)$", "\\2", regionsToken[["cap"]])
)]
regionsToken[, "cap3":= ifelse(
nchar(regionsToken[["cap2"]]) >=3,
gsub("^(\\d+)(\\d{2})$", "\\1-\\2", regionsToken[["cap2"]]),
regionsToken[["cap2"]]
)]
regionsToken[["cap"]] <-regionsToken[["cap3"]]
regionsToken[, "cap2":=NULL][, "cap3":=NULL]
.paste<-function(.SD) list(cap= paste(.SD[["cap"]], collapse="|"))
regionsToken2<-regionsToken[, .paste(.SD), by="cpos_left"]
regionsToken2[, "cap":= paste("|", regionsToken2[["cap"]], "|", sep="")]
regionsToken2[, "cpos_right":=regionsToken2[["cpos_left"]] ]
setcolorder(regionsToken2, neworder= c("cpos_left", "cpos_right", "cap"))
germaparl_regdata<- registry_file_parse(corpus="GERMAPARL", registry_dir= germaparl_regdir())
germaparl_charset<-germaparl_regdata[["properties"]][["charset"]]
germaparl_home<-germaparl_regdata[["home"]]
cwbtools::s_attribute_encode(
values=regionsToken2[["cap"]],
data_dir=germaparl_home,
s_attribute="cap",
corpus="GERMAPARL",
region_matrix= as.matrix(regionsToken2[, c("cpos_left", "cpos_right")]),
method="CWB",
registry_dir= germaparl_regdir(),
encoding=germaparl_charset
)
invisible( NULL )
}
The text was updated successfully, but these errors were encountered:
There is some functionality in the package to work with annotations, but it is not developed very well. It is removed from the package temporarily and stored here, to be reintegrated later on.
The text was updated successfully, but these errors were encountered: