Skip to content

Commit

Permalink
fix: Exclude characters with special meaning in Lucene Query Parser s…
Browse files Browse the repository at this point in the history
…yntax from searchbylabel search (DEV-1446) (#2269)
  • Loading branch information
irinaschubert committed Oct 28, 2022
1 parent 68f19c3 commit b359916
Show file tree
Hide file tree
Showing 13 changed files with 316 additions and 19 deletions.
13 changes: 9 additions & 4 deletions docs/03-apis/api-v2/reading-and-searching-resources.md
Expand Up @@ -462,12 +462,17 @@ character:
- Zeitglöcklein des Lebens

With each character added to the last term, the selection gets more
specific. The first term should at least contain four characters. To
specific. The first term should at least contain three characters. To
make this kind of "search as you type" possible, a wildcard character is
automatically added to the last search term.
Search by label automatically adds Lucene operators,
search strings are expected not to contain any characters with a special meaning in
[Lucene Query Parser syntax](../../07-lucene/index.md).

Characters provided by the user that have a special meaning in the Lucene Query Parser
syntax are replaced by a whitespace character for this search. If a user types "Zeit-Glöcklein"
it is interpreted as "Zeit Glöcklein". Whitespace is normalized afterwards. The special
characters that are replaced are:
`+`, `-`, `&`, `|`, `!`, `(`, `)`, `[`, `]`, `{`, `}`, `^`, `"`, `~`, `*`, `?`, `:`, `\`

If the `rdfs:label` of a resource contains a special character, it is found nonetheless.

```
HTTP GET to http://host/v2/searchbylabel/searchValue[limitToResourceClass=resourceClassIRI]
Expand Down
4 changes: 2 additions & 2 deletions mkdocs.yml
Expand Up @@ -43,8 +43,8 @@ nav:
- Getting Lists: 03-apis/api-v2/getting-lists.md
- XML to Standoff Mapping: 03-apis/api-v2/xml-to-standoff-mapping.md
- Gravsearch - Virtual Graph Search: 03-apis/api-v2/query-language.md
- Editing Resources: 03-apis/api-v2/editing-resources.md
- Editing Values: 03-apis/api-v2/editing-values.md
- Creating and Editing Resources: 03-apis/api-v2/editing-resources.md
- Creating and Editing Values: 03-apis/api-v2/editing-values.md
- Querying, Creating, and Updating Ontologies: 03-apis/api-v2/ontology-information.md
- TEI/XML: 03-apis/api-v2/tei-xml.md
- Permalinks: 03-apis/api-v2/permalinks.md
Expand Down
4 changes: 2 additions & 2 deletions test_data/all_data/anything-data.ttl
Expand Up @@ -2269,7 +2269,7 @@
knora-base:attachedToUser <http://rdfh.ch/users/9XBCrDV3SRa7kS1WwynB4Q> ;
knora-base:attachedToProject <http://rdfh.ch/projects/Lw3FC39BSzCwvmdOaTyLqQ> ;
knora-base:hasPermissions "V knora-admin:UnknownUser|M knora-admin:ProjectMember" ;
knora-base:creationDate "2019-11-29T10:00:00.673298Z"^^xsd:dateTime ;
knora-base:valueCreationDate "2019-11-29T10:00:00.673298Z"^^xsd:dateTime ;
knora-base:isDeleted false .

# Video Sequence Resource
Expand Down Expand Up @@ -2355,7 +2355,7 @@
knora-base:attachedToUser <http://rdfh.ch/users/9XBCrDV3SRa7kS1WwynB4Q> ;
knora-base:attachedToProject <http://rdfh.ch/projects/Lw3FC39BSzCwvmdOaTyLqQ> ;
knora-base:hasPermissions "V knora-admin:UnknownUser|M knora-admin:ProjectMember" ;
knora-base:creationDate "2019-11-29T10:00:00.673298Z"^^xsd:dateTime ;
knora-base:valueCreationDate "2019-11-29T10:00:00.673298Z"^^xsd:dateTime ;
knora-base:isDeleted false .

# Audio Sequence Resource
Expand Down
20 changes: 20 additions & 0 deletions test_data/all_data/books-data.ttl
Expand Up @@ -50,6 +50,16 @@
knora-base:hasPermissions "CR knora-admin:Creator|M knora-admin:ProjectMember|V knora-admin:KnownUser|RV knora-admin:UnknownUser" ;
knora-base:attachedToUser <http://rdfh.ch/users/BhkfBc3hTeS_IDo-JgXRbQ> .

<http://rdfh.ch/0001/book-instance-05/values/has-title-value-05>
a knora-base:TextValue ;
knora-base:valueHasUUID "dFLLWiihTAqxJsh0iTQ1HQ"^^xsd:string ;
knora-base:isDeleted false ;
knora-base:valueCreationDate "2018-05-29T16:42:04.381Z"^^xsd:dateTime ;
knora-base:valueHasOrder 0 ;
knora-base:valueHasString "A title" ;
knora-base:hasPermissions "CR knora-admin:Creator|M knora-admin:ProjectMember|V knora-admin:KnownUser|RV knora-admin:UnknownUser" ;
knora-base:attachedToUser <http://rdfh.ch/users/BhkfBc3hTeS_IDo-JgXRbQ> .

<http://rdfh.ch/0001/page-instance-01/values/has-page-number-value-01>
a knora-base:IntValue ;
knora-base:valueHasUUID "SZyeLLmOTcCCuS3B0VksHQ"^^xsd:string ;
Expand Down Expand Up @@ -137,6 +147,16 @@
rdfs:label "Treasure Island" ;
knora-base:isDeleted false .

<http://rdfh.ch/0001/book-instance-05>
a books:Book ;
knora-base:attachedToUser <http://rdfh.ch/users/9XBCrDV3SRa7kS1WwynB4Q> ;
knora-base:attachedToProject <http://rdfh.ch/projects/Lw3FC39BSzCwvmdOaTyLqQ> ;
knora-base:hasPermissions "CR knora-admin:Creator|M knora-admin:ProjectMember|V knora-admin:KnownUser,knora-admin:UnknownUser" ;
knora-base:creationDate "2019-11-29T10:00:00.673298Z"^^xsd:dateTime ;
books:hasTitle <http://rdfh.ch/0001/book-instance-05/values/has-title-value-05> ;
rdfs:label "this .,:; is + a - test & with \\ special ( characters ) in [] {} | the || label?!" ;
knora-base:isDeleted false .

<http://rdfh.ch/0001/page-instance-01>
a books:Page ;
knora-base:attachedToUser <http://rdfh.ch/users/9XBCrDV3SRa7kS1WwynB4Q> ;
Expand Down
80 changes: 80 additions & 0 deletions test_data/searchR2RV2/SearchbylabelSimple.jsonld
@@ -0,0 +1,80 @@
{
"knora-api:arkUrl": {
"@value": "http://0.0.0.0:3336/ark:/72163/1/0001/book=instance=04K",
"@type": "xsd:anyURI"
},
"rdfs:label": "Treasure Island",
"knora-api:versionArkUrl": {
"@value": "http://0.0.0.0:3336/ark:/72163/1/0001/book=instance=04K.20191129T100000673298Z",
"@type": "xsd:anyURI"
},
"knora-api:attachedToProject": {
"@id": "http://rdfh.ch/projects/Lw3FC39BSzCwvmdOaTyLqQ"
},
"knora-api:userHasPermission": "RV",
"knora-api:creationDate": {
"@value": "2019-11-29T10:00:00.673298Z",
"@type": "xsd:dateTimeStamp"
},
"knora-api:hasPermissions": "CR knora-admin:Creator|M knora-admin:ProjectMember|V knora-admin:KnownUser|RV knora-admin:UnknownUser",
"@type": "books:Book",
"books:hasTextType": {
"knora-api:arkUrl": {
"@value": "http://0.0.0.0:3336/ark:/72163/1/0001/book=instance=04K/00list_value_adventurQo",
"@type": "xsd:anyURI"
},
"knora-api:userHasPermission": "RV",
"knora-api:valueCreationDate": {
"@value": "2018-05-29T16:42:04.381Z",
"@type": "xsd:dateTimeStamp"
},
"knora-api:attachedToUser": {
"@id": "http://rdfh.ch/users/BhkfBc3hTeS_IDo-JgXRbQ"
},
"knora-api:valueHasUUID": "00list_value_adventurQ",
"knora-api:hasPermissions": "CR knora-admin:Creator|M knora-admin:ProjectMember|V knora-admin:KnownUser|RV knora-admin:UnknownUser",
"@type": "knora-api:ListValue",
"@id": "http://rdfh.ch/0001/book-instance-04/values/has-list-value-03",
"knora-api:listValueAsListNode": {
"@id": "http://rdfh.ch/lists/0001/ynm02-05"
},
"knora-api:versionArkUrl": {
"@value": "http://0.0.0.0:3336/ark:/72163/1/0001/book=instance=04K/00list_value_adventurQo.20180529T164204381Z",
"@type": "xsd:anyURI"
}
},
"@id": "http://rdfh.ch/0001/book-instance-04",
"books:hasTitle": {
"knora-api:valueAsString": "Treasure Island",
"knora-api:arkUrl": {
"@value": "http://0.0.0.0:3336/ark:/72163/1/0001/book=instance=04K/IN4R19yYR0ygi3K2VEHpUQe",
"@type": "xsd:anyURI"
},
"knora-api:versionArkUrl": {
"@value": "http://0.0.0.0:3336/ark:/72163/1/0001/book=instance=04K/IN4R19yYR0ygi3K2VEHpUQe.20180529T164204381Z",
"@type": "xsd:anyURI"
},
"knora-api:userHasPermission": "RV",
"knora-api:valueCreationDate": {
"@value": "2018-05-29T16:42:04.381Z",
"@type": "xsd:dateTimeStamp"
},
"knora-api:attachedToUser": {
"@id": "http://rdfh.ch/users/BhkfBc3hTeS_IDo-JgXRbQ"
},
"knora-api:valueHasUUID": "IN4R19yYR0ygi3K2VEHpUQ",
"knora-api:hasPermissions": "CR knora-admin:Creator|M knora-admin:ProjectMember|V knora-admin:KnownUser|RV knora-admin:UnknownUser",
"@type": "knora-api:TextValue",
"@id": "http://rdfh.ch/0001/book-instance-04/values/has-title-value-04"
},
"knora-api:attachedToUser": {
"@id": "http://rdfh.ch/users/9XBCrDV3SRa7kS1WwynB4Q"
},
"@context": {
"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
"knora-api": "http://api.knora.org/ontology/knora-api/v2#",
"rdfs": "http://www.w3.org/2000/01/rdf-schema#",
"books": "http://0.0.0.0:3333/ontology/0001/books/v2#",
"xsd": "http://www.w3.org/2001/XMLSchema#"
}
}
55 changes: 55 additions & 0 deletions test_data/searchR2RV2/SearchbylabelSpecialCharacters.jsonld
@@ -0,0 +1,55 @@
{
"knora-api:arkUrl": {
"@value": "http://0.0.0.0:3336/ark:/72163/1/0001/book=instance=05I",
"@type": "xsd:anyURI"
},
"rdfs:label": "this .,:; is + a - test & with \\ special ( characters ) in [] {} | the || label?!",
"knora-api:versionArkUrl": {
"@value": "http://0.0.0.0:3336/ark:/72163/1/0001/book=instance=05I.20191129T100000673298Z",
"@type": "xsd:anyURI"
},
"knora-api:attachedToProject": {
"@id": "http://rdfh.ch/projects/Lw3FC39BSzCwvmdOaTyLqQ"
},
"knora-api:userHasPermission": "V",
"knora-api:creationDate": {
"@value": "2019-11-29T10:00:00.673298Z",
"@type": "xsd:dateTimeStamp"
},
"knora-api:hasPermissions": "CR knora-admin:Creator|M knora-admin:ProjectMember|V knora-admin:KnownUser,knora-admin:UnknownUser",
"@type": "books:Book",
"@id": "http://rdfh.ch/0001/book-instance-05",
"books:hasTitle": {
"knora-api:valueAsString": "A title",
"knora-api:arkUrl": {
"@value": "http://0.0.0.0:3336/ark:/72163/1/0001/book=instance=05I/dFLLWiihTAqxJsh0iTQ1HQ0",
"@type": "xsd:anyURI"
},
"knora-api:versionArkUrl": {
"@value": "http://0.0.0.0:3336/ark:/72163/1/0001/book=instance=05I/dFLLWiihTAqxJsh0iTQ1HQ0.20180529T164204381Z",
"@type": "xsd:anyURI"
},
"knora-api:userHasPermission": "RV",
"knora-api:valueCreationDate": {
"@value": "2018-05-29T16:42:04.381Z",
"@type": "xsd:dateTimeStamp"
},
"knora-api:attachedToUser": {
"@id": "http://rdfh.ch/users/BhkfBc3hTeS_IDo-JgXRbQ"
},
"knora-api:valueHasUUID": "dFLLWiihTAqxJsh0iTQ1HQ",
"knora-api:hasPermissions": "CR knora-admin:Creator|M knora-admin:ProjectMember|V knora-admin:KnownUser|RV knora-admin:UnknownUser",
"@type": "knora-api:TextValue",
"@id": "http://rdfh.ch/0001/book-instance-05/values/has-title-value-05"
},
"knora-api:attachedToUser": {
"@id": "http://rdfh.ch/users/9XBCrDV3SRa7kS1WwynB4Q"
},
"@context": {
"rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
"knora-api": "http://api.knora.org/ontology/knora-api/v2#",
"rdfs": "http://www.w3.org/2000/01/rdf-schema#",
"books": "http://0.0.0.0:3333/ontology/0001/books/v2#",
"xsd": "http://www.w3.org/2001/XMLSchema#"
}
}
Expand Up @@ -43,13 +43,13 @@ trait InstrumentationSupport {
/**
* NOTE: The elapsed time of the span is saved somewhere by kamon, but
* I have no idea how to get to it and this is why I'm calculating
* it in the metricsLogger.info line. This is a quick and dirty hack to
* it in the metricsLogger.debug line. This is a quick and dirty hack to
* have at least something.
*/
val start = System.currentTimeMillis()
Kamon.span(name) {
future.andThen { case Success(_) =>
metricsLogger.info(s"$name: {} ms", System.currentTimeMillis() - start)
metricsLogger.debug(s"$name: {} ms", System.currentTimeMillis() - start)
}
}
}
Expand Down
Expand Up @@ -1666,8 +1666,6 @@ class StringFormatter private (
/**
* Makes a string safe to be entered in the triplestore by escaping special chars.
*
* If the param `revert` is set to `true`, the string is unescaped.
*
* @param s a string.
* @param errorFun a function that throws an exception. It will be called if the string is empty or contains
* a carriage return (`\r`).
Expand Down Expand Up @@ -1698,6 +1696,17 @@ class StringFormatter private (
SparqlEscapeInput
)

/**
* Replaces all characters that have a special meaning in the Lucene Query Parser syntax and normalizes spaces.
*
* @param s a string
* @return the normalized string
*/
def replaceLuceneQueryParserSyntaxCharacters(s: String): String = {
val stringWithoutSpecialCharacters = s.replaceAll("[\\+\\-&\\|!\\(\\)\\{\\}\\[\\]\\^\"~\\*\\?:\\\\]", " ")
StringUtils.normalizeSpace(stringWithoutSpecialCharacters)
}

/**
* Encodes a string for use in JSON, and encloses it in quotation marks.
*
Expand Down
Expand Up @@ -1801,7 +1801,7 @@ class UsersResponderADM(responderData: ResponderData) extends Responder(responde
*/
private def getUserFromCacheOrTriplestore(
identifier: UserIdentifierADM
): Future[Option[UserADM]] = // tracedFuture("admin-user-get-user-from-cache-or-triplestore") {
): Future[Option[UserADM]] = tracedFuture("admin-user-get-user-from-cache-or-triplestore") {
if (cacheServiceSettings.cacheServiceEnabled) {
// caching enabled
getUserFromCache(identifier).flatMap {
Expand Down Expand Up @@ -1830,7 +1830,7 @@ class UsersResponderADM(responderData: ResponderData) extends Responder(responde
log.debug("getUserFromCacheOrTriplestore - caching disabled. getting from triplestore.")
getUserFromTriplestore(identifier = identifier)
}
// }
}

/**
* Tries to retrieve a [[UserADM]] from the triplestore.
Expand Down
Expand Up @@ -25,12 +25,12 @@ trait HealthCheck {

protected def healthCheck(state: State): UIO[HttpResponse] =
for {
_ <- ZIO.logInfo("get application state")
_ <- ZIO.logDebug("get application state")
state <- state.get
result <- setHealthState(state)
_ <- ZIO.logInfo("set health state")
_ <- ZIO.logDebug("set health state")
response <- createResponse(result)
_ <- ZIO.logInfo("getting application state done")
_ <- ZIO.logDebug("getting application state done")
} yield response

private def setHealthState(state: AppState): UIO[HealthCheckResult] =
Expand Down
Expand Up @@ -429,11 +429,12 @@ class SearchRouteV2(routeData: KnoraRouteData) extends KnoraRoute(routeData) wit
"v2" / "searchbylabel" / Segment
) { searchval => // TODO: if a space is encoded as a "+", this is not converted back to a space
get { requestContext =>
val searchString =
val sparqlEncodedSearchString =
stringFormatter.toSparqlEncodedString(
searchval,
throw BadRequestException(s"Invalid search string: '$searchval'")
)
val searchString = stringFormatter.replaceLuceneQueryParserSyntaxCharacters(sparqlEncodedSearchString)

if (searchString.length < routeData.appConfig.v2.fulltextSearch.searchValueMinLength) {
throw BadRequestException(
Expand Down
Expand Up @@ -31,7 +31,7 @@
SELECT (count(distinct ?resource) as ?count)
}
WHERE {
?resource <http://jena.apache.org/text#query> "@searchTerm.generateLiteralForLuceneIndexWithoutExactSequence" .
?resource <http://jena.apache.org/text#query> (rdfs:label "@searchTerm.generateLiteralForLuceneIndexWithoutExactSequence") .

?resource a ?resourceClass ;
rdfs:label ?label .
Expand Down

0 comments on commit b359916

Please sign in to comment.