Skip to content

Commit

Permalink
Refactor the utf-8 json cleaning process to make it resuable.
Browse files Browse the repository at this point in the history
And also provide a bit more documentation on some of the choices made.
  • Loading branch information
GUI committed May 4, 2024
1 parent ccbc9d9 commit d7046c1
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 28 deletions.
2 changes: 1 addition & 1 deletion src/api-umbrella/proxy/hooks/init_preload_modules.lua
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,12 @@ require "api-umbrella.utils.shared_dict_retry"
require "api-umbrella.utils.string_template"
require "api-umbrella.utils.url_build"
require "api-umbrella.utils.url_parse"
require "api-umbrella.utils.utf8_safe_json_encode"
require "api-umbrella.utils.worker_group"
require "api-umbrella.utils.xpcall_error_handler"
require "cjson"
require "etlua"
require "libcidr-ffi"
require "lua-utf8"
require "lustache"
require "ngx.re"
require "pl.path"
Expand Down
23 changes: 6 additions & 17 deletions src/api-umbrella/proxy/hooks/log_initial_proxy.lua
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
local log_utils = require "api-umbrella.proxy.log_utils"
local utf8_clean = require("lua-utf8").clean

local ignore_request = log_utils.ignore_request
local ngx_ctx = ngx.ctx
Expand All @@ -9,7 +8,7 @@ if ignore_request(ngx_ctx) then
end

local flatten_headers = require "api-umbrella.utils.flatten_headers"
local json_encode = require "api-umbrella.utils.json_encode"
local utf8_safe_json_encode = require "api-umbrella.utils.utf8_safe_json_encode"
local xpcall_error_handler = require "api-umbrella.utils.xpcall_error_handler"

local cache_new_city_geocode = log_utils.cache_new_city_geocode
Expand Down Expand Up @@ -100,21 +99,11 @@ end
local function log_request()
-- Build the log message and send to Fluent Bit for processing.
local data = build_log_data()
local original_json = json_encode(normalized_data(data))
local message, was_valid_utf8 = utf8_clean(original_json)
if not was_valid_utf8 then
ngx.log(ngx.WARN, "log message contained invalid utf-8, original: ", original_json)
ngx.log(ngx.WARN, "log message contained invalid utf-8, cleaned: ", message)
elseif not message then
ngx.log(ngx.ERR, "failed to log message, message missing: ", original_json)
end

if message then
local _, err = send_message(message)
if err then
ngx.log(ngx.ERR, "failed to log message: ", err)
return
end
local message = utf8_safe_json_encode(normalized_data(data))
local _, err = send_message(message)
if err then
ngx.log(ngx.ERR, "failed to log message: ", err)
return
end

-- After logging, cache any new cities we see from GeoIP in our database.
Expand Down
4 changes: 1 addition & 3 deletions src/api-umbrella/utils/json_encode.lua
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,4 @@ local cjson_encode = cjson.encode
-- Increase precision in order to handle encoding bigger numbers.
cjson.encode_number_precision(16)

return function(data)
return cjson_encode(data)
end
return cjson_encode
33 changes: 33 additions & 0 deletions src/api-umbrella/utils/utf8_safe_json_encode.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
local json_encode = require "api-umbrella.utils.json_encode"
local utf8_clean = require("lua-utf8").clean

-- A JSON encoding method that ensures valid UTF-8 output. Technically JSON
-- must conform to UTF-8, but the normal `cjson` library does not enforce this.
-- There is extra overhead in this processing, so we will only use this in
-- cases where we have untrusted input and is necessary (for example, logging
-- to OpenSearch, since otherwise OpenSearch also doesn't perform any
-- validations, so we can end up with unparseable data if we send OpenSearch
-- invalid UTF-8 data).
--
-- Note that this takes the approach of encoding to a single JSON string first,
-- and then sanitizing that after the fact. In benchmarks, this appears
-- normally faster than trying to sanitize all of the inputs first (eg, on a
-- nested table). I believe this approach is still safe and should always
-- produce valid JSON output, but I'm not 100% sure that some edge-case doesn't
-- exist that might lead to invalid JSON (eg, if somehow the utf-cleaning
-- process messes with the quotes inside the JSON). So we could revisit this
-- approach if we discover edge cases, but in the meantime, we'll go with this
-- (the main risk would be invalid utf-8 input could lead to invalid JSON
-- output, in which case we might miss logging those entries).
return function(data)
local original_json = json_encode(data)
local clean_json, was_valid_utf8 = utf8_clean(original_json)
if not was_valid_utf8 then
ngx.log(ngx.WARN, "json contained invalid utf-8, original: ", original_json)
ngx.log(ngx.WARN, "json contained invalid utf-8, cleaned: ", clean_json)
elseif not clean_json then
ngx.log(ngx.ERR, "failed to perform utf-8 cleaning for json: ", original_json)
end

return clean_json
end
14 changes: 7 additions & 7 deletions test/proxy/logging/test_special_chars.rb
Original file line number Diff line number Diff line change
Expand Up @@ -173,9 +173,9 @@ def test_invalid_utf8_encoding_in_url_path_url_params_headers
assert_equal(expected_raw_in_header, record["request_origin"])
assert_equal(expected_raw_utf8_in_header, record["request_accept"])

log = log_tail.read_until(/log message contained invalid utf-8, cleaned/)
assert_match(/\[warn\].*log message contained invalid utf-8, original: \{/, log)
assert_match(/\[warn\].*log message contained invalid utf-8, cleaned: \{/, log)
log = log_tail.read_until(/json contained invalid utf-8, cleaned/)
assert_match(/\[warn\].*json contained invalid utf-8, original: \{/, log)
assert_match(/\[warn\].*json contained invalid utf-8, cleaned: \{/, log)
end

def test_encoded_strings_as_given
Expand Down Expand Up @@ -301,9 +301,9 @@ def test_invalid_quotes
record = wait_for_log(response)[:hit_source]
assert_equal("{\"user_agent\": \"foo \uFFFD bar\"}", record["api_key"])

log = log_tail.read_until(/log message contained invalid utf-8, cleaned/)
assert_match(/\[warn\].*log message contained invalid utf-8, original: \{/, log)
assert_match(/\[warn\].*log message contained invalid utf-8, cleaned: \{/, log)
log = log_tail.read_until(/json contained invalid utf-8, cleaned/)
assert_match(/\[warn\].*json contained invalid utf-8, original: \{/, log)
assert_match(/\[warn\].*json contained invalid utf-8, cleaned: \{/, log)
end

def test_utf8_json_quoting
Expand All @@ -324,6 +324,6 @@ def test_utf8_json_quoting
assert_equal("\" \" \\u0022 %22 " " \u201D", record["api_key"])

log = log_tail.read
refute_match("log message contained invalid utf-8", log)
refute_match("json contained invalid utf-8", log)
end
end

0 comments on commit d7046c1

Please sign in to comment.