Skip to content

Commit

Permalink
More complete solution for handling invalid UTF-8 in any part of log
Browse files Browse the repository at this point in the history
In various benchmarks, lua-utf8 applied once at the end seems to be the
most performant solution for handling this.
  • Loading branch information
GUI committed May 4, 2024
1 parent b153dd2 commit 8e41b90
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 57 deletions.
1 change: 1 addition & 0 deletions src/api-umbrella/proxy/hooks/init_preload_modules.lua
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ require "api-umbrella.utils.xpcall_error_handler"
require "cjson"
require "etlua"
require "libcidr-ffi"
require "lua-utf8"
require "lustache"
require "ngx.re"
require "pl.path"
Expand Down
21 changes: 16 additions & 5 deletions src/api-umbrella/proxy/hooks/log_initial_proxy.lua
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
local log_utils = require "api-umbrella.proxy.log_utils"
local utf8_clean = require("lua-utf8").clean

local ignore_request = log_utils.ignore_request
local ngx_ctx = ngx.ctx
Expand Down Expand Up @@ -99,11 +100,21 @@ end
local function log_request()
-- Build the log message and send to Fluent Bit for processing.
local data = build_log_data()
local message = json_encode(normalized_data(data))
local _, err = send_message(message)
if err then
ngx.log(ngx.ERR, "failed to log message: ", err)
return
local original_json = json_encode(normalized_data(data))
local message, was_valid_utf8 = utf8_clean(original_json)
if not was_valid_utf8 then
ngx.log(ngx.WARN, "log message contained invalid utf-8, original: ", original_json)
ngx.log(ngx.WARN, "log message contained invalid utf-8, cleaned: ", message)
elseif not message then
ngx.log(ngx.ERR, "failed to log message, message missing: ", original_json)
end

if message then
local _, err = send_message(message)
if err then
ngx.log(ngx.ERR, "failed to log message: ", err)
return
end
end

-- After logging, cache any new cities we see from GeoIP in our database.
Expand Down
26 changes: 8 additions & 18 deletions src/api-umbrella/proxy/log_utils.lua
Original file line number Diff line number Diff line change
Expand Up @@ -51,16 +51,6 @@ local function uppercase_truncate(value, max_length)
return string.upper(truncate_string(value, max_length))
end

local function sanitize_request_header(value)
local cleaned_value, _, gsub_err = re_gsub(value, "\xC0\xA2", "??", "jo")
if gsub_err then
ngx.log(ngx.ERR, "regex error: ", gsub_err)
return value
end

return cleaned_value
end

local function remove_envoy_empty_header_value(value)
if value and value == "-" then
return nil
Expand Down Expand Up @@ -296,20 +286,20 @@ function _M.normalized_data(data)
api_backend_response_flags = remove_envoy_empty_header_value(truncate(data["api_backend_response_flags"], 20)),
gatekeeper_denied_code = lowercase_truncate(data["gatekeeper_denied_code"], 50),
request_id = lowercase_truncate(data["request_id"], 20),
request_accept = sanitize_request_header(truncate(data["request_accept"], 200)),
request_accept_encoding = sanitize_request_header(truncate(data["request_accept_encoding"], 200)),
request_accept = truncate(data["request_accept"], 200),
request_accept_encoding = truncate(data["request_accept_encoding"], 200),
request_basic_auth_username = truncate(data["request_basic_auth_username"], 200),
request_connection = sanitize_request_header(truncate(data["request_connection"], 200)),
request_content_type = sanitize_request_header(truncate(data["request_content_type"], 200)),
request_connection = truncate(data["request_connection"], 200),
request_content_type = truncate(data["request_content_type"], 200),
request_ip = lowercase_truncate(data["request_ip"], 45),
request_ip_city = truncate(data["request_ip_city"], 200),
request_ip_country = uppercase_truncate(data["request_ip_country"], 2),
request_ip_region = uppercase_truncate(data["request_ip_region"], 2),
request_method = uppercase_truncate(data["request_method"], 10),
request_origin = sanitize_request_header(truncate(data["request_origin"], 200)),
request_referer = sanitize_request_header(truncate(data["request_referer"], 200)),
request_origin = truncate(data["request_origin"], 200),
request_referer = truncate(data["request_referer"], 200),
request_size = tonumber(data["request_size"]),
request_host = sanitize_request_header(lowercase_truncate(data["request_url_host"], 200)),
request_host = lowercase_truncate(data["request_url_host"], 200),
request_path = truncate(data["request_url_path"], 4000),
request_url_hierarchy_level0 = truncate(data["request_url_hierarchy_level0"], 200),
request_url_hierarchy_level1 = truncate(data["request_url_hierarchy_level1"], 200),
Expand All @@ -320,7 +310,7 @@ function _M.normalized_data(data)
request_url_hierarchy_level6 = truncate(data["request_url_hierarchy_level6"], 200),
request_url_query = truncate(data["request_url_query"], 4000),
request_scheme = lowercase_truncate(data["request_url_scheme"], 10),
request_user_agent = sanitize_request_header(truncate(data["request_user_agent"], 400)),
request_user_agent = truncate(data["request_user_agent"], 400),
request_user_agent_family = truncate(data["request_user_agent_family"], 100),
request_user_agent_type = truncate(data["request_user_agent_type"], 100),
response_age = tonumber(data["response_age"]),
Expand Down
77 changes: 45 additions & 32 deletions test/proxy/logging/test_special_chars.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
class Test::Proxy::Logging::TestSpecialChars < Minitest::Test
include ApiUmbrellaTestHelpers::Setup
include ApiUmbrellaTestHelpers::Logging
parallelize_me!

def setup
super
Expand Down Expand Up @@ -107,7 +106,7 @@ def test_valid_utf8_encoding_in_url_path_url_params_headers
end

def test_invalid_utf8_encoding_in_url_path_url_params_headers
log_tail = LogTail.new("fluent-bit/current")
log_tail = LogTail.new("nginx/current")

# Test various encodings of the ISO-8859-1 pound symbol: £ (but since this
# is the ISO-8859-1 version, it's not valid UTF-8).
Expand All @@ -127,33 +126,12 @@ def test_invalid_utf8_encoding_in_url_path_url_params_headers

record = wait_for_log(response)[:hit_source]

log = log_tail.read.encode("UTF-8", invalid: :replace)
# Fluent Bit's UTF-8 handling appears to be different on x86_64 versus
# ARM64. I believe related to this open issue:
# https://github.com/fluent/fluent-bit/issues/7995
#
# So on ARM64, it complains about UTF-8, which I think is more expected
# since we are sending in invalid encoded data. But on x86-64 systems, this
# doesn't appear to happen. I think we're okay with either behavior,
# really, we mainly just want to make sure things don't crash when
# encountering this type of weird input.
if RUBY_PLATFORM.start_with?("x86_64")
refute_match("invalid UTF-8 bytes found, skipping bytes", log)
else
assert_match("invalid UTF-8 bytes found, skipping bytes", log)
end

# Since the encoding of this string wasn't actually a valid UTF-8 string,
# we test situations where it's sent as the raw ISO-8859-1 value, as well
# as the UTF-8 replacement character.
expected_raw_in_url_path = url_encoded.downcase
expected_raw_in_url_query = url_encoded
# See above for differences in platform.
if RUBY_PLATFORM.start_with?("x86_64")
expected_raw_in_header = "\uE0A3"
else
expected_raw_in_header = ""
end
expected_raw_in_header = "\uFFFD"
expected_raw_utf8_in_url_path = "%ef%bf%bd"
expected_raw_utf8_in_url_query = "%EF%BF%BD"
expected_raw_utf8_in_header = Base64.decode64("77+9").force_encoding("utf-8")
Expand Down Expand Up @@ -194,6 +172,10 @@ def test_invalid_utf8_encoding_in_url_path_url_params_headers
assert_equal(base64ed, record["request_referer"])
assert_equal(expected_raw_in_header, record["request_origin"])
assert_equal(expected_raw_utf8_in_header, record["request_accept"])

log = log_tail.read_until(/log message contained invalid utf-8, cleaned/)
assert_match(/\[warn\].*log message contained invalid utf-8, original: \{/, log)
assert_match(/\[warn\].*log message contained invalid utf-8, cleaned: \{/, log)
end

def test_encoded_strings_as_given
Expand Down Expand Up @@ -295,6 +277,8 @@ def test_slashes_and_backslashes
end

def test_invalid_quotes
log_tail = LogTail.new("nginx/current")

response = Typhoeus.get("http://127.0.0.1:9080/api/hello", log_http_options.deep_merge({
:headers => {
"User-Agent" => Base64.decode64("eyJ1c2VyX2FnZW50IjogImZvbyDAp8CiIGJhciJ9"),
Expand All @@ -304,13 +288,42 @@ def test_invalid_quotes
assert_response_code(200, response)

record = wait_for_log(response)[:hit_source]
# CI returns a slightly different response than local dev for some reason.
if record["request_referer"].include?("foo ?? ")
assert_equal("{\"user_agent\": \"foo ?? bar\"}", record["request_referer"])
assert_equal("{\"user_agent\": \"foo ?? bar\"}", record["request_user_agent"])
else
assert_equal("{\"user_agent\": \"foo \xC0\xA7?? bar\"}", record["request_referer"])
assert_equal("{\"user_agent\": \"foo \xC0\xA7?? bar\"}", record["request_user_agent"])
end
assert_equal("{\"user_agent\": \"foo \uFFFD bar\"}", record["request_referer"])
assert_equal("{\"user_agent\": \"foo \uFFFD bar\"}", record["request_user_agent"])

response = Typhoeus.get("http://127.0.0.1:9080/api/hello", log_http_options.deep_merge({
:headers => {
"X-Api-Key" => Base64.decode64("eyJ1c2VyX2FnZW50IjogImZvbyDAp8CiIGJhciJ9"),
},
}))
assert_response_code(403, response)

record = wait_for_log(response)[:hit_source]
assert_equal("{\"user_agent\": \"foo \uFFFD bar\"}", record["api_key"])

log = log_tail.read_until(/log message contained invalid utf-8, cleaned/)
assert_match(/\[warn\].*log message contained invalid utf-8, original: \{/, log)
assert_match(/\[warn\].*log message contained invalid utf-8, cleaned: \{/, log)
end

def test_utf8_json_quoting
log_tail = LogTail.new("nginx/current")

response = Typhoeus.get("http://127.0.0.1:9080/api/hello", log_http_options.deep_merge({
:headers => {
"User-Agent" => "\" \u0022 \\u0022 %22 &#34; &#x22; \u201D",
"Referer" => "\" \u0022 \\u0022 %22 &#34; &#x22; \u201D",
"X-Api-Key" => "\" \u0022 \\u0022 %22 &#34; &#x22; \u201D",
},
}))
assert_response_code(403, response)

record = wait_for_log(response)[:hit_source]
assert_equal("\" \" \\u0022 %22 &#34; &#x22; \u201D", record["request_referer"])
assert_equal("\" \" \\u0022 %22 &#34; &#x22; \u201D", record["request_user_agent"])
assert_equal("\" \" \\u0022 %22 &#34; &#x22; \u201D", record["api_key"])

log = log_tail.read
refute_match("log message contained invalid utf-8", log)
end
end
4 changes: 2 additions & 2 deletions test/support/log_tail.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def read
output = nil
File.open(@path) do |file|
file.seek(@pos)
output = file.read
output = file.read.encode("UTF-8", invalid: :replace)
@pos = file.pos
end

Expand All @@ -23,7 +23,7 @@ def read_until(regex, timeout: 5)
begin
Timeout.timeout(timeout) do
loop do
output << self.read
output << self.read.encode("UTF-8", invalid: :replace)
break if output.match(regex)

sleep 0.1
Expand Down

0 comments on commit 8e41b90

Please sign in to comment.