Skip to content

Commit

Permalink
Implement support for different UUID binary formats (#42108)
Browse files Browse the repository at this point in the history
* Implement support for different UUID binary formats

* Declare error codes that the implementation uses

* Make single-argument constructor explicit

* Modernize parts of the solution

* Make improvements based on review comments

* Declare an error code being used
  • Loading branch information
ltrk2 committed Oct 25, 2022
1 parent b0d5e7d commit 2c902bb
Show file tree
Hide file tree
Showing 7 changed files with 195 additions and 73 deletions.
8 changes: 0 additions & 8 deletions docs/en/sql-reference/functions/encoding-functions.md
Expand Up @@ -376,14 +376,6 @@ Result:
└─────┘
```

## UUIDStringToNum(str)

Accepts a string containing 36 characters in the format `123e4567-e89b-12d3-a456-426655440000`, and returns it as a set of bytes in a FixedString(16).

## UUIDNumToString(str)

Accepts a FixedString(16) value. Returns a string containing 36 characters in text format.

## bitmaskToList(num)

Accepts an integer. Returns a string containing the list of powers of two that total the source number when summed. They are comma-separated without spaces in text format, in ascending order.
Expand Down
46 changes: 42 additions & 4 deletions docs/en/sql-reference/functions/uuid-functions.md
Expand Up @@ -211,12 +211,19 @@ SELECT toUUIDOrZero('61f0c404-5cb3-11e7-907b-a6006ad3dba0T') AS uuid

## UUIDStringToNum

Accepts a string containing 36 characters in the format `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`, and returns it as a set of bytes in a [FixedString(16)](../../sql-reference/data-types/fixedstring.md).
Accepts `string` containing 36 characters in the format `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`, and returns a [FixedString(16)](../../sql-reference/data-types/fixedstring.md) as its binary representation, with its format optionally specified by `variant` (`Big-endian` by default).

**Syntax**

``` sql
UUIDStringToNum(String)
UUIDStringToNum(string[, variant = 1])
```

**Arguments**

- `string` — String of 36 characters or FixedString(36). [String](../../sql-reference/syntax.md#syntax-string-literal).
- `variant` — Integer, representing a variant as specified by [RFC4122](https://datatracker.ietf.org/doc/html/rfc4122#section-4.1.1). 1 = `Big-endian` (default), 2 = `Microsoft`.

**Returned value**

FixedString(16)
Expand All @@ -235,14 +242,33 @@ SELECT
└──────────────────────────────────────┴──────────────────┘
```

``` sql
SELECT
'612f3c40-5d3b-217e-707b-6a546a3d7b29' AS uuid,
UUIDStringToNum(uuid, 2) AS bytes
```

``` text
┌─uuid─────────────────────────────────┬─bytes────────────┐
│ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │ @</a;]~!p{jTj={) │
└──────────────────────────────────────┴──────────────────┘
```

## UUIDNumToString

Accepts a [FixedString(16)](../../sql-reference/data-types/fixedstring.md) value, and returns a string containing 36 characters in text format.
Accepts `binary` containing a binary representation of a UUID, with its format optionally specified by `variant` (`Big-endian` by default), and returns a string containing 36 characters in text format.

**Syntax**

``` sql
UUIDNumToString(FixedString(16))
UUIDNumToString(binary[, variant = 1])
```

**Arguments**

- `binary`[FixedString(16)](../../sql-reference/data-types/fixedstring.md) as a binary representation of a UUID.
- `variant` — Integer, representing a variant as specified by [RFC4122](https://datatracker.ietf.org/doc/html/rfc4122#section-4.1.1). 1 = `Big-endian` (default), 2 = `Microsoft`.

**Returned value**

String.
Expand All @@ -261,6 +287,18 @@ SELECT
└──────────────────┴──────────────────────────────────────┘
```

``` sql
SELECT
'@</a;]~!p{jTj={)' AS bytes,
UUIDNumToString(toFixedString(bytes, 16), 2) AS uuid
```

``` text
┌─bytes────────────┬─uuid─────────────────────────────────┐
│ @</a;]~!p{jTj={) │ 612f3c40-5d3b-217e-707b-6a546a3d7b29 │
└──────────────────┴──────────────────────────────────────┘
```

## serverUUID()

Returns the random and unique UUID, which is generated when the server is first started and stored forever. The result writes to the file `uuid` created in the ClickHouse server directory `/var/lib/clickhouse/`.
Expand Down
187 changes: 142 additions & 45 deletions src/Functions/FunctionsCodingUUID.cpp
Expand Up @@ -13,43 +13,160 @@
#include <Interpreters/Context_fwd.h>
#include <Interpreters/castColumn.h>

namespace DB
#include <span>

namespace DB::ErrorCodes
{
extern const int ARGUMENT_OUT_OF_BOUND;
extern const int ILLEGAL_COLUMN;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int LOGICAL_ERROR;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
}

namespace
{
enum class Representation
{
BigEndian,
LittleEndian
};

std::pair<int, int> determineBinaryStartIndexWithIncrement(const ptrdiff_t num_bytes, const Representation representation)
{
if (representation == Representation::BigEndian)
return {0, 1};
else if (representation == Representation::LittleEndian)
return {num_bytes - 1, -1};

throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "{} is not handled yet", magic_enum::enum_name(representation));
}

void formatHex(const std::span<const UInt8> src, UInt8 * dst, const Representation representation)
{
const auto src_size = std::ssize(src);
const auto [src_start_index, src_increment] = determineBinaryStartIndexWithIncrement(src_size, representation);
for (int src_pos = src_start_index, dst_pos = 0; src_pos >= 0 && src_pos < src_size; src_pos += src_increment, dst_pos += 2)
writeHexByteLowercase(src[src_pos], dst + dst_pos);
}

void parseHex(const UInt8 * __restrict src, const std::span<UInt8> dst, const Representation representation)
{
const auto dst_size = std::ssize(dst);
const auto [dst_start_index, dst_increment] = determineBinaryStartIndexWithIncrement(dst_size, representation);
const auto * src_as_char = reinterpret_cast<const char *>(src);
for (auto dst_pos = dst_start_index, src_pos = 0; dst_pos >= 0 && dst_pos < dst_size; dst_pos += dst_increment, src_pos += 2)
dst[dst_pos] = unhex2(src_as_char + src_pos);
}

class UUIDSerializer
{
public:
enum class Variant
{
Default = 1,
Microsoft = 2
};

explicit UUIDSerializer(const Variant variant)
: first_half_binary_representation(variant == Variant::Microsoft ? Representation::LittleEndian : Representation::BigEndian)
{
if (variant != Variant::Default && variant != Variant::Microsoft)
throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "{} is not handled yet", magic_enum::enum_name(variant));
}

void deserialize(const UInt8 * src16, UInt8 * dst36) const
{
formatHex({src16, 4}, &dst36[0], first_half_binary_representation);
dst36[8] = '-';
formatHex({src16 + 4, 2}, &dst36[9], first_half_binary_representation);
dst36[13] = '-';
formatHex({src16 + 6, 2}, &dst36[14], first_half_binary_representation);
dst36[18] = '-';
formatHex({src16 + 8, 2}, &dst36[19], Representation::BigEndian);
dst36[23] = '-';
formatHex({src16 + 10, 6}, &dst36[24], Representation::BigEndian);
}

void serialize(const UInt8 * src36, UInt8 * dst16) const
{
/// If string is not like UUID - implementation specific behaviour.
parseHex(&src36[0], {dst16 + 0, 4}, first_half_binary_representation);
parseHex(&src36[9], {dst16 + 4, 2}, first_half_binary_representation);
parseHex(&src36[14], {dst16 + 6, 2}, first_half_binary_representation);
parseHex(&src36[19], {dst16 + 8, 2}, Representation::BigEndian);
parseHex(&src36[24], {dst16 + 10, 6}, Representation::BigEndian);
}

private:
Representation first_half_binary_representation;
};

void checkArgumentCount(const DB::DataTypes & arguments, const std::string_view function_name)
{
if (const auto argument_count = std::ssize(arguments); argument_count < 1 || argument_count > 2)
throw DB::Exception(
DB::ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Number of arguments for function {} doesn't match: passed {}, should be 1 or 2",
function_name,
argument_count);
}

namespace ErrorCodes
void checkFormatArgument(const DB::DataTypes & arguments, const std::string_view function_name)
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int ILLEGAL_COLUMN;
if (const auto argument_count = std::ssize(arguments);
argument_count > 1 && !DB::WhichDataType(arguments[1]).isInt8() && !DB::WhichDataType(arguments[1]).isUInt8())
throw DB::Exception(
DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Illegal type {} of second argument of function {}, expected Int8 or UInt8 type",
arguments[1]->getName(),
function_name);
}

UUIDSerializer::Variant parseVariant(const DB::ColumnsWithTypeAndName & arguments)
{
if (arguments.size() < 2)
return UUIDSerializer::Variant::Default;

const auto representation = static_cast<magic_enum::underlying_type_t<UUIDSerializer::Variant>>(arguments[1].column->getInt(0));
const auto as_enum = magic_enum::enum_cast<UUIDSerializer::Variant>(representation);
if (!as_enum)
throw DB::Exception(DB::ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Expected UUID variant, got {}", representation);

return *as_enum;
}
}

namespace DB
{
constexpr size_t uuid_bytes_length = 16;
constexpr size_t uuid_text_length = 36;

class FunctionUUIDNumToString : public IFunction
{

public:
static constexpr auto name = "UUIDNumToString";
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionUUIDNumToString>(); }

String getName() const override
{
return name;
}

size_t getNumberOfArguments() const override { return 1; }
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 0; }
bool isInjective(const ColumnsWithTypeAndName &) const override { return true; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
bool isVariadic() const override { return true; }

DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
checkArgumentCount(arguments, name);

const auto * ptr = checkAndGetDataType<DataTypeFixedString>(arguments[0].get());
if (!ptr || ptr->getN() != uuid_bytes_length)
throw Exception("Illegal type " + arguments[0]->getName() +
" of argument of function " + getName() +
", expected FixedString(" + toString(uuid_bytes_length) + ")",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);

checkFormatArgument(arguments, name);

return std::make_shared<DataTypeString>();
}

Expand All @@ -59,7 +176,7 @@ class FunctionUUIDNumToString : public IFunction
{
const ColumnWithTypeAndName & col_type_name = arguments[0];
const ColumnPtr & column = col_type_name.column;

const auto variant = parseVariant(arguments);
if (const auto * col_in = checkAndGetColumn<ColumnFixedString>(column.get()))
{
if (col_in->getN() != uuid_bytes_length)
Expand All @@ -82,9 +199,10 @@ class FunctionUUIDNumToString : public IFunction
size_t src_offset = 0;
size_t dst_offset = 0;

const UUIDSerializer uuid_serializer(variant);
for (size_t i = 0; i < size; ++i)
{
formatUUID(&vec_in[src_offset], &vec_res[dst_offset]);
uuid_serializer.deserialize(&vec_in[src_offset], &vec_res[dst_offset]);
src_offset += uuid_bytes_length;
dst_offset += uuid_text_length;
vec_res[dst_offset] = 0;
Expand All @@ -104,55 +222,33 @@ class FunctionUUIDNumToString : public IFunction

class FunctionUUIDStringToNum : public IFunction
{
private:
static void parseHex(const UInt8 * __restrict src, UInt8 * __restrict dst, const size_t num_bytes)
{
size_t src_pos = 0;
size_t dst_pos = 0;
for (; dst_pos < num_bytes; ++dst_pos)
{
dst[dst_pos] = unhex2(reinterpret_cast<const char *>(&src[src_pos]));
src_pos += 2;
}
}

static void parseUUID(const UInt8 * src36, UInt8 * dst16)
{
/// If string is not like UUID - implementation specific behaviour.

parseHex(&src36[0], &dst16[0], 4);
parseHex(&src36[9], &dst16[4], 2);
parseHex(&src36[14], &dst16[6], 2);
parseHex(&src36[19], &dst16[8], 2);
parseHex(&src36[24], &dst16[10], 6);
}

public:
static constexpr auto name = "UUIDStringToNum";
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionUUIDStringToNum>(); }

String getName() const override
{
return name;
}

size_t getNumberOfArguments() const override { return 1; }
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 0; }
bool isInjective(const ColumnsWithTypeAndName &) const override { return true; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
bool isVariadic() const override { return true; }

DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
checkArgumentCount(arguments, name);

/// String or FixedString(36)
if (!isString(arguments[0]))
{
const auto * ptr = checkAndGetDataType<DataTypeFixedString>(arguments[0].get());
if (!ptr || ptr->getN() != uuid_text_length)
throw Exception("Illegal type " + arguments[0]->getName() +
" of argument of function " + getName() +
" of first argument of function " + getName() +
", expected FixedString(" + toString(uuid_text_length) + ")",
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}

checkFormatArgument(arguments, name);

return std::make_shared<DataTypeFixedString>(uuid_bytes_length);
}

Expand All @@ -163,6 +259,7 @@ class FunctionUUIDStringToNum : public IFunction
const ColumnWithTypeAndName & col_type_name = arguments[0];
const ColumnPtr & column = col_type_name.column;

const UUIDSerializer uuid_serializer(parseVariant(arguments));
if (const auto * col_in = checkAndGetColumn<ColumnString>(column.get()))
{
const auto & vec_in = col_in->getChars();
Expand All @@ -184,7 +281,7 @@ class FunctionUUIDStringToNum : public IFunction

size_t string_size = offsets_in[i] - src_offset;
if (string_size == uuid_text_length + 1)
parseUUID(&vec_in[src_offset], &vec_res[dst_offset]);
uuid_serializer.serialize(&vec_in[src_offset], &vec_res[dst_offset]);
else
memset(&vec_res[dst_offset], 0, uuid_bytes_length);

Expand Down Expand Up @@ -216,7 +313,7 @@ class FunctionUUIDStringToNum : public IFunction

for (size_t i = 0; i < size; ++i)
{
parseUUID(&vec_in[src_offset], &vec_res[dst_offset]);
uuid_serializer.serialize(&vec_in[src_offset], &vec_res[dst_offset]);
src_offset += uuid_text_length;
dst_offset += uuid_bytes_length;
}
Expand Down
13 changes: 0 additions & 13 deletions src/IO/WriteHelpers.cpp
Expand Up @@ -18,19 +18,6 @@ void formatHex(IteratorSrc src, IteratorDst dst, size_t num_bytes)
}
}

void formatUUID(const UInt8 * src16, UInt8 * dst36)
{
formatHex(&src16[0], &dst36[0], 4);
dst36[8] = '-';
formatHex(&src16[4], &dst36[9], 2);
dst36[13] = '-';
formatHex(&src16[6], &dst36[14], 2);
dst36[18] = '-';
formatHex(&src16[8], &dst36[19], 2);
dst36[23] = '-';
formatHex(&src16[10], &dst36[24], 6);
}

/** Function used when byte ordering is important when parsing uuid
* ex: When we create an UUID type
*/
Expand Down

0 comments on commit 2c902bb

Please sign in to comment.