timescale · akuzm · Feb 16, 2024 · Feb 16, 2024 · Feb 16, 2024 · Feb 16, 2024
diff --git a/test/src/adt_tests.c b/test/src/adt_tests.c
@@ -22,7 +22,7 @@ TS_FUNCTION_INFO_V1(ts_test_adts);
 /* We have to stub this for the unit tests. */
 #ifndef CheckCompressedData
 #define CheckCompressedData(X) Assert(X)
-#define GLOBAL_MAX_ROWS_PER_COMPRESSION 1015
+#define GLOBAL_MAX_ROWS_PER_COMPRESSION 4096
 #endif
 
 #include <adts/bit_array.h>

diff --git a/tsl/src/compression/compression.c b/tsl/src/compression/compression.c
@@ -64,6 +64,8 @@
 #include "ts_catalog/compression_settings.h"
 #include "ts_catalog/compression_chunk_size.h"
 
+StaticAssertDecl(GLOBAL_MAX_ROWS_PER_COMPRESSION >= MAX_ROWS_PER_COMPRESSION, "max row numbers must be harmonized");
+
 static const CompressionAlgorithmDefinition definitions[_END_COMPRESSION_ALGORITHMS] = {
 	[COMPRESSION_ALGORITHM_ARRAY] = ARRAY_ALGORITHM_DEFINITION,
 	[COMPRESSION_ALGORITHM_DICTIONARY] = DICTIONARY_ALGORITHM_DEFINITION,

diff --git a/tsl/src/compression/compression.h b/tsl/src/compression/compression.h
@@ -32,7 +32,7 @@ typedef struct BulkInsertStateData *BulkInsertState;
 	char vl_len_[4];                                                                               \
 	uint8 compression_algorithm
 
-#define MAX_ROWS_PER_COMPRESSION 1000
+#define MAX_ROWS_PER_COMPRESSION 4096
 /* gap in sequence id between rows, potential for adding rows in gap later */
 #define SEQUENCE_NUM_GAP 10
 
@@ -403,6 +403,6 @@ consumeCompressedData(StringInfo si, int bytes)
  * Normal compression uses 1k rows, but the regression tests use up to 1015.
  * We use this limit for sanity checks in case the compressed data is corrupt.
  */
-#define GLOBAL_MAX_ROWS_PER_COMPRESSION 1015
+#define GLOBAL_MAX_ROWS_PER_COMPRESSION 4096
 
 const CompressionAlgorithmDefinition *algorithm_definition(CompressionAlgorithm algo);
diff --git a/tsl/src/compression/create.c b/tsl/src/compression/create.c
@@ -98,7 +98,8 @@ compressed_column_metadata_attno(CompressionSettings *settings, Oid chunk_reloid
 		return get_attnum(compressed_reloid, metadata_name);
 	}
 
-	return InvalidAttrNumber;
+	char* metadata_name = psprintf("_ts_meta_v2_%d_%s", chunk_attno, metadata_type);
+	return get_attnum(compressed_reloid, metadata_name);
 }
 
 /*
@@ -181,6 +182,26 @@ build_columndefs(CompressionSettings *settings, Oid src_relid)
 														   attr->atttypmod,
 														   attr->attcollation));
 		}
+		else if (!is_segmentby && attr->attbyval && attr->attlen <= 8 && attr->atttypid != BOOLOID)
+		{
+			TypeCacheEntry *type = lookup_type_cache(attr->atttypid, TYPECACHE_LT_OPR);
+
+			if (OidIsValid(type->lt_opr))
+			{
+				char * name = psprintf("_ts_meta_v2_%d_min", attr->attnum);
+				compressed_column_defs = lappend(compressed_column_defs,
+												 makeColumnDef(name,
+															   attr->atttypid,
+															   attr->atttypmod,
+															   attr->attcollation));
+				name = psprintf("_ts_meta_v2_%d_max", attr->attnum);
+				compressed_column_defs = lappend(compressed_column_defs,
+												 makeColumnDef(name,
+															   attr->atttypid,
+															   attr->atttypmod,
+															   attr->attcollation));
+			}
+		}
 
 		if (is_segmentby)
 		{

diff --git a/tsl/src/compression/gorilla.c b/tsl/src/compression/gorilla.c
@@ -860,16 +860,16 @@ unpack_leading_zeros_array(BitArray *bitarray, uint8 *restrict dest)
 	 * We do have to check that the result fits into the maximum number of rows,
 	 * because we get the length from user input.
 	 */
-	const int16 n_bytes_packed = bitarray->buckets.num_elements * sizeof(uint64);
-	const int16 n_lanes = (n_bytes_packed + LANE_INPUTS - 1) / LANE_INPUTS;
-	const int16 n_outputs = n_lanes * LANE_OUTPUTS;
+	const uint16 n_bytes_packed = bitarray->buckets.num_elements * sizeof(uint64);
+	const uint16 n_lanes = (n_bytes_packed + LANE_INPUTS - 1) / LANE_INPUTS;
+	const uint16 n_outputs = n_lanes * LANE_OUTPUTS;
 	CheckCompressedData(n_outputs <= MAX_NUM_LEADING_ZEROS_PADDED_N64);
 
-	for (int lane = 0; lane < n_lanes; lane++)
+	for (uint16 lane = 0; lane < n_lanes; lane++)
 	{
 		uint8 *restrict lane_dest = &dest[lane * LANE_OUTPUTS];
 		const uint8 *restrict lane_src = &((uint8 *) bitarray->buckets.data)[lane * LANE_INPUTS];
-		for (int output_in_lane = 0; output_in_lane < LANE_OUTPUTS; output_in_lane++)
+		for (uint16 output_in_lane = 0; output_in_lane < LANE_OUTPUTS; output_in_lane++)
 		{
 			const int startbit_abs = output_in_lane * BITS_PER_LEADING_ZEROS;
 			const int startbit_rel = startbit_abs % 8;

diff --git a/tsl/src/nodes/decompress_chunk/decompress_chunk.c b/tsl/src/nodes/decompress_chunk/decompress_chunk.c
@@ -859,6 +859,7 @@ ts_decompress_chunk_generate_paths(PlannerInfo *root, RelOptInfo *chunk_rel, Hyp
 
 				batch_merge_path->reverse = (merge_result != SCAN_FORWARD);
 				batch_merge_path->batch_sorted_merge = true;
+				batch_merge_path->enable_bulk_decompression = false;
 
 				/* The segment by optimization is only enabled if it can deliver the tuples in the
 				 * same order as the query requested it. So, we can just copy the pathkeys of the
@@ -1746,6 +1747,7 @@ decompress_chunk_path_create(PlannerInfo *root, CompressionInfo *info, int paral
 	path->custom_path.flags = 0;
 	path->custom_path.methods = &decompress_chunk_path_methods;
 	path->batch_sorted_merge = false;
+	path->enable_bulk_decompression = ts_guc_enable_bulk_decompression;
 
 	/* To prevent a non-parallel path with this node appearing
 	 * in a parallel plan we only set parallel_safe to true

diff --git a/tsl/src/nodes/decompress_chunk/decompress_chunk.h b/tsl/src/nodes/decompress_chunk/decompress_chunk.h
@@ -80,9 +80,9 @@ typedef struct DecompressChunkPath
 	List *bulk_decompression_column;
 
 	/*
-	 * If we produce at least some columns that support bulk decompression.
+	 * Whether the bulk decompression is enabled.
 	 */
-	bool have_bulk_decompression_columns;
+	bool enable_bulk_decompression;
 
 	/*
 	 * Maps the uncompressed chunk attno to the respective column compression

diff --git a/tsl/src/nodes/decompress_chunk/exec.c b/tsl/src/nodes/decompress_chunk/exec.c
@@ -617,14 +617,17 @@ perform_vectorized_sum_int4(DecompressChunkState *chunk_state, Aggref *aggref)
 			MemoryContextReset(dcontext->bulk_decompression_context);
 			MemoryContextSwitchTo(batch_state->per_batch_context);
 
-			/* A compressed batch consists of 1000 tuples (see MAX_ROWS_PER_COMPRESSION). The
-			 * attribute value is a int32 with a max value of 2^32. Even if all tuples have the max
-			 * value, the max sum is = 1000 * 2^32 < 2^10 * 2^32 = 2^42. This is smaller than 2^64,
-			 * which is the max value of the int64 variable. The same is true for negative values).
-			 * Therefore, we don't need to check for overflows within the loop, which would slow
-			 * down the calculation. */
-			Assert(arrow->length <= MAX_ROWS_PER_COMPRESSION);
-			Assert(MAX_ROWS_PER_COMPRESSION <= 1024);
+			/*
+			 * We accumulate the sum as int64, so we can sum INT_MAX = 2^31 - 1
+			 * at least 2^31 times without incurrint an overflow of the int64
+			 * accumulator. The same is true for negative numbers. The
+			 * compressed batch size is currently capped at 1000 rows, but even
+			 * if it's changed in the future, it's unlikely that we support
+			 * batches larger than 65536 rows, not to mention 2^31. Therefore,
+			 * we don't need to check for overflows within the loop, which would
+			 * slow down the calculation.
+			 */
+			Assert(arrow->length <= INT_MAX);
 
 			int64 batch_sum = 0;
 			for (int i = 0; i < arrow->length; i++)

diff --git a/tsl/src/nodes/decompress_chunk/planner.c b/tsl/src/nodes/decompress_chunk/planner.c
@@ -123,7 +123,7 @@ build_decompression_map(PlannerInfo *root, DecompressChunkPath *path, List *scan
 	 * Go over the scan targetlist and determine to which output column each
 	 * scan column goes, saving other additional info as we do that.
 	 */
-	path->have_bulk_decompression_columns = false;
+	bool bulk_decompression_possible_for_some_columns = false;
 	path->decompression_map = NIL;
 	foreach (lc, scan_tlist)
 	{
@@ -222,9 +222,9 @@ build_decompression_map(PlannerInfo *root, DecompressChunkPath *path, List *scan
 			!is_segment && destination_attno_in_uncompressed_chunk > 0 &&
 			tsl_get_decompress_all_function(compression_get_default_algorithm(typoid), typoid) !=
 				NULL;
-		path->have_bulk_decompression_columns |= bulk_decompression_possible;
 		path->bulk_decompression_column =
 			lappend_int(path->bulk_decompression_column, bulk_decompression_possible);
+		bulk_decompression_possible_for_some_columns |= bulk_decompression_possible;
 
 		/*
 		 * Save information about decompressed columns in uncompressed chunk
@@ -251,6 +251,16 @@ build_decompression_map(PlannerInfo *root, DecompressChunkPath *path, List *scan
 		}
 	}
 
+	if (!bulk_decompression_possible_for_some_columns)
+	{
+		/*
+		 * This is mostly a cosmetic thing for EXPLAIN -- don't say that we're
+		 * using bulk decompression, if it is enabled but we have no columns
+		 * that support it.
+		 */
+		path->enable_bulk_decompression = false;
+	}
+
 	/*
 	 * Check that we have found all the needed columns in the compressed targetlist.
 	 * We can't conveniently check that we have all columns for all-row vars, so
@@ -1021,18 +1031,15 @@ decompress_chunk_plan_create(PlannerInfo *root, RelOptInfo *rel, CustomPath *pat
 
 	Assert(list_length(custom_plans) == 1);
 
-	const bool enable_bulk_decompression = !dcpath->batch_sorted_merge &&
-										   ts_guc_enable_bulk_decompression &&
-										   dcpath->have_bulk_decompression_columns;
-
 	/*
 	 * For some predicates, we have more efficient implementation that work on
 	 * the entire compressed batch in one go. They go to this list, and the rest
 	 * goes into the usual scan.plan.qual.
 	 */
 	List *vectorized_quals = NIL;
-	if (enable_bulk_decompression)
+	if (dcpath->enable_bulk_decompression)
 	{
+		Assert(!dcpath->batch_sorted_merge);
 		List *nonvectorized_quals = NIL;
 		find_vectorized_quals(dcpath,
 							  decompress_plan->scan.plan.qual,
@@ -1064,7 +1071,7 @@ decompress_chunk_plan_create(PlannerInfo *root, RelOptInfo *rel, CustomPath *pat
 							  dcpath->info->chunk_rte->relid,
 							  dcpath->reverse,
 							  dcpath->batch_sorted_merge,
-							  enable_bulk_decompression,
+							  dcpath->enable_bulk_decompression,
 							  dcpath->perform_vectorized_aggregation);
 
 	/*

diff --git a/tsl/src/planner.c b/tsl/src/planner.c
@@ -23,6 +23,7 @@
 #include "nodes/frozen_chunk_dml/frozen_chunk_dml.h"
 #include "nodes/decompress_chunk/decompress_chunk.h"
 #include "nodes/gapfill/gapfill.h"
+#include "nodes/chunk_append/chunk_append.h"
 #include "planner.h"
 
 #include <math.h>
@@ -44,6 +45,126 @@ is_osm_present()
 }
 #endif
 
+/*
+ * Try to disable bulk decompression on DecompressChunkPath, skipping the above
+ * Projection path and also handling Lists.
+ */
+static void
+try_disable_bulk_decompression(PlannerInfo *root, Node *node)
+{
+	if (node == NULL)
+	{
+		return;
+	}
+
+	if (IsA(node, ProjectionPath))
+	{
+		try_disable_bulk_decompression(root, (Node *) castNode(ProjectionPath, node)->subpath);
+		return;
+	}
+
+	if (IsA(node, List))
+	{
+		ListCell *lc;
+		foreach (lc, (List *) node)
+		{
+			try_disable_bulk_decompression(root, (Node *) lfirst(lc));
+		}
+		return;
+	}
+
+	if (!IsA(node, CustomPath))
+	{
+		return;
+	}
+
+	CustomPath *custom_child = castNode(CustomPath, node);
+	if (strcmp(custom_child->methods->CustomName, "DecompressChunk") != 0)
+	{
+		return;
+	}
+
+	DecompressChunkPath *dcpath = (DecompressChunkPath *) custom_child;
+	ListCell *column_cell;
+	foreach (column_cell, dcpath->bulk_decompression_column)
+	{
+		lfirst(column_cell) = false;
+	}
+	dcpath->enable_bulk_decompression = false;
+}
+
+/*
+ * When we have a small limit above chunk decompression, it is more efficient to
+ * use the row-by-row decompression iterators than the bulk decompression. Since
+ * bulk decompression is about 10x faster than row-by-row, this advantage goes
+ * away on limits > 100.
+ * This hook disables bulk decompression under small limits.
+ */
+static void
+check_limit_bulk_decompression(PlannerInfo *root, Node *node)
+{
+	if (node == NULL)
+	{
+		return;
+	}
+
+	ListCell *lc;
+	switch (node->type)
+	{
+		case T_List:
+			foreach (lc, (List *) node)
+			{
+				check_limit_bulk_decompression(root, lfirst(lc));
+			}
+			break;
+		case T_LimitPath:
+		{
+			LimitPath *path = castNode(LimitPath, node);
+			/*
+			 * Theoretically we could handle plain Limit as well, but it is more
+			 * complicated because the limit and offset are specified not by
+			 * plain constants there, but by some expression nodes.
+			 * This matters when we have a single chunk (direct select or due to
+			 * plan-time exclusion).
+			 */
+			check_limit_bulk_decompression(root, (Node *) path->subpath);
+			break;
+		}
+		case T_MemoizePath:
+			check_limit_bulk_decompression(root, (Node *) castNode(MemoizePath, node)->subpath);
+			break;
+		case T_ProjectionPath:
+			check_limit_bulk_decompression(root, (Node *) castNode(ProjectionPath, node)->subpath);
+			break;
+		case T_SubqueryScanPath:
+			check_limit_bulk_decompression(root,
+										   (Node *) castNode(SubqueryScanPath, node)->subpath);
+			break;
+		case T_NestPath:
+		case T_MergePath:
+		case T_HashPath:
+			check_limit_bulk_decompression(root, (Node *) ((JoinPath *) node)->outerjoinpath);
+			check_limit_bulk_decompression(root, (Node *) ((JoinPath *) node)->innerjoinpath);
+			break;
+		case T_CustomPath:
+		{
+			CustomPath *custom_this = castNode(CustomPath, node);
+			if (strcmp(custom_this->methods->CustomName, "ChunkAppend") != 0)
+			{
+				break;
+			}
+			ChunkAppendPath *chunk_append = (ChunkAppendPath *) custom_this;
+			if (chunk_append->limit_tuples > 0 && chunk_append->limit_tuples < 100)
+			{
+				try_disable_bulk_decompression(root, (Node *) chunk_append->cpath.custom_paths);
+			}
+			return;
+		}
+		default:
+			break;
+	}
+}
+
 void
 tsl_create_upper_paths_hook(PlannerInfo *root, UpperRelationKind stage, RelOptInfo *input_rel,
 							RelOptInfo *output_rel, TsRelType input_reltype, Hypertable *ht,
@@ -62,6 +183,9 @@ tsl_create_upper_paths_hook(PlannerInfo *root, UpperRelationKind stage, RelOptIn
 		case UPPERREL_DISTINCT:
 			tsl_skip_scan_paths_add(root, input_rel, output_rel);
 			break;
+		case UPPERREL_FINAL:
+			check_limit_bulk_decompression(root, (Node *) output_rel->pathlist);
+			break;
 		default:
 			break;
 	}