sparklyr::sdf_bind_cols not great for lots of columns that are generated from another table #3365

scottporter · 2023-08-18T01:07:08Z

sparklyr::sdf_bind_cols seems to fail when combining a lot of columns that were generated from another table.

I needed to process some fixed width files in spark. There isn't a fixed width reader in spark, so I wrote some code to do it manually.

The way I would handle this in local memory is use purrr to split things into a list with a single column each, then combine.

At least the way I generated those columns, sparklyr::sdf_bind_cols fails when you get too many columns. I know I'm not running the latest copies of sparklyr. I booted up posit.cloud and tried my reprex there with the latest copies of all the packages, and it didn't fail with the same error, but it went on for hours without completion, so I figured that this older reprex is sufficient.

I can certainly work around this. I show another approach down below where I can stitch together my own SQL code and do the same task that way. But it would certainly be nice if the straightforward approach would work. Maybe I'm missing something that would avoid the recursion, but I tried a few things and couldn't get around the fact that the laziness of the tables seems to compound when you stitch them together and then it errors out.

library(magrittr)
# library(sparklyr)
# library(purrr)
# library(dplyr)

SparkR::sparkR.session()
#> Spark package found in SPARK_HOME: /databricks/spark
#> Java ref type org.apache.spark.sql.SparkSession id 1
sc <- sparklyr::spark_connect(method = "databricks")

## pushing to a temp table

tempname <- sparklyr::spark_table_name("")

# I get data that is all one big string each line that I will need to parse
# creating some placeholder text

line_text <- "1a2b3c4d5e6f7g8h9i10jj11kk12ll13mm14nn15oo16pp17qq18rr19ss20tt21uu22vv23ww24xx25yy26zz27AA28BB29CC30DD31EE32FF33GG34HH35II36JJ37KK38LL39MM40NN41OO42PP43QQ44RR45SS46TT47UU48VV49WW50XX51YY52ZZ5000.01aaaaaaa50.0001bbbbbbb"

# repeating it a couple of times with a little variation
lines_text <- c(
  line_text,
  toupper(line_text),
  tolower(line_text)
)            

# changing to a table             
my_table <- tibble::tibble(
  line = lines_text
)

# uploading table to spark
initial_fwf <- dplyr::copy_to(
  sc,
  my_table,
  name = tempname,
  overwrite = TRUE)

# I get metadata with column name, width, scale (for numeric variables), and type
# creating some placeholder metadata here

this_metadata_tbl <- tibble::tibble(
  id = 1:108,
  name = glue::glue("var_{1:108}"),
  width = c(rep(1,18),rep(2,86),rep(7,4)),
  scale = c(rep(c(0,NA),52),2,NA,4,NA)
) %>%
  dplyr::mutate(type = dplyr::case_when(
    id %% 2 == 0 ~ "string",
    TRUE ~ "decimal"
  ))

print(this_metadata_tbl)
#> # A tibble: 108 x 5
#>       id name   width scale type   
#>    <int> <glue> <dbl> <dbl> <chr>  
#>  1     1 var_1      1     0 decimal
#>  2     2 var_2      1    NA string 
#>  3     3 var_3      1     0 decimal
#>  4     4 var_4      1    NA string 
#>  5     5 var_5      1     0 decimal
#>  6     6 var_6      1    NA string 
#>  7     7 var_7      1     0 decimal
#>  8     8 var_8      1    NA string 
#>  9     9 var_9      1     0 decimal
#> 10    10 var_10     1    NA string 
#> # ... with 98 more rows
print(tail(this_metadata_tbl))
#> # A tibble: 6 x 5
#>      id name    width scale type   
#>   <int> <glue>  <dbl> <dbl> <chr>  
#> 1   103 var_103     2     0 decimal
#> 2   104 var_104     2    NA string 
#> 3   105 var_105     7     2 decimal
#> 4   106 var_106     7    NA string 
#> 5   107 var_107     7     4 decimal
#> 6   108 var_108     7    NA string

# calculating the start and stop characters for each column
this_split_tbl <- this_metadata_tbl %>%
  dplyr::mutate(lag_width = dplyr::lag(width)) %>%
  dplyr::mutate(lag_width = dplyr::case_when(
    is.na(lag_width) ~ 0,
    TRUE ~ lag_width
  )) %>%
  dplyr::mutate(cume_lag_width = cumsum(lag_width)) %>%
  dplyr::mutate(start = 1 + cume_lag_width) %>%
  dplyr::mutate(stop = start + width - 1) %>%
  dplyr::select(
    name, start, width, type, scale
  )

print(this_split_tbl)
#> # A tibble: 108 x 5
#>    name   start width type    scale
#>    <glue> <dbl> <dbl> <chr>   <dbl>
#>  1 var_1      1     1 decimal     0
#>  2 var_2      2     1 string     NA
#>  3 var_3      3     1 decimal     0
#>  4 var_4      4     1 string     NA
#>  5 var_5      5     1 decimal     0
#>  6 var_6      6     1 string     NA
#>  7 var_7      7     1 decimal     0
#>  8 var_8      8     1 string     NA
#>  9 var_9      9     1 decimal     0
#> 10 var_10    10     1 string     NA
#> # ... with 98 more rows
print(tail(this_split_tbl))
#> # A tibble: 6 x 5
#>   name    start width type    scale
#>   <glue>  <dbl> <dbl> <chr>   <dbl>
#> 1 var_103   187     2 decimal     0
#> 2 var_104   189     2 string     NA
#> 3 var_105   191     7 decimal     2
#> 4 var_106   198     7 string     NA
#> 5 var_107   205     7 decimal     4
#> 6 var_108   212     7 string     NA

split_fwf <- this_split_tbl %>%
  purrr::pmap(
    ~ initial_fwf %>%
      dplyr::mutate(!!..1 := dplyr::case_when(
        ..4 == "decimal" ~ as.double(substr(line, ..2, ..3)),
        TRUE ~ as.character(substr(line, ..2, ..3)))
      ) %>%
      dplyr::select(!!..1)
  )

# everything has worked great up to this point
# i have a list with 104 spark tables, all 1 column each

# bind together all of our new columns
split_fwf_tbl <- sparklyr::sdf_bind_cols(split_fwf)
#> Error in validObject(.Object): invalid class "SQL" object: Error : C stack usage  7972468 is too close to the limit

^{Created on 2023-08-18 by the reprex package (v2.0.1)}

Session info

sessioninfo::session_info()
#> Warning in system("timedatectl", intern = TRUE): running command 'timedatectl'
#> had status 1
#> - Session info ---------------------------------------------------------------
#>  setting  value
#>  version  R version 4.1.3 (2022-03-10)
#>  os       Ubuntu 20.04.4 LTS
#>  system   x86_64, linux-gnu
#>  ui       X11
#>  language (EN)
#>  collate  C
#>  ctype    C
#>  tz       /UTC
#>  date     2023-08-18
#>  pandoc   2.18 @ /usr/lib/rstudio-server/bin/quarto/bin/tools/ (via rmarkdown)
#> 
#> - Packages -------------------------------------------------------------------
#>  package     * version date (UTC) lib source
#>  assertthat    0.2.1   2019-03-21 [1] RSPM (R 4.1.3)
#>  backports     1.4.1   2021-12-13 [1] RSPM (R 4.1.3)
#>  base64enc     0.1-3   2015-07-28 [1] RSPM (R 4.1.3)
#>  blob          1.2.2   2021-07-23 [1] RSPM (R 4.1.3)
#>  cli           3.1.0   2021-10-27 [1] RSPM (R 4.1.2)
#>  config        0.3.1   2020-12-17 [1] RSPM (R 4.1.3)
#>  crayon        1.4.2   2021-10-29 [1] RSPM (R 4.1.3)
#>  DBI           1.1.2   2021-12-20 [1] RSPM (R 4.1.3)
#>  dbplyr        2.1.1   2021-04-06 [1] RSPM (R 4.1.3)
#>  digest        0.6.29  2021-12-01 [1] RSPM (R 4.1.3)
#>  dplyr       * 1.0.7   2021-06-18 [1] RSPM (R 4.1.2)
#>  ellipsis      0.3.2   2021-04-29 [1] RSPM (R 4.1.3)
#>  evaluate      0.14    2019-05-28 [1] RSPM (R 4.1.3)
#>  fansi         1.0.0   2022-01-10 [1] RSPM (R 4.1.2)
#>  fastmap       1.1.0   2021-01-25 [1] RSPM (R 4.1.3)
#>  forge         0.2.0   2019-02-26 [1] RSPM (R 4.1.3)
#>  fs            1.5.2   2021-12-08 [1] RSPM (R 4.1.3)
#>  generics      0.1.1   2021-10-25 [1] RSPM (R 4.1.2)
#>  glue          1.6.0   2021-12-17 [1] RSPM (R 4.1.2)
#>  highr         0.9     2021-04-16 [1] RSPM (R 4.1.3)
#>  htmltools     0.5.2   2021-08-25 [1] RSPM (R 4.1.3)
#>  htmlwidgets   1.5.4   2021-09-08 [1] RSPM (R 4.1.3)
#>  httr          1.4.2   2020-07-20 [1] RSPM (R 4.1.3)
#>  jsonlite      1.7.2   2020-12-09 [1] RSPM (R 4.1.2)
#>  knitr         1.37    2021-12-16 [1] RSPM (R 4.1.3)
#>  lifecycle     1.0.1   2021-09-24 [1] RSPM (R 4.1.2)
#>  magrittr    * 2.0.1   2020-11-17 [1] RSPM (R 4.1.2)
#>  pillar        1.6.4   2021-10-18 [1] RSPM (R 4.1.2)
#>  pkgconfig     2.0.3   2019-09-22 [1] RSPM (R 4.1.3)
#>  purrr       * 0.3.4   2020-04-17 [1] RSPM (R 4.1.3)
#>  R.cache       0.15.0  2021-04-30 [1] RSPM (R 4.1.3)
#>  R.methodsS3   1.8.1   2020-08-26 [1] RSPM (R 4.1.3)
#>  R.oo          1.24.0  2020-08-26 [1] RSPM (R 4.1.3)
#>  R.utils       2.11.0  2021-09-26 [1] RSPM (R 4.1.3)
#>  r2d3          0.2.5   2020-12-18 [1] RSPM (R 4.1.3)
#>  R6            2.5.1   2021-08-19 [1] RSPM (R 4.1.3)
#>  reprex        2.0.1   2021-08-05 [1] RSPM (R 4.1.3)
#>  rlang         0.4.12  2021-10-18 [1] RSPM (R 4.1.2)
#>  rmarkdown     2.11    2021-09-14 [1] RSPM (R 4.1.3)
#>  rprojroot     2.0.2   2020-11-15 [1] RSPM (R 4.1.3)
#>  rstudioapi    0.13    2020-11-12 [1] RSPM (R 4.1.3)
#>  sessioninfo   1.2.2   2021-12-06 [1] RSPM (R 4.1.3)
#>  sparklyr    * 1.7.4   2022-01-08 [1] RSPM (R 4.1.3)
#>  SparkR        3.2.0   2023-08-17 [2] local
#>  stringi       1.7.6   2021-11-29 [1] RSPM (R 4.1.3)
#>  stringr       1.4.0   2019-02-10 [1] RSPM (R 4.1.3)
#>  styler        1.6.2   2021-09-23 [1] RSPM (R 4.1.3)
#>  tibble        3.1.6   2021-11-07 [1] RSPM (R 4.1.2)
#>  tidyr         1.1.4   2021-09-27 [1] RSPM (R 4.1.3)
#>  tidyselect    1.1.1   2021-04-30 [1] RSPM (R 4.1.2)
#>  utf8          1.2.2   2021-07-24 [1] RSPM (R 4.1.3)
#>  uuid          1.0-3   2021-11-01 [1] RSPM (R 4.1.3)
#>  vctrs         0.3.8   2021-04-29 [1] RSPM (R 4.1.2)
#>  withr         2.4.3   2021-11-30 [1] RSPM (R 4.1.2)
#>  xfun          0.29    2021-12-14 [1] RSPM (R 4.1.3)
#>  yaml          2.2.1   2020-02-01 [1] RSPM (R 4.1.3)
#> 
#>  [1] /mnt/central-config/r-libraries/4.1_2022-01-13_default
#>  [2] /databricks/spark/R/lib
#>  [3] /mnt/central-config/r-libraries/4.1_2022-01-13_dataportal_alpha
#>  [4] /usr/local/lib/R/site-library
#>  [5] /usr/lib/R/site-library
#>  [6] /usr/lib/R/library
#> 
#> ------------------------------------------------------------------------------

Here is the workaround I did. Works fine... just involves stitching together a big SQL query manually to subset all the variables.

Again, it's possible to get the work done this way. It would just be nice if I could use use sparklyr::sdf_bind_cols instead.

The nice thing about using SQL to do this is that then the DECIMAL precisions that I was given could actually be relevant. I can actually load these in as DECIMAL (so I did). But for my purposes, I would have loved the easier code above and just treated all the numeric variables the same.

library(magrittr)
# library(sparklyr)
# library(purrr)
# library(dplyr)

SparkR::sparkR.session()
#> Spark package found in SPARK_HOME: /databricks/spark
#> Java ref type org.apache.spark.sql.SparkSession id 1
sc <- sparklyr::spark_connect(method = "databricks")

# for some cases reading in as DECIMAL with the width and scale isn't important
#   so I've set it up with a parameter here
strict_decimal_types <- TRUE

## pushing to a temp table

tempname <- sparklyr::spark_table_name("")

# I get data that is all one big string each line that I will need to parse
# creating some placeholder text

line_text <- "1a2b3c4d5e6f7g8h9i10jj11kk12ll13mm14nn15oo16pp17qq18rr19ss20tt21uu22vv23ww24xx25yy26zz27AA28BB29CC30DD31EE32FF33GG34HH35II36JJ37KK38LL39MM40NN41OO42PP43QQ44RR45SS46TT47UU48VV49WW50XX51YY52ZZ5000.01aaaaaaa50.0001bbbbbbb"

# repeating it a couple of times with a little variation
lines_text <- c(
  line_text,
  toupper(line_text),
  tolower(line_text)
)            

# changing to a table             
my_table <- tibble::tibble(
  line = lines_text
)

# uploading table to spark
initial_fwf <- dplyr::copy_to(
  sc,
  my_table,
  name = tempname,
  overwrite = TRUE)

# I get metadata with column name, width, scale (for numeric variables), and type
# creating some placeholder metadata here

this_metadata_tbl <- tibble::tibble(
  id = 1:108,
  name = glue::glue("var_{1:108}"),
  width = c(rep(1,18),rep(2,86),rep(7,4)),
  scale = c(rep(c(0,NA),52),2,NA,4,NA)
) %>%
  dplyr::mutate(type = dplyr::case_when(
    id %% 2 == 0 ~ "string",
    TRUE ~ "decimal"
  ))

print(this_metadata_tbl)
#> # A tibble: 108 x 5
#>       id name   width scale type   
#>    <int> <glue> <dbl> <dbl> <chr>  
#>  1     1 var_1      1     0 decimal
#>  2     2 var_2      1    NA string 
#>  3     3 var_3      1     0 decimal
#>  4     4 var_4      1    NA string 
#>  5     5 var_5      1     0 decimal
#>  6     6 var_6      1    NA string 
#>  7     7 var_7      1     0 decimal
#>  8     8 var_8      1    NA string 
#>  9     9 var_9      1     0 decimal
#> 10    10 var_10     1    NA string 
#> # ... with 98 more rows
print(tail(this_metadata_tbl))
#> # A tibble: 6 x 5
#>      id name    width scale type   
#>   <int> <glue>  <dbl> <dbl> <chr>  
#> 1   103 var_103     2     0 decimal
#> 2   104 var_104     2    NA string 
#> 3   105 var_105     7     2 decimal
#> 4   106 var_106     7    NA string 
#> 5   107 var_107     7     4 decimal
#> 6   108 var_108     7    NA string

# calculating the start and stop characters for each column
this_split_tbl <- this_metadata_tbl %>%
  dplyr::mutate(lag_width = dplyr::lag(width)) %>%
  dplyr::mutate(lag_width = dplyr::case_when(
    is.na(lag_width) ~ 0,
    TRUE ~ lag_width
  )) %>%
  dplyr::mutate(cume_lag_width = cumsum(lag_width)) %>%
  dplyr::mutate(start = 1 + cume_lag_width) %>%
  dplyr::mutate(stop = start + width - 1) %>%
  dplyr::select(
    name, start, width, type, scale
  )

print(this_split_tbl)
#> # A tibble: 108 x 5
#>    name   start width type    scale
#>    <glue> <dbl> <dbl> <chr>   <dbl>
#>  1 var_1      1     1 decimal     0
#>  2 var_2      2     1 string     NA
#>  3 var_3      3     1 decimal     0
#>  4 var_4      4     1 string     NA
#>  5 var_5      5     1 decimal     0
#>  6 var_6      6     1 string     NA
#>  7 var_7      7     1 decimal     0
#>  8 var_8      8     1 string     NA
#>  9 var_9      9     1 decimal     0
#> 10 var_10    10     1 string     NA
#> # ... with 98 more rows
print(tail(this_split_tbl))
#> # A tibble: 6 x 5
#>   name    start width type    scale
#>   <glue>  <dbl> <dbl> <chr>   <dbl>
#> 1 var_103   187     2 decimal     0
#> 2 var_104   189     2 string     NA
#> 3 var_105   191     7 decimal     2
#> 4 var_106   198     7 string     NA
#> 5 var_107   205     7 decimal     4
#> 6 var_108   212     7 string     NA

working_fwf <- initial_fwf %>%
  dplyr::mutate(id = dplyr::row_number())

code_split_fwf <- this_split_tbl %>%
  purrr::pmap(
    ~ if ("decimal" == ..4) {
      if (strict_decimal_types == TRUE){
        glue::glue('CAST(SUBSTR(line FROM {as.numeric(..2)} FOR {as.numeric(..3)}) AS DECIMAL({as.numeric(..3)},{as.numeric(..5)})) AS `{..1}`')
      } else {
        glue::glue('CAST(SUBSTR(line FROM {as.numeric(..2)} FOR {as.numeric(..3)}) AS DOUBLE) AS `{..1}`')
      }
      
    } else {
      glue::glue('SUBSTR(line FROM {as.numeric(..2)} FOR {as.numeric(..3)}) AS `{..1}`')
    }
  )

print(head(code_split_fwf))
#> [[1]]
#> CAST(SUBSTR(line FROM 1 FOR 1) AS DECIMAL(1,0)) AS `var_1`
#> 
#> [[2]]
#> SUBSTR(line FROM 2 FOR 1) AS `var_2`
#> 
#> [[3]]
#> CAST(SUBSTR(line FROM 3 FOR 1) AS DECIMAL(1,0)) AS `var_3`
#> 
#> [[4]]
#> SUBSTR(line FROM 4 FOR 1) AS `var_4`
#> 
#> [[5]]
#> CAST(SUBSTR(line FROM 5 FOR 1) AS DECIMAL(1,0)) AS `var_5`
#> 
#> [[6]]
#> SUBSTR(line FROM 6 FOR 1) AS `var_6`
print(tail(code_split_fwf))
#> [[1]]
#> CAST(SUBSTR(line FROM 187 FOR 2) AS DECIMAL(2,0)) AS `var_103`
#> 
#> [[2]]
#> SUBSTR(line FROM 189 FOR 2) AS `var_104`
#> 
#> [[3]]
#> CAST(SUBSTR(line FROM 191 FOR 7) AS DECIMAL(7,2)) AS `var_105`
#> 
#> [[4]]
#> SUBSTR(line FROM 198 FOR 7) AS `var_106`
#> 
#> [[5]]
#> CAST(SUBSTR(line FROM 205 FOR 7) AS DECIMAL(7,4)) AS `var_107`
#> 
#> [[6]]
#> SUBSTR(line FROM 212 FOR 7) AS `var_108`


code_expression <- paste0(
  "SELECT ",
  paste(code_split_fwf, collapse =",\n "),
  " FROM ",
  tempname
)

extended_table <- dplyr::tbl(sc, dplyr::sql(code_expression)) %>% dplyr::compute()

dplyr::glimpse(extended_table)
#> Rows: ??
#> Warning in sdf_collect_static(object, impl, ...): NAs introduced by coercion to
#> integer range
#> Columns: 108
#> Database: spark_connection
#> $ var_1   <dbl> 1, 1, 1
#> $ var_2   <chr> "a", "A", "a"
#> $ var_3   <dbl> 2, 2, 2
#> $ var_4   <chr> "b", "B", "b"
#> $ var_5   <dbl> 3, 3, 3
#> $ var_6   <chr> "c", "C", "c"
#> $ var_7   <dbl> 4, 4, 4
#> $ var_8   <chr> "d", "D", "d"
#> $ var_9   <dbl> 5, 5, 5
#> $ var_10  <chr> "e", "E", "e"
#> $ var_11  <dbl> 6, 6, 6
#> $ var_12  <chr> "f", "F", "f"
#> $ var_13  <dbl> 7, 7, 7
#> $ var_14  <chr> "g", "G", "g"
#> $ var_15  <dbl> 8, 8, 8
#> $ var_16  <chr> "h", "H", "h"
#> $ var_17  <dbl> 9, 9, 9
#> $ var_18  <chr> "i", "I", "i"
#> $ var_19  <dbl> 10, 10, 10
#> $ var_20  <chr> "jj", "JJ", "jj"
#> $ var_21  <dbl> 11, 11, 11
#> $ var_22  <chr> "kk", "KK", "kk"
#> $ var_23  <dbl> 12, 12, 12
#> $ var_24  <chr> "ll", "LL", "ll"
#> $ var_25  <dbl> 13, 13, 13
#> $ var_26  <chr> "mm", "MM", "mm"
#> $ var_27  <dbl> 14, 14, 14
#> $ var_28  <chr> "nn", "NN", "nn"
#> $ var_29  <dbl> 15, 15, 15
#> $ var_30  <chr> "oo", "OO", "oo"
#> $ var_31  <dbl> 16, 16, 16
#> $ var_32  <chr> "pp", "PP", "pp"
#> $ var_33  <dbl> 17, 17, 17
#> $ var_34  <chr> "qq", "QQ", "qq"
#> $ var_35  <dbl> 18, 18, 18
#> $ var_36  <chr> "rr", "RR", "rr"
#> $ var_37  <dbl> 19, 19, 19
#> $ var_38  <chr> "ss", "SS", "ss"
#> $ var_39  <dbl> 20, 20, 20
#> $ var_40  <chr> "tt", "TT", "tt"
#> $ var_41  <dbl> 21, 21, 21
#> $ var_42  <chr> "uu", "UU", "uu"
#> $ var_43  <dbl> 22, 22, 22
#> $ var_44  <chr> "vv", "VV", "vv"
#> $ var_45  <dbl> 23, 23, 23
#> $ var_46  <chr> "ww", "WW", "ww"
#> $ var_47  <dbl> 24, 24, 24
#> $ var_48  <chr> "xx", "XX", "xx"
#> $ var_49  <dbl> 25, 25, 25
#> $ var_50  <chr> "yy", "YY", "yy"
#> $ var_51  <dbl> 26, 26, 26
#> $ var_52  <chr> "zz", "ZZ", "zz"
#> $ var_53  <dbl> 27, 27, 27
#> $ var_54  <chr> "AA", "AA", "aa"
#> $ var_55  <dbl> 28, 28, 28
#> $ var_56  <chr> "BB", "BB", "bb"
#> $ var_57  <dbl> 29, 29, 29
#> $ var_58  <chr> "CC", "CC", "cc"
#> $ var_59  <dbl> 30, 30, 30
#> $ var_60  <chr> "DD", "DD", "dd"
#> $ var_61  <dbl> 31, 31, 31
#> $ var_62  <chr> "EE", "EE", "ee"
#> $ var_63  <dbl> 32, 32, 32
#> $ var_64  <chr> "FF", "FF", "ff"
#> $ var_65  <dbl> 33, 33, 33
#> $ var_66  <chr> "GG", "GG", "gg"
#> $ var_67  <dbl> 34, 34, 34
#> $ var_68  <chr> "HH", "HH", "hh"
#> $ var_69  <dbl> 35, 35, 35
#> $ var_70  <chr> "II", "II", "ii"
#> $ var_71  <dbl> 36, 36, 36
#> $ var_72  <chr> "JJ", "JJ", "jj"
#> $ var_73  <dbl> 37, 37, 37
#> $ var_74  <chr> "KK", "KK", "kk"
#> $ var_75  <dbl> 38, 38, 38
#> $ var_76  <chr> "LL", "LL", "ll"
#> $ var_77  <dbl> 39, 39, 39
#> $ var_78  <chr> "MM", "MM", "mm"
#> $ var_79  <dbl> 40, 40, 40
#> $ var_80  <chr> "NN", "NN", "nn"
#> $ var_81  <dbl> 41, 41, 41
#> $ var_82  <chr> "OO", "OO", "oo"
#> $ var_83  <dbl> 42, 42, 42
#> $ var_84  <chr> "PP", "PP", "pp"
#> $ var_85  <dbl> 43, 43, 43
#> $ var_86  <chr> "QQ", "QQ", "qq"
#> $ var_87  <dbl> 44, 44, 44
#> $ var_88  <chr> "RR", "RR", "rr"
#> $ var_89  <dbl> 45, 45, 45
#> $ var_90  <chr> "SS", "SS", "ss"
#> $ var_91  <dbl> 46, 46, 46
#> $ var_92  <chr> "TT", "TT", "tt"
#> $ var_93  <dbl> 47, 47, 47
#> $ var_94  <chr> "UU", "UU", "uu"
#> $ var_95  <dbl> 48, 48, 48
#> $ var_96  <chr> "VV", "VV", "vv"
#> $ var_97  <dbl> 49, 49, 49
#> $ var_98  <chr> "WW", "WW", "ww"
#> $ var_99  <dbl> 50, 50, 50
#> $ var_100 <chr> "XX", "XX", "xx"
#> $ var_101 <dbl> 51, 51, 51
#> $ var_102 <chr> "YY", "YY", "yy"
#> $ var_103 <dbl> 52, 52, 52
#> $ var_104 <chr> "ZZ", "ZZ", "zz"
#> $ var_105 <dbl> 5000.01, 5000.01, 5000.01
#> $ var_106 <chr> "aaaaaaa", "AAAAAAA", "aaaaaaa"
#> $ var_107 <dbl> 50.0001, 50.0001, 50.0001
#> $ var_108 <chr> "bbbbbbb", "BBBBBBB", "bbbbbbb"

^{Created on 2023-08-18 by the reprex package (v2.0.1)}

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

sparklyr::sdf_bind_cols not great for lots of columns that are generated from another table #3365

sparklyr::sdf_bind_cols not great for lots of columns that are generated from another table #3365

scottporter commented Aug 18, 2023 •

edited

sparklyr::sdf_bind_cols not great for lots of columns that are generated from another table #3365

sparklyr::sdf_bind_cols not great for lots of columns that are generated from another table #3365

Comments

scottporter commented Aug 18, 2023 • edited

scottporter commented Aug 18, 2023 •

edited