Skip to content

Commit

Permalink
Add support for ORC file format (#194)
Browse files Browse the repository at this point in the history
* Added ORC file support

* Added test for ORC file support

* Updated transformORC to use map for row

* Updated test for transformORCFile

* Changed m to be map slice rather than interface slice

* Finished adding support for ORC and added option in UI

* Fixed bug where ORC, Parquet, and ODS options were not displaying on desktop

* Ran formatter
  • Loading branch information
gl28 committed Mar 11, 2022
1 parent 6c773f1 commit aba843b
Show file tree
Hide file tree
Showing 7 changed files with 141 additions and 8 deletions.
44 changes: 43 additions & 1 deletion runner/file.go
Expand Up @@ -11,8 +11,9 @@ import (
"runtime"
"strings"

"github.com/multiprocessio/go-json"
jsonutil "github.com/multiprocessio/go-json"
"github.com/multiprocessio/go-openoffice"
"github.com/scritchley/orc"

"github.com/xitongsys/parquet-go-source/local"
"github.com/xitongsys/parquet-go/reader"
Expand Down Expand Up @@ -142,6 +143,42 @@ func transformParquetFile(in string, out io.Writer) error {
return transformParquet(r, out)
}

func transformORC(in *orc.Reader, out io.Writer) error {
cols := in.Schema().Columns()
c := in.Select(cols...)

return withJSONArrayOutWriterFile(out, func(w *jsonutil.StreamEncoder) error {
row := map[string]interface{}{}

for c.Stripes() {
for c.Next() {
r := c.Row()
for i, col := range cols {
row[col] = r[i]
}

err := w.EncodeRow(row)
if err != nil {
return err
}
}
}

return c.Err()

})
}

func transformORCFile(in string, out io.Writer) error {
r, err := orc.Open(in)
if err != nil {
return err
}
defer r.Close()

return transformORC(r, out)
}

func writeSheet(rows [][]string, w *jsonutil.StreamEncoder) error {
var header []string
isHeader := true
Expand Down Expand Up @@ -481,6 +518,7 @@ const (
ExcelOpenXMLMimeType = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
OpenOfficeSheetMimeType = "application/vnd.oasis.opendocument.spreadsheet"
ParquetMimeType = "parquet"
ORCMimeType = "orc"
ApacheErrorMimeType = "text/apache2error"
ApacheAccessMimeType = "text/apache2access"
NginxAccessMimeType = "text/nginxaccess"
Expand Down Expand Up @@ -511,6 +549,8 @@ func GetMimeType(fileName string, ct ContentTypeInfo) MimeType {
return OpenOfficeSheetMimeType
case ".parquet":
return ParquetMimeType
case ".orc":
return ORCMimeType
}

return UnknownMimeType
Expand Down Expand Up @@ -552,6 +592,8 @@ func TransformFile(fileName string, cti ContentTypeInfo, out io.Writer) error {
return transformXLSXFile(fileName, out)
case ParquetMimeType:
return transformParquetFile(fileName, out)
case ORCMimeType:
return transformORCFile(fileName, out)
case JSONConcatMimeType:
return transformJSONConcatFile(fileName, out)
case RegexpLinesMimeType:
Expand Down
69 changes: 69 additions & 0 deletions runner/file_test.go
Expand Up @@ -4,10 +4,12 @@ import (
"encoding/json"
"fmt"
"io/ioutil"
"math/rand"
"os"
"testing"
"time"

"github.com/scritchley/orc"
"github.com/stretchr/testify/assert"
)

Expand Down Expand Up @@ -122,6 +124,73 @@ func Test_transformJSONConcat(t *testing.T) {
}
}

func Test_transformORCFile(t *testing.T) {
inTmp, err := ioutil.TempFile("", "")
assert.Nil(t, err)
defer os.Remove(inTmp.Name())
defer inTmp.Close()

// define column types for ORC file
schema, err := orc.ParseSchema("struct<username:string,administrator:boolean,score:double,nested:struct<randomnumber:double,correct:boolean>>")
assert.Nil(t, err)

w, err := orc.NewWriter(inTmp, orc.SetSchema(schema))
assert.Nil(t, err)

length := 2 // number of rows to create

// will hold output data for test
var expJson []map[string]interface{}

// generate test data
for i := 0; i < length; i++ {
nestedValues := []interface{}{
rand.Float64(),
rand.Int63n(10000) > 5000,
}

values := []interface{}{
fmt.Sprintf("%x", rand.Int63n(1000)),
rand.Int63n(10000) > 4444,
rand.Float64(),
nestedValues,
}

expJson = append(expJson, map[string]interface{}{
"username": values[0],
"administrator": values[1],
"score": values[2],
"nested": map[string]interface{}{
"randomnumber": nestedValues[0],
"correct": nestedValues[1],
},
})

err = w.Write(values...)
assert.Nil(t, err)
}

err = w.Close()
assert.Nil(t, err)

outTmp, err := ioutil.TempFile("", "")
defer os.Remove(outTmp.Name())
assert.Nil(t, err)

err = transformORCFile(inTmp.Name(), outTmp)
assert.Nil(t, err)

var m []map[string]interface{}
outTmpBs, err := ioutil.ReadFile(outTmp.Name())
assert.Nil(t, err)

err = json.Unmarshal(outTmpBs, &m)
assert.Nil(t, err)

assert.Equal(t, expJson, m)

}

func Test_transformGeneric(t *testing.T) {
tests := []struct {
in string
Expand Down
1 change: 1 addition & 0 deletions runner/go.mod
Expand Up @@ -89,6 +89,7 @@ require (
github.com/prometheus/procfs v0.7.3 // indirect
github.com/richardlehane/mscfb v1.0.3 // indirect
github.com/richardlehane/msoleps v1.0.1 // indirect
github.com/scritchley/orc v0.0.0-20210513144143-06dddf1ad665 // indirect
github.com/shopspring/decimal v1.3.1 // indirect
github.com/sirupsen/logrus v1.8.1 // indirect
github.com/xuri/efp v0.0.0-20210322160811-ab561f5b45e3 // indirect
Expand Down
2 changes: 2 additions & 0 deletions runner/go.sum
Expand Up @@ -490,6 +490,8 @@ github.com/richardlehane/msoleps v1.0.1/go.mod h1:BWev5JBpU9Ko2WAgmZEuiz4/u3ZYTK
github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ=
github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
github.com/ruudk/golang-pdf417 v0.0.0-20181029194003-1af4ab5afa58/go.mod h1:6lfFZQK844Gfx8o5WFuvpxWRwnSoipWe/p622j1v06w=
github.com/scritchley/orc v0.0.0-20210513144143-06dddf1ad665 h1:W7Y6ejGhTaW9WlWhTtxE8f+SOa3c1NoFWsU9XT2cUOY=
github.com/scritchley/orc v0.0.0-20210513144143-06dddf1ad665/go.mod h1:U4h1RViHcbDQl9stSaImdd7N3/ZnUkZ2yombj5cSgEY=
github.com/shirou/gopsutil v2.19.11+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA=
github.com/shirou/w32 v0.0.0-20160930032740-bb4de0191aa4/go.mod h1:qsXQc7+bwAM3Q1u/4XEfrquwF8Lw7D7y5cD8CuHnfIc=
github.com/shopspring/decimal v1.3.1 h1:2Usl1nmF/WZucqkFZhnfFYxxxu8LG21F6nPQBE5gKV8=
Expand Down
16 changes: 16 additions & 0 deletions runner/http.go
Expand Up @@ -260,6 +260,22 @@ func TransformReader(r io.Reader, fileName string, cti ContentTypeInfo, out io.W
}

return transformParquetFile(w.Name(), out)
case ORCMimeType:
w, err := ioutil.TempFile("", "http-orc-temp")
if err != nil {
return err
}
defer os.Remove(w.Name())

_, err = w.ReadFrom(r)
if err == io.EOF {
err = nil
}
if err != nil {
return err
}

return transformORCFile(w.Name(), out)
case RegexpLinesMimeType:
// There are probably weird cases this won't work but
// let's wait for a bug report to do more intelligent
Expand Down
15 changes: 9 additions & 6 deletions ui/components/ContentTypePicker.tsx
Expand Up @@ -38,12 +38,15 @@ export function ContentTypePicker({
<option value="text/csv">CSV</option>
<option value="text/tab-separated-values">TSV</option>
<option value={XLSX_MIME_TYPE}>Excel</option>
{!inMemoryEval /* This is getting ridiculous. Really need to find a plugin architecture */ && (
<React.Fragment>
<option value={ODS_MIME_TYPE}>ODS</option>
<option value="parquet">Parquet</option>
</React.Fragment>
)}
<option disabled={inMemoryEval} value={ODS_MIME_TYPE}>
ODS
</option>
<option disabled={inMemoryEval} value="parquet">
Parquet
</option>
<option disabled={inMemoryEval} value="orc">
ORC
</option>
<option value="application/json">JSON</option>
<option value="application/jsonlines">
Newline-delimited JSON
Expand Down
2 changes: 1 addition & 1 deletion ui/panels/FilePanel.tsx
Expand Up @@ -65,7 +65,7 @@ export function FilePanelDetails({
/>
</div>
<ContentTypePicker
inMemoryEval={MODE !== 'browser'}
inMemoryEval={MODE === 'browser'}
value={panel.file.contentTypeInfo}
onChange={(cti: { type: string; customLineRegexp: string }) => {
panel.file.contentTypeInfo = cti;
Expand Down

0 comments on commit aba843b

Please sign in to comment.