Skip to content

Commit

Permalink
[pkg/ottl]: Add ParseXML converter (#31487)
Browse files Browse the repository at this point in the history
**Description:**
* Adds a ParseXML converter function that can be used to parse an XML
document to a pcommon.Map value

**Link to tracking Issue:** Closes #31133

**Testing:**
Unit tests
Manually tested parsing XML logs

**Documentation:**
Added documentation for the ParseXML function to the ottl_funcs README.

---------

Co-authored-by: Evan Bradley <11745660+evan-bradley@users.noreply.github.com>
  • Loading branch information
BinaryFissionGames and evan-bradley committed Mar 15, 2024
1 parent c485615 commit 0d9b1b0
Show file tree
Hide file tree
Showing 6 changed files with 546 additions and 0 deletions.
13 changes: 13 additions & 0 deletions .chloggen/feat_ottl_xml-parse-function.yaml
@@ -0,0 +1,13 @@
# Use this changelog template to create an entry for release notes.

# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
change_type: "enhancement"

# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
component: pkg/ottl

# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
note: Add `ParseXML` function for parsing XML from a target string.

# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
issues: [31133]
16 changes: 16 additions & 0 deletions pkg/ottl/e2e/e2e_test.go
Expand Up @@ -485,6 +485,22 @@ func Test_e2e_converters(t *testing.T) {
m.PutStr("k2", "v2__!__v2")
},
},
{
statement: `set(attributes["test"], ParseXML("<Log id=\"1\"><Message>This is a log message!</Message></Log>"))`,
want: func(tCtx ottllog.TransformContext) {
log := tCtx.GetLogRecord().Attributes().PutEmptyMap("test")
log.PutStr("tag", "Log")

attrs := log.PutEmptyMap("attributes")
attrs.PutStr("id", "1")

logChildren := log.PutEmptySlice("children")

message := logChildren.AppendEmpty().SetEmptyMap()
message.PutStr("tag", "Message")
message.PutStr("content", "This is a log message!")
},
},
{
statement: `set(attributes["test"], Seconds(Duration("1m")))`,
want: func(tCtx ottllog.TransformContext) {
Expand Down
73 changes: 73 additions & 0 deletions pkg/ottl/ottlfuncs/README.md
Expand Up @@ -403,6 +403,7 @@ Available Converters:
- [ParseCSV](#parsecsv)
- [ParseJSON](#parsejson)
- [ParseKeyValue](#parsekeyvalue)
- [ParseXML](#parsexml)
- [Seconds](#seconds)
- [SHA1](#sha1)
- [SHA256](#sha256)
Expand Down Expand Up @@ -913,6 +914,78 @@ Examples:
- `ParseKeyValue(attributes["pairs"])`


### ParseXML

`ParseXML(target)`

The `ParseXML` Converter returns a `pcommon.Map` struct that is the result of parsing the target string as an XML document.

`target` is a Getter that returns a string. This string should be in XML format.
If `target` is not a string, nil, or cannot be parsed as XML, `ParseXML` will return an error.

Unmarshalling XML is done using the following rules:
1. All character data for an XML element is trimmed, joined, and placed into the `content` field.
2. The tag for an XML element is trimmed, and placed into the `tag` field.
3. The attributes for an XML element is placed as a `pcommon.Map` into the `attribute` field.
4. Processing instructions, directives, and comments are ignored and not represented in the resultant map.
5. All child elements are parsed as above, and placed in a `pcommon.Slice`, which is then placed into the `children` field.

For example, the following XML document:
```xml
<?xml version="1.0" encoding="UTF-8" ?>
<Log>
<User>
<ID>00001</ID>
<Name type="first">Joe</Name>
<Email>joe.smith@example.com</Email>
</User>
<Text>User fired alert A</Text>
</Log>
```

will be parsed as:
```json
{
"tag": "Log",
"children": [
{
"tag": "User",
"children": [
{
"tag": "ID",
"content": "00001"
},
{
"tag": "Name",
"content": "Joe",
"attributes": {
"type": "first"
}
},
{
"tag": "Email",
"content": "joe.smith@example.com"
}
]
},
{
"tag": "Text",
"content": "User fired alert A"
}
]
}
```

Examples:

- `ParseXML(body)`

- `ParseXML(attributes["xml"])`

- `ParseXML("<HostInfo hostname=\"example.com\" zone=\"east-1\" cloudprovider=\"aws\" />")`



### Seconds

`Seconds(value)`
Expand Down
134 changes: 134 additions & 0 deletions pkg/ottl/ottlfuncs/func_parse_xml.go
@@ -0,0 +1,134 @@
// Copyright The OpenTelemetry Authors
// SPDX-License-Identifier: Apache-2.0

package ottlfuncs // import "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/ottl/ottlfuncs"

import (
"bytes"
"context"
"encoding/xml"
"errors"
"fmt"
"strings"

"go.opentelemetry.io/collector/pdata/pcommon"

"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/ottl"
)

type ParseXMLArguments[K any] struct {
Target ottl.StringGetter[K]
}

func NewParseXMLFactory[K any]() ottl.Factory[K] {
return ottl.NewFactory("ParseXML", &ParseXMLArguments[K]{}, createParseXMLFunction[K])
}

func createParseXMLFunction[K any](_ ottl.FunctionContext, oArgs ottl.Arguments) (ottl.ExprFunc[K], error) {
args, ok := oArgs.(*ParseXMLArguments[K])

if !ok {
return nil, fmt.Errorf("ParseXMLFactory args must be of type *ParseXMLArguments[K]")
}

return parseXML(args.Target), nil
}

// parseXML returns a `pcommon.Map` struct that is a result of parsing the target string as XML
func parseXML[K any](target ottl.StringGetter[K]) ottl.ExprFunc[K] {
return func(ctx context.Context, tCtx K) (any, error) {
targetVal, err := target.Get(ctx, tCtx)
if err != nil {
return nil, err
}

parsedXML := xmlElement{}

decoder := xml.NewDecoder(strings.NewReader(targetVal))
err = decoder.Decode(&parsedXML)
if err != nil {
return nil, fmt.Errorf("unmarshal xml: %w", err)
}

if decoder.InputOffset() != int64(len(targetVal)) {
return nil, errors.New("trailing bytes after parsing xml")
}

parsedMap := pcommon.NewMap()
parsedXML.intoMap(parsedMap)

return parsedMap, nil
}
}

type xmlElement struct {
tag string
attributes []xml.Attr
text string
children []xmlElement
}

// UnmarshalXML implements xml.Unmarshaler for xmlElement
func (a *xmlElement) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
a.tag = start.Name.Local
a.attributes = start.Attr

for {
tok, err := d.Token()
if err != nil {
return fmt.Errorf("decode next token: %w", err)
}

switch t := tok.(type) {
case xml.StartElement:
child := xmlElement{}
err := d.DecodeElement(&child, &t)
if err != nil {
return err
}

a.children = append(a.children, child)
case xml.EndElement:
// End element means we've reached the end of parsing
return nil
case xml.CharData:
// Strip leading/trailing spaces to ignore newlines and
// indentation in formatted XML
a.text += string(bytes.TrimSpace([]byte(t)))
case xml.Comment: // ignore comments
case xml.ProcInst: // ignore processing instructions
case xml.Directive: // ignore directives
default:
return fmt.Errorf("unexpected token type %T", t)
}
}
}

// intoMap converts and adds the xmlElement into the provided pcommon.Map.
func (a xmlElement) intoMap(m pcommon.Map) {
m.EnsureCapacity(4)

m.PutStr("tag", a.tag)

if a.text != "" {
m.PutStr("content", a.text)
}

if len(a.attributes) > 0 {
attrs := m.PutEmptyMap("attributes")
attrs.EnsureCapacity(len(a.attributes))

for _, attr := range a.attributes {
attrs.PutStr(attr.Name.Local, attr.Value)
}
}

if len(a.children) > 0 {
children := m.PutEmptySlice("children")
children.EnsureCapacity(len(a.children))

for _, child := range a.children {
child.intoMap(children.AppendEmpty().SetEmptyMap())
}
}
}

0 comments on commit 0d9b1b0

Please sign in to comment.