Skip to content

Commit

Permalink
reworked parser for repsonse format html to include comments; see #68928
Browse files Browse the repository at this point in the history
  • Loading branch information
Philipp Hempel committed Jul 6, 2023
1 parent d71c419 commit 0ec6a2e
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 21 deletions.
65 changes: 45 additions & 20 deletions pkg/lib/util/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"github.com/clbanning/mxj"
"github.com/pkg/errors"
"github.com/programmfabrik/golib"
"golang.org/x/net/html"
)

func Max(x, y int) int {
Expand Down Expand Up @@ -111,7 +112,17 @@ func Html2Json(rawHtml []byte) ([]byte, error) {
return []byte{}, errors.Wrap(err, "Could not parse html")
}

htmlData := parseHtmlNode(htmlDoc.Selection)
htmlData := map[string]any{}
htmlDoc.Selection.Contents().Each(func(_ int, node *goquery.Selection) {
switch node.Get(0).Type {
case html.ElementNode:
htmlData = parseHtmlNode(node)
return
default:
return
}
})

jsonStr, err := golib.JsonBytesIndent(htmlData, "", " ")
if err != nil {
return []byte{}, errors.Wrap(err, "Could not convert html to json")
Expand All @@ -123,23 +134,40 @@ func Html2Json(rawHtml []byte) ([]byte, error) {
// parseHtmlNode recursivly parses the html node and adds it to a map
// the resulting structure is the same as the result of format "xml2" (using mxj.NewMapXmlSeq)
func parseHtmlNode(node *goquery.Selection) map[string]any {
tagName := node.Get(0).Data
tagData := map[string]any{}

childrenByName := map[string][]any{}
comments := []string{}

node.Contents().Each(func(i int, content *goquery.Selection) {
switch content.Get(0).Type {
case html.ElementNode:
// recursively parse child nodes
for childName, childContent := range parseHtmlNode(content) {
childrenByName[childName] = append(childrenByName[childName], childContent)
}
case html.CommentNode:
comments = append(comments, strings.Trim(content.Get(0).Data, " \n\t"))
default:
return
}
})

// include attributes
for _, attr := range node.Get(0).Attr {
tagData["-"+attr.Key] = attr.Val
}

// recursively parse child nodes
childrenByName := map[string][]any{}
node.Children().Each(func(i int, childNode *goquery.Selection) {
for childName, childContent := range parseHtmlNode(childNode) {
childrenByName[childName] = append(childrenByName[childName], childContent)
}
})
// include comments
if len(comments) == 1 {
tagData["#comment"] = comments[0]
} else if len(comments) > 1 {
tagData["#comment"] = comments
}

// include children
for childName, children := range childrenByName {
if len(children) < 1 {
if len(children) == 0 {
continue
}
if len(children) == 1 {
Expand All @@ -149,18 +177,15 @@ func parseHtmlNode(node *goquery.Selection) map[string]any {
tagData[childName] = children
}

text := strings.Trim(node.Text(), " \n\t")
if len(text) > 0 && len(childrenByName) < 1 {
// include the text only if there are no children, since goquery would render all children into a single string
tagData["#text"] = text
}

// there might be an empty top level tag (eg '<!DOCTYPE')
if tagName == "" {
return tagData
// include tag text only if there are no children, since goquery would render all children into a single string
if len(childrenByName) == 0 {
text := strings.Trim(node.Text(), " \n\t")
if len(text) > 0 {
tagData["#text"] = text
}
}

return map[string]any{
tagName: tagData,
node.Get(0).Data: tagData,
}
}
9 changes: 9 additions & 0 deletions test/response/format/html/result_html.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"must_not_exist": true
},
"body": {
"#comment": "comment body",
"#text:control": {
"must_not_exist": true
},
Expand All @@ -24,6 +25,10 @@
"must_not_exist": true
},
"footer": {
"#comment": [
"1. comment in footer",
"2. comment in footer"
],
"#text:control": {
"must_not_exist": true
},
Expand Down Expand Up @@ -153,6 +158,9 @@
},
{
"-class": "error-summary",
"#comment:control": {
"match": "comment form\\n\\s*multiline"
},
"#text": "Das Formular enthält Fehler"
}
]
Expand All @@ -163,6 +171,7 @@
}
},
"head": {
"#comment": "comment head",
"#text:control": {
"must_not_exist": true
},
Expand Down
12 changes: 11 additions & 1 deletion test/response/format/html/sample.html
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
<html lang="de">

<head>
<!-- comment head -->
<meta charset="utf-8" />
<title>fylr</title>
<meta name="description" content="fylr - manage your data" />
Expand Down Expand Up @@ -46,6 +47,7 @@
</head>

<body>
<!-- comment body -->
<main class="page-register">
<article class="access-container access-container--wide">
<div class="container">
Expand All @@ -56,7 +58,11 @@ <h1>Registrieren</h1>
<p class="required-information"><sup>*</sup>Pflichtfelder<br>
<p class="error-summary">Das Formular enthält Fehler

<hr>
<!--
comment form
multiline
-->
<hr>

<fieldset data-width="1" data-field-name="email">
<label for="email">
Expand All @@ -79,6 +85,9 @@ <h1>Registrieren</h1>
</section>

<footer>
<!--
1. comment in footer
-->
<nav>
<div class="language-switcher">
<select onChange="onLanguageChange(event)">
Expand All @@ -94,6 +103,7 @@ <h1>Registrieren</h1>
}
</script>
</nav>
<!-- 2. comment in footer -->
</footer>

</div>
Expand Down

0 comments on commit 0ec6a2e

Please sign in to comment.