Skip to content

Commit 6f950a9

Browse files
djaglowskievan-bradley
authored andcommitted
[pkg/ottl] Add ParseSimplifiedXML Converter (open-telemetry#35421)
This adds a converter called `ParseSimplifiedXML`. This serves as the final step described in open-telemetry#35281, which will allow users to parse any arbitrary XML document into user-friendly result, by first transforming the document in place with other functions (e.g. open-telemetry#35328 and open-telemetry#35364) and then calling this function. --------- Co-authored-by: Evan Bradley <[email protected]>
1 parent 270cefa commit 6f950a9

File tree

6 files changed

+576
-0
lines changed

6 files changed

+576
-0
lines changed

.chloggen/ottl-parse-simple-xml.yaml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# Use this changelog template to create an entry for release notes.
2+
3+
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
4+
change_type: enhancement
5+
6+
# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
7+
component: pkg/ottl
8+
9+
# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
10+
note: Add ParseSimplifiedXML Converter
11+
12+
# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
13+
issues: [35421]
14+
15+
# (Optional) One or more lines of additional information to render under the primary note.
16+
# These lines will be padded with 2 spaces and then inserted directly into the document.
17+
# Use pipe (|) for multiline entries.
18+
subtext:
19+
20+
# If your change doesn't affect end users or the exported elements of any package,
21+
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
22+
# Optional: The change log or logs in which this entry should be included.
23+
# e.g. '[user]' or '[user, api]'
24+
# Include 'user' if the change is relevant to end users.
25+
# Include 'api' if there is a change to a library API.
26+
# Default: '[user]'
27+
change_logs: []

pkg/ottl/e2e/e2e_test.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -663,6 +663,15 @@ func Test_e2e_converters(t *testing.T) {
663663
tCtx.GetLogRecord().Attributes().PutStr("test", "k1=v1 k2=\"v2=v3\"")
664664
},
665665
},
666+
{
667+
statement: `set(attributes["test"], ParseSimplifiedXML("<Log><id>1</id><Message>This is a log message!</Message></Log>"))`,
668+
want: func(tCtx ottllog.TransformContext) {
669+
attr := tCtx.GetLogRecord().Attributes().PutEmptyMap("test")
670+
log := attr.PutEmptyMap("Log")
671+
log.PutStr("id", "1")
672+
log.PutStr("Message", "This is a log message!")
673+
},
674+
},
666675
{
667676
statement: `set(attributes["test"], ParseXML("<Log id=\"1\"><Message>This is a log message!</Message></Log>"))`,
668677
want: func(tCtx ottllog.TransformContext) {

pkg/ottl/ottlfuncs/README.md

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -449,6 +449,7 @@ Available Converters:
449449
- [ParseCSV](#parsecsv)
450450
- [ParseJSON](#parsejson)
451451
- [ParseKeyValue](#parsekeyvalue)
452+
- [ParseSimplifiedXML](#parsesimplifiedxml)
452453
- [ParseXML](#parsexml)
453454
- [RemoveXML](#removexml)
454455
- [Seconds](#seconds)
@@ -1335,6 +1336,132 @@ Examples:
13351336
- `ParseKeyValue("k1!v1_k2!v2_k3!v3", "!", "_")`
13361337
- `ParseKeyValue(attributes["pairs"])`
13371338

1339+
### ParseSimplifiedXML
1340+
1341+
`ParseSimplifiedXML(target)`
1342+
1343+
The `ParseSimplifiedXML` Converter returns a `pcommon.Map` struct that is the result of parsing the target string without preservation of attributes or extraneous text content.
1344+
1345+
The goal of this Converter is to produce a more user-friendly representation of XML data than the `ParseXML` Converter.
1346+
This Converter should be preferred over `ParseXML` when minor semantic details (e.g. order of elements) are not critically important, when subsequent processing or querying of the result is expected, or when human-readability is a concern.
1347+
1348+
This Converter disregards certain aspects of XML, specifically attributes and extraneous text content, in order to produce
1349+
a direct representation of XML data. Users are encouraged to simplify their XML documents prior to using `ParseSimplifiedXML`.
1350+
1351+
See other functions which may be useful for preparing XML documents:
1352+
1353+
- `ConvertAttributesToElementsXML`
1354+
- `ConvertTextToElementsXML`
1355+
- `RemoveXML`
1356+
- `InsertXML`
1357+
- `GetXML`
1358+
1359+
#### Formal Definitions
1360+
1361+
A "Simplified XML" document contains no attributes and no extraneous text content.
1362+
1363+
An element has "extraneous text content" when it contains both text and element content. e.g.
1364+
1365+
```xml
1366+
<foo>
1367+
bar <!-- extraneous text content -->
1368+
<hello>world</hello> <!-- element content -->
1369+
</foo>
1370+
```
1371+
1372+
#### Parsing logic
1373+
1374+
1. Declaration elements, attributes, comments, and extraneous text content are ignored.
1375+
2. Elements which contain a value are converted into key/value pairs.
1376+
e.g. `<foo>bar</foo>` becomes `"foo": "bar"`
1377+
3. Elements which contain child elements are converted into a key/value pair where the value is a map.
1378+
e.g. `<foo> <bar>baz</bar> </foo>` becomes `"foo": { "bar": "baz" }`
1379+
4. Sibling elements that share the same tag will be combined into a slice.
1380+
e.g. `<a> <b>1</b> <c>2</c> <c>3</c> </foo>` becomes `"a": { "b": "1", "c": [ "2", "3" ] }`.
1381+
5. Empty elements are dropped, but they can determine whether a value should be a slice or map.
1382+
e.g. `<a> <b>1</b> <b/> </a>` becomes `"a": { "b": [ "1" ] }` instead of `"a": { "b": "1" }`
1383+
1384+
#### Examples
1385+
1386+
Parse a Simplified XML document from the body:
1387+
1388+
```xml
1389+
<event>
1390+
<id>1</id>
1391+
<user>jane</user>
1392+
<details>
1393+
<time>2021-10-01T12:00:00Z</time>
1394+
<description>Something happened</description>
1395+
<cause>unknown</cause>
1396+
</details>
1397+
</event>
1398+
```
1399+
1400+
```json
1401+
{
1402+
"event": {
1403+
"id": 1,
1404+
"user": "jane",
1405+
"details": {
1406+
"time": "2021-10-01T12:00:00Z",
1407+
"description": "Something happened",
1408+
"cause": "unknown"
1409+
}
1410+
}
1411+
}
1412+
```
1413+
1414+
Parse a Simplified XML document with unique child elements:
1415+
1416+
```xml
1417+
<x>
1418+
<y>1</y>
1419+
<z>2</z>
1420+
</x>
1421+
```
1422+
1423+
```json
1424+
{
1425+
"x": {
1426+
"y": "1",
1427+
"z": "2"
1428+
}
1429+
}
1430+
```
1431+
1432+
Parse a Simplified XML document with multiple elements of the same tag:
1433+
1434+
```xml
1435+
<a>
1436+
<b>1</b>
1437+
<b>2</b>
1438+
</a>
1439+
```
1440+
1441+
```json
1442+
{
1443+
"a": {
1444+
"b": ["1", "2"]
1445+
}
1446+
}
1447+
```
1448+
1449+
Parse a Simplified XML document with CDATA element:
1450+
1451+
```xml
1452+
<a>
1453+
<b>1</b>
1454+
<b><![CDATA[2]]></b>
1455+
</a>
1456+
```
1457+
1458+
```json
1459+
{
1460+
"a": {
1461+
"b": ["1", "2"]
1462+
}
1463+
}
1464+
```
13381465

13391466
### ParseXML
13401467

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
// Copyright The OpenTelemetry Authors
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package ottlfuncs // import "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/ottl/ottlfuncs"
5+
6+
import (
7+
"context"
8+
"fmt"
9+
10+
"github.com/antchfx/xmlquery"
11+
"go.opentelemetry.io/collector/pdata/pcommon"
12+
13+
"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/ottl"
14+
)
15+
16+
type ParseSimplifiedXMLArguments[K any] struct {
17+
Target ottl.StringGetter[K]
18+
}
19+
20+
func NewParseSimplifiedXMLFactory[K any]() ottl.Factory[K] {
21+
return ottl.NewFactory("ParseSimplifiedXML", &ParseSimplifiedXMLArguments[K]{}, createParseSimplifiedXMLFunction[K])
22+
}
23+
24+
func createParseSimplifiedXMLFunction[K any](_ ottl.FunctionContext, oArgs ottl.Arguments) (ottl.ExprFunc[K], error) {
25+
args, ok := oArgs.(*ParseSimplifiedXMLArguments[K])
26+
27+
if !ok {
28+
return nil, fmt.Errorf("ParseSimplifiedXML args must be of type *ParseSimplifiedXMLAguments[K]")
29+
}
30+
31+
return parseSimplifiedXML(args.Target), nil
32+
}
33+
34+
// The `ParseSimplifiedXML` Converter returns a `pcommon.Map` struct that is the result of parsing the target
35+
// string without preservation of attributes or extraneous text content.
36+
func parseSimplifiedXML[K any](target ottl.StringGetter[K]) ottl.ExprFunc[K] {
37+
return func(ctx context.Context, tCtx K) (any, error) {
38+
var doc *xmlquery.Node
39+
if targetVal, err := target.Get(ctx, tCtx); err != nil {
40+
return nil, err
41+
} else if doc, err = parseNodesXML(targetVal); err != nil {
42+
return nil, err
43+
}
44+
45+
docMap := pcommon.NewMap()
46+
parseElement(doc, &docMap)
47+
return docMap, nil
48+
}
49+
}
50+
51+
func parseElement(parent *xmlquery.Node, parentMap *pcommon.Map) {
52+
// Count the number of each element tag so we know whether it will be a member of a slice or not
53+
childTags := make(map[string]int)
54+
for child := parent.FirstChild; child != nil; child = child.NextSibling {
55+
if child.Type != xmlquery.ElementNode {
56+
continue
57+
}
58+
childTags[child.Data]++
59+
}
60+
if len(childTags) == 0 {
61+
return
62+
}
63+
64+
// Convert the children, now knowing whether they will be a member of a slice or not
65+
for child := parent.FirstChild; child != nil; child = child.NextSibling {
66+
if child.Type != xmlquery.ElementNode || child.FirstChild == nil {
67+
continue
68+
}
69+
70+
leafValue := leafValueFromElement(child)
71+
72+
// Slice of the same element
73+
if childTags[child.Data] > 1 {
74+
// Get or create the slice of children
75+
var childrenSlice pcommon.Slice
76+
childrenValue, ok := parentMap.Get(child.Data)
77+
if ok {
78+
childrenSlice = childrenValue.Slice()
79+
} else {
80+
childrenSlice = parentMap.PutEmptySlice(child.Data)
81+
}
82+
83+
// Add the child's text content to the slice
84+
if leafValue != "" {
85+
childrenSlice.AppendEmpty().SetStr(leafValue)
86+
continue
87+
}
88+
89+
// Parse the child to make sure there's something to add
90+
childMap := pcommon.NewMap()
91+
parseElement(child, &childMap)
92+
if childMap.Len() == 0 {
93+
continue
94+
}
95+
96+
sliceValue := childrenSlice.AppendEmpty()
97+
sliceMap := sliceValue.SetEmptyMap()
98+
childMap.CopyTo(sliceMap)
99+
continue
100+
}
101+
102+
if leafValue != "" {
103+
parentMap.PutStr(child.Data, leafValue)
104+
continue
105+
}
106+
107+
// Child will be a map
108+
childMap := pcommon.NewMap()
109+
parseElement(child, &childMap)
110+
if childMap.Len() == 0 {
111+
continue
112+
}
113+
114+
childMap.CopyTo(parentMap.PutEmptyMap(child.Data))
115+
}
116+
}
117+
118+
func leafValueFromElement(node *xmlquery.Node) string {
119+
// First check if there are any child elements. If there are, ignore any extraneous text.
120+
for child := node.FirstChild; child != nil; child = child.NextSibling {
121+
if child.Type == xmlquery.ElementNode {
122+
return ""
123+
}
124+
}
125+
126+
// No child elements, so return the first text or CDATA content
127+
for child := node.FirstChild; child != nil; child = child.NextSibling {
128+
switch child.Type {
129+
case xmlquery.TextNode, xmlquery.CharDataNode:
130+
return child.Data
131+
}
132+
}
133+
return ""
134+
}

0 commit comments

Comments
 (0)