Skip to content

Commit 72eeeab

Browse files
authored
Merge pull request #9 from setnicka/master
Allow to parse of elements of multiple types
2 parents 00de614 + 75d36e1 commit 72eeeab

File tree

3 files changed

+105
-53
lines changed

3 files changed

+105
-53
lines changed

README.md

Lines changed: 23 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
## xml stream parser
2-
xml-stream-parser is xml parser for GO. It is efficient to parse large xml data with streaming fashion.
1+
# xml stream parser
32

4-
### Usage
3+
xml-stream-parser is xml parser for GO. It is efficient to parse large xml data with streaming fashion.
4+
5+
## Usage
56

67
```xml
78
<?xml version="1.0" encoding="UTF-8"?>
@@ -22,46 +23,50 @@ xml-stream-parser is xml parser for GO. It is efficient to parse large xml data
2223
<userComment rating="4">Excellent overview of world literature.</userComment>
2324
</comments>
2425
</book>
26+
<journal>
27+
<title>Journal of XML parsing</title>
28+
<issue>1</issue>
29+
</journal>
2530
</bookstore>
2631
```
2732

28-
<b>Stream</b> over books
29-
```go
30-
33+
**Stream** over books and journals
3134

35+
```go
3236
f, _ := os.Open("input.xml")
3337
br := bufio.NewReaderSize(f,65536)
34-
parser := xmlparser.NewXMLParser(br, "book")
38+
parser := xmlparser.NewXMLParser(br, "book", "journal")
3539

3640
for xml := range parser.Stream() {
37-
fmt.Println(xml.Childs["title"][0].InnerText)
38-
fmt.Println(xml.Childs["comments"][0].Childs["userComment"][0].Attrs["rating"])
39-
fmt.Println(xml.Childs["comments"][0].Childs["userComment"][0].InnerText)
41+
fmt.Println(xml.Childs["title"][0].InnerText)
42+
if xml.Name == "book" {
43+
fmt.Println(xml.Childs["comments"][0].Childs["userComment"][0].Attrs["rating"])
44+
fmt.Println(xml.Childs["comments"][0].Childs["userComment"][0].InnerText)
45+
}
4046
}
41-
4247
```
4348

44-
<b>Skip</b> tags for speed
49+
**Skip** tags for speed
50+
4551
```go
4652
parser := xmlparser.NewXMLParser(br, "book").SkipElements([]string{"price", "comments"})
4753
```
4854

49-
<b>Error</b> handlings
55+
**Error** handlings
56+
5057
```go
5158
for xml := range parser.Stream() {
52-
if xml.Err !=nil {
59+
if xml.Err !=nil {
5360
// handle error
5461
}
5562
}
5663
```
5764

58-
<b>Progress</b> of parsing
65+
**Progress** of parsing
66+
5967
```go
6068
// total byte read to calculate the progress of parsing
6169
parser.TotalReadSize
6270
```
6371

64-
65-
66-
6772
If you interested check also [json parser](https://github.com/tamerh/jsparser) which works similarly

xmlparser.go

Lines changed: 37 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import (
88

99
type XMLParser struct {
1010
reader *bufio.Reader
11-
loopElement string
11+
loopElements map[string]bool
1212
resultChannel chan *XMLElement
1313
skipElements map[string]bool
1414
skipOuterElements bool
@@ -18,22 +18,29 @@ type XMLParser struct {
1818
}
1919

2020
type XMLElement struct {
21+
Name string
2122
Attrs map[string]string
2223
InnerText string
2324
Childs map[string][]XMLElement
2425
Err error
2526
}
2627

27-
func NewXMLParser(reader *bufio.Reader, loopElement string) *XMLParser {
28+
func NewXMLParser(reader *bufio.Reader, loopElements ...string) *XMLParser {
2829

2930
x := &XMLParser{
3031
reader: reader,
31-
loopElement: loopElement,
32+
loopElements: map[string]bool{},
3233
resultChannel: make(chan *XMLElement, 256),
3334
skipElements: map[string]bool{},
3435
scratch: &scratch{data: make([]byte, 1024)},
3536
scratch2: &scratch{data: make([]byte, 1024)},
3637
}
38+
39+
// Register loop elements
40+
for _, e := range loopElements {
41+
x.loopElements[e] = true
42+
}
43+
3744
return x
3845
}
3946

@@ -69,7 +76,6 @@ func (x *XMLParser) parse() {
6976

7077
defer close(x.resultChannel)
7178
var element *XMLElement
72-
var tagName string
7379
var tagClosed bool
7480
var err error
7581
var b byte
@@ -116,29 +122,29 @@ func (x *XMLParser) parse() {
116122
continue
117123
}
118124

119-
tagName, element, tagClosed, err = x.startElement()
125+
element, tagClosed, err = x.startElement()
120126

121127
if err != nil {
122128
x.sendError()
123129
return
124130
}
125131

126-
if tagName == x.loopElement {
132+
if _, found := x.loopElements[element.Name]; found {
127133
if tagClosed {
128134
x.resultChannel <- element
129135
continue
130136
}
131137

132-
element = x.getElementTree(tagName, element)
138+
element = x.getElementTree(element)
133139
x.resultChannel <- element
134140
if element.Err != nil {
135141
return
136142
}
137143
} else if x.skipOuterElements {
138144

139-
if _, ok := x.skipElements[tagName]; ok && !tagClosed {
145+
if _, ok := x.skipElements[element.Name]; ok && !tagClosed {
140146

141-
err = x.skipElement(tagName)
147+
err = x.skipElement(element.Name)
142148
if err != nil {
143149
x.sendError()
144150
return
@@ -154,7 +160,7 @@ func (x *XMLParser) parse() {
154160

155161
}
156162

157-
func (x *XMLParser) getElementTree(tagName string, result *XMLElement) *XMLElement {
163+
func (x *XMLParser) getElementTree(result *XMLElement) *XMLElement {
158164

159165
if result.Err != nil {
160166
return result
@@ -166,7 +172,6 @@ func (x *XMLParser) getElementTree(tagName string, result *XMLElement) *XMLEleme
166172
var element *XMLElement
167173
var tagClosed bool
168174
x.scratch2.reset() // this hold the inner text
169-
var tagName2 string
170175
var iscomment bool
171176

172177
for {
@@ -219,7 +224,7 @@ func (x *XMLParser) getElementTree(tagName string, result *XMLElement) *XMLEleme
219224
return result
220225
}
221226

222-
if tag == tagName {
227+
if tag == result.Name {
223228
if len(result.Childs) == 0 {
224229
result.InnerText = string(x.scratch2.bytes())
225230
}
@@ -229,34 +234,34 @@ func (x *XMLParser) getElementTree(tagName string, result *XMLElement) *XMLEleme
229234
x.unreadByte()
230235
}
231236

232-
tagName2, element, tagClosed, err = x.startElement()
237+
element, tagClosed, err = x.startElement()
233238

234239
if err != nil {
235240
result.Err = err
236241
return result
237242
}
238243

239-
if _, ok := x.skipElements[tagName2]; ok && !tagClosed {
240-
err = x.skipElement(tagName2)
244+
if _, ok := x.skipElements[element.Name]; ok && !tagClosed {
245+
err = x.skipElement(element.Name)
241246
if err != nil {
242247
result.Err = err
243248
return result
244249
}
245250
continue
246251
}
247252
if !tagClosed {
248-
element = x.getElementTree(tagName2, element)
253+
element = x.getElementTree(element)
249254
}
250255

251-
if _, ok := result.Childs[tagName2]; ok {
252-
result.Childs[tagName2] = append(result.Childs[tagName2], *element)
256+
if _, ok := result.Childs[element.Name]; ok {
257+
result.Childs[element.Name] = append(result.Childs[element.Name], *element)
253258
} else {
254259
var childs []XMLElement
255260
childs = append(childs, *element)
256261
if result.Childs == nil {
257262
result.Childs = map[string][]XMLElement{}
258263
}
259-
result.Childs[tagName2] = childs
264+
result.Childs[element.Name] = childs
260265
}
261266

262267
} else {
@@ -302,7 +307,7 @@ func (x *XMLParser) skipElement(elname string) error {
302307
}
303308
}
304309

305-
func (x *XMLParser) startElement() (string, *XMLElement, bool, error) {
310+
func (x *XMLParser) startElement() (*XMLElement, bool, error) {
306311

307312
x.scratch.reset()
308313

@@ -313,26 +318,27 @@ func (x *XMLParser) startElement() (string, *XMLElement, bool, error) {
313318
// a tag have 3 forms * <abc > ** <abc type="foo" val="bar"/> *** <abc />
314319
var attr string
315320
var attrVal string
316-
var tagName string
317321
for {
318322

319323
cur, err = x.readByte()
320324

321325
if err != nil {
322-
return "", nil, false, x.defaultError()
326+
return nil, false, x.defaultError()
323327
}
324328

325329
if x.isWS(cur) {
326-
tagName = string(x.scratch.bytes())
330+
result.Name = string(x.scratch.bytes())
327331
x.scratch.reset()
328332
goto search_close_tag
329333
}
330334

331335
if cur == '>' {
332336
if prev == '/' {
333-
return string(x.scratch.bytes()[:len(x.scratch.bytes())-1]), result, true, nil
337+
result.Name = string(x.scratch.bytes()[:len(x.scratch.bytes())-1])
338+
return result, true, nil
334339
}
335-
return string(x.scratch.bytes()), result, false, nil
340+
result.Name = string(x.scratch.bytes())
341+
return result, false, nil
336342
}
337343
x.scratch.add(cur)
338344
prev = cur
@@ -344,7 +350,7 @@ search_close_tag:
344350
cur, err = x.readByte()
345351

346352
if err != nil {
347-
return "", nil, false, x.defaultError()
353+
return nil, false, x.defaultError()
348354
}
349355

350356
if x.isWS(cur) {
@@ -359,17 +365,17 @@ search_close_tag:
359365
cur, err = x.readByte()
360366

361367
if err != nil {
362-
return "", nil, false, x.defaultError()
368+
return nil, false, x.defaultError()
363369
}
364370

365371
if !(cur == '"' || cur == '\'') {
366-
return "", nil, false, x.defaultError()
372+
return nil, false, x.defaultError()
367373
}
368374

369375
attr = string(x.scratch.bytes())
370376
attrVal, err = x.string(cur)
371377
if err != nil {
372-
return "", nil, false, x.defaultError()
378+
return nil, false, x.defaultError()
373379
}
374380
result.Attrs[attr] = attrVal
375381
x.scratch.reset()
@@ -378,9 +384,9 @@ search_close_tag:
378384

379385
if cur == '>' { //if tag name not found
380386
if prev == '/' { //tag special close
381-
return tagName, result, true, nil
387+
return result, true, nil
382388
}
383-
return tagName, result, false, nil
389+
return result, false, nil
384390
}
385391

386392
x.scratch.add(cur)

xmlparser_test.go

Lines changed: 45 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,18 @@ import (
66
"testing"
77
)
88

9-
func getparser(prop string) *XMLParser {
9+
func getparser(prop ...string) *XMLParser {
1010

11-
return getparserFile("sample.xml", prop)
11+
return getparserFile("sample.xml", prop...)
1212
}
1313

14-
func getparserFile(filename, prop string) *XMLParser {
14+
func getparserFile(filename string, prop ...string) *XMLParser {
1515

1616
file, _ := os.Open(filename)
1717

1818
br := bufio.NewReader(file)
1919

20-
p := NewXMLParser(br, prop)
20+
p := NewXMLParser(br, prop...)
2121

2222
return p
2323

@@ -232,6 +232,47 @@ func TestError(t *testing.T) {
232232

233233
}
234234

235+
func TestMultipleTags(t *testing.T) {
236+
p := getparser("tag1", "tag2")
237+
238+
tagCount := map[string]int{}
239+
for xml := range p.Stream() {
240+
if xml.Name != "tag1" && xml.Name != "tag2" {
241+
t.Errorf("Only 'tag1' and 'tag2' expected, but '%s' returned", xml.Name)
242+
}
243+
tagCount[xml.Name]++
244+
}
245+
246+
if tagCount["tag1"] != 2 {
247+
t.Errorf("There should be 2 parsed 'tag1', but %d found", tagCount["tag1"])
248+
}
249+
if tagCount["tag2"] != 2 {
250+
t.Errorf("There should be 2 parsed 'tag2', but %d found", tagCount["tag2"])
251+
}
252+
}
253+
254+
func TestMultipleTagsNested(t *testing.T) {
255+
p := getparser("tag1", "tag11")
256+
257+
tagCount := map[string]int{}
258+
for xml := range p.Stream() {
259+
if xml.Name != "tag1" && xml.Name != "tag11" {
260+
t.Errorf("Only 'tag1' and 'tag11' expected, but '%s' returned", xml.Name)
261+
}
262+
tagCount[xml.Name]++
263+
}
264+
265+
if tagCount["tag1"] != 2 {
266+
t.Errorf("There should be 2 parsed 'tag1', but %d found", tagCount["tag1"])
267+
}
268+
if tagCount["tag11"] != 1 {
269+
if tagCount["tag11"] == 4 {
270+
t.Errorf("There should be only 1 parsed 'tag11', but 'tag11' nested under 'tag1' were parsed too")
271+
}
272+
t.Errorf("There should be 1 parsed 'tag11', but %d found", tagCount["tag11"])
273+
}
274+
}
275+
235276
func Benchmark1(b *testing.B) {
236277

237278
for n := 0; n < b.N; n++ {

0 commit comments

Comments
 (0)