@@ -48,29 +48,15 @@ class BaseParser
48
48
REFERENCE = "&(?:#{ NAME } ;|#\\ d+;|#x[0-9a-fA-F]+;)"
49
49
REFERENCE_RE = /#{ REFERENCE } /
50
50
51
- DOCTYPE_START = /\A \s *<!DOCTYPE\s /um
52
- DOCTYPE_END = /\A \s *\] \s *>/um
53
51
ATTRIBUTE_PATTERN = /\s *(#{ QNAME_STR } )\s *=\s *(["'])(.*?)\4 /um
54
- COMMENT_START = /\A <!--/u
55
- COMMENT_PATTERN = /<!--(.*?)-->/um
56
- CDATA_START = /\A <!\[ CDATA\[ /u
57
- CDATA_END = /\A \s *\] \s *>/um
58
- CDATA_PATTERN = /<!\[ CDATA\[ (.*?)\] \] >/um
59
- XMLDECL_START = /\A <\? xml\s /u ;
60
- XMLDECL_PATTERN = /<\? xml\s +(.*?)\? >/um
61
- INSTRUCTION_START = /\A <\? /u
62
- INSTRUCTION_PATTERN = /<\? #{ NAME } (\s +.*?)?\? >/um
63
- TAG_MATCH = /\A <((?>#{ QNAME_STR } ))/um
64
- CLOSE_MATCH = /\A \s *<\/ (#{ QNAME_STR } )\s *>/um
52
+ INSTRUCTION_PATTERN = /#{ NAME } (\s +.*?)?\? >/um
53
+ TAG_MATCH = /((?>#{ QNAME_STR } ))/um
54
+ CLOSE_MATCH = /(#{ QNAME_STR } )\s *>/um
65
55
66
56
VERSION = /\b version\s *=\s *["'](.*?)['"]/um
67
57
ENCODING = /\b encoding\s *=\s *["'](.*?)['"]/um
68
58
STANDALONE = /\b standalone\s *=\s *["'](.*?)['"]/um
69
59
70
- ENTITY_START = /\A \s *<!ENTITY/
71
- ELEMENTDECL_START = /\A \s *<!ELEMENT/um
72
- ELEMENTDECL_PATTERN = /\A \s *(<!ELEMENT.*?)>/um
73
- SYSTEMENTITY = /\A \s *(%.*?;)\s *$/um
74
60
ENUMERATION = "\\ (\\ s*#{ NMTOKEN } (?:\\ s*\\ |\\ s*#{ NMTOKEN } )*\\ s*\\ )"
75
61
NOTATIONTYPE = "NOTATION\\ s+\\ (\\ s*#{ NAME } (?:\\ s*\\ |\\ s*#{ NAME } )*\\ s*\\ )"
76
62
ENUMERATEDTYPE = "(?:(?:#{ NOTATIONTYPE } )|(?:#{ ENUMERATION } ))"
@@ -79,10 +65,7 @@ class BaseParser
79
65
DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\ s+)?#{ ATTVALUE } ))"
80
66
ATTDEF = "\\ s+#{ NAME } \\ s+#{ ATTTYPE } \\ s+#{ DEFAULTDECL } "
81
67
ATTDEF_RE = /#{ ATTDEF } /
82
- ATTLISTDECL_START = /\A \s *<!ATTLIST/um
83
- ATTLISTDECL_PATTERN = /\A \s *<!ATTLIST\s +#{ NAME } (?:#{ ATTDEF } )*\s *>/um
84
-
85
- TEXT_PATTERN = /\A ([^<]*)/um
68
+ ATTLISTDECL_PATTERN = /\s +#{ NAME } (?:#{ ATTDEF } )*\s *>/um
86
69
87
70
# Entity constants
88
71
PUBIDCHAR = "\x20 \x0D \x0A a-zA-Z0-9\\ -()+,./:=?;!*@$_%#"
@@ -94,11 +77,10 @@ class BaseParser
94
77
ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{ PEREFERENCE } |#{ REFERENCE } )*")|(?:'([^%&']|#{ PEREFERENCE } |#{ REFERENCE } )*'))}
95
78
PEDEF = "(?:#{ ENTITYVALUE } |#{ EXTERNALID } )"
96
79
ENTITYDEF = "(?:#{ ENTITYVALUE } |(?:#{ EXTERNALID } (#{ NDATADECL } )?))"
97
- PEDECL = "<!ENTITY \\ s+(%)\\ s+#{ NAME } \\ s+#{ PEDEF } \\ s*>"
98
- GEDECL = "<!ENTITY \\ s+#{ NAME } \\ s+#{ ENTITYDEF } \\ s*>"
99
- ENTITYDECL = /\s * (?:#{ GEDECL } )|\s * (?:#{ PEDECL } )/um
80
+ PEDECL = "\\ s+(%)\\ s+#{ NAME } \\ s+#{ PEDEF } \\ s*>"
81
+ GEDECL = "\\ s+#{ NAME } \\ s+#{ ENTITYDEF } \\ s*>"
82
+ ENTITYDECL = /(?:#{ GEDECL } )|(?:#{ PEDECL } )/um
100
83
101
- NOTATIONDECL_START = /\A \s *<!NOTATION/um
102
84
EXTERNAL_ID_PUBLIC = /\A \s *PUBLIC\s +#{ PUBIDLITERAL } \s +#{ SYSTEMLITERAL } \s */um
103
85
EXTERNAL_ID_SYSTEM = /\A \s *SYSTEM\s +#{ SYSTEMLITERAL } \s */um
104
86
PUBLIC_ID = /\A \s *PUBLIC\s +#{ PUBIDLITERAL } \s */um
@@ -198,65 +180,67 @@ def pull_event
198
180
#STDERR.puts @source.encoding
199
181
#STDERR.puts "BUFFER = #{@source.buffer.inspect}"
200
182
if @document_status == nil
201
- word = @source . match ( /\A ((?:\s +)|(?:<[^>]*>))/um )
202
- word = word [ 1 ] unless word . nil?
203
- #STDERR.puts "WORD = #{word.inspect}"
204
- case word
205
- when COMMENT_START
206
- return [ :comment , @source . match ( COMMENT_PATTERN , true ) [ 1 ] ]
207
- when XMLDECL_START
208
- #STDERR.puts "XMLDECL"
209
- results = @source . match ( XMLDECL_PATTERN , true ) [ 1 ]
210
- version = VERSION . match ( results )
211
- version = version [ 1 ] unless version . nil?
212
- encoding = ENCODING . match ( results )
213
- encoding = encoding [ 1 ] unless encoding . nil?
214
- if need_source_encoding_update? ( encoding )
215
- @source . encoding = encoding
216
- end
217
- if encoding . nil? and /\A UTF-16(?:BE|LE)\z /i =~ @source . encoding
218
- encoding = "UTF-16"
219
- end
220
- standalone = STANDALONE . match ( results )
221
- standalone = standalone [ 1 ] unless standalone . nil?
222
- return [ :xmldecl , version , encoding , standalone ]
223
- when INSTRUCTION_START
224
- return process_instruction
225
- when DOCTYPE_START
226
- base_error_message = "Malformed DOCTYPE"
227
- @source . match ( DOCTYPE_START , true )
228
- @nsstack . unshift ( curr_ns = Set . new )
229
- name = parse_name ( base_error_message )
230
- if @source . match ( /\A \s *\[ /um , true )
231
- id = [ nil , nil , nil ]
232
- @document_status = :in_doctype
233
- elsif @source . match ( /\A \s *>/um , true )
234
- id = [ nil , nil , nil ]
235
- @document_status = :after_doctype
236
- else
237
- id = parse_id ( base_error_message ,
238
- accept_external_id : true ,
239
- accept_public_id : false )
240
- if id [ 0 ] == "SYSTEM"
241
- # For backward compatibility
242
- id [ 1 ] , id [ 2 ] = id [ 2 ] , nil
183
+ @source . read
184
+ if @source . match ( "<?" , true , false )
185
+ if results = @source . match ( /xml\s +(.*?)\? >/um , true , false )
186
+ results = results [ 1 ]
187
+ version = VERSION . match ( results )
188
+ version = version [ 1 ] unless version . nil?
189
+ encoding = ENCODING . match ( results )
190
+ encoding = encoding [ 1 ] unless encoding . nil?
191
+ if need_source_encoding_update? ( encoding )
192
+ @source . encoding = encoding
243
193
end
244
- if @source . match ( /\A \s *\[ /um , true )
194
+ if encoding . nil? and /\A UTF-16(?:BE|LE)\z /i =~ @source . encoding
195
+ encoding = "UTF-16"
196
+ end
197
+ standalone = STANDALONE . match ( results )
198
+ standalone = standalone [ 1 ] unless standalone . nil?
199
+ return [ :xmldecl , version , encoding , standalone ]
200
+ else # instruction
201
+ return process_instruction
202
+ end
203
+ elsif @source . match ( "<!" , true , false )
204
+ if @source . match ( "--" , true , false )
205
+ return [ :comment , @source . match ( /(.*?)-->/um , true ) [ 1 ] ]
206
+ elsif @source . match ( /DOCTYPE\s /um , true , false )
207
+ base_error_message = "Malformed DOCTYPE"
208
+ @nsstack . unshift ( curr_ns = Set . new )
209
+ name = parse_name ( base_error_message )
210
+ if @source . match ( /\s *\[ /um , true )
211
+ id = [ nil , nil , nil ]
245
212
@document_status = :in_doctype
246
- elsif @source . match ( /\A \s *>/um , true )
213
+ elsif @source . match ( /\s *>/um , true )
214
+ id = [ nil , nil , nil ]
247
215
@document_status = :after_doctype
248
216
else
249
- message = "#{ base_error_message } : garbage after external ID"
250
- raise REXML ::ParseException . new ( message , @source )
217
+ id = parse_id ( base_error_message ,
218
+ accept_external_id : true ,
219
+ accept_public_id : false )
220
+ if id [ 0 ] == "SYSTEM"
221
+ # For backward compatibility
222
+ id [ 1 ] , id [ 2 ] = id [ 2 ] , nil
223
+ end
224
+ if @source . match ( /\s *\[ /um , true )
225
+ @document_status = :in_doctype
226
+ elsif @source . match ( /\s *>/um , true )
227
+ @document_status = :after_doctype
228
+ else
229
+ message = "#{ base_error_message } : garbage after external ID"
230
+ raise REXML ::ParseException . new ( message , @source )
231
+ end
251
232
end
233
+ args = [ :start_doctype , name , *id ]
234
+ if @document_status == :after_doctype
235
+ @source . match ( /\s */um , true )
236
+ @stack << [ :end_doctype ]
237
+ end
238
+ return args
239
+ else
240
+ message = "Invalid XML"
241
+ raise REXML ::ParseException . new ( message , @source )
252
242
end
253
- args = [ :start_doctype , name , *id ]
254
- if @document_status == :after_doctype
255
- @source . match ( /\A \s */um , true )
256
- @stack << [ :end_doctype ]
257
- end
258
- return args
259
- when /\A \s +/
243
+ elsif @source . match ( /\s +/ , false , false )
260
244
else
261
245
@document_status = :after_doctype
262
246
if @source . encoding == "UTF-8"
@@ -265,16 +249,13 @@ def pull_event
265
249
end
266
250
end
267
251
if @document_status == :in_doctype
268
- md = @source . match ( /\A \s *(.*?>)/um )
269
- case md [ 1 ]
270
- when SYSTEMENTITY
271
- match = @source . match ( SYSTEMENTITY , true ) [ 1 ]
272
- return [ :externalentity , match ]
273
-
274
- when ELEMENTDECL_START
275
- return [ :elementdecl , @source . match ( ELEMENTDECL_PATTERN , true ) [ 1 ] ]
276
-
277
- when ENTITY_START
252
+ @source . read
253
+ @source . match ( /\s */um , true , false ) # skip spaces
254
+ if match = @source . match ( /(%.*?;)\s *$/um , true , false )
255
+ return [ :externalentity , match [ 1 ] ]
256
+ elsif match = @source . match ( /(<!ELEMENT.*?)>/um , true , false )
257
+ return [ :elementdecl , match [ 1 ] ]
258
+ elsif @source . match ( "<!ENTITY" , true , false )
278
259
match = [ :entitydecl , *@source . match ( ENTITYDECL , true ) . captures . compact ]
279
260
ref = false
280
261
if match [ 1 ] == '%'
@@ -300,7 +281,7 @@ def pull_event
300
281
end
301
282
match << '%' if ref
302
283
return match
303
- when ATTLISTDECL_START
284
+ elsif @source . match ( "<!ATTLIST" , true , false )
304
285
md = @source . match ( ATTLISTDECL_PATTERN , true )
305
286
raise REXML ::ParseException . new ( "Bad ATTLIST declaration!" , @source ) if md . nil?
306
287
element = md [ 1 ]
@@ -320,42 +301,41 @@ def pull_event
320
301
end
321
302
end
322
303
return [ :attlistdecl , element , pairs , contents ]
323
- when NOTATIONDECL_START
304
+ elsif @source . match ( "<!NOTATION" , true , false )
324
305
base_error_message = "Malformed notation declaration"
325
- unless @source . match ( /\A \s *<!NOTATION \ s +/um , true )
326
- if @source . match ( /\A \s *<!NOTATION \ s *>/um )
306
+ unless @source . match ( /\s +/um , true )
307
+ if @source . match ( /\s *>/um )
327
308
message = "#{ base_error_message } : name is missing"
328
309
else
329
310
message = "#{ base_error_message } : invalid declaration name"
330
311
end
312
+ @source . string = " <!NOTATION" + @source . buffer
331
313
raise REXML ::ParseException . new ( message , @source )
332
314
end
333
315
name = parse_name ( base_error_message )
334
316
id = parse_id ( base_error_message ,
335
317
accept_external_id : true ,
336
318
accept_public_id : true )
337
- unless @source . match ( /\A \ s *>/um , true )
319
+ unless @source . match ( /\s *>/um , true )
338
320
message = "#{ base_error_message } : garbage before end >"
339
321
raise REXML ::ParseException . new ( message , @source )
340
322
end
341
323
return [ :notationdecl , name , *id ]
342
- when DOCTYPE_END
324
+ elsif @source . match ( / \] \s *>/um , true , false )
343
325
@document_status = :after_doctype
344
- @source . match ( DOCTYPE_END , true )
345
326
return [ :end_doctype ]
346
327
end
347
328
end
348
329
if @document_status == :after_doctype
349
- @source . match ( /\A \ s */um , true )
330
+ @source . match ( /\s */um , true )
350
331
end
351
332
begin
352
333
next_data = @source . buffer
353
334
if next_data . size < 2
354
335
@source . read
355
- next_data = @source . buffer
356
336
end
357
- if next_data [ 0 ] == ?<
358
- if next_data [ 1 ] == ?/
337
+ if @source . match ( "<" , true , false )
338
+ if @source . match ( "/" , true , false )
359
339
@nsstack . shift
360
340
last_tag = @tags . pop
361
341
md = @source . match ( CLOSE_MATCH , true )
@@ -366,15 +346,16 @@ def pull_event
366
346
if md . nil? or last_tag != md [ 1 ]
367
347
message = "Missing end tag for '#{ last_tag } '"
368
348
message += " (got '#{ md [ 1 ] } ')" if md
349
+ @source . string = "</" + @source . buffer if md . nil?
369
350
raise REXML ::ParseException . new ( message , @source )
370
351
end
371
352
return [ :end_element , last_tag ]
372
- elsif next_data [ 1 ] == ?!
373
- md = @source . match ( /\A ( \s * [^>]*>)/um )
353
+ elsif @source . match ( "!" , true , false )
354
+ md = @source . match ( /( [^>]*>)/um )
374
355
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
375
356
raise REXML ::ParseException . new ( "Malformed node" , @source ) unless md
376
- if md [ 0 ] [ 2 ] == ?-
377
- md = @source . match ( COMMENT_PATTERN , true )
357
+ if md [ 0 ] [ 0 ] == ?-
358
+ md = @source . match ( /--(.*?)-->/um , true )
378
359
379
360
case md [ 1 ]
380
361
when /--/ , /-\z /
@@ -383,17 +364,18 @@ def pull_event
383
364
384
365
return [ :comment , md [ 1 ] ] if md
385
366
else
386
- md = @source . match ( CDATA_PATTERN , true )
367
+ md = @source . match ( / \[ CDATA \[ (.*?) \] \] >/um , true )
387
368
return [ :cdata , md [ 1 ] ] if md
388
369
end
389
370
raise REXML ::ParseException . new ( "Declarations can only occur " +
390
371
"in the doctype declaration." , @source )
391
- elsif next_data [ 1 ] == ??
372
+ elsif @source . match ( "?" , true , false )
392
373
return process_instruction
393
374
else
394
375
# Get the next tag
395
376
md = @source . match ( TAG_MATCH , true )
396
377
unless md
378
+ @source . string = "<" + @source . buffer
397
379
raise REXML ::ParseException . new ( "malformed XML: missing tag start" , @source )
398
380
end
399
381
tag = md [ 1 ]
@@ -418,7 +400,7 @@ def pull_event
418
400
return [ :start_element , tag , attributes ]
419
401
end
420
402
else
421
- md = @source . match ( TEXT_PATTERN , true )
403
+ md = @source . match ( /([^<]*)/um , true )
422
404
text = md [ 1 ]
423
405
return [ :text , text ]
424
406
end
@@ -579,6 +561,7 @@ def process_instruction
579
561
match_data = @source . match ( INSTRUCTION_PATTERN , true )
580
562
unless match_data
581
563
message = "Invalid processing instruction node"
564
+ @source . string = "<?" + @source . buffer
582
565
raise REXML ::ParseException . new ( message , @source )
583
566
end
584
567
[ :processing_instruction , match_data [ 1 ] , match_data [ 2 ] ]
0 commit comments