Skip to content

Commit 9a075bf

Browse files
committed
Changed processing in REXML::Parsers::BaseParser#pull_event from regular expression to processing using StringScanner.
## Why Improve maintainability by optimizing the process so that the parsing process proceeds using StringScanner#scan. # Changed - Added read_source option to IOSource#match to suppress read from @source. - Added Source#string= method for error message output. ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 11.308 11.437 17.833 18.369 i/s - 100.000 times in 8.843230s 8.743769s 5.607477s 5.443861s sax 31.280 31.835 48.002 51.767 i/s - 100.000 times in 3.196881s 3.141148s 2.083259s 1.931737s pull 36.954 37.981 59.502 62.359 i/s - 100.000 times in 2.706080s 2.632914s 1.680629s 1.603608s stream 34.328 36.263 50.594 56.571 i/s - 100.000 times in 2.913063s 2.757657s 1.976527s 1.767694s Comparison: dom after(YJIT): 18.4 i/s before(YJIT): 17.8 i/s - 1.03x slower after: 11.4 i/s - 1.61x slower before: 11.3 i/s - 1.62x slower sax after(YJIT): 51.8 i/s before(YJIT): 48.0 i/s - 1.08x slower after: 31.8 i/s - 1.63x slower before: 31.3 i/s - 1.65x slower pull after(YJIT): 62.4 i/s before(YJIT): 59.5 i/s - 1.05x slower after: 38.0 i/s - 1.64x slower before: 37.0 i/s - 1.69x slower stream after(YJIT): 56.6 i/s before(YJIT): 50.6 i/s - 1.12x slower after: 36.3 i/s - 1.56x slower before: 34.3 i/s - 1.65x slower ``` - YJIT=ON : 1.03x - 1.12x faster - YJIT=OFF : 1.01x - 1.05x faster
1 parent 0656925 commit 9a075bf

File tree

2 files changed

+96
-109
lines changed

2 files changed

+96
-109
lines changed

lib/rexml/parsers/baseparser.rb

Lines changed: 89 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -48,29 +48,15 @@ class BaseParser
4848
REFERENCE = "&(?:#{NAME};|#\\d+;|#x[0-9a-fA-F]+;)"
4949
REFERENCE_RE = /#{REFERENCE}/
5050

51-
DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
52-
DOCTYPE_END = /\A\s*\]\s*>/um
5351
ATTRIBUTE_PATTERN = /\s*(#{QNAME_STR})\s*=\s*(["'])(.*?)\4/um
54-
COMMENT_START = /\A<!--/u
55-
COMMENT_PATTERN = /<!--(.*?)-->/um
56-
CDATA_START = /\A<!\[CDATA\[/u
57-
CDATA_END = /\A\s*\]\s*>/um
58-
CDATA_PATTERN = /<!\[CDATA\[(.*?)\]\]>/um
59-
XMLDECL_START = /\A<\?xml\s/u;
60-
XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
61-
INSTRUCTION_START = /\A<\?/u
62-
INSTRUCTION_PATTERN = /<\?#{NAME}(\s+.*?)?\?>/um
63-
TAG_MATCH = /\A<((?>#{QNAME_STR}))/um
64-
CLOSE_MATCH = /\A\s*<\/(#{QNAME_STR})\s*>/um
52+
INSTRUCTION_PATTERN = /#{NAME}(\s+.*?)?\?>/um
53+
TAG_MATCH = /((?>#{QNAME_STR}))/um
54+
CLOSE_MATCH = /(#{QNAME_STR})\s*>/um
6555

6656
VERSION = /\bversion\s*=\s*["'](.*?)['"]/um
6757
ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um
6858
STANDALONE = /\bstandalone\s*=\s*["'](.*?)['"]/um
6959

70-
ENTITY_START = /\A\s*<!ENTITY/
71-
ELEMENTDECL_START = /\A\s*<!ELEMENT/um
72-
ELEMENTDECL_PATTERN = /\A\s*(<!ELEMENT.*?)>/um
73-
SYSTEMENTITY = /\A\s*(%.*?;)\s*$/um
7460
ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)"
7561
NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)"
7662
ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))"
@@ -79,10 +65,7 @@ class BaseParser
7965
DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))"
8066
ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}"
8167
ATTDEF_RE = /#{ATTDEF}/
82-
ATTLISTDECL_START = /\A\s*<!ATTLIST/um
83-
ATTLISTDECL_PATTERN = /\A\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um
84-
85-
TEXT_PATTERN = /\A([^<]*)/um
68+
ATTLISTDECL_PATTERN = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
8669

8770
# Entity constants
8871
PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#"
@@ -94,11 +77,10 @@ class BaseParser
9477
ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))}
9578
PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})"
9679
ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
97-
PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
98-
GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
99-
ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
80+
PEDECL = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
81+
GEDECL = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
82+
ENTITYDECL = /(?:#{GEDECL})|(?:#{PEDECL})/um
10083

101-
NOTATIONDECL_START = /\A\s*<!NOTATION/um
10284
EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
10385
EXTERNAL_ID_SYSTEM = /\A\s*SYSTEM\s+#{SYSTEMLITERAL}\s*/um
10486
PUBLIC_ID = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s*/um
@@ -198,65 +180,67 @@ def pull_event
198180
#STDERR.puts @source.encoding
199181
#STDERR.puts "BUFFER = #{@source.buffer.inspect}"
200182
if @document_status == nil
201-
word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um )
202-
word = word[1] unless word.nil?
203-
#STDERR.puts "WORD = #{word.inspect}"
204-
case word
205-
when COMMENT_START
206-
return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
207-
when XMLDECL_START
208-
#STDERR.puts "XMLDECL"
209-
results = @source.match( XMLDECL_PATTERN, true )[1]
210-
version = VERSION.match( results )
211-
version = version[1] unless version.nil?
212-
encoding = ENCODING.match(results)
213-
encoding = encoding[1] unless encoding.nil?
214-
if need_source_encoding_update?(encoding)
215-
@source.encoding = encoding
216-
end
217-
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
218-
encoding = "UTF-16"
219-
end
220-
standalone = STANDALONE.match(results)
221-
standalone = standalone[1] unless standalone.nil?
222-
return [ :xmldecl, version, encoding, standalone ]
223-
when INSTRUCTION_START
224-
return process_instruction
225-
when DOCTYPE_START
226-
base_error_message = "Malformed DOCTYPE"
227-
@source.match(DOCTYPE_START, true)
228-
@nsstack.unshift(curr_ns=Set.new)
229-
name = parse_name(base_error_message)
230-
if @source.match(/\A\s*\[/um, true)
231-
id = [nil, nil, nil]
232-
@document_status = :in_doctype
233-
elsif @source.match(/\A\s*>/um, true)
234-
id = [nil, nil, nil]
235-
@document_status = :after_doctype
236-
else
237-
id = parse_id(base_error_message,
238-
accept_external_id: true,
239-
accept_public_id: false)
240-
if id[0] == "SYSTEM"
241-
# For backward compatibility
242-
id[1], id[2] = id[2], nil
183+
@source.read
184+
if @source.match("<?", true, false)
185+
if results = @source.match(/xml\s+(.*?)\?>/um, true, false)
186+
results = results[1]
187+
version = VERSION.match( results )
188+
version = version[1] unless version.nil?
189+
encoding = ENCODING.match(results)
190+
encoding = encoding[1] unless encoding.nil?
191+
if need_source_encoding_update?(encoding)
192+
@source.encoding = encoding
243193
end
244-
if @source.match(/\A\s*\[/um, true)
194+
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
195+
encoding = "UTF-16"
196+
end
197+
standalone = STANDALONE.match(results)
198+
standalone = standalone[1] unless standalone.nil?
199+
return [ :xmldecl, version, encoding, standalone ]
200+
else # instruction
201+
return process_instruction
202+
end
203+
elsif @source.match("<!", true, false)
204+
if @source.match("--", true, false)
205+
return [ :comment, @source.match( /(.*?)-->/um, true )[1] ]
206+
elsif @source.match(/DOCTYPE\s/um, true, false)
207+
base_error_message = "Malformed DOCTYPE"
208+
@nsstack.unshift(curr_ns=Set.new)
209+
name = parse_name(base_error_message)
210+
if @source.match(/\s*\[/um, true)
211+
id = [nil, nil, nil]
245212
@document_status = :in_doctype
246-
elsif @source.match(/\A\s*>/um, true)
213+
elsif @source.match(/\s*>/um, true)
214+
id = [nil, nil, nil]
247215
@document_status = :after_doctype
248216
else
249-
message = "#{base_error_message}: garbage after external ID"
250-
raise REXML::ParseException.new(message, @source)
217+
id = parse_id(base_error_message,
218+
accept_external_id: true,
219+
accept_public_id: false)
220+
if id[0] == "SYSTEM"
221+
# For backward compatibility
222+
id[1], id[2] = id[2], nil
223+
end
224+
if @source.match(/\s*\[/um, true)
225+
@document_status = :in_doctype
226+
elsif @source.match(/\s*>/um, true)
227+
@document_status = :after_doctype
228+
else
229+
message = "#{base_error_message}: garbage after external ID"
230+
raise REXML::ParseException.new(message, @source)
231+
end
251232
end
233+
args = [:start_doctype, name, *id]
234+
if @document_status == :after_doctype
235+
@source.match(/\s*/um, true)
236+
@stack << [ :end_doctype ]
237+
end
238+
return args
239+
else
240+
message = "Invalid XML"
241+
raise REXML::ParseException.new(message, @source)
252242
end
253-
args = [:start_doctype, name, *id]
254-
if @document_status == :after_doctype
255-
@source.match(/\A\s*/um, true)
256-
@stack << [ :end_doctype ]
257-
end
258-
return args
259-
when /\A\s+/
243+
elsif @source.match( /\s+/, false, false )
260244
else
261245
@document_status = :after_doctype
262246
if @source.encoding == "UTF-8"
@@ -265,16 +249,13 @@ def pull_event
265249
end
266250
end
267251
if @document_status == :in_doctype
268-
md = @source.match(/\A\s*(.*?>)/um)
269-
case md[1]
270-
when SYSTEMENTITY
271-
match = @source.match( SYSTEMENTITY, true )[1]
272-
return [ :externalentity, match ]
273-
274-
when ELEMENTDECL_START
275-
return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
276-
277-
when ENTITY_START
252+
@source.read
253+
@source.match(/\s*/um, true, false) # skip spaces
254+
if match = @source.match( /(%.*?;)\s*$/um, true, false)
255+
return [ :externalentity, match[1] ]
256+
elsif match = @source.match(/(<!ELEMENT.*?)>/um, true, false)
257+
return [ :elementdecl, match[1] ]
258+
elsif @source.match( "<!ENTITY", true, false)
278259
match = [:entitydecl, *@source.match( ENTITYDECL, true ).captures.compact]
279260
ref = false
280261
if match[1] == '%'
@@ -300,7 +281,7 @@ def pull_event
300281
end
301282
match << '%' if ref
302283
return match
303-
when ATTLISTDECL_START
284+
elsif @source.match( "<!ATTLIST", true, false)
304285
md = @source.match( ATTLISTDECL_PATTERN, true )
305286
raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
306287
element = md[1]
@@ -320,42 +301,41 @@ def pull_event
320301
end
321302
end
322303
return [ :attlistdecl, element, pairs, contents ]
323-
when NOTATIONDECL_START
304+
elsif @source.match( "<!NOTATION", true, false)
324305
base_error_message = "Malformed notation declaration"
325-
unless @source.match(/\A\s*<!NOTATION\s+/um, true)
326-
if @source.match(/\A\s*<!NOTATION\s*>/um)
306+
unless @source.match(/\s+/um, true)
307+
if @source.match(/\s*>/um)
327308
message = "#{base_error_message}: name is missing"
328309
else
329310
message = "#{base_error_message}: invalid declaration name"
330311
end
312+
@source.string = " <!NOTATION" + @source.buffer
331313
raise REXML::ParseException.new(message, @source)
332314
end
333315
name = parse_name(base_error_message)
334316
id = parse_id(base_error_message,
335317
accept_external_id: true,
336318
accept_public_id: true)
337-
unless @source.match(/\A\s*>/um, true)
319+
unless @source.match(/\s*>/um, true)
338320
message = "#{base_error_message}: garbage before end >"
339321
raise REXML::ParseException.new(message, @source)
340322
end
341323
return [:notationdecl, name, *id]
342-
when DOCTYPE_END
324+
elsif @source.match( /\]\s*>/um, true, false)
343325
@document_status = :after_doctype
344-
@source.match( DOCTYPE_END, true )
345326
return [ :end_doctype ]
346327
end
347328
end
348329
if @document_status == :after_doctype
349-
@source.match(/\A\s*/um, true)
330+
@source.match(/\s*/um, true)
350331
end
351332
begin
352333
next_data = @source.buffer
353334
if next_data.size < 2
354335
@source.read
355-
next_data = @source.buffer
356336
end
357-
if next_data[0] == ?<
358-
if next_data[1] == ?/
337+
if @source.match("<", true, false)
338+
if @source.match("/", true, false)
359339
@nsstack.shift
360340
last_tag = @tags.pop
361341
md = @source.match( CLOSE_MATCH, true )
@@ -366,15 +346,16 @@ def pull_event
366346
if md.nil? or last_tag != md[1]
367347
message = "Missing end tag for '#{last_tag}'"
368348
message += " (got '#{md[1]}')" if md
349+
@source.string = "</" + @source.buffer if md.nil?
369350
raise REXML::ParseException.new(message, @source)
370351
end
371352
return [ :end_element, last_tag ]
372-
elsif next_data[1] == ?!
373-
md = @source.match(/\A(\s*[^>]*>)/um)
353+
elsif @source.match("!", true, false)
354+
md = @source.match(/([^>]*>)/um)
374355
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
375356
raise REXML::ParseException.new("Malformed node", @source) unless md
376-
if md[0][2] == ?-
377-
md = @source.match( COMMENT_PATTERN, true )
357+
if md[0][0] == ?-
358+
md = @source.match( /--(.*?)-->/um, true )
378359

379360
case md[1]
380361
when /--/, /-\z/
@@ -383,17 +364,18 @@ def pull_event
383364

384365
return [ :comment, md[1] ] if md
385366
else
386-
md = @source.match( CDATA_PATTERN, true )
367+
md = @source.match( /\[CDATA\[(.*?)\]\]>/um, true )
387368
return [ :cdata, md[1] ] if md
388369
end
389370
raise REXML::ParseException.new( "Declarations can only occur "+
390371
"in the doctype declaration.", @source)
391-
elsif next_data[1] == ??
372+
elsif @source.match("?", true, false)
392373
return process_instruction
393374
else
394375
# Get the next tag
395376
md = @source.match(TAG_MATCH, true)
396377
unless md
378+
@source.string = "<" + @source.buffer
397379
raise REXML::ParseException.new("malformed XML: missing tag start", @source)
398380
end
399381
tag = md[1]
@@ -418,7 +400,7 @@ def pull_event
418400
return [ :start_element, tag, attributes ]
419401
end
420402
else
421-
md = @source.match( TEXT_PATTERN, true )
403+
md = @source.match( /([^<]*)/um, true )
422404
text = md[1]
423405
return [ :text, text ]
424406
end
@@ -579,6 +561,7 @@ def process_instruction
579561
match_data = @source.match(INSTRUCTION_PATTERN, true)
580562
unless match_data
581563
message = "Invalid processing instruction node"
564+
@source.string = "<?" + @source.buffer
582565
raise REXML::ParseException.new(message, @source)
583566
end
584567
[:processing_instruction, match_data[1], match_data[2]]

lib/rexml/source.rb

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,14 +68,18 @@ def encoding=(enc)
6868
def read
6969
end
7070

71-
def match(pattern, cons=false)
71+
def match(pattern, cons=false, read_source=false)
7272
if cons
7373
@scanner.scan(pattern).nil? ? nil : @scanner
7474
else
7575
@scanner.check(pattern).nil? ? nil : @scanner
7676
end
7777
end
7878

79+
def string=(string)
80+
@scanner.string = string
81+
end
82+
7983
# @return true if the Source is exhausted
8084
def empty?
8185
@scanner.eos?
@@ -155,13 +159,13 @@ def read
155159
end
156160
end
157161

158-
def match( pattern, cons=false )
162+
def match( pattern, cons=false, read_source=true )
159163
if cons
160164
md = @scanner.scan(pattern)
161165
else
162166
md = @scanner.check(pattern)
163167
end
164-
while md.nil? and @source
168+
while read_source && md.nil? && @source
165169
begin
166170
@scanner << readline
167171
if cons

0 commit comments

Comments
 (0)