From e06b3fb2660c682423e10d59b92d192c42e9825d Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 7 Jun 2024 14:34:25 +0900 Subject: [PATCH] Improve text parse performance If there are many ">"s in text, parsing is very slow. Calculating ------------------------------------- rexml 3.2.6 master 3.2.6(YJIT) master(YJIT) attribute 1.116 3.618k 1.117 1.941k i/s - 10.000 times in 8.957748s 0.002764s 8.951665s 0.005152s text 27.089 2.262k 42.632 1.033k i/s - 10.000 times in 0.369147s 0.004421s 0.234566s 0.009683s Comparison: attribute master: 3617.6 i/s master(YJIT): 1941.1 i/s - 1.86x slower 3.2.6(YJIT): 1.1 i/s - 3238.31x slower rexml 3.2.6: 1.1 i/s - 3240.51x slower text master: 2261.8 i/s master(YJIT): 1032.7 i/s - 2.19x slower 3.2.6(YJIT): 42.6 i/s - 53.05x slower rexml 3.2.6: 27.1 i/s - 83.49x slower --- benchmark/gt.yaml | 34 +++++++++++++++++++++++++++++++++ lib/rexml/parsers/baseparser.rb | 10 ++++++++-- lib/rexml/source.rb | 19 +++++++++--------- 3 files changed, 52 insertions(+), 11 deletions(-) create mode 100644 benchmark/gt.yaml diff --git a/benchmark/gt.yaml b/benchmark/gt.yaml new file mode 100644 index 00000000..3f6af739 --- /dev/null +++ b/benchmark/gt.yaml @@ -0,0 +1,34 @@ +loop_count: 10 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require "rexml" + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require "rexml" + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require "rexml" + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require "rexml" + RubyVM::YJIT.enable + +prelude: | + require "rexml/document" + + n = 10000 + gts = ">" * n + in_attribute = "" + in_text = "#{gts}" + +benchmark: + "attribute": REXML::Document.new(in_attribute) + "text": REXML::Document.new(in_text) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 82575685..eadc78f7 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -373,6 +373,10 @@ def pull_event begin start_position = @source.position if @source.match("<", true) + # :text's read_until may remain only "<" in buffer. In the + # case, buffer is empty here. So we need to fill buffer + # here explicitly. + @source.ensure_buffer if @source.match("/", true) @nsstack.shift last_tag = @tags.pop @@ -438,8 +442,10 @@ def pull_event return [ :start_element, tag, attributes ] end else - md = @source.match(/([^<]*)/um, true) - text = md[1] + text = @source.read_until("<") + if text.chomp!("<") + @source.position -= "<".bytesize + end return [ :text, text ] end rescue REXML::UndefinedNamespaceException diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 542b76a6..982aa84a 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -36,7 +36,7 @@ class Source module Private PRE_DEFINED_TERM_PATTERNS = {} - pre_defined_terms = ["'", '"'] + pre_defined_terms = ["'", '"', "<"] pre_defined_terms.each do |term| PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/ end @@ -192,17 +192,18 @@ def read(term = nil) def read_until(term) pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/ term = encode(term) - begin - until str = @scanner.scan_until(pattern) - @scanner << readline(term) - end - rescue EOFError + until str = @scanner.scan_until(pattern) + break if @source.nil? + break if @source.eof? + @scanner << readline(term) + end + if str + read if @scanner.eos? and !@source.eof? + str + else rest = @scanner.rest @scanner.pos = @scanner.string.bytesize rest - else - read if @scanner.eos? and !@source.eof? - str end end