Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve the speed of parsing markdown input five times #482

Merged
merged 6 commits into from
Aug 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -235,9 +235,10 @@ jobs:
- name: Test Lua command-line interface
run: |
set -ex
RESULT="$(printf '%s\n' 'Hello *Markdown*! $a_x + b_x = c_x$' | markdown-cli hybrid=true underscores=false)"
test "$RESULT" = '\markdownRendererDocumentBegin
Hello \markdownRendererEmphasis{Markdown}! $a_x + b_x = c_x$\markdownRendererDocumentEnd'
printf '%s\n' 'Hello *Markdown*! $a_x + b_x = c_x$' | (time markdown-cli hybrid=true underscores=false) 1>stdout 2>stderr
test "$(cat stdout)" = '\markdownRendererDocumentBegin
Hello \markdownRendererEmphasis{Markdown}! $a_x + b_x = c_x$\markdownRendererDocumentEnd' # Check that the output is correct.
grep 'real\s*0m0' stderr # Check that the command finishes in less than a second.
- name: Run tests
if: matrix.texlive == 'latest' || github.event_name != 'pull_request_target' || github.event.pull_request.draft == false
run: make FAIL_FAST=${{ github.event_name == 'pull_request_target' }} test
Expand Down
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ Speed improvements:

- Precompile snippets to improve the speed of setting them.
(#467, #479, inspired by the TUG 2024 talk by @josephwright)
- Improve the speed of parsing markdown input 5 times.
(#458, #474, #482, co-authored by @Yggdrasil128)

Deprecation:

Expand Down
75 changes: 65 additions & 10 deletions markdown.dtx
Original file line number Diff line number Diff line change
Expand Up @@ -26625,31 +26625,86 @@ parsers.ascii_punctuation = S("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~")
%
% \end{markdown}
% \begin{macrocode}
parsers.punctuation = {}
(function()
;(function()
local pathname = assert(kpse.find_file("UnicodeData.txt"),
[[Could not locate file "UnicodeData.txt"]])
local file = assert(io.open(pathname, "r"),
[[Could not open file "UnicodeData.txt"]])
% \end{macrocode}
% \par
% \begin{markdown}
%
% In order to minimize the size and speed of the parser, we will first
% construct a prefix tree of UTF-8 encodings for all codepoints of a
% given code length.
%
% \end{markdown}
% \begin{macrocode}
local prefix_trees = {}
for line in file:lines() do
local codepoint, major_category = line:match("^(%x+);[^;]*;(%a)")
if major_category == "P" or major_category == "S" then
local code = unicode.utf8.char(tonumber(codepoint, 16))
if parsers.punctuation[#code] == nil then
parsers.punctuation[#code] = parsers.fail
if prefix_trees[#code] == nil then
prefix_trees[#code] = {}
end
local code_parser = parsers.succeed
local node = prefix_trees[#code]
for i = 1, #code do
local byte = code:sub(i, i)
local byte_parser = S(byte)
code_parser = code_parser
* byte_parser
if i < #code then
if node[byte] == nil then
node[byte] = {}
end
node = node[byte]
else
table.insert(node, byte)
end
end
parsers.punctuation[#code] = parsers.punctuation[#code]
+ code_parser
end
end
assert(file:close())
% \end{macrocode}
% \par
% \begin{markdown}
%
% Next, we will construct a parser out of the prefix tree.
%
% \end{markdown}
% \begin{macrocode}
local function depth_first_search(node, path, visit, leave)
visit(node, path)
for label, child in pairs(node) do
if type(child) == "table" then
depth_first_search(child, path .. label, visit, leave)
else
visit(child, path)
end
end
leave(node, path)
end

parsers.punctuation = {}
for length, prefix_tree in pairs(prefix_trees) do
local subparsers = {}
depth_first_search(prefix_tree, "", function(node, path)
if type(node) == "table" then
subparsers[path] = parsers.fail
else
assert(type(node) == "string")
subparsers[path] = subparsers[path] + S(node)
end
end, function(_, path)
if #path > 0 then
local byte = path:sub(#path, #path)
local parent_path = path:sub(1, #path-1)
subparsers[parent_path] = subparsers[parent_path]
+ S(byte) * subparsers[path]
else
parsers.punctuation[length] = subparsers[path]
end
end)
assert(parsers.punctuation[length] ~= nil)
end
end)()

parsers.escapable = parsers.ascii_punctuation
Expand Down