Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Define parsers.punctuation in a streaming fashion #432

Merged
merged 2 commits into from
Apr 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Fixes:
intervals. (#408, #419)
- Do not misinterpret bracketed e-mails as citations. (#424, #426,
sponsored by @istqborg)
- Comply with CommonMark 0.31.2. (#416, 40b516ee, de8d137d,
- Comply with CommonMark 0.31.2. (#416, 40b516ee, de8d137d, #432,
contributed by @lostenderman)

Documentation:
Expand Down
84 changes: 31 additions & 53 deletions markdown.dtx
Original file line number Diff line number Diff line change
Expand Up @@ -24510,38 +24510,6 @@ end
% \par
% \begin{markdown}
%
%### Unicode punctuation
% This section documents [the Unicode punctuation][unicode-punctuation]
% recognized by the markdown reader. The punctuation is organized in the
% \luamdef{punctuation} table according to the number of bytes occupied after
% conversion to \acro{utf}8.
%
% [unicode-punctuation]: https://spec.commonmark.org/0.31.2/#unicode-punctuation-character
% (CommonMark Spec, Version 0.31.2 (2024-01-28))
%
% \end{markdown}
% \begin{macrocode}
local punctuation = {}
(function()
local pathname = kpse.lookup("UnicodeData.txt")
local file = assert(io.open(pathname, "r"),
[[Could not open file "UnicodeData.txt"]])
for line in file:lines() do
local codepoint, major_category = line:match("^(%x+);[^;]*;(%a)")
if major_category == "P" or major_category == "S" then
local code = unicode.utf8.char(tonumber(codepoint, 16))
if punctuation[#code] == nil then
punctuation[#code] = {}
end
table.insert(punctuation[#code], code)
end
end
assert(file:close())
end)()
% \end{macrocode}
% \par
% \begin{markdown}
%
%### Plain \TeX{} Writer {#tex-writer}
%
% This section documents the \luamref{writer} object, which implements the
Expand Down Expand Up @@ -25809,36 +25777,46 @@ parsers.fail = P(false)

parsers.internal_punctuation = S(":;,.?")
parsers.ascii_punctuation = S("!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~")
% \end{macrocode}
% \par
% \begin{markdown}
%
%### Unicode punctuation
% This section documents [the Unicode punctuation][unicode-punctuation]
% recognized by the markdown reader. The punctuation is organized in the
% \luamdef{parsers.punctuation} table according to the number of bytes occupied
% after conversion to \acro{utf}8.
%
% [unicode-punctuation]: https://spec.commonmark.org/0.31.2/#unicode-punctuation-character
% (CommonMark Spec, Version 0.31.2 (2024-01-28))
%
% \end{markdown}
% \begin{macrocode}
parsers.punctuation = {}
(function()
for size = 1, 4 do
local codepoint_parser = parsers.fail
if size == 1 then
codepoint_parser = codepoint_parser + parsers.ascii_punctuation
end
for _, code in ipairs(punctuation[size] or {}) do
local pathname = kpse.lookup("UnicodeData.txt")
local file = assert(io.open(pathname, "r"),
[[Could not open file "UnicodeData.txt"]])
for line in file:lines() do
local codepoint, major_category = line:match("^(%x+);[^;]*;(%a)")
if major_category == "P" or major_category == "S" then
local code = unicode.utf8.char(tonumber(codepoint, 16))
if parsers.punctuation[#code] == nil then
parsers.punctuation[#code] = parsers.fail
end
local code_parser = parsers.succeed
assert(#code == size)
for i = 1, size do
for i = 1, #code do
local byte = code:sub(i, i)
local byte_parser = S(byte)
code_parser = code_parser * byte_parser
code_parser = code_parser
* byte_parser
end
codepoint_parser = codepoint_parser + code_parser
parsers.punctuation[#code] = parsers.punctuation[#code]
+ code_parser
end
parsers.punctuation[size] = codepoint_parser
end
assert(file:close())
end)()
% \end{macrocode}
% \par
% \begin{markdown}
%
% Here, we garbage-collect the \luamref{punctuation} table, since we won't need it anymore.
%
% \end{markdown}
% \begin{macrocode}
punctuation = nil
collectgarbage("collect")

parsers.escapable = parsers.ascii_punctuation
parsers.anyescaped = parsers.backslash / "" * parsers.escapable
Expand Down
Loading