From 367218383077d3d7e5eb5548892008db3c3a1e37 Mon Sep 17 00:00:00 2001 From: Nicholas Hirschey Date: Fri, 19 Aug 2022 21:46:22 +0100 Subject: [PATCH 1/5] new emphasis tests from the commonmark spec --- tests/FSharp.Markdown.Tests/Markdown.fs | 136 ++++++++++++++++++++++++ 1 file changed, 136 insertions(+) diff --git a/tests/FSharp.Markdown.Tests/Markdown.fs b/tests/FSharp.Markdown.Tests/Markdown.fs index 0106bbcd2..aca176261 100644 --- a/tests/FSharp.Markdown.Tests/Markdown.fs +++ b/tests/FSharp.Markdown.Tests/Markdown.fs @@ -877,3 +877,139 @@ let ``Underscore inside italic and bold near punctuation is preserved`` () = |> properNewLines Markdown.ToHtml doc |> shouldEqual expected + +[] +let ``emphasis with space`` () = + let doc = "*foo bar*" + let actual = + "

foo bar

\r\n" + |> properNewLines + Markdown.ToHtml doc |> shouldEqual actual + +[] +let ``No emphasis if opening * is followed by whitespace`` () = + let doc = "a * foo bar*" + let actual = + "

a * foo bar*

\r\n" + |> properNewLines + Markdown.ToHtml doc |> shouldEqual actual + +[] +let ``No emphasis if opening * is preceded by alphanumeric and followed by punctuation`` () = + let doc = """a*"foo"*""" + let actual = + """

a*"foo"*

""" + "\r\n" + |> properNewLines + Markdown.ToHtml doc |> shouldEqual actual + +[] +let ``Intraword emphasis with * is permitted`` () = + let doc = "foo*bar*" + let actual = + "

foobar

\r\n" + |> properNewLines + Markdown.ToHtml doc |> shouldEqual actual + + let doc2 = "5*6*78" + let actual2 = + "

5678

\r\n" + |> properNewLines + Markdown.ToHtml doc2 |> shouldEqual actual2 + +[] +let ``emphasis using _ with space`` () = + let doc = "_foo bar_" + let actual = + "

foo bar

\r\n" + |> properNewLines + Markdown.ToHtml doc |> shouldEqual actual + +[] +let ``No emphasis if opening _ is followed by whitespace`` () = + let doc = "_ foo bar_" + let actual = + "

_ foo bar_

\r\n" + |> properNewLines + Markdown.ToHtml doc |> shouldEqual actual + +[] +let ``No emphasis if opening _ is preceded by alphanumeric and followed by punctuation`` () = + let doc = """a_"foo"_""" + let actual = + """

a_"foo"_

""" + "\r\n" + |> properNewLines + Markdown.ToHtml doc |> shouldEqual actual + +[] +let ``Intraword emphasis with _ is not permitted`` () = + let doc = "foo_bar_" + let actual = + "

foo_bar_

\r\n" + |> properNewLines + Markdown.ToHtml doc |> shouldEqual actual + + let doc2 = "5_6_78" + let actual2 = + "

5_6_78

\r\n" + |> properNewLines + Markdown.ToHtml doc2 |> shouldEqual actual2 + + let doc3 = "пристаням_стремятся_" + let actual3 = + "

пристаням_стремятся_

\r\n" + |> properNewLines + Markdown.ToHtml doc3 |> shouldEqual actual3 + +[] +let ``No emphasis if first _ is right flanking and second is left flanking`` () = + let doc = """aa_"bb"_cc""" + let actual = + """

aa_"bb"_cc

""" + "\r\n" + |> properNewLines + Markdown.ToHtml doc |> shouldEqual actual + +[] +let ``Emphasis if first _ is left and right flanking and preceded by punctuation`` () = + let doc = "foo-_(bar)_" + let actual = + "

foo-(bar)

\r\n" + |> properNewLines + Markdown.ToHtml doc |> shouldEqual actual + +[] +let ``No emphasis if open and close delim do not match`` () = + let doc = "_foo*" + let actual = + "

_foo*

\r\n" + |> properNewLines + Markdown.ToHtml doc |> shouldEqual actual + +[] +let ``No emphasis if closing * is preceded by whitespace`` () = + let doc1 = "*foo bar *" + let actual1 = + "

*foo bar *

\r\n" + |> properNewLines + Markdown.ToHtml doc1 |> shouldEqual actual1 + + let doc2 = "*foo bar +*" + let actual2 = + "

*foo bar +*

" + "\r\n" + |> properNewLines + Markdown.ToHtml doc2 |> shouldEqual actual2 + +[] +let ``Do not close emphasis if second * is preceded by punctuation and followed by alphanumeric`` () = + let doc = "*(*foo)" + let actual = + "

*(*foo)

\r\n" + |> properNewLines + Markdown.ToHtml doc |> shouldEqual actual + + let doc2 = "*(*foo*)*" + let actual2 = + "

(foo)

\r\n" + |> properNewLines + Markdown.ToHtml doc2 |> shouldEqual actual2 From 5fc8e30bb45a31af0af6ce9053bc0f4324827370 Mon Sep 17 00:00:00 2001 From: Nicholas Hirschey Date: Fri, 19 Aug 2022 21:57:11 +0100 Subject: [PATCH 2/5] Better emphasis parsing Makes the code more closely follow the commonmark spec description. --- .../MarkdownParser.fs | 189 +++++++++++++++--- 1 file changed, 161 insertions(+), 28 deletions(-) diff --git a/src/FSharp.Formatting.Markdown/MarkdownParser.fs b/src/FSharp.Formatting.Markdown/MarkdownParser.fs index ae24b9048..e07a0c106 100644 --- a/src/FSharp.Formatting.Markdown/MarkdownParser.fs +++ b/src/FSharp.Formatting.Markdown/MarkdownParser.fs @@ -69,17 +69,135 @@ let inline (|EscapedLatexInlineMathChar|_|) input = | '\\' :: (('$') as c) :: rest -> Some(c, rest) | _ -> None -/// Succeeds when the specified character list starts with a letter or number -let inline (|AlphaNum|_|) input = - let re = """^[a-zA-Z0-9]""" - let match' = Regex.Match(Array.ofList input |> String, re) +/// Succeeds when the specificed character list starts with non-escaped punctuation. +let inline (|Punctuation|_|) input = + match input with + | EscapedChar _ -> None + | _ -> + // from https://github.com/commonmark/commonmark.js/blob/master/lib/inlines.js#L38 + let re = """^[!"#$%&'()*+,\-./:;<=>?@\[\]\\^_`{|}~\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u0AF0\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E42\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD804[\uDC47-\uDC4D\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC9\uDDCD\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDCC6\uDDC1-\uDDD7\uDE41-\uDE43\uDF3C-\uDF3E]|\uD809[\uDC70-\uDC74]|\uD81A[\uDE6E\uDE6F\uDEF5\uDF37-\uDF3B\uDF44]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]""" + let match' = Regex.Match(Array.ofList input |> String, re) + if match'.Success then + let entity = match'.Value + let _, rest = List.splitAt entity.Length input + Some(char entity, rest) + else + None - if match'.Success then - let entity = match'.Value - let _, rest = List.splitAt entity.Length input - Some(char entity, rest) - else +let inline (|NotPunctuation|_|) input= + match input with + | Punctuation _ -> None + | _ -> Some input + +module Char = + let inline (|WhiteSpace|_|) input = + match input with + | [] -> Some input + | x :: _xs -> + if String.IsNullOrWhiteSpace(string x) then + Some input + else + None + + let inline (|NotWhiteSpace|_|) input = + match input with + | WhiteSpace _ -> None + | _ -> Some input + +/// Succeeds when the specificed character list starts with a delimeter run. +let inline (|DelimiterRun|_|) input = + match input with + | ('*'|'_') :: _tail as (h :: t) -> + let run, rest = List.partitionWhile (fun x -> x = h) (h :: t) + Some (run, rest) + | _ -> None + +/// Succeeds when there's a match to a string of * or _ that could +/// open emphasis. +let inline (|LeftDelimiterRun|_|) input = + match input with + // (1) Not followed by [Unicode whitespace] and + // (2a) not followed by a [Unicode punctuation character] or + // (2b) followed by a [Unicode punctuation character] and + // preceded by [Unicode whitespace] or a [Unicode punctuation character]. + // + // Passes 1 and 2a. + | DelimiterRun(_, Char.NotWhiteSpace _) & + DelimiterRun(run, NotPunctuation xs) -> + Some([], run, xs) + | _ :: DelimiterRun(_, Char.NotWhiteSpace _) & + h :: DelimiterRun(run, NotPunctuation xs) -> + Some([h], run, xs) + // Passes 1 and 2b + | h :: DelimiterRun(run, Punctuation(x, xs)) -> + match [h] with + | Char.WhiteSpace _ + | Punctuation _ -> Some ([h], run, x :: xs) + | _ -> None + // Passes 1 and 2b when the run is at the start of the line. + // |CannotStartEmphasis| ensures that we don't match this + // when we've previously discarded a leading character. + | DelimiterRun(run, Punctuation(x, xs)) -> + Some ([], run, x :: xs) + | _ -> None + +/// Succeeds when there's a match to a string of * or _ that could +/// close emphasis. +let inline (|RightDelimiterRun|_|) input = + match input with + // A right-flanking delimiter run is + // 1. not preceded by [Unicode whitepace] + // 2. And either + // a. not preceded by a [Unicode punctuation character], or + // b. preceded by a [Unicode punctuation character] and + // followed by [Unicode whitespace] or a [Unicode punctuation character] + // + // An escaped character followed by delimiter run matches 1 and 2a. + | EscapedChar(x, DelimiterRun(run, xs)) -> + Some (['\\'; x], run, xs) + | EscapedChar _ -> None + | Char.NotWhiteSpace _ & + x :: DelimiterRun (run, xs) -> + match input with + // 1 and 2a + | NotPunctuation _ -> + Some ([x], run, xs) + // 1 and 2b + | Punctuation(x, DelimiterRun(run, Char.WhiteSpace ys)) -> + Some ([x], run, ys) + // 1 and 2b + | Punctuation(x, DelimiterRun(run, Punctuation(y, ys))) -> + Some ([x], run, y :: ys) + | _ -> None + | _ -> None + +/// Matches ['c',LeftDelimiterRun]::xs that should +/// not open emphasis. This is useful because the +/// parser iterates through characters one by one and +/// in this case we need to skip both 'c' and the LeftDelimiterRun. +/// If we only skipped 'c' then we could match LeftDelimiterRun +/// on the next iteration and we do not want that to happen. +let inline (|CannotOpenEmphasis|_|) input = + match input with + // Rule #2: A single `_` character [can open emphasis] iff + // it is part of a [left-flanking delimiter run] + // and either (a) not part of a [right-flanking delimiter run] + // or (b) part of a [right-flanking delimiter run] + // preceded by a [Unicode punctuation character]. + | LeftDelimiterRun _ & + RightDelimiterRun(pre, ['_'], post) -> + match List.rev pre with + | Punctuation _ -> None + | revPre-> Some ('_' :: revPre, post) + // We cannot pass 1 and 2b of the left flanking rule + // when h is neither white space nor punctuation. + | h :: DelimiterRun(run, Punctuation(x, xs)) -> + match [h] with + | Char.WhiteSpace _ + | Punctuation _ -> None + | _ -> Some (List.rev (h :: run), x :: xs) + | _ -> None /// Matches a list if it starts with a sub-list that is delimited /// using the specified delimiters. Returns a wrapped list and the rest. @@ -88,20 +206,33 @@ let inline (|AlphaNum|_|) input = let (|DelimitedMarkdown|_|) bracket input = let _startl, endl = bracket, bracket // Like List.partitionUntilEquals, but skip over escaped characters - let rec loop acc = + let rec loop acc count = function - | EscapedChar (x, xs) -> loop (x :: '\\' :: acc) xs - | input when List.startsWith endl input -> - let rest = List.skip bracket.Length input - - match rest with - | AlphaNum (x, xs) -> loop (x :: endl @ acc) xs - | _ -> Some(List.rev acc, input) - | x :: xs -> loop (x :: acc) xs + | (RightDelimiterRun(pre, ['_'], post) as input) + when endl = ['_'] -> + match input with + | LeftDelimiterRun(pre, run, (Punctuation _ as post)) -> + if count = 0 then + Some ((List.rev acc) @ pre, run @ post) + else + loop ((List.rev (pre @ run)) @ acc) (count - 1) post + | LeftDelimiterRun(pre, run, post) -> + loop ((List.rev (pre @ run)) @ acc) (count + 1) post + | _ -> Some ((List.rev acc) @ pre, ['_'] @ post) + | RightDelimiterRun(pre, run, post) + when endl = run -> + if count = 0 then + Some ((List.rev acc) @ pre, run @ post) + else + loop ((List.rev (pre @ run)) @ acc) (count - 1) post + | EscapedChar(x, xs) -> loop (x :: '\\' :: acc) count xs + | LeftDelimiterRun(pre, run, post) when run = endl -> + loop ((List.rev (pre @ run)) @ acc) (count + 1) post + | x :: xs -> loop (x :: acc) count xs | [] -> None // If it starts with 'startl', let's search for 'endl' if List.startsWith bracket input then - match loop [] (List.skip bracket.Length input) with + match loop [] 0 (List.skip bracket.Length input) with | Some (pre, post) -> Some(pre, List.skip bracket.Length post) | None -> None else @@ -183,16 +314,15 @@ let (|AutoLink|_|) input = /// TODO: This does not handle nested emphasis well. let (|Emphasised|_|) = function - | (('_' - | '*') :: _tail) as input -> - match input with + | LeftDelimiterRun(pre, run, post) -> + match run @ post with | DelimitedMarkdown [ '_'; '_'; '_' ] (body, rest) | DelimitedMarkdown [ '*'; '*'; '*' ] (body, rest) -> - Some(body, Emphasis >> List.singleton >> (fun s -> Strong(s, None)), rest) + Some(pre, body, Emphasis >> List.singleton >> (fun s -> Strong(s, None)), rest) | DelimitedMarkdown [ '_'; '_' ] (body, rest) - | DelimitedMarkdown [ '*'; '*' ] (body, rest) -> Some(body, Strong, rest) + | DelimitedMarkdown [ '*'; '*' ] (body, rest) -> Some(pre, body, Strong, rest) | DelimitedMarkdown [ '_' ] (body, rest) - | DelimitedMarkdown [ '*' ] (body, rest) -> Some(body, Emphasis, rest) + | DelimitedMarkdown [ '*' ] (body, rest) -> Some(pre, body, Emphasis, rest) | _ -> None | _ -> None @@ -392,13 +522,16 @@ let rec parseChars acc input (ctx: ParsingContext) = yield IndirectImage(String(Array.ofList body), original, key, ctx.CurrentRange) yield! parseChars [] rest ctx - // Handle emphasised text - | Emphasised (body, f, rest) -> + // Handle Emphasis + | CannotOpenEmphasis (revPre, post) -> yield! parseChars (revPre @ acc) post ctx + | Emphasised (pre, body, f, rest) -> let (value, ctx) = accLiterals.Value - yield! value + yield! value + yield! parseChars [] pre ctx let body = parseChars [] body ctx |> List.ofSeq yield f (body, ctx.CurrentRange) yield! parseChars [] rest ctx + // Encode '<' char if it is not link or inline HTML | '<' :: rest -> yield! parseChars (';' :: 't' :: 'l' :: '&' :: acc) rest ctx | '>' :: rest -> yield! parseChars (';' :: 't' :: 'g' :: '&' :: acc) rest ctx From 275f1a5de6f41546c4bd7d78367e24061cd23b29 Mon Sep 17 00:00:00 2001 From: Nicholas Hirschey Date: Fri, 19 Aug 2022 22:03:35 +0100 Subject: [PATCH 3/5] fantomas format --- .../MarkdownParser.fs | 98 ++++++++----------- tests/FSharp.Markdown.Tests/Markdown.fs | 76 +++++--------- 2 files changed, 66 insertions(+), 108 deletions(-) diff --git a/src/FSharp.Formatting.Markdown/MarkdownParser.fs b/src/FSharp.Formatting.Markdown/MarkdownParser.fs index e07a0c106..0e8a3a127 100644 --- a/src/FSharp.Formatting.Markdown/MarkdownParser.fs +++ b/src/FSharp.Formatting.Markdown/MarkdownParser.fs @@ -73,10 +73,13 @@ let inline (|EscapedLatexInlineMathChar|_|) input = let inline (|Punctuation|_|) input = match input with | EscapedChar _ -> None - | _ -> + | _ -> // from https://github.com/commonmark/commonmark.js/blob/master/lib/inlines.js#L38 - let re = """^[!"#$%&'()*+,\-./:;<=>?@\[\]\\^_`{|}~\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u0AF0\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E42\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD804[\uDC47-\uDC4D\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC9\uDDCD\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDCC6\uDDC1-\uDDD7\uDE41-\uDE43\uDF3C-\uDF3E]|\uD809[\uDC70-\uDC74]|\uD81A[\uDE6E\uDE6F\uDEF5\uDF37-\uDF3B\uDF44]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]""" + let re = + """^[!"#$%&'()*+,\-./:;<=>?@\[\]\\^_`{|}~\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u0AF0\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E42\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD804[\uDC47-\uDC4D\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC9\uDDCD\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDCC6\uDDC1-\uDDD7\uDE41-\uDE43\uDF3C-\uDF3E]|\uD809[\uDC70-\uDC74]|\uD81A[\uDE6E\uDE6F\uDEF5\uDF37-\uDF3B\uDF44]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]""" + let match' = Regex.Match(Array.ofList input |> String, re) + if match'.Success then let entity = match'.Value let _, rest = List.splitAt entity.Length input @@ -84,7 +87,7 @@ let inline (|Punctuation|_|) input = else None -let inline (|NotPunctuation|_|) input= +let inline (|NotPunctuation|_|) input = match input with | Punctuation _ -> None | _ -> Some input @@ -93,10 +96,10 @@ module Char = let inline (|WhiteSpace|_|) input = match input with | [] -> Some input - | x :: _xs -> - if String.IsNullOrWhiteSpace(string x) then - Some input - else + | x :: _xs -> + if String.IsNullOrWhiteSpace(string x) then + Some input + else None let inline (|NotWhiteSpace|_|) input = @@ -107,9 +110,10 @@ module Char = /// Succeeds when the specificed character list starts with a delimeter run. let inline (|DelimiterRun|_|) input = match input with - | ('*'|'_') :: _tail as (h :: t) -> + | ('*' + | '_') :: _tail as (h :: t) -> let run, rest = List.partitionWhile (fun x -> x = h) (h :: t) - Some (run, rest) + Some(run, rest) | _ -> None /// Succeeds when there's a match to a string of * or _ that could @@ -122,30 +126,25 @@ let inline (|LeftDelimiterRun|_|) input = // preceded by [Unicode whitespace] or a [Unicode punctuation character]. // // Passes 1 and 2a. - | DelimiterRun(_, Char.NotWhiteSpace _) & - DelimiterRun(run, NotPunctuation xs) -> - Some([], run, xs) - | _ :: DelimiterRun(_, Char.NotWhiteSpace _) & - h :: DelimiterRun(run, NotPunctuation xs) -> - Some([h], run, xs) + | DelimiterRun (_, Char.NotWhiteSpace _) & DelimiterRun (run, NotPunctuation xs) -> Some([], run, xs) + | _ :: DelimiterRun (_, Char.NotWhiteSpace _) & h :: DelimiterRun (run, NotPunctuation xs) -> Some([ h ], run, xs) // Passes 1 and 2b - | h :: DelimiterRun(run, Punctuation(x, xs)) -> - match [h] with + | h :: DelimiterRun (run, Punctuation (x, xs)) -> + match [ h ] with | Char.WhiteSpace _ - | Punctuation _ -> Some ([h], run, x :: xs) + | Punctuation _ -> Some([ h ], run, x :: xs) | _ -> None // Passes 1 and 2b when the run is at the start of the line. // |CannotStartEmphasis| ensures that we don't match this // when we've previously discarded a leading character. - | DelimiterRun(run, Punctuation(x, xs)) -> - Some ([], run, x :: xs) + | DelimiterRun (run, Punctuation (x, xs)) -> Some([], run, x :: xs) | _ -> None /// Succeeds when there's a match to a string of * or _ that could /// close emphasis. let inline (|RightDelimiterRun|_|) input = match input with - // A right-flanking delimiter run is + // A right-flanking delimiter run is // 1. not preceded by [Unicode whitepace] // 2. And either // a. not preceded by a [Unicode punctuation character], or @@ -153,22 +152,16 @@ let inline (|RightDelimiterRun|_|) input = // followed by [Unicode whitespace] or a [Unicode punctuation character] // // An escaped character followed by delimiter run matches 1 and 2a. - | EscapedChar(x, DelimiterRun(run, xs)) -> - Some (['\\'; x], run, xs) - | EscapedChar _ -> - None - | Char.NotWhiteSpace _ & - x :: DelimiterRun (run, xs) -> + | EscapedChar (x, DelimiterRun (run, xs)) -> Some([ '\\'; x ], run, xs) + | EscapedChar _ -> None + | Char.NotWhiteSpace _ & x :: DelimiterRun (run, xs) -> match input with // 1 and 2a - | NotPunctuation _ -> - Some ([x], run, xs) + | NotPunctuation _ -> Some([ x ], run, xs) // 1 and 2b - | Punctuation(x, DelimiterRun(run, Char.WhiteSpace ys)) -> - Some ([x], run, ys) + | Punctuation (x, DelimiterRun (run, Char.WhiteSpace ys)) -> Some([ x ], run, ys) // 1 and 2b - | Punctuation(x, DelimiterRun(run, Punctuation(y, ys))) -> - Some ([x], run, y :: ys) + | Punctuation (x, DelimiterRun (run, Punctuation (y, ys))) -> Some([ x ], run, y :: ys) | _ -> None | _ -> None @@ -185,18 +178,17 @@ let inline (|CannotOpenEmphasis|_|) input = // and either (a) not part of a [right-flanking delimiter run] // or (b) part of a [right-flanking delimiter run] // preceded by a [Unicode punctuation character]. - | LeftDelimiterRun _ & - RightDelimiterRun(pre, ['_'], post) -> + | LeftDelimiterRun _ & RightDelimiterRun (pre, [ '_' ], post) -> match List.rev pre with | Punctuation _ -> None - | revPre-> Some ('_' :: revPre, post) + | revPre -> Some('_' :: revPre, post) // We cannot pass 1 and 2b of the left flanking rule // when h is neither white space nor punctuation. - | h :: DelimiterRun(run, Punctuation(x, xs)) -> - match [h] with + | h :: DelimiterRun (run, Punctuation (x, xs)) -> + match [ h ] with | Char.WhiteSpace _ | Punctuation _ -> None - | _ -> Some (List.rev (h :: run), x :: xs) + | _ -> Some(List.rev (h :: run), x :: xs) | _ -> None /// Matches a list if it starts with a sub-list that is delimited @@ -208,26 +200,22 @@ let (|DelimitedMarkdown|_|) bracket input = // Like List.partitionUntilEquals, but skip over escaped characters let rec loop acc count = function - | (RightDelimiterRun(pre, ['_'], post) as input) - when endl = ['_'] -> + | (RightDelimiterRun (pre, [ '_' ], post) as input) when endl = [ '_' ] -> match input with - | LeftDelimiterRun(pre, run, (Punctuation _ as post)) -> - if count = 0 then - Some ((List.rev acc) @ pre, run @ post) + | LeftDelimiterRun (pre, run, (Punctuation _ as post)) -> + if count = 0 then + Some((List.rev acc) @ pre, run @ post) else loop ((List.rev (pre @ run)) @ acc) (count - 1) post - | LeftDelimiterRun(pre, run, post) -> - loop ((List.rev (pre @ run)) @ acc) (count + 1) post - | _ -> Some ((List.rev acc) @ pre, ['_'] @ post) - | RightDelimiterRun(pre, run, post) - when endl = run -> + | LeftDelimiterRun (pre, run, post) -> loop ((List.rev (pre @ run)) @ acc) (count + 1) post + | _ -> Some((List.rev acc) @ pre, [ '_' ] @ post) + | RightDelimiterRun (pre, run, post) when endl = run -> if count = 0 then - Some ((List.rev acc) @ pre, run @ post) + Some((List.rev acc) @ pre, run @ post) else loop ((List.rev (pre @ run)) @ acc) (count - 1) post - | EscapedChar(x, xs) -> loop (x :: '\\' :: acc) count xs - | LeftDelimiterRun(pre, run, post) when run = endl -> - loop ((List.rev (pre @ run)) @ acc) (count + 1) post + | EscapedChar (x, xs) -> loop (x :: '\\' :: acc) count xs + | LeftDelimiterRun (pre, run, post) when run = endl -> loop ((List.rev (pre @ run)) @ acc) (count + 1) post | x :: xs -> loop (x :: acc) count xs | [] -> None // If it starts with 'startl', let's search for 'endl' @@ -314,7 +302,7 @@ let (|AutoLink|_|) input = /// TODO: This does not handle nested emphasis well. let (|Emphasised|_|) = function - | LeftDelimiterRun(pre, run, post) -> + | LeftDelimiterRun (pre, run, post) -> match run @ post with | DelimitedMarkdown [ '_'; '_'; '_' ] (body, rest) | DelimitedMarkdown [ '*'; '*'; '*' ] (body, rest) -> @@ -526,7 +514,7 @@ let rec parseChars acc input (ctx: ParsingContext) = | CannotOpenEmphasis (revPre, post) -> yield! parseChars (revPre @ acc) post ctx | Emphasised (pre, body, f, rest) -> let (value, ctx) = accLiterals.Value - yield! value + yield! value yield! parseChars [] pre ctx let body = parseChars [] body ctx |> List.ofSeq yield f (body, ctx.CurrentRange) diff --git a/tests/FSharp.Markdown.Tests/Markdown.fs b/tests/FSharp.Markdown.Tests/Markdown.fs index aca176261..e7119ef1c 100644 --- a/tests/FSharp.Markdown.Tests/Markdown.fs +++ b/tests/FSharp.Markdown.Tests/Markdown.fs @@ -881,135 +881,105 @@ let ``Underscore inside italic and bold near punctuation is preserved`` () = [] let ``emphasis with space`` () = let doc = "*foo bar*" - let actual = - "

foo bar

\r\n" - |> properNewLines + let actual = "

foo bar

\r\n" |> properNewLines Markdown.ToHtml doc |> shouldEqual actual [] let ``No emphasis if opening * is followed by whitespace`` () = let doc = "a * foo bar*" - let actual = - "

a * foo bar*

\r\n" - |> properNewLines + let actual = "

a * foo bar*

\r\n" |> properNewLines Markdown.ToHtml doc |> shouldEqual actual [] let ``No emphasis if opening * is preceded by alphanumeric and followed by punctuation`` () = let doc = """a*"foo"*""" - let actual = - """

a*"foo"*

""" + "\r\n" - |> properNewLines + let actual = """

a*"foo"*

""" + "\r\n" |> properNewLines Markdown.ToHtml doc |> shouldEqual actual [] let ``Intraword emphasis with * is permitted`` () = let doc = "foo*bar*" - let actual = - "

foobar

\r\n" - |> properNewLines + let actual = "

foobar

\r\n" |> properNewLines Markdown.ToHtml doc |> shouldEqual actual let doc2 = "5*6*78" - let actual2 = - "

5678

\r\n" - |> properNewLines + let actual2 = "

5678

\r\n" |> properNewLines Markdown.ToHtml doc2 |> shouldEqual actual2 [] let ``emphasis using _ with space`` () = let doc = "_foo bar_" - let actual = - "

foo bar

\r\n" - |> properNewLines + let actual = "

foo bar

\r\n" |> properNewLines Markdown.ToHtml doc |> shouldEqual actual [] let ``No emphasis if opening _ is followed by whitespace`` () = let doc = "_ foo bar_" - let actual = - "

_ foo bar_

\r\n" - |> properNewLines + let actual = "

_ foo bar_

\r\n" |> properNewLines Markdown.ToHtml doc |> shouldEqual actual [] let ``No emphasis if opening _ is preceded by alphanumeric and followed by punctuation`` () = let doc = """a_"foo"_""" - let actual = - """

a_"foo"_

""" + "\r\n" - |> properNewLines + let actual = """

a_"foo"_

""" + "\r\n" |> properNewLines Markdown.ToHtml doc |> shouldEqual actual [] let ``Intraword emphasis with _ is not permitted`` () = let doc = "foo_bar_" - let actual = - "

foo_bar_

\r\n" - |> properNewLines + let actual = "

foo_bar_

\r\n" |> properNewLines Markdown.ToHtml doc |> shouldEqual actual let doc2 = "5_6_78" - let actual2 = - "

5_6_78

\r\n" - |> properNewLines + let actual2 = "

5_6_78

\r\n" |> properNewLines Markdown.ToHtml doc2 |> shouldEqual actual2 let doc3 = "пристаням_стремятся_" - let actual3 = - "

пристаням_стремятся_

\r\n" - |> properNewLines + let actual3 = "

пристаням_стремятся_

\r\n" |> properNewLines Markdown.ToHtml doc3 |> shouldEqual actual3 [] let ``No emphasis if first _ is right flanking and second is left flanking`` () = let doc = """aa_"bb"_cc""" - let actual = - """

aa_"bb"_cc

""" + "\r\n" - |> properNewLines + let actual = """

aa_"bb"_cc

""" + "\r\n" |> properNewLines Markdown.ToHtml doc |> shouldEqual actual [] let ``Emphasis if first _ is left and right flanking and preceded by punctuation`` () = let doc = "foo-_(bar)_" - let actual = - "

foo-(bar)

\r\n" - |> properNewLines + let actual = "

foo-(bar)

\r\n" |> properNewLines Markdown.ToHtml doc |> shouldEqual actual [] let ``No emphasis if open and close delim do not match`` () = let doc = "_foo*" - let actual = - "

_foo*

\r\n" - |> properNewLines + let actual = "

_foo*

\r\n" |> properNewLines Markdown.ToHtml doc |> shouldEqual actual [] let ``No emphasis if closing * is preceded by whitespace`` () = let doc1 = "*foo bar *" - let actual1 = - "

*foo bar *

\r\n" - |> properNewLines + let actual1 = "

*foo bar *

\r\n" |> properNewLines Markdown.ToHtml doc1 |> shouldEqual actual1 - let doc2 = "*foo bar + let doc2 = + "*foo bar *" + let actual2 = "

*foo bar -*

" + "\r\n" +*

" + + "\r\n" |> properNewLines + Markdown.ToHtml doc2 |> shouldEqual actual2 [] let ``Do not close emphasis if second * is preceded by punctuation and followed by alphanumeric`` () = let doc = "*(*foo)" - let actual = - "

*(*foo)

\r\n" - |> properNewLines + let actual = "

*(*foo)

\r\n" |> properNewLines Markdown.ToHtml doc |> shouldEqual actual let doc2 = "*(*foo*)*" - let actual2 = - "

(foo)

\r\n" - |> properNewLines + let actual2 = "

(foo)

\r\n" |> properNewLines Markdown.ToHtml doc2 |> shouldEqual actual2 From 9a709b717320215d88c915e60b969bbb6054ee41 Mon Sep 17 00:00:00 2001 From: Nicholas Hirschey Date: Fri, 19 Aug 2022 22:06:57 +0100 Subject: [PATCH 4/5] Update RELEASE_NOTES.md --- RELEASE_NOTES.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 67ef98660..ae674cc01 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,3 +1,6 @@ +## 16.0.2 +* [Improves markdown emphasis parsing.](https://github.com/fsprojects/FSharp.Formatting/pull/763) + ## 16.0.1 * Custom templaying for menus From e06752ad7046e02040d0af4388482ddada3131bd Mon Sep 17 00:00:00 2001 From: Don Syme Date: Tue, 23 Aug 2022 15:57:16 +0100 Subject: [PATCH 5/5] Update MarkdownParser.fs --- .../MarkdownParser.fs | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/FSharp.Formatting.Markdown/MarkdownParser.fs b/src/FSharp.Formatting.Markdown/MarkdownParser.fs index 0e8a3a127..07a45c6e1 100644 --- a/src/FSharp.Formatting.Markdown/MarkdownParser.fs +++ b/src/FSharp.Formatting.Markdown/MarkdownParser.fs @@ -42,7 +42,7 @@ let getLinkAndTitle (StringPosition.TrimBoth (input, _n)) = /// Succeeds when the specified character list starts with an escaped /// character - in that case, returns the character and the tail of the list -let inline (|EscapedChar|_|) input = +let (|EscapedChar|_|) input = match input with | '\\' :: (('*' | '\\' @@ -64,13 +64,13 @@ let inline (|EscapedChar|_|) input = | _ -> None /// Escape dollar inside a LaTex inline math span. -let inline (|EscapedLatexInlineMathChar|_|) input = +let (|EscapedLatexInlineMathChar|_|) input = match input with | '\\' :: (('$') as c) :: rest -> Some(c, rest) | _ -> None /// Succeeds when the specificed character list starts with non-escaped punctuation. -let inline (|Punctuation|_|) input = +let (|Punctuation|_|) input = match input with | EscapedChar _ -> None | _ -> @@ -87,13 +87,13 @@ let inline (|Punctuation|_|) input = else None -let inline (|NotPunctuation|_|) input = +let (|NotPunctuation|_|) input = match input with | Punctuation _ -> None | _ -> Some input module Char = - let inline (|WhiteSpace|_|) input = + let (|WhiteSpace|_|) input = match input with | [] -> Some input | x :: _xs -> @@ -102,13 +102,13 @@ module Char = else None - let inline (|NotWhiteSpace|_|) input = + let (|NotWhiteSpace|_|) input = match input with | WhiteSpace _ -> None | _ -> Some input /// Succeeds when the specificed character list starts with a delimeter run. -let inline (|DelimiterRun|_|) input = +let (|DelimiterRun|_|) input = match input with | ('*' | '_') :: _tail as (h :: t) -> @@ -118,7 +118,7 @@ let inline (|DelimiterRun|_|) input = /// Succeeds when there's a match to a string of * or _ that could /// open emphasis. -let inline (|LeftDelimiterRun|_|) input = +let (|LeftDelimiterRun|_|) input = match input with // (1) Not followed by [Unicode whitespace] and // (2a) not followed by a [Unicode punctuation character] or @@ -142,7 +142,7 @@ let inline (|LeftDelimiterRun|_|) input = /// Succeeds when there's a match to a string of * or _ that could /// close emphasis. -let inline (|RightDelimiterRun|_|) input = +let (|RightDelimiterRun|_|) input = match input with // A right-flanking delimiter run is // 1. not preceded by [Unicode whitepace] @@ -171,7 +171,7 @@ let inline (|RightDelimiterRun|_|) input = /// in this case we need to skip both 'c' and the LeftDelimiterRun. /// If we only skipped 'c' then we could match LeftDelimiterRun /// on the next iteration and we do not want that to happen. -let inline (|CannotOpenEmphasis|_|) input = +let (|CannotOpenEmphasis|_|) input = match input with // Rule #2: A single `_` character [can open emphasis] iff // it is part of a [left-flanking delimiter run]