diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 67ef98660..ae674cc01 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,3 +1,6 @@ +## 16.0.2 +* [Improves markdown emphasis parsing.](https://github.com/fsprojects/FSharp.Formatting/pull/763) + ## 16.0.1 * Custom templaying for menus diff --git a/src/FSharp.Formatting.Markdown/MarkdownParser.fs b/src/FSharp.Formatting.Markdown/MarkdownParser.fs index ae24b9048..07a45c6e1 100644 --- a/src/FSharp.Formatting.Markdown/MarkdownParser.fs +++ b/src/FSharp.Formatting.Markdown/MarkdownParser.fs @@ -42,7 +42,7 @@ let getLinkAndTitle (StringPosition.TrimBoth (input, _n)) = /// Succeeds when the specified character list starts with an escaped /// character - in that case, returns the character and the tail of the list -let inline (|EscapedChar|_|) input = +let (|EscapedChar|_|) input = match input with | '\\' :: (('*' | '\\' @@ -64,22 +64,132 @@ let inline (|EscapedChar|_|) input = | _ -> None /// Escape dollar inside a LaTex inline math span. -let inline (|EscapedLatexInlineMathChar|_|) input = +let (|EscapedLatexInlineMathChar|_|) input = match input with | '\\' :: (('$') as c) :: rest -> Some(c, rest) | _ -> None -/// Succeeds when the specified character list starts with a letter or number -let inline (|AlphaNum|_|) input = - let re = """^[a-zA-Z0-9]""" - let match' = Regex.Match(Array.ofList input |> String, re) +/// Succeeds when the specificed character list starts with non-escaped punctuation. +let (|Punctuation|_|) input = + match input with + | EscapedChar _ -> None + | _ -> + // from https://github.com/commonmark/commonmark.js/blob/master/lib/inlines.js#L38 + let re = + """^[!"#$%&'()*+,\-./:;<=>?@\[\]\\^_`{|}~\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u0AF0\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E42\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD804[\uDC47-\uDC4D\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC9\uDDCD\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDCC6\uDDC1-\uDDD7\uDE41-\uDE43\uDF3C-\uDF3E]|\uD809[\uDC70-\uDC74]|\uD81A[\uDE6E\uDE6F\uDEF5\uDF37-\uDF3B\uDF44]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]""" - if match'.Success then - let entity = match'.Value - let _, rest = List.splitAt entity.Length input - Some(char entity, rest) - else - None + let match' = Regex.Match(Array.ofList input |> String, re) + + if match'.Success then + let entity = match'.Value + let _, rest = List.splitAt entity.Length input + Some(char entity, rest) + else + None + +let (|NotPunctuation|_|) input = + match input with + | Punctuation _ -> None + | _ -> Some input + +module Char = + let (|WhiteSpace|_|) input = + match input with + | [] -> Some input + | x :: _xs -> + if String.IsNullOrWhiteSpace(string x) then + Some input + else + None + + let (|NotWhiteSpace|_|) input = + match input with + | WhiteSpace _ -> None + | _ -> Some input + +/// Succeeds when the specificed character list starts with a delimeter run. +let (|DelimiterRun|_|) input = + match input with + | ('*' + | '_') :: _tail as (h :: t) -> + let run, rest = List.partitionWhile (fun x -> x = h) (h :: t) + Some(run, rest) + | _ -> None + +/// Succeeds when there's a match to a string of * or _ that could +/// open emphasis. +let (|LeftDelimiterRun|_|) input = + match input with + // (1) Not followed by [Unicode whitespace] and + // (2a) not followed by a [Unicode punctuation character] or + // (2b) followed by a [Unicode punctuation character] and + // preceded by [Unicode whitespace] or a [Unicode punctuation character]. + // + // Passes 1 and 2a. + | DelimiterRun (_, Char.NotWhiteSpace _) & DelimiterRun (run, NotPunctuation xs) -> Some([], run, xs) + | _ :: DelimiterRun (_, Char.NotWhiteSpace _) & h :: DelimiterRun (run, NotPunctuation xs) -> Some([ h ], run, xs) + // Passes 1 and 2b + | h :: DelimiterRun (run, Punctuation (x, xs)) -> + match [ h ] with + | Char.WhiteSpace _ + | Punctuation _ -> Some([ h ], run, x :: xs) + | _ -> None + // Passes 1 and 2b when the run is at the start of the line. + // |CannotStartEmphasis| ensures that we don't match this + // when we've previously discarded a leading character. + | DelimiterRun (run, Punctuation (x, xs)) -> Some([], run, x :: xs) + | _ -> None + +/// Succeeds when there's a match to a string of * or _ that could +/// close emphasis. +let (|RightDelimiterRun|_|) input = + match input with + // A right-flanking delimiter run is + // 1. not preceded by [Unicode whitepace] + // 2. And either + // a. not preceded by a [Unicode punctuation character], or + // b. preceded by a [Unicode punctuation character] and + // followed by [Unicode whitespace] or a [Unicode punctuation character] + // + // An escaped character followed by delimiter run matches 1 and 2a. + | EscapedChar (x, DelimiterRun (run, xs)) -> Some([ '\\'; x ], run, xs) + | EscapedChar _ -> None + | Char.NotWhiteSpace _ & x :: DelimiterRun (run, xs) -> + match input with + // 1 and 2a + | NotPunctuation _ -> Some([ x ], run, xs) + // 1 and 2b + | Punctuation (x, DelimiterRun (run, Char.WhiteSpace ys)) -> Some([ x ], run, ys) + // 1 and 2b + | Punctuation (x, DelimiterRun (run, Punctuation (y, ys))) -> Some([ x ], run, y :: ys) + | _ -> None + | _ -> None + +/// Matches ['c',LeftDelimiterRun]::xs that should +/// not open emphasis. This is useful because the +/// parser iterates through characters one by one and +/// in this case we need to skip both 'c' and the LeftDelimiterRun. +/// If we only skipped 'c' then we could match LeftDelimiterRun +/// on the next iteration and we do not want that to happen. +let (|CannotOpenEmphasis|_|) input = + match input with + // Rule #2: A single `_` character [can open emphasis] iff + // it is part of a [left-flanking delimiter run] + // and either (a) not part of a [right-flanking delimiter run] + // or (b) part of a [right-flanking delimiter run] + // preceded by a [Unicode punctuation character]. + | LeftDelimiterRun _ & RightDelimiterRun (pre, [ '_' ], post) -> + match List.rev pre with + | Punctuation _ -> None + | revPre -> Some('_' :: revPre, post) + // We cannot pass 1 and 2b of the left flanking rule + // when h is neither white space nor punctuation. + | h :: DelimiterRun (run, Punctuation (x, xs)) -> + match [ h ] with + | Char.WhiteSpace _ + | Punctuation _ -> None + | _ -> Some(List.rev (h :: run), x :: xs) + | _ -> None /// Matches a list if it starts with a sub-list that is delimited /// using the specified delimiters. Returns a wrapped list and the rest. @@ -88,20 +198,29 @@ let inline (|AlphaNum|_|) input = let (|DelimitedMarkdown|_|) bracket input = let _startl, endl = bracket, bracket // Like List.partitionUntilEquals, but skip over escaped characters - let rec loop acc = + let rec loop acc count = function - | EscapedChar (x, xs) -> loop (x :: '\\' :: acc) xs - | input when List.startsWith endl input -> - let rest = List.skip bracket.Length input - - match rest with - | AlphaNum (x, xs) -> loop (x :: endl @ acc) xs - | _ -> Some(List.rev acc, input) - | x :: xs -> loop (x :: acc) xs + | (RightDelimiterRun (pre, [ '_' ], post) as input) when endl = [ '_' ] -> + match input with + | LeftDelimiterRun (pre, run, (Punctuation _ as post)) -> + if count = 0 then + Some((List.rev acc) @ pre, run @ post) + else + loop ((List.rev (pre @ run)) @ acc) (count - 1) post + | LeftDelimiterRun (pre, run, post) -> loop ((List.rev (pre @ run)) @ acc) (count + 1) post + | _ -> Some((List.rev acc) @ pre, [ '_' ] @ post) + | RightDelimiterRun (pre, run, post) when endl = run -> + if count = 0 then + Some((List.rev acc) @ pre, run @ post) + else + loop ((List.rev (pre @ run)) @ acc) (count - 1) post + | EscapedChar (x, xs) -> loop (x :: '\\' :: acc) count xs + | LeftDelimiterRun (pre, run, post) when run = endl -> loop ((List.rev (pre @ run)) @ acc) (count + 1) post + | x :: xs -> loop (x :: acc) count xs | [] -> None // If it starts with 'startl', let's search for 'endl' if List.startsWith bracket input then - match loop [] (List.skip bracket.Length input) with + match loop [] 0 (List.skip bracket.Length input) with | Some (pre, post) -> Some(pre, List.skip bracket.Length post) | None -> None else @@ -183,16 +302,15 @@ let (|AutoLink|_|) input = /// TODO: This does not handle nested emphasis well. let (|Emphasised|_|) = function - | (('_' - | '*') :: _tail) as input -> - match input with + | LeftDelimiterRun (pre, run, post) -> + match run @ post with | DelimitedMarkdown [ '_'; '_'; '_' ] (body, rest) | DelimitedMarkdown [ '*'; '*'; '*' ] (body, rest) -> - Some(body, Emphasis >> List.singleton >> (fun s -> Strong(s, None)), rest) + Some(pre, body, Emphasis >> List.singleton >> (fun s -> Strong(s, None)), rest) | DelimitedMarkdown [ '_'; '_' ] (body, rest) - | DelimitedMarkdown [ '*'; '*' ] (body, rest) -> Some(body, Strong, rest) + | DelimitedMarkdown [ '*'; '*' ] (body, rest) -> Some(pre, body, Strong, rest) | DelimitedMarkdown [ '_' ] (body, rest) - | DelimitedMarkdown [ '*' ] (body, rest) -> Some(body, Emphasis, rest) + | DelimitedMarkdown [ '*' ] (body, rest) -> Some(pre, body, Emphasis, rest) | _ -> None | _ -> None @@ -392,13 +510,16 @@ let rec parseChars acc input (ctx: ParsingContext) = yield IndirectImage(String(Array.ofList body), original, key, ctx.CurrentRange) yield! parseChars [] rest ctx - // Handle emphasised text - | Emphasised (body, f, rest) -> + // Handle Emphasis + | CannotOpenEmphasis (revPre, post) -> yield! parseChars (revPre @ acc) post ctx + | Emphasised (pre, body, f, rest) -> let (value, ctx) = accLiterals.Value yield! value + yield! parseChars [] pre ctx let body = parseChars [] body ctx |> List.ofSeq yield f (body, ctx.CurrentRange) yield! parseChars [] rest ctx + // Encode '<' char if it is not link or inline HTML | '<' :: rest -> yield! parseChars (';' :: 't' :: 'l' :: '&' :: acc) rest ctx | '>' :: rest -> yield! parseChars (';' :: 't' :: 'g' :: '&' :: acc) rest ctx diff --git a/tests/FSharp.Markdown.Tests/Markdown.fs b/tests/FSharp.Markdown.Tests/Markdown.fs index 0106bbcd2..e7119ef1c 100644 --- a/tests/FSharp.Markdown.Tests/Markdown.fs +++ b/tests/FSharp.Markdown.Tests/Markdown.fs @@ -877,3 +877,109 @@ let ``Underscore inside italic and bold near punctuation is preserved`` () = |> properNewLines Markdown.ToHtml doc |> shouldEqual expected + +[] +let ``emphasis with space`` () = + let doc = "*foo bar*" + let actual = "

foo bar

\r\n" |> properNewLines + Markdown.ToHtml doc |> shouldEqual actual + +[] +let ``No emphasis if opening * is followed by whitespace`` () = + let doc = "a * foo bar*" + let actual = "

a * foo bar*

\r\n" |> properNewLines + Markdown.ToHtml doc |> shouldEqual actual + +[] +let ``No emphasis if opening * is preceded by alphanumeric and followed by punctuation`` () = + let doc = """a*"foo"*""" + let actual = """

a*"foo"*

""" + "\r\n" |> properNewLines + Markdown.ToHtml doc |> shouldEqual actual + +[] +let ``Intraword emphasis with * is permitted`` () = + let doc = "foo*bar*" + let actual = "

foobar

\r\n" |> properNewLines + Markdown.ToHtml doc |> shouldEqual actual + + let doc2 = "5*6*78" + let actual2 = "

5678

\r\n" |> properNewLines + Markdown.ToHtml doc2 |> shouldEqual actual2 + +[] +let ``emphasis using _ with space`` () = + let doc = "_foo bar_" + let actual = "

foo bar

\r\n" |> properNewLines + Markdown.ToHtml doc |> shouldEqual actual + +[] +let ``No emphasis if opening _ is followed by whitespace`` () = + let doc = "_ foo bar_" + let actual = "

_ foo bar_

\r\n" |> properNewLines + Markdown.ToHtml doc |> shouldEqual actual + +[] +let ``No emphasis if opening _ is preceded by alphanumeric and followed by punctuation`` () = + let doc = """a_"foo"_""" + let actual = """

a_"foo"_

""" + "\r\n" |> properNewLines + Markdown.ToHtml doc |> shouldEqual actual + +[] +let ``Intraword emphasis with _ is not permitted`` () = + let doc = "foo_bar_" + let actual = "

foo_bar_

\r\n" |> properNewLines + Markdown.ToHtml doc |> shouldEqual actual + + let doc2 = "5_6_78" + let actual2 = "

5_6_78

\r\n" |> properNewLines + Markdown.ToHtml doc2 |> shouldEqual actual2 + + let doc3 = "пристаням_стремятся_" + let actual3 = "

пристаням_стремятся_

\r\n" |> properNewLines + Markdown.ToHtml doc3 |> shouldEqual actual3 + +[] +let ``No emphasis if first _ is right flanking and second is left flanking`` () = + let doc = """aa_"bb"_cc""" + let actual = """

aa_"bb"_cc

""" + "\r\n" |> properNewLines + Markdown.ToHtml doc |> shouldEqual actual + +[] +let ``Emphasis if first _ is left and right flanking and preceded by punctuation`` () = + let doc = "foo-_(bar)_" + let actual = "

foo-(bar)

\r\n" |> properNewLines + Markdown.ToHtml doc |> shouldEqual actual + +[] +let ``No emphasis if open and close delim do not match`` () = + let doc = "_foo*" + let actual = "

_foo*

\r\n" |> properNewLines + Markdown.ToHtml doc |> shouldEqual actual + +[] +let ``No emphasis if closing * is preceded by whitespace`` () = + let doc1 = "*foo bar *" + let actual1 = "

*foo bar *

\r\n" |> properNewLines + Markdown.ToHtml doc1 |> shouldEqual actual1 + + let doc2 = + "*foo bar +*" + + let actual2 = + "

*foo bar +*

" + + "\r\n" + |> properNewLines + + Markdown.ToHtml doc2 |> shouldEqual actual2 + +[] +let ``Do not close emphasis if second * is preceded by punctuation and followed by alphanumeric`` () = + let doc = "*(*foo)" + let actual = "

*(*foo)

\r\n" |> properNewLines + Markdown.ToHtml doc |> shouldEqual actual + + let doc2 = "*(*foo*)*" + let actual2 = "

(foo)

\r\n" |> properNewLines + Markdown.ToHtml doc2 |> shouldEqual actual2