Skip to content

Commit

Permalink
Merge pull request #763 from nhirschey/emph2
Browse files Browse the repository at this point in the history
Pass more commonmark emphasis parsing tests
  • Loading branch information
dsyme authored Aug 23, 2022
2 parents 7a4aca3 + e06752a commit 31950ff
Show file tree
Hide file tree
Showing 3 changed files with 260 additions and 30 deletions.
3 changes: 3 additions & 0 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
## 16.0.2
* [Improves markdown emphasis parsing.](https://github.com/fsprojects/FSharp.Formatting/pull/763)

## 16.0.1

* Custom templaying for menus
Expand Down
181 changes: 151 additions & 30 deletions src/FSharp.Formatting.Markdown/MarkdownParser.fs
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ let getLinkAndTitle (StringPosition.TrimBoth (input, _n)) =

/// Succeeds when the specified character list starts with an escaped
/// character - in that case, returns the character and the tail of the list
let inline (|EscapedChar|_|) input =
let (|EscapedChar|_|) input =
match input with
| '\\' :: (('*'
| '\\'
Expand All @@ -64,22 +64,132 @@ let inline (|EscapedChar|_|) input =
| _ -> None

/// Escape dollar inside a LaTex inline math span.
let inline (|EscapedLatexInlineMathChar|_|) input =
let (|EscapedLatexInlineMathChar|_|) input =
match input with
| '\\' :: (('$') as c) :: rest -> Some(c, rest)
| _ -> None

/// Succeeds when the specified character list starts with a letter or number
let inline (|AlphaNum|_|) input =
let re = """^[a-zA-Z0-9]"""
let match' = Regex.Match(Array.ofList input |> String, re)
/// Succeeds when the specificed character list starts with non-escaped punctuation.
let (|Punctuation|_|) input =
match input with
| EscapedChar _ -> None
| _ ->
// from https://github.com/commonmark/commonmark.js/blob/master/lib/inlines.js#L38
let re =
"""^[!"#$%&'()*+,\-./:;<=>?@\[\]\\^_`{|}~\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u0AF0\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E42\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD804[\uDC47-\uDC4D\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC9\uDDCD\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDCC6\uDDC1-\uDDD7\uDE41-\uDE43\uDF3C-\uDF3E]|\uD809[\uDC70-\uDC74]|\uD81A[\uDE6E\uDE6F\uDEF5\uDF37-\uDF3B\uDF44]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]"""

if match'.Success then
let entity = match'.Value
let _, rest = List.splitAt entity.Length input
Some(char entity, rest)
else
None
let match' = Regex.Match(Array.ofList input |> String, re)

if match'.Success then
let entity = match'.Value
let _, rest = List.splitAt entity.Length input
Some(char entity, rest)
else
None

let (|NotPunctuation|_|) input =
match input with
| Punctuation _ -> None
| _ -> Some input

module Char =
let (|WhiteSpace|_|) input =
match input with
| [] -> Some input
| x :: _xs ->
if String.IsNullOrWhiteSpace(string x) then
Some input
else
None

let (|NotWhiteSpace|_|) input =
match input with
| WhiteSpace _ -> None
| _ -> Some input

/// Succeeds when the specificed character list starts with a delimeter run.
let (|DelimiterRun|_|) input =
match input with
| ('*'
| '_') :: _tail as (h :: t) ->
let run, rest = List.partitionWhile (fun x -> x = h) (h :: t)
Some(run, rest)
| _ -> None

/// Succeeds when there's a match to a string of * or _ that could
/// open emphasis.
let (|LeftDelimiterRun|_|) input =
match input with
// (1) Not followed by [Unicode whitespace] and
// (2a) not followed by a [Unicode punctuation character] or
// (2b) followed by a [Unicode punctuation character] and
// preceded by [Unicode whitespace] or a [Unicode punctuation character].
//
// Passes 1 and 2a.
| DelimiterRun (_, Char.NotWhiteSpace _) & DelimiterRun (run, NotPunctuation xs) -> Some([], run, xs)
| _ :: DelimiterRun (_, Char.NotWhiteSpace _) & h :: DelimiterRun (run, NotPunctuation xs) -> Some([ h ], run, xs)
// Passes 1 and 2b
| h :: DelimiterRun (run, Punctuation (x, xs)) ->
match [ h ] with
| Char.WhiteSpace _
| Punctuation _ -> Some([ h ], run, x :: xs)
| _ -> None
// Passes 1 and 2b when the run is at the start of the line.
// |CannotStartEmphasis| ensures that we don't match this
// when we've previously discarded a leading character.
| DelimiterRun (run, Punctuation (x, xs)) -> Some([], run, x :: xs)
| _ -> None

/// Succeeds when there's a match to a string of * or _ that could
/// close emphasis.
let (|RightDelimiterRun|_|) input =
match input with
// A right-flanking delimiter run is
// 1. not preceded by [Unicode whitepace]
// 2. And either
// a. not preceded by a [Unicode punctuation character], or
// b. preceded by a [Unicode punctuation character] and
// followed by [Unicode whitespace] or a [Unicode punctuation character]
//
// An escaped character followed by delimiter run matches 1 and 2a.
| EscapedChar (x, DelimiterRun (run, xs)) -> Some([ '\\'; x ], run, xs)
| EscapedChar _ -> None
| Char.NotWhiteSpace _ & x :: DelimiterRun (run, xs) ->
match input with
// 1 and 2a
| NotPunctuation _ -> Some([ x ], run, xs)
// 1 and 2b
| Punctuation (x, DelimiterRun (run, Char.WhiteSpace ys)) -> Some([ x ], run, ys)
// 1 and 2b
| Punctuation (x, DelimiterRun (run, Punctuation (y, ys))) -> Some([ x ], run, y :: ys)
| _ -> None
| _ -> None

/// Matches ['c',LeftDelimiterRun]::xs that should
/// not open emphasis. This is useful because the
/// parser iterates through characters one by one and
/// in this case we need to skip both 'c' and the LeftDelimiterRun.
/// If we only skipped 'c' then we could match LeftDelimiterRun
/// on the next iteration and we do not want that to happen.
let (|CannotOpenEmphasis|_|) input =
match input with
// Rule #2: A single `_` character [can open emphasis] iff
// it is part of a [left-flanking delimiter run]
// and either (a) not part of a [right-flanking delimiter run]
// or (b) part of a [right-flanking delimiter run]
// preceded by a [Unicode punctuation character].
| LeftDelimiterRun _ & RightDelimiterRun (pre, [ '_' ], post) ->
match List.rev pre with
| Punctuation _ -> None
| revPre -> Some('_' :: revPre, post)
// We cannot pass 1 and 2b of the left flanking rule
// when h is neither white space nor punctuation.
| h :: DelimiterRun (run, Punctuation (x, xs)) ->
match [ h ] with
| Char.WhiteSpace _
| Punctuation _ -> None
| _ -> Some(List.rev (h :: run), x :: xs)
| _ -> None

/// Matches a list if it starts with a sub-list that is delimited
/// using the specified delimiters. Returns a wrapped list and the rest.
Expand All @@ -88,20 +198,29 @@ let inline (|AlphaNum|_|) input =
let (|DelimitedMarkdown|_|) bracket input =
let _startl, endl = bracket, bracket
// Like List.partitionUntilEquals, but skip over escaped characters
let rec loop acc =
let rec loop acc count =
function
| EscapedChar (x, xs) -> loop (x :: '\\' :: acc) xs
| input when List.startsWith endl input ->
let rest = List.skip bracket.Length input

match rest with
| AlphaNum (x, xs) -> loop (x :: endl @ acc) xs
| _ -> Some(List.rev acc, input)
| x :: xs -> loop (x :: acc) xs
| (RightDelimiterRun (pre, [ '_' ], post) as input) when endl = [ '_' ] ->
match input with
| LeftDelimiterRun (pre, run, (Punctuation _ as post)) ->
if count = 0 then
Some((List.rev acc) @ pre, run @ post)
else
loop ((List.rev (pre @ run)) @ acc) (count - 1) post
| LeftDelimiterRun (pre, run, post) -> loop ((List.rev (pre @ run)) @ acc) (count + 1) post
| _ -> Some((List.rev acc) @ pre, [ '_' ] @ post)
| RightDelimiterRun (pre, run, post) when endl = run ->
if count = 0 then
Some((List.rev acc) @ pre, run @ post)
else
loop ((List.rev (pre @ run)) @ acc) (count - 1) post
| EscapedChar (x, xs) -> loop (x :: '\\' :: acc) count xs
| LeftDelimiterRun (pre, run, post) when run = endl -> loop ((List.rev (pre @ run)) @ acc) (count + 1) post
| x :: xs -> loop (x :: acc) count xs
| [] -> None
// If it starts with 'startl', let's search for 'endl'
if List.startsWith bracket input then
match loop [] (List.skip bracket.Length input) with
match loop [] 0 (List.skip bracket.Length input) with
| Some (pre, post) -> Some(pre, List.skip bracket.Length post)
| None -> None
else
Expand Down Expand Up @@ -183,16 +302,15 @@ let (|AutoLink|_|) input =
/// TODO: This does not handle nested emphasis well.
let (|Emphasised|_|) =
function
| (('_'
| '*') :: _tail) as input ->
match input with
| LeftDelimiterRun (pre, run, post) ->
match run @ post with
| DelimitedMarkdown [ '_'; '_'; '_' ] (body, rest)
| DelimitedMarkdown [ '*'; '*'; '*' ] (body, rest) ->
Some(body, Emphasis >> List.singleton >> (fun s -> Strong(s, None)), rest)
Some(pre, body, Emphasis >> List.singleton >> (fun s -> Strong(s, None)), rest)
| DelimitedMarkdown [ '_'; '_' ] (body, rest)
| DelimitedMarkdown [ '*'; '*' ] (body, rest) -> Some(body, Strong, rest)
| DelimitedMarkdown [ '*'; '*' ] (body, rest) -> Some(pre, body, Strong, rest)
| DelimitedMarkdown [ '_' ] (body, rest)
| DelimitedMarkdown [ '*' ] (body, rest) -> Some(body, Emphasis, rest)
| DelimitedMarkdown [ '*' ] (body, rest) -> Some(pre, body, Emphasis, rest)
| _ -> None
| _ -> None

Expand Down Expand Up @@ -392,13 +510,16 @@ let rec parseChars acc input (ctx: ParsingContext) =
yield IndirectImage(String(Array.ofList body), original, key, ctx.CurrentRange)
yield! parseChars [] rest ctx

// Handle emphasised text
| Emphasised (body, f, rest) ->
// Handle Emphasis
| CannotOpenEmphasis (revPre, post) -> yield! parseChars (revPre @ acc) post ctx
| Emphasised (pre, body, f, rest) ->
let (value, ctx) = accLiterals.Value
yield! value
yield! parseChars [] pre ctx
let body = parseChars [] body ctx |> List.ofSeq
yield f (body, ctx.CurrentRange)
yield! parseChars [] rest ctx

// Encode '<' char if it is not link or inline HTML
| '<' :: rest -> yield! parseChars (';' :: 't' :: 'l' :: '&' :: acc) rest ctx
| '>' :: rest -> yield! parseChars (';' :: 't' :: 'g' :: '&' :: acc) rest ctx
Expand Down
106 changes: 106 additions & 0 deletions tests/FSharp.Markdown.Tests/Markdown.fs
Original file line number Diff line number Diff line change
Expand Up @@ -877,3 +877,109 @@ let ``Underscore inside italic and bold near punctuation is preserved`` () =
|> properNewLines

Markdown.ToHtml doc |> shouldEqual expected

[<Test>]
let ``emphasis with space`` () =
let doc = "*foo bar*"
let actual = "<p><em>foo bar</em></p>\r\n" |> properNewLines
Markdown.ToHtml doc |> shouldEqual actual

[<Test>]
let ``No emphasis if opening * is followed by whitespace`` () =
let doc = "a * foo bar*"
let actual = "<p>a * foo bar*</p>\r\n" |> properNewLines
Markdown.ToHtml doc |> shouldEqual actual

[<Test>]
let ``No emphasis if opening * is preceded by alphanumeric and followed by punctuation`` () =
let doc = """a*"foo"*"""
let actual = """<p>a*"foo"*</p>""" + "\r\n" |> properNewLines
Markdown.ToHtml doc |> shouldEqual actual

[<Test>]
let ``Intraword emphasis with * is permitted`` () =
let doc = "foo*bar*"
let actual = "<p>foo<em>bar</em></p>\r\n" |> properNewLines
Markdown.ToHtml doc |> shouldEqual actual

let doc2 = "5*6*78"
let actual2 = "<p>5<em>6</em>78</p>\r\n" |> properNewLines
Markdown.ToHtml doc2 |> shouldEqual actual2

[<Test>]
let ``emphasis using _ with space`` () =
let doc = "_foo bar_"
let actual = "<p><em>foo bar</em></p>\r\n" |> properNewLines
Markdown.ToHtml doc |> shouldEqual actual

[<Test>]
let ``No emphasis if opening _ is followed by whitespace`` () =
let doc = "_ foo bar_"
let actual = "<p>_ foo bar_</p>\r\n" |> properNewLines
Markdown.ToHtml doc |> shouldEqual actual

[<Test>]
let ``No emphasis if opening _ is preceded by alphanumeric and followed by punctuation`` () =
let doc = """a_"foo"_"""
let actual = """<p>a_"foo"_</p>""" + "\r\n" |> properNewLines
Markdown.ToHtml doc |> shouldEqual actual

[<Test>]
let ``Intraword emphasis with _ is not permitted`` () =
let doc = "foo_bar_"
let actual = "<p>foo_bar_</p>\r\n" |> properNewLines
Markdown.ToHtml doc |> shouldEqual actual

let doc2 = "5_6_78"
let actual2 = "<p>5_6_78</p>\r\n" |> properNewLines
Markdown.ToHtml doc2 |> shouldEqual actual2

let doc3 = "пристаням_стремятся_"
let actual3 = "<p>пристаням_стремятся_</p>\r\n" |> properNewLines
Markdown.ToHtml doc3 |> shouldEqual actual3

[<Test>]
let ``No emphasis if first _ is right flanking and second is left flanking`` () =
let doc = """aa_"bb"_cc"""
let actual = """<p>aa_"bb"_cc</p>""" + "\r\n" |> properNewLines
Markdown.ToHtml doc |> shouldEqual actual

[<Test>]
let ``Emphasis if first _ is left and right flanking and preceded by punctuation`` () =
let doc = "foo-_(bar)_"
let actual = "<p>foo-<em>(bar)</em></p>\r\n" |> properNewLines
Markdown.ToHtml doc |> shouldEqual actual

[<Test>]
let ``No emphasis if open and close delim do not match`` () =
let doc = "_foo*"
let actual = "<p>_foo*</p>\r\n" |> properNewLines
Markdown.ToHtml doc |> shouldEqual actual

[<Test>]
let ``No emphasis if closing * is preceded by whitespace`` () =
let doc1 = "*foo bar *"
let actual1 = "<p>*foo bar *</p>\r\n" |> properNewLines
Markdown.ToHtml doc1 |> shouldEqual actual1

let doc2 =
"*foo bar
*"

let actual2 =
"<p>*foo bar
*</p>"
+ "\r\n"
|> properNewLines

Markdown.ToHtml doc2 |> shouldEqual actual2

[<Test>]
let ``Do not close emphasis if second * is preceded by punctuation and followed by alphanumeric`` () =
let doc = "*(*foo)"
let actual = "<p>*(*foo)</p>\r\n" |> properNewLines
Markdown.ToHtml doc |> shouldEqual actual

let doc2 = "*(*foo*)*"
let actual2 = "<p><em>(<em>foo</em>)</em></p>\r\n" |> properNewLines
Markdown.ToHtml doc2 |> shouldEqual actual2

0 comments on commit 31950ff

Please sign in to comment.